Module 3 — Numpy

Introduction to Numpy Array

azam sayeed
10 min readSep 17, 2019
  • Linear algebra library in Python
  • Used for performing mathematical and logical operations on Arrays
  • provides features for operations on multi- dimensional array and matrices in python

create Numpy Array

  • Most Important object defined in NumPy is an N dimensional array type called ndarray
  • Describes the collection of items of same type
  • Items can be accessed using a zero based index
  • Each element in nd array is an object of data type dtype, each item in ndarray takes the same size of block in the memory.
#1D array , always pass list of arrays
import numpy as np
a= np.array([1,2,3])#pass list of arrays **
b= np.array((1,2,3))
print(a)
print(b)
o/p:
[1 2 3]
[1 2 3]
#2D array - () or [] is accepted
c = np.array([[1,2,3],[4,5,6]])
print(c)
c = np.array(((1,2,3),(4,5,6)))
print(c)
c = np.array([1,2,3],[4,5,6]) #<- brackets missing here
print (c)
o/p:
[[1 2 3]
[4 5 6]]
[[1 2 3]
[4 5 6]]
...
TypeError
: data type not understood

Initializing Numpy Array

zeros, arange,linspace,full,random.random

#Initialize an Array of x, y dimension with 0's
import numpy as np
a = np.zeros((1,4))
b = np.zeros((3,4))
print(a)
print(b)
o/p:
[[0. 0. 0. 0.]]
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
#Arrange the numbers between x and y with an interval of z
import numpy as np
np.arange(1,10,2) # here 10 (end till) is exclusive
o/p: array([1, 3, 5, 7, 9])
np.arange(10,20,2)
o/p: array([10, 12, 14, 16, 18])
#Arranging z numbers between x and y where z is specified
np.linspace(5,10,10) #here 10 (end till) is inclusive and fn name is #linspace
o/p:
array([ 5. , 5.55555556, 6.11111111, 6.66666667, 7.22222222, 7.77777778, 8.33333333, 8.88888889, 9.44444444, 10.
np.linspace(0,10,6)
array([ 0., 2., 4., 6., 8., 10.])
#Filling same number in an array of dimension x,y
np.full((2,3),6)
o/p: array([[6,6,6],
[6,6,6]])
#Filling random number in an array of dimension x,y
np.random.random((2,3))
o/p: array([[0.13770999, 0.03505756, 0.2594829 ],
[0.9559493 , 0.19415785, 0.27867208]])

Numpy Array — Inspection

ndarray.shape — Returns a tuple containing array dimensions, can be used to reseize the array

Shape,size,reshape,ndim,dtype

#Inspecting Array - Checking the Size of the array (ndarray.shape)
a = np.array([[2,3,4],[4,4,6]])
print(a.shape)
o/p: (2,3)
s = np.array([[1,2,3,4],[2,3,4,6],[6,7,8,9]])
print(s.shape)
o/p:(3,4)
#Resize the Array
a = np.array([[2,3,4],[4,4,6]])
a.shape = (3,2)
print(a)
o/p:
[[2 3]
[4 4]
[4 6]]
a = np.array([[2,3,4,4],[4,4,6,6]])
a.shape = (8,1) # Trick: x*y = Total no of elements in array (factors)
print(a)
b.shape =(8,2) # not possible gives error
print(b)
o/p:
[[2]
[3]
[4]
[4]
[4]
[4]
[6]
[6]]
...
AttributeError: 'tuple' object has no attribute 'shape'
#Return the Dimension of array
a= np.arange(24)
print(a)
print(a.ndim)
b= a.reshape(2,4,3) #cal the factors of 24: 1 ,2 ,3 ,6 ,12 ,24
print(b)
print(b.ndim)
o/p:
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
1
[[[ 0 1 2]
[ 3 4 5]
[ 6 7 8]
[ 9 10 11]]

[[12 13 14]
[15 16 17]
[18 19 20]
[21 22 23]]]
3
b=a.reshape(12,2)
print(b.ndim)
o/p:
2
#Note: Shape changes affects original array as well
c=a
c.shape=(2,4,3)
print(a.ndim)
print(c.ndim)
o/p:
3
3
#Find the Number of elements in an array
d= np.array([[1,2,3,4],[3,4,5,6],[7,8,9,10]])
print(d.size)
print(d)
o/p:
12
array([[ 1, 2, 3, 4],
[ 3, 4, 5, 6],
[ 7, 8, 9, 10]])
d= np.array([[1,2,3,4],[3,4,5,6],[7,8,9,10,11]])
print(d.size)
print(d)
o/p:
3
array([list([1, 2, 3, 4]), list([3, 4, 5, 6]), list([7, 8, 9, 10, 11])],
dtype=object)
#Find the datatype of the array
a=np.arange(24, dtype =float)
print(a.dtype)
print(a)
a=np.arange(24)
print(a.dtype)
print(a)
o/p:
float64
[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17.
18. 19. 20. 21. 22. 23.]
int32
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]

Numpy Array — Mathematics

Numpy array Mathematics :Addition

Sum,Multiply,divide,exp,sqrt,sin,cos,log

axis =0 is column wise, axis=1 is row wise

import numpy as np
np.sum([10,20])
o/p: 30
a,b= 10,20
print(np.sum([a,b])) # a+b is also correct
print(np.sum(a,b)) #correct [] brackets needed
o/p:
30
...
AxisError
: axis 20 is out of bounds for array of dimension 0
np.sum([[1,2],[5,6]],axis=0)
array([6, 8])
np.sum([[1,2],[5,6]],axis=1)
array([ 3, 11])
np.sum([[1,2],[5,6]])
14

Numpy array Mathematics : Other Fns

np.subtract(10,20) # notice unlike np.sum [] brackets not reqd
o/p: -10
np.multiply(2,3)
o/p: 6
np.divide(10,5)
o/p: 2
a= np.array([2,4,6])
b= np.array([1,2,3])
print(np.subtract(a,b))
print(np.multiply(a,b))
print(np.divide(a,b))
o/p:
[1 2 3]
[ 2 8 18]
[2. 2. 2.]
#exp,sqrt , sin ,cos ,log
print("Exponent: ",np.exp(a)) # e to the power 2,4,6
print("Square root :",np.sqrt(a))
print("Sin :",np.sin(a))
print("Cos :",np.cos(a))
print("log :",np.log(a))
o/p:
Exponent: [ 7.3890561 54.59815003 403.42879349]
Square root : [1.41421356 2. 2.44948974]
Sin : [ 0.90929743 -0.7568025 -0.2794155 ]
Cos : [-0.41614684 -0.65364362 0.96017029]
log : [0.69314718 1.38629436 1.79175947]
import numpy
my_array = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])
print(numpy.ceil(my_array))
print(numpy.floor(my_array))
o/p:
[ 2. 3. 4. 5. 6. 7. 8. 9. 10.]
[1. 2. 3. 4. 5. 6. 7. 8. 9.]

Array Comparision

equal,array_equal

#Element wise comparision
a =[1,2,4]
b= [2,4,4]
c= [1,2,4]
np.equal(b,c)
o/p: array([False, False, True])
#Array wise comparision
a =[1,2,4]
b= [2,4,4]
c= [1,2,4]
print(np.array_equal(a,b))
print(np.array_equal(a,c))
o/p:
False
True

Aggregate Functions (statistical methods)

sum,min,mean,median,corrcoef,std

a = [1,2,4]
b = [2,4,4]
c = [1,2,4]
print(“Sum: “,np.sum(a))
print(“Minimum Value: “,np.min(a))
print(“Mean: “,np.mean(a))
print(“Median: “,np.median(a))
print(“Correlation coefficient: “,np.corrcoef(a))
print(“Standard deviation: “,np.std(a))
o/p:
Sum: 7
Minimum Value: 1
Mean: 2.3333333333333335
Median: 2.0
Correlation coefficient: 1.0
Standard deviation: 1.247219128924647

Numpy broadcasting


import numpy as np
a = np.array([[0,0,0],[1,2,3],[4,5,6],[5,6,7]])
b = np.array([[0,1,2]])
print("First Array: \n",a,'\n')
print("Second Array: \n",b,'\n')
print("First Array + Second Array: \n",a+b,'\n')
o/p:
First Array:
[[0 0 0]
[1 2 3]
[4 5 6]
[5 6 7]]

Second Array:
[[0 1 2]]

First Array + Second Array:
[[0 1 2]
[1 3 5]
[4 6 8]
[5 7 9]]

Numpy Array — Array Manipulation in Python

concatenate,hstack,vstack,colum_stack,hsplit

a = np.array([1,2,3])
b= np.array([4,5,6])
np.concatenate((a,b))
o/p:
array([1, 2, 3, 4, 5, 6])
import numpy
array_1 = numpy.array([[1,2,3],[0,0,0]])
array_2 = numpy.array([[0,0,0],[7,8,9]])
print(numpy.concatenate((array_1, array_2), axis = 0))#same as #default no axis parameter
print(numpy.concatenate((array_1, array_2), axis = 1))
o/p:
[[1 2 3]
[0 0 0]
[0 0 0]
[7 8 9]]
[[1 2 3 0 0 0]
[0 0 0 7 8 9]]
#stack array row wise: horizontal , similar to concatenate
np.hstack((a,b))
o/p:
array([1, 2, 3, 4, 5, 6])
#stack array row wise: Vertically
np.vstack((a,b))
o/p:
array([[1, 2, 3],
[4, 5, 6]])
#combining Column- wise
np.column_stack((a,b))
o/p:
array([[1, 4],
[2, 5],
[3, 6]])

Splitting Array

x = np.arange(16).reshape(4,4)
print(x,"\n\n")
print(np.hsplit(x,2))
print("\n\n",np.hsplit(x,[3]))
print("\n\n",np.hsplit(x,[2,3]))
o/p:
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]
[12 13 14 15]]


[array([[ 0, 1],
[ 4, 5],
[ 8, 9],
[12, 13]]), array([[ 2, 3],
[ 6, 7],
[10, 11],
[14, 15]])]


[array([[ 0, 1, 2],
[ 4, 5, 6],
[ 8, 9, 10],
[12, 13, 14]]), array([[ 3],
[ 7],
[11],
[15]])]


[array([[ 0, 1],
[ 4, 5],
[ 8, 9],
[12, 13]]), array([[ 2],
[ 6],
[10],
[14]]), array([[ 3],
[ 7],
[11],
[15]])]
print(np.vsplit(x,2))
[array([[0, 1, 2, 3],
[4, 5, 6, 7]]), array([[ 8, 9, 10, 11],
[12, 13, 14, 15]])]

Indexing and Slicing

Indexing -refers to the position of the item in the array

a= ['m','o','n','t','y',' ','p','y','t','h','o','n']
print(a[2:9])
o/p:
['n', 't', 'y', ' ', 'p', 'y', 't']
a =np.array([[1,2,3],[4,5,6],[7,8,9]])
print("point 1:",a[0],"\n",a[:1],"\n")
print("point 2:",a[0,1:],"\n",a[:1,1:],"\n")
print("point 4:",a[:2,1:],"\n")
print("point 5:",a[1:,1:],"\n")
More Examples — Illustration
o/p:
point 1: [1 2 3]
[[1 2 3]]

point 2: [2 3]
[[2 3]]

point 4: [[2 3]
[5 6]]

point 5: [[5 6]
[8 9]]

Numpy Vs List

why prefer numpy over list

  • consumes less Memory
  • Faster and more convient

An Example to demonstrate performance between numpy over list

#Numpy vs List: Memory size
import numpy as np
import sys
#define a list
l=range(1000)
print("Size of a list:",sys.getsizeof(l)*len(l))
#define a numpy array
a =np.arange(1000)
print("Size of an array: ",a.size*a.itemsize)
o/p:
Size of a list: 48000
Size of an array: 4000
#In terms of processing time
import time
import numpy as np
def using_List():
t1=time.time()
X=range(10000)
Y=range(10000)
z=[X[i]+Y[i] for i in range(len(X))]
return time.time()-t1
def using_Numpy():
t1=time.time()
a=np.arange(10000)
b=np.arange(10000)
z=a+b
return time.time()-t1
list_time =using_List()
numpy_time =using_Numpy()
print(list_time, numpy_time)
o/p:
0.007995367050170898 0.016004323959350586

SciPy

  • Used for Scientific computing and technical computing
  • contains modules for optimization, linear algebra, integration, interpolation ,special functions, FFT, Single and image processing
Sub packages
  1. scipy.cluster
  • provides kmeans() function
  • useful while making k clusters of data based on attributes
  • clustering — the process of group dataset into groups with commonality based on attributes

video explaining clustering process:

Elbow plot, variance, centroids, mean

Distance btw 2 points =sqrt((x2-x1)²-(y2-y1)²)

Example :

Dataset : https://www.dropbox.com/s/dm0ucvlj1hfw0dl/somecars1.xlsx?dl=0

1. Import data

import pandas as pd
#importing data
data = pd.read_excel('somecars1.xlsx')
#print the dataset
data

2. Divide dataset into clusters using scipy clusters (kmeans and vq, this step should return centroids , vq does vector quantizations means returns an array index for each dataset to highlight clustering)

#import libraries
import pandas as pd
from scipy.cluster.vq import kmeans, vq
#importing data
data = pd.read_excel('somecars1.xlsx')
#find out centroids with the help of kmeans functions
#k, number of clusters required
centroid, _ = kmeans(data,3)
#find out the cluster index for each record with vector quantization function
#vq(data,centroid)
idx, _ = vq(data,centroid)
#print the cluster index array
idx
o/p:
array([2, 2, 2, 0, 1, 0, 1, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 2, 1, 2, 1, 2])
#also print the centroids
centroid

3. Difference After Data whitening

#import libraries
import pandas as pd
from scipy.cluster.vq import kmeans, vq, whiten
#importing data
data = pd.read_excel('somecars1.xlsx')
#whiten data
data = whiten(data)
#find out centroids with the help of kmeans functions
#k, number of clusters required
centroid, _ = kmeans(data,3)
#find out the cluster index for each record with vector quantization function
#vq(data,centroid)
idx, _ = vq(data,centroid)
#print the cluster index array
idx
o/p:array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2])centroid

Example with scatter plot

from scipy.cluster.vq import kmeans,vq,whiten 
import matplotlib.pyplot as plt
new1=[0.1,1,3,0.2,1,0.5,12,0.2,10,0.5,10,2,5,0.8]
plt.scatter(range(len(new1)),new1)
centroid,_ = kmeans(new1,2)
idx, _ = vq(new1,centroid)
idx

2) Scipy.stats

#import numpy
import numpy as np
#create the marks array
coffee = np.array([15,18,20,26,32,38,32,24,21,16,13,11,14])
from scipy import stats
#find the zscore
print(stats.zscore(coffee))
print(coffee.mean(), coffee.std())
#let us see the data distribution by plotting it
import matplotlib.pyplot as pyplot
plt.plot(range(13),coffee)
#import numpy
import numpy as np
from scipy import stats
#create the numpy array consisting of frequency of people going to gym and frequency of smoking
obs = np.array([[7,1,3],[87,18,84],[12,3,4],[9,1,7]])
#since we are lookingfor only p values, ignore the rest
_,p,_,_ = stats.chi2_contingency(obs)
#print p
p
o/p:
0.4828421694654563

chi square statistical measure Explanation:

3) scipy.optimize

#generate one function and plot with matplotlib
#import matplotlib
import matplotlib.pyplot as plt
#import numpy
import numpy as np
x= np.arange(0.0,1.0,0.1)
#create function
def f(x):
return -np.exp(-(x-0.7)**2)
#plot function
plt.plot(x,f(x),'o-')
plt.grid()
#find at which x value we get the minimum function
from scipy import optimize
#generating the function
import numpy as np
def f(x):
return -np.exp(-(x-0.7)**2)
#find the minimum of the function
result = optimize .minimize_scalar(f)
#now find the corresponding x value
x_min = result.x
#print the x value
x_min
o/p:
0.6999999997839409

3) scipy.integrate

#import scipy integrate
import scipy.integrate as intg
#create one function to find the integration
def integrad(x):
return x**2
#apply quad() function, get only the answer, ignore rest
ans,_ = intg.quad(integrad,0,1)
#print ans
ans
o/p:
0.33333333333333337

dblquad is used for double integration

4) scipy.linalg

#import scipy linalg package
from scipy import linalg
#import numpy to the square matrix
import numpy as np
data = np.array([[1,2,3],[3,4,5],[5,6,7]])
#find determinant
linalg.det(data)
o/p:
-1.1842378929335004e-15
#import scipy linalg package
from scipy import linalg
#import numpy to the square matrix
import numpy as np
data = np.array([[1,2,3],[3,4,5],[5,6,7]])
#find determinant
linalg.inv(data)
o/p:
array([[-1.18515780e+15, 2.37031559e+15, -1.18515780e+15],
[ 2.37031559e+15, -4.74063119e+15, 2.37031559e+15],
[-1.18515780e+15, 2.37031559e+15, -1.18515780e+15]])
#Eigen values of a square matrix
#import scipy linalg package
from scipy import linalg
#import numpy to the square matrix
import numpy as np
data = np.array([[1,2,3],[3,4,5],[5,6,7]])
#find determinant
linalg.eigvals(data)
o/p:
array([ 1.29282032e+01+0.j, -9.28203230e-01+0.j, 6.16237757e-16+0.j])

5) scipy.fftpack and scipy.signal are mainly worked on Electronics applications , beyond scope of the Topic

--

--

No responses yet