Module 3 — Numpy

Introduction to Numpy Array

10 min readSep 17, 2019

Linear algebra library in Python
Used for performing mathematical and logical operations on Arrays
provides features for operations on multi- dimensional array and matrices in python

create Numpy Array

Most Important object defined in NumPy is an N dimensional array type called ndarray
Describes the collection of items of same type
Items can be accessed using a zero based index
Each element in nd array is an object of data type dtype, each item in ndarray takes the same size of block in the memory.

#1D array , always pass list of arrays
import numpy as np
a= np.array([1,2,3])#pass list of arrays **
b= np.array((1,2,3))
print(a)
print(b)
o/p:
[1 2 3]
[1 2 3]#2D array - () or [] is accepted
c = np.array([[1,2,3],[4,5,6]])
print(c)
c = np.array(((1,2,3),(4,5,6)))
print(c)
c = np.array([1,2,3],[4,5,6]) #<- brackets missing here
print (c)o/p:
[[1 2 3]
 [4 5 6]]
[[1 2 3]
 [4 5 6]]...
TypeError: data type not understood

Initializing Numpy Array

zeros, arange,linspace,full,random.random

#Initialize an Array of x, y dimension with 0's
import numpy as np
a = np.zeros((1,4))
b = np.zeros((3,4))
print(a)
print(b)o/p:
[[0. 0. 0. 0.]][[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]#Arrange the numbers between x and y with an interval of z
import numpy as np
np.arange(1,10,2) # here 10 (end till) is exclusive
o/p: array([1, 3, 5, 7, 9])np.arange(10,20,2)
o/p: array([10, 12, 14, 16, 18])#Arranging z numbers between x and y where z is specified
np.linspace(5,10,10) #here 10 (end till) is inclusive and fn name is #linspace
o/p: 
array([ 5. ,  5.55555556,  6.11111111,  6.66666667,  7.22222222, 7.77777778,  8.33333333,  8.88888889,  9.44444444, 10.np.linspace(0,10,6)
array([ 0.,  2.,  4.,  6.,  8., 10.])#Filling same number in an array of dimension x,y
np.full((2,3),6)
o/p: array([[6,6,6],
            [6,6,6]])#Filling random number in an array of dimension x,y
np.random.random((2,3))
o/p: array([[0.13770999, 0.03505756, 0.2594829 ],
       [0.9559493 , 0.19415785, 0.27867208]])

Numpy Array — Inspection

ndarray.shape — Returns a tuple containing array dimensions, can be used to reseize the array

Shape,size,reshape,ndim,dtype

#Inspecting Array - Checking the Size of the array (ndarray.shape)
a = np.array([[2,3,4],[4,4,6]])
print(a.shape)
o/p: (2,3)
s = np.array([[1,2,3,4],[2,3,4,6],[6,7,8,9]])
print(s.shape)
o/p:(3,4)#Resize the Array
a = np.array([[2,3,4],[4,4,6]])
a.shape = (3,2)
print(a)
o/p:
[[2 3]
 [4 4]
 [4 6]]a = np.array([[2,3,4,4],[4,4,6,6]])
a.shape = (8,1) # Trick: x*y = Total no of elements in array (factors)
print(a)
b.shape =(8,2) # not possible gives error
print(b)
o/p:
[[2]
 [3]
 [4]
 [4]
 [4]
 [4]
 [6]
 [6]]...
AttributeError: 'tuple' object has no attribute 'shape'#Return the Dimension of array
a= np.arange(24)
print(a)
print(a.ndim)
b= a.reshape(2,4,3) #cal the factors of 24: 1 ,2 ,3 ,6 ,12 ,24print(b)
print(b.ndim)o/p:
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
1
[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]
  [ 9 10 11]]

 [[12 13 14]
  [15 16 17]
  [18 19 20]
  [21 22 23]]]
3b=a.reshape(12,2)
print(b.ndim)
o/p:
2#Note: Shape changes affects original array as well
c=a
c.shape=(2,4,3)
print(a.ndim)
print(c.ndim)o/p:
3
3#Find the Number of elements in an array
d= np.array([[1,2,3,4],[3,4,5,6],[7,8,9,10]])
print(d.size)
print(d)
o/p:
12
array([[ 1,  2,  3,  4],
       [ 3,  4,  5,  6],
       [ 7,  8,  9, 10]])d= np.array([[1,2,3,4],[3,4,5,6],[7,8,9,10,11]])
print(d.size)
print(d)
o/p:
3array([list([1, 2, 3, 4]), list([3, 4, 5, 6]), list([7, 8, 9, 10, 11])],
      dtype=object)#Find the datatype of the array
a=np.arange(24, dtype =float)
print(a.dtype)
print(a)a=np.arange(24)
print(a.dtype)
print(a)o/p:
float64
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23.]
int32
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]

Numpy Array — Mathematics

Numpy array Mathematics :Addition

Sum,Multiply,divide,exp,sqrt,sin,cos,log
axis =0 is column wise, axis=1 is row wise

import numpy as np
np.sum([10,20])
o/p: 30a,b= 10,20
print(np.sum([a,b])) # a+b is also correct
print(np.sum(a,b)) #correct [] brackets needed
o/p:
30
...
AxisError: axis 20 is out of bounds for array of dimension 0np.sum([[1,2],[5,6]],axis=0)
array([6, 8])np.sum([[1,2],[5,6]],axis=1)
array([ 3, 11])np.sum([[1,2],[5,6]])
14

Numpy array Mathematics : Other Fns

np.subtract(10,20) # notice unlike np.sum [] brackets not reqd
o/p: -10np.multiply(2,3)
o/p: 6np.divide(10,5)
o/p: 2a= np.array([2,4,6])
b= np.array([1,2,3])
print(np.subtract(a,b))
print(np.multiply(a,b))
print(np.divide(a,b))
o/p:
[1 2 3]
[ 2  8 18]
[2. 2. 2.]#exp,sqrt , sin ,cos ,log
print("Exponent: ",np.exp(a)) # e to the power 2,4,6
print("Square root :",np.sqrt(a))
print("Sin :",np.sin(a))
print("Cos :",np.cos(a))
print("log :",np.log(a))
o/p:
Exponent:  [  7.3890561   54.59815003 403.42879349]
Square root : [1.41421356 2.         2.44948974]
Sin : [ 0.90929743 -0.7568025  -0.2794155 ]
Cos : [-0.41614684 -0.65364362  0.96017029]
log : [0.69314718 1.38629436 1.79175947]import numpy 
my_array = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])
print(numpy.ceil(my_array))
print(numpy.floor(my_array))
o/p:
[ 2.  3.  4.  5.  6.  7.  8.  9. 10.]
[1. 2. 3. 4. 5. 6. 7. 8. 9.]

Array Comparision

equal,array_equal

#Element wise comparision
a =[1,2,4]
b= [2,4,4]
c= [1,2,4]
np.equal(b,c)
o/p: array([False, False,  True])#Array wise comparision
a =[1,2,4]
b= [2,4,4]
c= [1,2,4]
print(np.array_equal(a,b))
print(np.array_equal(a,c))
o/p: 
False
True

Aggregate Functions (statistical methods)

sum,min,mean,median,corrcoef,std

a = [1,2,4]
b = [2,4,4]
c = [1,2,4]
print(“Sum: “,np.sum(a))
print(“Minimum Value: “,np.min(a))
print(“Mean: “,np.mean(a))
print(“Median: “,np.median(a))
print(“Correlation coefficient: “,np.corrcoef(a))
print(“Standard deviation: “,np.std(a))
o/p:
Sum:  7
Minimum Value:  1
Mean:  2.3333333333333335
Median:  2.0
Correlation coefficient:  1.0
Standard deviation:  1.247219128924647

Numpy broadcasting


import numpy as np
a = np.array([[0,0,0],[1,2,3],[4,5,6],[5,6,7]])
b = np.array([[0,1,2]])print("First Array: \n",a,'\n')
print("Second Array: \n",b,'\n')
print("First Array + Second Array: \n",a+b,'\n')o/p:
First Array: 
 [[0 0 0]
 [1 2 3]
 [4 5 6]
 [5 6 7]] 

Second Array: 
 [[0 1 2]] 

First Array + Second Array: 
 [[0 1 2]
 [1 3 5]
 [4 6 8]
 [5 7 9]]

Numpy Array — Array Manipulation in Python

concatenate,hstack,vstack,colum_stack,hsplit

a = np.array([1,2,3])
b= np.array([4,5,6])
np.concatenate((a,b))
o/p:
array([1, 2, 3, 4, 5, 6])import numpy 
array_1 = numpy.array([[1,2,3],[0,0,0]])
array_2 = numpy.array([[0,0,0],[7,8,9]]) 
print(numpy.concatenate((array_1, array_2), axis = 0))#same as #default no axis parameter
print(numpy.concatenate((array_1, array_2), axis = 1))
o/p:[[1 2 3]
 [0 0 0]
 [0 0 0]
 [7 8 9]][[1 2 3 0 0 0]
 [0 0 0 7 8 9]]#stack array row wise: horizontal , similar to concatenate
np.hstack((a,b))
o/p: 
array([1, 2, 3, 4, 5, 6])#stack array row wise: Vertically
np.vstack((a,b))
o/p:
array([[1, 2, 3],
       [4, 5, 6]])#combining Column- wise
np.column_stack((a,b))
o/p:
array([[1, 4],
       [2, 5],
       [3, 6]])

Splitting Array

x = np.arange(16).reshape(4,4)
print(x,"\n\n")
print(np.hsplit(x,2))
print("\n\n",np.hsplit(x,[3]))
print("\n\n",np.hsplit(x,[2,3]))
o/p:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]] 


[array([[ 0,  1],
       [ 4,  5],
       [ 8,  9],
       [12, 13]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11],
       [14, 15]])]


 [array([[ 0,  1,  2],
       [ 4,  5,  6],
       [ 8,  9, 10],
       [12, 13, 14]]), array([[ 3],
       [ 7],
       [11],
       [15]])]


 [array([[ 0,  1],
       [ 4,  5],
       [ 8,  9],
       [12, 13]]), array([[ 2],
       [ 6],
       [10],
       [14]]), array([[ 3],
       [ 7],
       [11],
       [15]])]print(np.vsplit(x,2))
[array([[0, 1, 2, 3],
       [4, 5, 6, 7]]), array([[ 8,  9, 10, 11],
       [12, 13, 14, 15]])]

Indexing and Slicing

Indexing -refers to the position of the item in the array

a= ['m','o','n','t','y',' ','p','y','t','h','o','n']
print(a[2:9])
o/p:
['n', 't', 'y', ' ', 'p', 'y', 't']

a =np.array([[1,2,3],[4,5,6],[7,8,9]])
print("point 1:",a[0],"\n",a[:1],"\n")
print("point 2:",a[0,1:],"\n",a[:1,1:],"\n")
print("point 4:",a[:2,1:],"\n")
print("point 5:",a[1:,1:],"\n")

o/p:
point 1: [1 2 3] 
 [[1 2 3]] 

point 2: [2 3] 
 [[2 3]] 

point 4: [[2 3]
 [5 6]] 

point 5: [[5 6]
 [8 9]]

Numpy Vs List

why prefer numpy over list

consumes less Memory
Faster and more convient

An Example to demonstrate performance between numpy over list

#Numpy vs List: Memory size
import numpy as np
import sys#define a list
l=range(1000)
print("Size of a list:",sys.getsizeof(l)*len(l))#define a numpy array
a =np.arange(1000)
print("Size of an array: ",a.size*a.itemsize)
o/p:
Size of a list: 48000
Size of an array:  4000#In terms of processing time
import time
import numpy as np
def using_List():
    t1=time.time()
    X=range(10000)
    Y=range(10000)
    z=[X[i]+Y[i] for i in range(len(X))]
    return time.time()-t1def using_Numpy():
    t1=time.time()
    a=np.arange(10000)
    b=np.arange(10000)
    z=a+b
    return time.time()-t1list_time =using_List()
numpy_time =using_Numpy()
print(list_time, numpy_time)
o/p:
0.007995367050170898 0.016004323959350586

SciPy

Used for Scientific computing and technical computing
contains modules for optimization, linear algebra, integration, interpolation ,special functions, FFT, Single and image processing

scipy.cluster

provides kmeans() function
useful while making k clusters of data based on attributes
clustering — the process of group dataset into groups with commonality based on attributes

video explaining clustering process:

Elbow plot, variance, centroids, mean

Distance btw 2 points =sqrt((x2-x1)²-(y2-y1)²)

Example :

Dataset : https://www.dropbox.com/s/dm0ucvlj1hfw0dl/somecars1.xlsx?dl=0

1. Import data

import pandas as pd
#importing data
data = pd.read_excel('somecars1.xlsx')
#print the dataset
data

2. Divide dataset into clusters using scipy clusters (kmeans and vq, this step should return centroids , vq does vector quantizations means returns an array index for each dataset to highlight clustering)

#import libraries
import pandas  as pd
from  scipy.cluster.vq import kmeans, vq
#importing data
data = pd.read_excel('somecars1.xlsx') 
#find out centroids with the help of kmeans functions
#k, number of clusters required
centroid, _ = kmeans(data,3)
#find out the cluster index for each record with vector quantization function
#vq(data,centroid)
idx, _ = vq(data,centroid)
#print the cluster index array
idxo/p:
array([2, 2, 2, 0, 1, 0, 1, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 2, 1, 2, 1, 2])

#also print the centroids
centroid

3. Difference After Data whitening

#import libraries
import pandas  as pd
from  scipy.cluster.vq import kmeans, vq, whiten 
#importing data
data = pd.read_excel('somecars1.xlsx') 
#whiten data
data = whiten(data)
#find out centroids with the help of kmeans functions
#k, number of clusters required
centroid, _ = kmeans(data,3)
#find out the cluster index for each record with vector quantization function
#vq(data,centroid)
idx, _ = vq(data,centroid)
#print the cluster index array
idxo/p:array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2])centroid

Example with scatter plot

from scipy.cluster.vq import kmeans,vq,whiten 
import matplotlib.pyplot as plt 
new1=[0.1,1,3,0.2,1,0.5,12,0.2,10,0.5,10,2,5,0.8] 
plt.scatter(range(len(new1)),new1)
centroid,_ = kmeans(new1,2) 
idx, _ = vq(new1,centroid)
idx

2) Scipy.stats

#import numpy
import numpy as np 
#create the marks array 
coffee = np.array([15,18,20,26,32,38,32,24,21,16,13,11,14])
from scipy import stats
#find the zscore
print(stats.zscore(coffee))
print(coffee.mean(), coffee.std())
#let us see the data distribution by plotting it
import matplotlib.pyplot as pyplot
plt.plot(range(13),coffee)#import numpy
import numpy as np
from scipy import stats
#create the numpy array consisting of frequency of people going to gym and frequency of smoking  
obs = np.array([[7,1,3],[87,18,84],[12,3,4],[9,1,7]])
#since we are lookingfor only p values, ignore the rest
_,p,_,_ = stats.chi2_contingency(obs)
#print p
p
o/p: 
0.4828421694654563

chi square statistical measure Explanation:

Chi-Square Test

You research two groups and put them in categories single, married or divorced: The numbers are definitely different…

www.mathsisfun.com

3) scipy.optimize

#generate one function and plot with matplotlib
#import matplotlib
import matplotlib.pyplot as plt
#import numpy
import numpy as np
x= np.arange(0.0,1.0,0.1)
#create function
def f(x):
    return -np.exp(-(x-0.7)**2)
#plot function
plt.plot(x,f(x),'o-')
plt.grid()#find at which x value we get the minimum function
from scipy import optimize
#generating the function
import numpy as np
def f(x):
    return -np.exp(-(x-0.7)**2)
#find the minimum of the function
result = optimize .minimize_scalar(f)
#now find the corresponding x value
x_min = result.x
#print the x value
x_mino/p: 
0.6999999997839409

3) scipy.integrate

#import scipy integrate
import scipy.integrate as intg
#create one function to find the integration
def integrad(x):
    return x**2
#apply quad() function, get only the answer, ignore rest
ans,_ = intg.quad(integrad,0,1)
#print ans
ans
o/p:
0.33333333333333337

dblquad is used for double integration

4) scipy.linalg

#import scipy linalg package
from scipy import linalg
#import numpy to the square matrix
import numpy as np
data = np.array([[1,2,3],[3,4,5],[5,6,7]])
#find determinant 
linalg.det(data)
o/p:
-1.1842378929335004e-15#import scipy linalg package
from scipy import linalg
#import numpy to the square matrix
import numpy as np
data = np.array([[1,2,3],[3,4,5],[5,6,7]])
#find determinant 
linalg.inv(data)
o/p:
array([[-1.18515780e+15,  2.37031559e+15, -1.18515780e+15],
       [ 2.37031559e+15, -4.74063119e+15,  2.37031559e+15],
       [-1.18515780e+15,  2.37031559e+15, -1.18515780e+15]])#Eigen values of a square matrix
#import scipy linalg package
from scipy import linalg
#import numpy to the square matrix
import numpy as np
data = np.array([[1,2,3],[3,4,5],[5,6,7]])
#find determinant 
linalg.eigvals(data)
o/p:
array([ 1.29282032e+01+0.j, -9.28203230e-01+0.j,  6.16237757e-16+0.j])

5) scipy.fftpack and scipy.signal are mainly worked on Electronics applications , beyond scope of the Topic

Links for Exploring Image Processing using Scipy:

3.3. Scikit-image: image processing - Scipy lecture notes

Author: Emmanuelle Gouillart scikit-image is a Python package dedicated to image processing, and using natively NumPy…

scipy-lectures.org