# Statistical functions with Numy¶

In [ ]:
import numpy as np

In [ ]:
arr = np.random.rand(100000)

In [ ]:
np.amin(arr)

Out[ ]:
1.2487471540811867e-05
In [ ]:
np.amax(arr)

Out[ ]:
0.9999991775057848
In [ ]:
np.mean(arr)

Out[ ]:
0.5000285410829219
In [ ]:
np.std(arr)

Out[ ]:
0.28858064751802887
In [ ]:
np.median(arr)

Out[ ]:
0.500066650992059
In [ ]:
np.var(arr)

Out[ ]:
0.08327879012192481
In [ ]:
np.percentile(arr, 50)

Out[ ]:
0.500066650992059
In [ ]:
np.median(arr)

Out[ ]:
0.5000945785815393
In [ ]:
np.percentile(arr, 75)

Out[ ]:
0.7497093639966632
In [ ]:
np.percentile(arr, [10, 30, 60])

Out[ ]:
array([0.10068439, 0.30077677, 0.60000558])
In [ ]:
np.percentile(arr, [25, 75])

Out[ ]:
array([0.25037751, 0.74986553])
In [ ]:
np.percentile(arr, 10)

Out[ ]:
0.10093826911393286
In [ ]:
np.percentile(arr, 90)

Out[ ]:
0.899997898924828
In [ ]:
np.percentile(arr, 30)

Out[ ]:
0.3005185124573417
In [ ]:
iqr = np.percentile(arr, 75) - np.percentile(arr, 25)

In [ ]:
print(iqr)

0.49948802790270597

In [ ]:
quartiles = np.percentile(arr, [25, 75])

In [ ]:
print(quartiles)

[0.25037751 0.74986553]

In [ ]:
iqr = quartiles[1] - quartiles[0]

In [ ]:
print(iqr)

0.49948802790270597


#### - observe the time difference between the following two methods to get IQR¶

In [ ]:
%%time
iqr = np.percentile(arr, 75) - np.percentile(arr, 25)

CPU times: user 5.31 ms, sys: 10 µs, total: 5.32 ms
Wall time: 7.53 ms

In [ ]:
%%time
quartiles = np.percentile(arr, [75, 25])
iqr = quartiles[0] - quartiles[1]

CPU times: user 3.51 ms, sys: 0 ns, total: 3.51 ms
Wall time: 3.53 ms


#### if the array size is a bit larger, the performace difference would be more significant¶

In [ ]:
arr2 = np.random.rand(100000000)

In [ ]:
%%time
iqr = np.percentile(arr2, 75) - np.percentile(arr2, 25)

CPU times: user 2.65 s, sys: 6.95 ms, total: 2.65 s
Wall time: 2.65 s

In [ ]:
print(iqr)

0.5000233875522627

In [ ]:
%%time
quartiles = np.percentile(arr2, [75, 25])
iqr = quartiles[0] - quartiles[1]

CPU times: user 1.96 s, sys: 1.99 ms, total: 1.96 s
Wall time: 1.95 s

In [ ]:
print(iqr)

0.5000233875522627


### Z score¶

• Z score - how far away a particular point is away from the mean,
• negative implies left from mean and
• positive implies right from the mean
In [ ]:
(arr - np.mean(arr))/np.std(arr) # returns one number per array element
# displaying the distance of that particular element in the array from the mean

Out[ ]:
array([ 1.10533211, -1.52392851,  0.71986322, ..., -1.66676396,
-0.51762155, -0.01055   ])

### histogram with this array gives two arrays, where one is the points and the other is the bin sie¶

In [ ]:
np.histogram(arr)

Out[ ]:
(array([ 9916, 10043,  9992, 10031, 10013, 10071,  9890,  9993, 10051,
10000]),
array([1.24874715e-05, 1.00011156e-01, 2.00009825e-01, 3.00008494e-01,
4.00007163e-01, 5.00005832e-01, 6.00004501e-01, 7.00003170e-01,
8.00001839e-01, 9.00000509e-01, 9.99999178e-01]))
In [ ]:
np.histogram(arr, 5)

Out[ ]:
(array([19959, 20023, 20084, 19883, 20051]),
array([1.24874715e-05, 2.00009825e-01, 4.00007163e-01, 6.00004501e-01,
8.00001839e-01, 9.99999178e-01]))
In [ ]:
np.histogram(arr, bins = [0, .25, .27, 1])

Out[ ]:
(array([24966,  1998, 73036]), array([0.  , 0.25, 0.27, 1.  ]))
In [ ]:
bins = [0, .25, .5, .75, 1]


# digitize¶

• gives an array where the numbers indicate the bin in which they are in
In [ ]:
np.digitize(arr, bins)

Out[ ]:
array([4, 1, 3, ..., 1, 2, 2])
In [ ]:
arr3 = np.random.randint(10, 20, 10)

In [ ]:
arr3

Out[ ]:
array([19, 17, 15, 13, 13, 15, 12, 12, 14, 18])
In [ ]:
bins = [10, 14, 18, 20] # here the bins are 10-14, 14-18, 18-20

In [ ]:
np.digitize(arr3, bins) # 19 in bin 3, 17 in bin2, 15 in bin 2, 13 in bin1 and so on

Out[ ]:
array([3, 2, 2, 1, 1, 2, 1, 1, 2, 3])

## Consider an example with realtime dataset with height, weight and age¶

In [ ]:
height = np.random.randint(100, 180, 10)
weight = np.random.randint(40, 150, 10)
age = np.random.randint(10, 80, 10)

In [ ]:
height

Out[ ]:
array([173, 124, 113, 169, 163, 144, 113, 164, 106, 166])
In [ ]:
weight

Out[ ]:
array([ 85, 125, 135,  76,  61,  58, 125, 132, 113,  94])
In [ ]:
age

Out[ ]:
array([77, 37, 13, 49, 47, 19, 58, 42, 49, 23])
In [ ]:
np.min(weight)

Out[ ]:
58
In [ ]:
np.max(weight)

Out[ ]:
135
In [ ]:
np.min(height)

Out[ ]:
106
In [ ]:
np.max(height)

Out[ ]:
173
In [ ]:
arr_concat = np.concatenate((weight, height, age))

In [ ]:
print(arr_concat)

[ 85 125 135  76  61  58 125 132 113  94 173 124 113 169 163 144 113 164
106 166  77  37  13  49  47  19  58  42  49  23]

In [ ]:
np.amin(arr_concat)

Out[ ]:
13

#### the expectation from the following line of code is to get to get the min of height, weight and age but the result seems no... hence use vstack()¶

In [ ]:
np.concatenate((weight, height, age)).shape

Out[ ]:
(30,)

#### unlike concatenate(), which appends the next daatset to the previous dataset¶

In [ ]:
np.vstack((height, weight, age))

Out[ ]:
array([[173, 124, 113, 169, 163, 144, 113, 164, 106, 166],
[ 85, 125, 135,  76,  61,  58, 125, 132, 113,  94],
[ 77,  37,  13,  49,  47,  19,  58,  42,  49,  23]])
In [ ]:
np.vstack((height, weight, age)).shape

Out[ ]:
(3, 10)
In [ ]:
arr4 = np.vstack((height, weight, age))

In [ ]:
arr4

Out[ ]:
array([[173, 124, 113, 169, 163, 144, 113, 164, 106, 166],
[ 85, 125, 135,  76,  61,  58, 125, 132, 113,  94],
[ 77,  37,  13,  49,  47,  19,  58,  42,  49,  23]])
In [ ]:
np.amin(arr4, axis=1)

Out[ ]:
array([106,  58,  13])
In [ ]:
np.amax(arr4, axis=0)

Out[ ]:
array([173, 125, 135, 169, 163, 144, 125, 164, 113, 166])

#### unlike the concatenate f() which gives the min across the combination of height, weight and age¶

In [ ]:
np.amin(arr4, axis=1)

Out[ ]:
array([106,  58,  13])

# Rules of Statistics¶

In [ ]:
import numpy as np


## 1. Mean subtracted array has zero mean¶

In [ ]:
base_mean_data = np.random.rand(10000000)

In [ ]:
base_mean = base_mean_data - np.mean(base_mean_data)

In [ ]:
print(base_mean)

[ 0.31959795 -0.48022501  0.04116783 ...  0.23665416  0.03643021
-0.45697251]

In [ ]:
print(np.mean(base_mean))

-3.907985046680551e-18


## Computing mean with smaller set of values¶

• generally, the mean of the sample is almost near to the mean of the population
• smaller dataset's mean need not be the same as the mean of the population, so,
• to check emperically, how big a sample should be to get a closer estimate to the mean of the population?
In [ ]:
import matplotlib.pyplot as plt

In [ ]:
arr = np.random.randint(1, 100, 100)

In [ ]:
arr[:10]

Out[ ]:
array([33, 26, 75, 38, 17, 21, 87, 29, 38, 57])
In [ ]:
arr[0]

Out[ ]:
33
In [ ]:
np.mean(33)

Out[ ]:
33.0
In [ ]:
np.mean([33, 26])

Out[ ]:
29.5
In [ ]:
print(arr[:10])

[33 26 75 38 17 21 87 29 38 57]

In [ ]:
print(arr[0:0])

[]

In [ ]:
print(arr[0:1])

[33]


#### the outcome of the following code remains almost constant between 30 - 40 and there after...¶

• hence, 30 - 40 samples can be considered as a good sample size to resemble, to that of the mean of the population
In [ ]:
print(arr[:10])

[33 26 75 38 17 21 87 29 38 57]

In [ ]:
for i in range(1, 50):
arr1 = arr[0:i]
print(i, arr[i-1], np.mean(arr1))

1 33 33.0
2 26 29.5
3 75 44.666666666666664
4 38 43.0
5 17 37.8
6 21 35.0
7 87 42.42857142857143
8 29 40.75
9 38 40.44444444444444
10 57 42.1
11 50 42.81818181818182
12 54 43.75
13 15 41.53846153846154
14 66 43.285714285714285
15 15 41.4
16 26 40.4375
17 51 41.05882352941177
18 19 39.833333333333336
19 36 39.63157894736842
20 99 42.6
21 88 44.76190476190476
22 33 44.22727272727273
23 47 44.34782608695652
24 76 45.666666666666664
25 6 44.08
26 47 44.19230769230769
27 2 42.629629629629626
28 22 41.892857142857146
29 5 40.62068965517241
30 42 40.666666666666664
31 61 41.32258064516129
32 36 41.15625
33 90 42.63636363636363
34 27 42.1764705882353
35 74 43.08571428571429
36 48 43.22222222222222
37 23 42.67567567567568
38 39 42.578947368421055
39 19 41.97435897435897
40 28 41.625
41 15 40.97560975609756
42 10 40.23809523809524
43 68 40.883720930232556
44 4 40.04545454545455
45 46 40.17777777777778
46 81 41.06521739130435
47 96 42.234042553191486
48 30 41.979166666666664
49 3 41.183673469387756


#### cumsum() sums all the elements till the current element in the array¶

• first time, first number
• second time, first two numbers,
• third time, first three numbers and so on till the end of the array
In [ ]:
np.cumsum(arr)

Out[ ]:
array([  33,   59,  134,  172,  189,  210,  297,  326,  364,  421,  471,
525,  540,  606,  621,  647,  698,  717,  753,  852,  940,  973,
1020, 1096, 1102, 1149, 1151, 1173, 1178, 1220, 1281, 1317, 1407,
1434, 1508, 1556, 1579, 1618, 1637, 1665, 1680, 1690, 1758, 1762,
1808, 1889, 1985, 2015, 2018, 2102, 2162, 2200, 2226, 2310, 2359,
2428, 2477, 2491, 2521, 2544, 2616, 2624, 2685, 2716, 2808, 2843,
2867, 2903, 2921, 2967, 2994, 3013, 3097, 3114, 3119, 3187, 3232,
3287, 3332, 3375, 3447, 3462, 3521, 3543, 3570, 3572, 3668, 3709,
3786, 3873, 3928, 4001, 4046, 4086, 4132, 4164, 4251, 4279, 4306,
4327])
In [ ]:
np.cumsum(arr)/(np.arange(1, 101))

Out[ ]:
array([33.        , 29.5       , 44.66666667, 43.        , 37.8       ,
35.        , 42.42857143, 40.75      , 40.44444444, 42.1       ,
42.81818182, 43.75      , 41.53846154, 43.28571429, 41.4       ,
40.4375    , 41.05882353, 39.83333333, 39.63157895, 42.6       ,
44.76190476, 44.22727273, 44.34782609, 45.66666667, 44.08      ,
44.19230769, 42.62962963, 41.89285714, 40.62068966, 40.66666667,
41.32258065, 41.15625   , 42.63636364, 42.17647059, 43.08571429,
43.22222222, 42.67567568, 42.57894737, 41.97435897, 41.625     ,
40.97560976, 40.23809524, 40.88372093, 40.04545455, 40.17777778,
41.06521739, 42.23404255, 41.97916667, 41.18367347, 42.04      ,
42.39215686, 42.30769231, 42.        , 42.77777778, 42.89090909,
43.35714286, 43.45614035, 42.94827586, 42.72881356, 42.4       ,
42.8852459 , 42.32258065, 42.61904762, 42.4375    , 43.2       ,
43.07575758, 42.79104478, 42.69117647, 42.33333333, 42.38571429,
42.16901408, 41.84722222, 42.42465753, 42.08108108, 41.58666667,
41.93421053, 41.97402597, 42.14102564, 42.17721519, 42.1875    ,
42.55555556, 42.2195122 , 42.42168675, 42.17857143, 42.        ,
41.53488372, 42.16091954, 42.14772727, 42.53932584, 43.03333333,
43.16483516, 43.48913043, 43.50537634, 43.46808511, 43.49473684,
43.375     , 43.82474227, 43.66326531, 43.49494949, 43.27      ])
In [ ]:
means = np.cumsum(arr)/np.arange(1, 101)

In [ ]:
means[:10]

Out[ ]:
array([33.        , 29.5       , 44.66666667, 43.        , 37.8       ,
35.        , 42.42857143, 40.75      , 40.44444444, 42.1       ])

#### how does scaling and shifting effect mean and median¶

In [ ]:
ex_arr = np.random.randint(1, 100, 100)

In [ ]:
np.mean(ex_arr)

Out[ ]:
48.85
In [ ]:
ex_arr[:10]

Out[ ]:
array([29, 88, 93, 45, 41, 75, 78, 14,  6, 69])
In [ ]:
np.median(ex_arr)

Out[ ]:
44.0

#### adding outliers to the end of the array¶

In [ ]:
ex_arr = np.append(ex_arr, [4000, 2000])

In [ ]:
ex_arr[-10:]

Out[ ]:
array([  30,   74,   23,   24,   49,   66,   94,   22, 4000, 2000])

#### observe the difference between mean and median values, on par with the previous observations¶

• in the previous instance, mean and median are very near to each other
• were as after addition of outliers, they are very far from each other
• observation: mean is very sensitive to outliers where as median is not so
In [ ]:
np.mean(ex_arr)

Out[ ]:
106.7156862745098
In [ ]:
np.median(ex_arr)

Out[ ]:
44.5

#### Effect of scaling on mean and median¶

In [ ]:
sca_arr = np.random.randint(1, 100, 100)

In [ ]:
np.mean(sca_arr)

Out[ ]:
47.2
In [ ]:
np.median(sca_arr)

Out[ ]:
49.0
In [ ]:
sca_arr1 = 2.5 * sca_arr + 10.02


#### SCALING the array by linear co-efficient and adding is equivalent to scaling the mean of the array¶

In [ ]:
print(np.mean(sca_arr1),
2.5 * np.mean(sca_arr) + 10.02)

128.02000000000004 128.02

In [ ]:
print(np.mean(2.5 * sca_arr + 10.02),
2.5 * np.mean(sca_arr) + 10.02)

128.02000000000004 128.02

In [ ]:
print(np.median(sca_arr1), 2.5 * np.median(sca_arr) + 10.02)

132.52 132.52


#### outcome of SCALING is different in the case of variance and standard deviation¶

In [ ]:
print(np.var(sca_arr1), 2.5 * np.var(sca_arr) + 10.02)

4602.000000000001 1850.8200000000002

In [ ]:
print(np.std(sca_arr1), 2.5 * np.std(sca_arr) + 10.02)

67.83804242458652 77.85804242458651

In [ ]: