import numpy as np
import pandas as pd

Creating dataframe objects¶

arr = np.random.randint(0, 10, (5, 3))

arr

array([[2, 9, 7],
       [5, 4, 5],
       [8, 6, 4],
       [4, 3, 2],
       [6, 3, 3]])

df = pd.DataFrame(arr)

df

df.values

array([[2, 9, 7],
       [5, 4, 5],
       [8, 6, 4],
       [4, 3, 2],
       [6, 3, 3]])

df.index

RangeIndex(start=0, stop=5, step=1)

df.columns

RangeIndex(start=0, stop=3, step=1)

for c in df.columns:
    print(c)

0
1
2

df.values[0]

array([2, 9, 7])

df.index = ['R1', 'R2', 'R3', 'R4', 'R5']
df.columns = ['C1', 'C2', 'C3']

df

df.loc['R3', 'C2']

6

df.iloc[2, 1]

6

type(df.iloc[2:4, 1:3])

pandas.core.frame.DataFrame

df.loc['R3':'R5', 'C2':'C3']

df.iloc[0]

C1    2
C2    9
C3    7
Name: R1, dtype: int64

df.iloc[:, 0]

R1    2
R2    5
R3    8
R4    4
R5    6
Name: C1, dtype: int64

df.shape

(5, 3)

df.T

Task on creating dataframes

def create_df(nRows, nCols, maxRand=10):
    arr = np.random.randint(0, maxRand, (nRows, nCols))
    df = pd.DataFrame(arr)
    df.index = ['R' + str(x) for x in np.arange(1, nRows+1)]
    df.columns = ['C' + str(x) for x in np.arange(1, nCols+1)]
    return df

create_df(5, 3)

create_df(2, 5)

mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series([4879, 12104, 12756, 3475, 6792, 142984, 120536, 51118, 49528, 2370], 
                     index=['Mercury', 'Venus', 'Earth', 'Moon', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

mass

Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
dtype: float64

diameter

Mercury      4879
Venus       12104
Earth       12756
Moon         3475
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2370
dtype: int64

df = pd.DataFrame({'Mass': mass, 'Diameter': diameter})

df

df['Mass']

Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: Mass, dtype: float64

df['Diameter']

Earth       12756
Jupiter    142984
Mars         6792
Mercury      4879
Moon         3475
Neptune     49528
Pluto        2370
Saturn     120536
Uranus      51118
Venus       12104
Name: Diameter, dtype: int64

df['Earth']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Earth'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-37-b07be539a774> in <module>()
----> 1 df['Earth']

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Earth'

df['Mass']['Earth']

5.97

df.Mass.Earth

5.97

df['pop'] = 0

df

df['pop']['Earth'] = 8000000000

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

df

df['Mass'] is df.Mass

True

df['pop'] is df.pop

False

df

df

df.loc['Earth']

Mass        5.970000e+00
Diameter    1.275600e+04
pop         8.000000e+09
Name: Earth, dtype: float64

df.loc[:,'Mass']

Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: Mass, dtype: float64

def create_mean_row(df):
    # df.loc['Col_Mean'] = [np.mean(df[col]) for col in df.columns]
    df.loc['Col_Mean'] = df.mean()
    return df

df

df.loc['Col_Mean'] = 0

df

np.mean(df['Mass'])

266.66266

df.drop('Col_Mean', inplace=True)

df.drop('pop', axis=1, inplace=True)

df

np.mean(df['Mass'])

296.29184444444445

df.loc['Col_Mean'] = [np.mean(df['Mass']), np.mean(df['Diameter'])]

df

df.drop('Col_Mean')

create_mean_row(df)

df = create_df(5, 3)

df

df.mean()

C1    4.2
C2    5.6
C3    7.0
dtype: float64

df.mean(axis=1)

R1    4.666667
R2    7.666667
R3    6.333333
R4    6.333333
R5    3.000000
dtype: float64

df['Row_Mean'] = df.mean(axis=1)

df.loc['Col_Mean'] = df.mean()

df

df.median()

C1          4.600000
C2          5.800000
C3          6.500000
Row_Mean    5.966667
dtype: float64

df.min()

C1          0.0
C2          3.0
C3          6.0
Row_Mean    3.0
dtype: float64

df.max()

C1          7.000000
C2          8.000000
C3          9.000000
Row_Mean    7.666667
dtype: float64

df.quantile(0.25)

C1          4.05
C2          4.40
C3          6.00
Row_Mean    4.90
Name: 0.25, dtype: float64

df.drop('Row_Mean', axis=1)

df.describe()

mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series([4879, 12104, 12756, 3475, 6792, 142984, 120536, 51118, 49528, 2370], 
                     index=['Mercury', 'Venus', 'Earth', 'Moon', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

planets = pd.DataFrame({'mass': mass, 'diameter': diameter})

planets.describe()

import seaborn as sns

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

df = sns.load_dataset('planets')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB

df.head()

df.tail()

df.describe()

Go through each row of the dataframe and delete it (drop) if any of the columns is null

for r in df.index:
    for c in df.columns:
        if pd.isnull(df.loc[r, c]):
            df.drop(r, inplace=True)
            break

df.describe()

for i, r in df.iterrows():
    print(i)
    print(r)
    break

0
method            Radial Velocity
number                          1
orbital_period              269.3
mass                          7.1
distance                     77.4
year                         2006
Name: 0, dtype: object

for i, r in df.iterrows():
    if pd.isnull(r).any():
        df.drop(i, inplace=True)

df.describe()

df.dropna(inplace=True)

df.describe()

Filter and show only those rows which have planets that are found in the 2010s and method is 'Radial Velocity' or 'Transit' and distance is large (> 75 percentile)

df_ = df.copy()
per_75 = df.distance.quantile(0.75)
for i, r in df_.iterrows():
    if r['year'] < 2010:
        df_.drop(i, inplace=True)
        continue
    if r['method'] != 'Radial Velocity' and r['method'] != 'Transit':
        df_.drop(i, inplace=True)
        continue
    if r['distance'] < per_75:
        df_.drop(i, inplace=True)
        continue

df_.describe()

df_.tail()

per_75

59.3325

df_ = df.copy()
df_ = df_[
    (df_['year'] >= 2010) &
    ((df_['method'] == 'Radial Velocity') | (df_['method'] == 'Transit')) &
    (df_['distance'] > per_75)
]

df_.describe()

Modify the method column to have only the abbreviation of each method

df.method.unique()

array(['Radial Velocity', 'Imaging', 'Eclipse Timing Variations',
       'Transit', 'Astrometry', 'Transit Timing Variations',
       'Orbital Brightness Modulation', 'Microlensing', 'Pulsar Timing',
       'Pulsation Timing Variations'], dtype=object)

s = 'Orbital Brightness Modulation'

''.join([x[0] for x in s.split(' ')])

'OBM'

short_names = {}
for s in df.method.unique():
    short_names[s] = ''.join([x[0] for x in s.split(' ')])

print(short_names)

{'Radial Velocity': 'RV', 'Imaging': 'I', 'Eclipse Timing Variations': 'ETV', 'Transit': 'T', 'Astrometry': 'A', 'Transit Timing Variations': 'TTV', 'Orbital Brightness Modulation': 'OBM', 'Microlensing': 'M', 'Pulsar Timing': 'PT', 'Pulsation Timing Variations': 'PTV'}

for i, r in df.iterrows():
    df.loc[i, 'short_method'] = short_names.get(r['method'], r['method'])

df.tail()

df = sns.load_dataset('planets')

def shorten_method(s):
    return short_names.get(s, s)

df['short_method'] = df['method'].apply(shorten_method)

df.head()

Count the number of planets discovered for each method type

Split the dataframe into smaller chunks (in this case they should have the same method name)
Apply some function in each smaller chunk (in this case it is the count function)
Aggregate the results from each chunk together

d = {}
for m in df.method.unique():
    d[m] = df[df.method == m]['method'].count()
print(d)

{'Radial Velocity': 553, 'Imaging': 38, 'Eclipse Timing Variations': 9, 'Transit': 397, 'Astrometry': 2, 'Transit Timing Variations': 4, 'Orbital Brightness Modulation': 3, 'Microlensing': 23, 'Pulsar Timing': 5, 'Pulsation Timing Variations': 1}

df.groupby('method')['method'].count()

method
Astrometry                         2
Eclipse Timing Variations          9
Imaging                           38
Microlensing                      23
Orbital Brightness Modulation      3
Pulsar Timing                      5
Pulsation Timing Variations        1
Radial Velocity                  553
Transit                          397
Transit Timing Variations          4
Name: method, dtype: int64

d = {}
for m in df.method.unique():
    d[m] = df[df.method == m]['distance'].mean()
print(d)

{'Radial Velocity': 51.60020754716983, 'Imaging': 67.7159375, 'Eclipse Timing Variations': 315.36, 'Transit': 599.2980803571429, 'Astrometry': 17.875, 'Transit Timing Variations': 1104.3333333333333, 'Orbital Brightness Modulation': 1180.0, 'Microlensing': 4144.0, 'Pulsar Timing': 1200.0, 'Pulsation Timing Variations': nan}

df.groupby('method')['distance'].mean()

method
Astrometry                         17.875000
Eclipse Timing Variations         315.360000
Imaging                            67.715937
Microlensing                     4144.000000
Orbital Brightness Modulation    1180.000000
Pulsar Timing                    1200.000000
Pulsation Timing Variations              NaN
Radial Velocity                    51.600208
Transit                           599.298080
Transit Timing Variations        1104.333333
Name: distance, dtype: float64

Find out what fraction of planets have been found in the last decade (i.e., in 2010s) across each method type

Filter the data for given condition (in this case planet found in last decade)
Split (in this case across method)
Apply (in this case just count)
Aggregate (to represent the final result)

s_2010s = df[df.year >= 2010].groupby('method')['method'].count()

s_allTime = df.groupby('method')['method'].count()

s_2010s/s_allTime

method
Astrometry                       1.000000
Eclipse Timing Variations        0.666667
Imaging                          0.473684
Microlensing                     0.565217
Orbital Brightness Modulation    1.000000
Pulsar Timing                    0.200000
Pulsation Timing Variations           NaN
Radial Velocity                  0.388788
Transit                          0.843829
Transit Timing Variations        1.000000
Name: method, dtype: float64

Find a dataset of Nifty numbers for 2018 and 2019 - daily numbers open, close, high, low

nifty50_2018 = pd.read_csv('NIFTY50_2018.csv')

nifty50_2018.head()

nifty50_2018 = pd.read_csv('NIFTY50_2018.csv', index_col=0)

nifty50_2018.head()

nifty50_2018.loc['31 Dec 2018']

Open     10913.20
High     10923.55
Low      10853.20
Close    10862.55
Name: 31 Dec 2018, dtype: float64

nifty50_2018['Open']

Date
31 Dec 2018    10913.20
28 Dec 2018    10820.95
27 Dec 2018    10817.90
26 Dec 2018    10635.45
24 Dec 2018    10780.90
                 ...   
05 Jan 2018    10534.25
04 Jan 2018    10469.40
03 Jan 2018    10482.65
02 Jan 2018    10477.55
01 Jan 2018    10531.70
Name: Open, Length: 246, dtype: float64

nifty50_2019 = pd.read_csv('NIFTY50_2019.csv', index_col=0)

nifty50_2019.head()

print(nifty50_2018.shape, nifty50_2019.shape)

(246, 4) (245, 4)

nifty50 = pd.concat([nifty50_2018, nifty50_2019])

nifty50.shape

(491, 4)

niftynext50_2019 = pd.read_csv('NIFTYNext50_2019.csv', index_col = 0)

niftynext50_2019.head()

nifty_2019 = pd.concat([nifty50_2019, niftynext50_2019], axis=1)

nifty_2019.shape

(245, 8)

nifty_2019.head()

nifty_2019 = pd.concat([nifty50_2019, niftynext50_2019], axis=1,
                       keys=['nifty50', 'niftynext50'])

nifty_2019.shape

(245, 8)

nifty_2019.head()

nifty_2019['nifty50'].loc['31 Dec 2019']

Open     12247.10
High     12247.10
Low      12151.80
Close    12168.45
Name: 31 Dec 2019, dtype: float64

Tasks on the NIFTY datasets:

In 2019, in how many days was the NIFTY50 volatile (high > 105% of low)
In 2019, in how many days was the NIFTYNEXT50 volatile (high > 105% of low)
In 2019, how many days belonged to the four classes NIFTY50 volatile / non-volatile and NIFTYNext50 volatile / non-volatile
Compute the mean, median, std, var of closing values for each weekday in NIFTY50 for 2019
Compute the mean, median, std, var of closing values for each month in NIFTY50 for 2019
On the days in which NIFTY50 closed higher than the open, what was the mean of (close - open) for NIFTYNext50
In 2019, how many days had the day's high lower than the previous day's low in NIFTY50
In 2019, on how many days did the day's close exceed the 30 day moving average in NIFTY50 (exclude first month)

	Mass	Diameter
Earth	5.970000	12756.0
Jupiter	1898.000000	142984.0
Mars	0.642000	6792.0
Mercury	0.330000	4879.0
Moon	NaN	3475.0
Neptune	102.000000	49528.0
Pluto	0.014600	2370.0
Saturn	568.000000	120536.0
Uranus	86.800000	51118.0
Venus	4.870000	12104.0
Col_Mean	296.291844	40654.2

	Mass	Diameter
Earth	5.970000	12756.0
Jupiter	1898.000000	142984.0
Mars	0.642000	6792.0
Mercury	0.330000	4879.0
Moon	NaN	3475.0
Neptune	102.000000	49528.0
Pluto	0.014600	2370.0
Saturn	568.000000	120536.0
Uranus	86.800000	51118.0
Venus	4.870000	12104.0
Col_Mean	296.291844	40654.2

	mass	diameter
count	9.000000	10.00000
mean	296.291844	40654.20000
std	627.786429	51541.39142
min	0.014600	2370.00000
25%	0.642000	5357.25000
50%	5.970000	12430.00000
75%	102.000000	50720.50000
max	1898.000000	142984.00000

	method	number	orbital_period	mass	distance	year
0	Radial Velocity	1	269.300	7.10	77.40	2006
1	Radial Velocity	1	874.774	2.21	56.95	2008
2	Radial Velocity	1	763.000	2.60	19.84	2011
3	Radial Velocity	1	326.030	19.40	110.62	2007
4	Radial Velocity	1	516.220	10.50	119.47	2009

	number	orbital_period	mass	distance	year
count	1035.000000	992.000000	513.000000	808.000000	1035.000000
mean	1.785507	2002.917596	2.638161	264.069282	2009.070531
std	1.240976	26014.728304	3.818617	733.116493	3.972567
min	1.000000	0.090706	0.003600	1.350000	1989.000000
25%	1.000000	5.442540	0.229000	32.560000	2007.000000
50%	1.000000	39.979500	1.260000	55.250000	2010.000000
75%	2.000000	526.005000	3.040000	178.500000	2012.000000
max	7.000000	730000.000000	25.000000	8500.000000	2014.000000

Consulting - Quality

Fundamentals of DataScience Week 10 - Class Demo

Creating dataframe objects¶

	Mass	Diameter
Earth	5.9700	12756
Jupiter	1898.0000	142984
Mars	0.6420	6792
Mercury	0.3300	4879
Moon	NaN	3475
Neptune	102.0000	49528
Pluto	0.0146	2370
Saturn	568.0000	120536
Uranus	86.8000	51118
Venus	4.8700	12104

	C1	C2	C3	Row_Mean
R1	4.0	4.0	6.0	4.666667
R2	7.0	7.0	9.0	7.666667
R3	5.0	8.0	6.0	6.333333
R4	5.0	6.0	8.0	6.333333
R5	0.0	3.0	6.0	3.000000
Col_Mean	4.2	5.6	7.0	5.600000

	C1	C2	C3	Row_Mean
count	6.000000	6.000000	6.000000	6.000000
mean	4.200000	5.600000	7.000000	5.600000
std	2.315167	1.854724	1.264911	1.611073
min	0.000000	3.000000	6.000000	3.000000
25%	4.050000	4.400000	6.000000	4.900000
50%	4.600000	5.800000	6.500000	5.966667
75%	5.000000	6.750000	7.750000	6.333333
max	7.000000	8.000000	9.000000	7.666667

	method	number	orbital_period	mass	distance	year
1030	Transit	1	3.941507	NaN	172.0	2006
1031	Transit	1	2.615864	NaN	148.0	2007
1032	Transit	1	3.191524	NaN	174.0	2007
1033	Transit	1	4.125083	NaN	293.0	2008
1034	Transit	1	4.187757	NaN	260.0	2008

	number	orbital_period	mass	distance	year
count	498.00000	498.000000	498.000000	498.000000	498.000000
mean	1.73494	835.778671	2.509320	52.068213	2007.377510
std	1.17572	1469.128259	3.636274	46.596041	4.167284
min	1.00000	1.328300	0.003600	1.350000	1989.000000
25%	1.00000	38.272250	0.212500	24.497500	2005.000000
50%	1.00000	357.000000	1.245000	39.940000	2009.000000
75%	2.00000	999.600000	2.867500	59.332500	2011.000000
max	6.00000	17337.500000	25.000000	354.000000	2014.000000

	number	orbital_period	mass	distance	year
count	50.000000	50.000000	50.000000	50.000000	50.000000
mean	1.300000	763.904808	3.322740	133.142600	2011.360000
std	0.505076	966.789870	3.648002	70.378699	1.120496
min	1.000000	2.703390	0.770000	65.620000	2010.000000
25%	1.000000	255.555000	1.325000	80.205000	2011.000000
50%	1.000000	550.500000	1.875000	121.070000	2011.000000
75%	2.000000	873.625000	3.400000	150.097500	2012.000000
max	3.000000	5584.000000	20.600000	354.000000	2014.000000

	method	number	orbital_period	mass	distance	year
620	Radial Velocity	1	745.70000	5.300	307.69	2011
627	Radial Velocity	1	16.20000	1.250	223.21	2010
636	Radial Velocity	1	124.60000	9.180	149.25	2013
649	Transit	1	2.70339	1.470	178.00	2013
784	Radial Velocity	3	580.00000	0.947	135.00	2012

	Date	Open	High	Low	Close
0	31 Dec 2018	10913.20	10923.55	10853.20	10862.55
1	28 Dec 2018	10820.95	10893.60	10817.15	10859.90
2	27 Dec 2018	10817.90	10834.20	10764.45	10779.80
3	26 Dec 2018	10635.45	10747.50	10534.55	10729.85
4	24 Dec 2018	10780.90	10782.30	10649.25	10663.50

	Open	High	Low	Close
Date
31 Dec 2018	10913.20	10923.55	10853.20	10862.55
28 Dec 2018	10820.95	10893.60	10817.15	10859.90
27 Dec 2018	10817.90	10834.20	10764.45	10779.80
26 Dec 2018	10635.45	10747.50	10534.55	10729.85
24 Dec 2018	10780.90	10782.30	10649.25	10663.50

	Open	High	Low	Close
Date
31 Dec 2019	12247.10	12247.10	12151.80	12168.45
30 Dec 2019	12274.90	12286.45	12213.80	12255.85
27 Dec 2019	12172.90	12258.45	12157.90	12245.80
26 Dec 2019	12211.85	12221.55	12118.85	12126.55
24 Dec 2019	12269.25	12283.70	12202.10	12214.55

	Open	High	Low	Close
Date
31 Dec 2019	28495.00	28549.50	28270.25	28307.55
30 Dec 2019	28528.95	28612.95	28406.70	28484.85
27 Dec 2019	28354.50	28500.25	28319.90	28476.80
26 Dec 2019	28409.10	28435.25	28259.75	28280.25
24 Dec 2019	28423.70	28430.40	28318.75	28382.85