Fundamentals of DataScience Week 10 - Class Demo
FDS_W10-Pandas
In [1]:
import numpy as np
import pandas as pd

Creating dataframe objects

In [2]:
arr = np.random.randint(0, 10, (5, 3))
In [3]:
arr
Out[3]:
array([[2, 9, 7],
       [5, 4, 5],
       [8, 6, 4],
       [4, 3, 2],
       [6, 3, 3]])
In [4]:
df = pd.DataFrame(arr)
In [5]:
df
Out[5]:
0 1 2
0 2 9 7
1 5 4 5
2 8 6 4
3 4 3 2
4 6 3 3
In [6]:
df.values
Out[6]:
array([[2, 9, 7],
       [5, 4, 5],
       [8, 6, 4],
       [4, 3, 2],
       [6, 3, 3]])
In [7]:
df.index
Out[7]:
RangeIndex(start=0, stop=5, step=1)
In [8]:
df.columns
Out[8]:
RangeIndex(start=0, stop=3, step=1)
In [9]:
for c in df.columns:
    print(c)
0
1
2
In [10]:
df.values[0]
Out[10]:
array([2, 9, 7])
In [11]:
df.index = ['R1', 'R2', 'R3', 'R4', 'R5']
df.columns = ['C1', 'C2', 'C3']
In [12]:
df
Out[12]:
C1 C2 C3
R1 2 9 7
R2 5 4 5
R3 8 6 4
R4 4 3 2
R5 6 3 3
In [13]:
df.loc['R3', 'C2']
Out[13]:
6
In [14]:
df.iloc[2, 1]
Out[14]:
6
In [15]:
type(df.iloc[2:4, 1:3])
Out[15]:
pandas.core.frame.DataFrame
In [16]:
df.loc['R3':'R5', 'C2':'C3']
Out[16]:
C2 C3
R3 6 4
R4 3 2
R5 3 3
In [17]:
df.iloc[0]
Out[17]:
C1    2
C2    9
C3    7
Name: R1, dtype: int64
In [18]:
df.iloc[:, 0]
Out[18]:
R1    2
R2    5
R3    8
R4    4
R5    6
Name: C1, dtype: int64
In [25]:
df.shape
Out[25]:
(5, 3)
In [26]:
df.T
Out[26]:
R1 R2 R3 R4 R5
C1 2 5 8 4 6
C2 9 4 6 3 3
C3 7 5 4 2 3

Task on creating dataframes

In [27]:
def create_df(nRows, nCols, maxRand=10):
    arr = np.random.randint(0, maxRand, (nRows, nCols))
    df = pd.DataFrame(arr)
    df.index = ['R' + str(x) for x in np.arange(1, nRows+1)]
    df.columns = ['C' + str(x) for x in np.arange(1, nCols+1)]
    return df
In [28]:
create_df(5, 3)
Out[28]:
C1 C2 C3
R1 4 0 9
R2 3 8 3
R3 2 4 9
R4 9 3 0
R5 9 9 7
In [29]:
create_df(2, 5)
Out[29]:
C1 C2 C3 C4 C5
R1 3 6 8 5 2
R2 4 3 4 4 7
In [30]:
mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series([4879, 12104, 12756, 3475, 6792, 142984, 120536, 51118, 49528, 2370], 
                     index=['Mercury', 'Venus', 'Earth', 'Moon', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
In [31]:
mass
Out[31]:
Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
dtype: float64
In [32]:
diameter
Out[32]:
Mercury      4879
Venus       12104
Earth       12756
Moon         3475
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2370
dtype: int64
In [33]:
df = pd.DataFrame({'Mass': mass, 'Diameter': diameter})
In [34]:
df
Out[34]:
Mass Diameter
Earth 5.9700 12756
Jupiter 1898.0000 142984
Mars 0.6420 6792
Mercury 0.3300 4879
Moon NaN 3475
Neptune 102.0000 49528
Pluto 0.0146 2370
Saturn 568.0000 120536
Uranus 86.8000 51118
Venus 4.8700 12104
In [35]:
df['Mass']
Out[35]:
Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: Mass, dtype: float64
In [36]:
df['Diameter']
Out[36]:
Earth       12756
Jupiter    142984
Mars         6792
Mercury      4879
Moon         3475
Neptune     49528
Pluto        2370
Saturn     120536
Uranus      51118
Venus       12104
Name: Diameter, dtype: int64
In [37]:
df['Earth']
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Earth'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-37-b07be539a774> in <module>()
----> 1 df['Earth']

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Earth'
In [45]:
df['Mass']['Earth']
Out[45]:
5.97
In [46]:
df.Mass.Earth
Out[46]:
5.97
In [51]:
df['pop'] = 0
In [53]:
df
Out[53]:
Mass Diameter pop
Earth 5.9700 12756 0
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2370 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 12104 0
In [54]:
df['pop']['Earth'] = 8000000000
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [55]:
df
Out[55]:
Mass Diameter pop
Earth 5.9700 12756 8000000000
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2370 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 12104 0
In [56]:
df['Mass'] is df.Mass
Out[56]:
True
In [57]:
df['pop'] is df.pop
Out[57]:
False
In [58]:
df
Out[58]:
Mass Diameter pop
Earth 5.9700 12756 8000000000
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2370 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 12104 0
In [59]:
df
Out[59]:
Mass Diameter pop
Earth 5.9700 12756 8000000000
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2370 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 12104 0
In [60]:
df.loc['Earth']
Out[60]:
Mass        5.970000e+00
Diameter    1.275600e+04
pop         8.000000e+09
Name: Earth, dtype: float64
In [61]:
df.loc[:,'Mass']
Out[61]:
Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: Mass, dtype: float64
In [78]:
def create_mean_row(df):
    # df.loc['Col_Mean'] = [np.mean(df[col]) for col in df.columns]
    df.loc['Col_Mean'] = df.mean()
    return df
In [62]:
df
Out[62]:
Mass Diameter pop
Earth 5.9700 12756 8000000000
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2370 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 12104 0
In [63]:
df.loc['Col_Mean'] = 0
In [64]:
df
Out[64]:
Mass Diameter pop
Earth 5.9700 12756 8000000000
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2370 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 12104 0
Col_Mean 0.0000 0 0
In [65]:
np.mean(df['Mass'])
Out[65]:
266.66266
In [69]:
df.drop('Col_Mean', inplace=True)
In [70]:
df.drop('pop', axis=1, inplace=True)
In [71]:
df
Out[71]:
Mass Diameter
Earth 5.9700 12756
Jupiter 1898.0000 142984
Mars 0.6420 6792
Mercury 0.3300 4879
Moon NaN 3475
Neptune 102.0000 49528
Pluto 0.0146 2370
Saturn 568.0000 120536
Uranus 86.8000 51118
Venus 4.8700 12104
In [72]:
np.mean(df['Mass'])
Out[72]:
296.29184444444445
In [73]:
df.loc['Col_Mean'] = [np.mean(df['Mass']), np.mean(df['Diameter'])]
In [74]:
df
Out[74]:
Mass Diameter
Earth 5.970000 12756.0
Jupiter 1898.000000 142984.0
Mars 0.642000 6792.0
Mercury 0.330000 4879.0
Moon NaN 3475.0
Neptune 102.000000 49528.0
Pluto 0.014600 2370.0
Saturn 568.000000 120536.0
Uranus 86.800000 51118.0
Venus 4.870000 12104.0
Col_Mean 296.291844 40654.2
In [76]:
df.drop('Col_Mean')
Out[76]:
Mass Diameter
Earth 5.9700 12756.0
Jupiter 1898.0000 142984.0
Mars 0.6420 6792.0
Mercury 0.3300 4879.0
Moon NaN 3475.0
Neptune 102.0000 49528.0
Pluto 0.0146 2370.0
Saturn 568.0000 120536.0
Uranus 86.8000 51118.0
Venus 4.8700 12104.0
In [79]:
create_mean_row(df)
Out[79]:
Mass Diameter
Earth 5.970000 12756.0
Jupiter 1898.000000 142984.0
Mars 0.642000 6792.0
Mercury 0.330000 4879.0
Moon NaN 3475.0
Neptune 102.000000 49528.0
Pluto 0.014600 2370.0
Saturn 568.000000 120536.0
Uranus 86.800000 51118.0
Venus 4.870000 12104.0
Col_Mean 296.291844 40654.2
In [80]:
df = create_df(5, 3)
In [82]:
df
Out[82]:
C1 C2 C3
R1 4 4 6
R2 7 7 9
R3 5 8 6
R4 5 6 8
R5 0 3 6
In [81]:
df.mean()
Out[81]:
C1    4.2
C2    5.6
C3    7.0
dtype: float64
In [83]:
df.mean(axis=1)
Out[83]:
R1    4.666667
R2    7.666667
R3    6.333333
R4    6.333333
R5    3.000000
dtype: float64
In [85]:
df['Row_Mean'] = df.mean(axis=1)
In [88]:
df.loc['Col_Mean'] = df.mean()
In [89]:
df
Out[89]:
C1 C2 C3 Row_Mean
R1 4.0 4.0 6.0 4.666667
R2 7.0 7.0 9.0 7.666667
R3 5.0 8.0 6.0 6.333333
R4 5.0 6.0 8.0 6.333333
R5 0.0 3.0 6.0 3.000000
Col_Mean 4.2 5.6 7.0 5.600000
In [90]:
df.median()
Out[90]:
C1          4.600000
C2          5.800000
C3          6.500000
Row_Mean    5.966667
dtype: float64
In [91]:
df.min()
Out[91]:
C1          0.0
C2          3.0
C3          6.0
Row_Mean    3.0
dtype: float64
In [92]:
df.max()
Out[92]:
C1          7.000000
C2          8.000000
C3          9.000000
Row_Mean    7.666667
dtype: float64
In [93]:
df.quantile(0.25)
Out[93]:
C1          4.05
C2          4.40
C3          6.00
Row_Mean    4.90
Name: 0.25, dtype: float64
In [94]:
df.drop('Row_Mean', axis=1)
Out[94]:
C1 C2 C3
R1 4.0 4.0 6.0
R2 7.0 7.0 9.0
R3 5.0 8.0 6.0
R4 5.0 6.0 8.0
R5 0.0 3.0 6.0
Col_Mean 4.2 5.6 7.0
In [95]:
df.describe()
Out[95]:
C1 C2 C3 Row_Mean
count 6.000000 6.000000 6.000000 6.000000
mean 4.200000 5.600000 7.000000 5.600000
std 2.315167 1.854724 1.264911 1.611073
min 0.000000 3.000000 6.000000 3.000000
25% 4.050000 4.400000 6.000000 4.900000
50% 4.600000 5.800000 6.500000 5.966667
75% 5.000000 6.750000 7.750000 6.333333
max 7.000000 8.000000 9.000000 7.666667
In [96]:
mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series([4879, 12104, 12756, 3475, 6792, 142984, 120536, 51118, 49528, 2370], 
                     index=['Mercury', 'Venus', 'Earth', 'Moon', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
In [97]:
planets = pd.DataFrame({'mass': mass, 'diameter': diameter})
In [98]:
planets.describe()
Out[98]:
mass diameter
count 9.000000 10.00000
mean 296.291844 40654.20000
std 627.786429 51541.39142
min 0.014600 2370.00000
25% 0.642000 5357.25000
50% 5.970000 12430.00000
75% 102.000000 50720.50000
max 1898.000000 142984.00000
In [99]:
import seaborn as sns
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [139]:
df = sns.load_dataset('planets')
In [102]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB
In [103]:
df.head()
Out[103]:
method number orbital_period mass distance year
0 Radial Velocity 1 269.300 7.10 77.40 2006
1 Radial Velocity 1 874.774 2.21 56.95 2008
2 Radial Velocity 1 763.000 2.60 19.84 2011
3 Radial Velocity 1 326.030 19.40 110.62 2007
4 Radial Velocity 1 516.220 10.50 119.47 2009
In [104]:
df.tail()
Out[104]:
method number orbital_period mass distance year
1030 Transit 1 3.941507 NaN 172.0 2006
1031 Transit 1 2.615864 NaN 148.0 2007
1032 Transit 1 3.191524 NaN 174.0 2007
1033 Transit 1 4.125083 NaN 293.0 2008
1034 Transit 1 4.187757 NaN 260.0 2008
In [105]:
df.describe()
Out[105]:
number orbital_period mass distance year
count 1035.000000 992.000000 513.000000 808.000000 1035.000000
mean 1.785507 2002.917596 2.638161 264.069282 2009.070531
std 1.240976 26014.728304 3.818617 733.116493 3.972567
min 1.000000 0.090706 0.003600 1.350000 1989.000000
25% 1.000000 5.442540 0.229000 32.560000 2007.000000
50% 1.000000 39.979500 1.260000 55.250000 2010.000000
75% 2.000000 526.005000 3.040000 178.500000 2012.000000
max 7.000000 730000.000000 25.000000 8500.000000 2014.000000

Go through each row of the dataframe and delete it (drop) if any of the columns is null

In [106]:
for r in df.index:
    for c in df.columns:
        if pd.isnull(df.loc[r, c]):
            df.drop(r, inplace=True)
            break
In [107]:
df.describe()
Out[107]:
number orbital_period mass distance year
count 498.00000 498.000000 498.000000 498.000000 498.000000
mean 1.73494 835.778671 2.509320 52.068213 2007.377510
std 1.17572 1469.128259 3.636274 46.596041 4.167284
min 1.00000 1.328300 0.003600 1.350000 1989.000000
25% 1.00000 38.272250 0.212500 24.497500 2005.000000
50% 1.00000 357.000000 1.245000 39.940000 2009.000000
75% 2.00000 999.600000 2.867500 59.332500 2011.000000
max 6.00000 17337.500000 25.000000 354.000000 2014.000000
In [110]:
for i, r in df.iterrows():
    print(i)
    print(r)
    break
0
method            Radial Velocity
number                          1
orbital_period              269.3
mass                          7.1
distance                     77.4
year                         2006
Name: 0, dtype: object
In [115]:
for i, r in df.iterrows():
    if pd.isnull(r).any():
        df.drop(i, inplace=True)
In [118]:
df.describe()
Out[118]:
number orbital_period mass distance year
count 1035.000000 992.000000 513.000000 808.000000 1035.000000
mean 1.785507 2002.917596 2.638161 264.069282 2009.070531
std 1.240976 26014.728304 3.818617 733.116493 3.972567
min 1.000000 0.090706 0.003600 1.350000 1989.000000
25% 1.000000 5.442540 0.229000 32.560000 2007.000000
50% 1.000000 39.979500 1.260000 55.250000 2010.000000
75% 2.000000 526.005000 3.040000 178.500000 2012.000000
max 7.000000 730000.000000 25.000000 8500.000000 2014.000000
In [119]:
df.dropna(inplace=True)
In [120]:
df.describe()
Out[120]:
number orbital_period mass distance year
count 498.00000 498.000000 498.000000 498.000000 498.000000
mean 1.73494 835.778671 2.509320 52.068213 2007.377510
std 1.17572 1469.128259 3.636274 46.596041 4.167284
min 1.00000 1.328300 0.003600 1.350000 1989.000000
25% 1.00000 38.272250 0.212500 24.497500 2005.000000
50% 1.00000 357.000000 1.245000 39.940000 2009.000000
75% 2.00000 999.600000 2.867500 59.332500 2011.000000
max 6.00000 17337.500000 25.000000 354.000000 2014.000000

Filter and show only those rows which have planets that are found in the 2010s and method is 'Radial Velocity' or 'Transit' and distance is large (> 75 percentile)

In [130]:
df_ = df.copy()
per_75 = df.distance.quantile(0.75)
for i, r in df_.iterrows():
    if r['year'] < 2010:
        df_.drop(i, inplace=True)
        continue
    if r['method'] != 'Radial Velocity' and r['method'] != 'Transit':
        df_.drop(i, inplace=True)
        continue
    if r['distance'] < per_75:
        df_.drop(i, inplace=True)
        continue
In [131]:
df_.describe()
Out[131]:
number orbital_period mass distance year
count 50.000000 50.000000 50.000000 50.000000 50.000000
mean 1.300000 763.904808 3.322740 133.142600 2011.360000
std 0.505076 966.789870 3.648002 70.378699 1.120496
min 1.000000 2.703390 0.770000 65.620000 2010.000000
25% 1.000000 255.555000 1.325000 80.205000 2011.000000
50% 1.000000 550.500000 1.875000 121.070000 2011.000000
75% 2.000000 873.625000 3.400000 150.097500 2012.000000
max 3.000000 5584.000000 20.600000 354.000000 2014.000000
In [134]:
df_.tail()
Out[134]:
method number orbital_period mass distance year
620 Radial Velocity 1 745.70000 5.300 307.69 2011
627 Radial Velocity 1 16.20000 1.250 223.21 2010
636 Radial Velocity 1 124.60000 9.180 149.25 2013
649 Transit 1 2.70339 1.470 178.00 2013
784 Radial Velocity 3 580.00000 0.947 135.00 2012
In [133]:
per_75
Out[133]:
59.3325
In [136]:
df_ = df.copy()
df_ = df_[
    (df_['year'] >= 2010) &
    ((df_['method'] == 'Radial Velocity') | (df_['method'] == 'Transit')) &
    (df_['distance'] > per_75)
]
In [137]:
df_.describe()
Out[137]:
number orbital_period mass distance year
count 50.000000 50.000000 50.000000 50.000000 50.000000
mean 1.300000 763.904808 3.322740 133.142600 2011.360000
std 0.505076 966.789870 3.648002 70.378699 1.120496
min 1.000000 2.703390 0.770000 65.620000 2010.000000
25% 1.000000 255.555000 1.325000 80.205000 2011.000000
50% 1.000000 550.500000 1.875000 121.070000 2011.000000
75% 2.000000 873.625000 3.400000 150.097500 2012.000000
max 3.000000 5584.000000 20.600000 354.000000 2014.000000

Modify the method column to have only the abbreviation of each method

In [140]:
df.method.unique()
Out[140]:
array(['Radial Velocity', 'Imaging', 'Eclipse Timing Variations',
       'Transit', 'Astrometry', 'Transit Timing Variations',
       'Orbital Brightness Modulation', 'Microlensing', 'Pulsar Timing',
       'Pulsation Timing Variations'], dtype=object)
In [145]:
s = 'Orbital Brightness Modulation'
In [146]:
''.join([x[0] for x in s.split(' ')])
Out[146]:
'OBM'
In [147]:
short_names = {}
for s in df.method.unique():
    short_names[s] = ''.join([x[0] for x in s.split(' ')])
In [148]:
print(short_names)
{'Radial Velocity': 'RV', 'Imaging': 'I', 'Eclipse Timing Variations': 'ETV', 'Transit': 'T', 'Astrometry': 'A', 'Transit Timing Variations': 'TTV', 'Orbital Brightness Modulation': 'OBM', 'Microlensing': 'M', 'Pulsar Timing': 'PT', 'Pulsation Timing Variations': 'PTV'}
In [149]:
for i, r in df.iterrows():
    df.loc[i, 'short_method'] = short_names.get(r['method'], r['method'])
In [151]:
df.tail()
Out[151]:
method number orbital_period mass distance year short_method
1030 Transit 1 3.941507 NaN 172.0 2006 T
1031 Transit 1 2.615864 NaN 148.0 2007 T
1032 Transit 1 3.191524 NaN 174.0 2007 T
1033 Transit 1 4.125083 NaN 293.0 2008 T
1034 Transit 1 4.187757 NaN 260.0 2008 T
In [152]:
df = sns.load_dataset('planets')
In [153]:
def shorten_method(s):
    return short_names.get(s, s)
In [154]:
df['short_method'] = df['method'].apply(shorten_method)
In [155]:
df.head()
Out[155]:
method number orbital_period mass distance year short_method
0 Radial Velocity 1 269.300 7.10 77.40 2006 RV
1 Radial Velocity 1 874.774 2.21 56.95 2008 RV
2 Radial Velocity 1 763.000 2.60 19.84 2011 RV
3 Radial Velocity 1 326.030 19.40 110.62 2007 RV
4 Radial Velocity 1 516.220 10.50 119.47 2009 RV

Count the number of planets discovered for each method type

  1. Split the dataframe into smaller chunks (in this case they should have the same method name)
  2. Apply some function in each smaller chunk (in this case it is the count function)
  3. Aggregate the results from each chunk together
In [160]:
d = {}
for m in df.method.unique():
    d[m] = df[df.method == m]['method'].count()
print(d)
{'Radial Velocity': 553, 'Imaging': 38, 'Eclipse Timing Variations': 9, 'Transit': 397, 'Astrometry': 2, 'Transit Timing Variations': 4, 'Orbital Brightness Modulation': 3, 'Microlensing': 23, 'Pulsar Timing': 5, 'Pulsation Timing Variations': 1}
In [162]:
df.groupby('method')['method'].count()
Out[162]:
method
Astrometry                         2
Eclipse Timing Variations          9
Imaging                           38
Microlensing                      23
Orbital Brightness Modulation      3
Pulsar Timing                      5
Pulsation Timing Variations        1
Radial Velocity                  553
Transit                          397
Transit Timing Variations          4
Name: method, dtype: int64
In [163]:
d = {}
for m in df.method.unique():
    d[m] = df[df.method == m]['distance'].mean()
print(d)
{'Radial Velocity': 51.60020754716983, 'Imaging': 67.7159375, 'Eclipse Timing Variations': 315.36, 'Transit': 599.2980803571429, 'Astrometry': 17.875, 'Transit Timing Variations': 1104.3333333333333, 'Orbital Brightness Modulation': 1180.0, 'Microlensing': 4144.0, 'Pulsar Timing': 1200.0, 'Pulsation Timing Variations': nan}
In [164]:
df.groupby('method')['distance'].mean()
Out[164]:
method
Astrometry                         17.875000
Eclipse Timing Variations         315.360000
Imaging                            67.715937
Microlensing                     4144.000000
Orbital Brightness Modulation    1180.000000
Pulsar Timing                    1200.000000
Pulsation Timing Variations              NaN
Radial Velocity                    51.600208
Transit                           599.298080
Transit Timing Variations        1104.333333
Name: distance, dtype: float64

Find out what fraction of planets have been found in the last decade (i.e., in 2010s) across each method type

  1. Filter the data for given condition (in this case planet found in last decade)
  2. Split (in this case across method)
  3. Apply (in this case just count)
  4. Aggregate (to represent the final result)
In [170]:
s_2010s = df[df.year >= 2010].groupby('method')['method'].count()
In [171]:
s_allTime = df.groupby('method')['method'].count()
In [172]:
s_2010s/s_allTime
Out[172]:
method
Astrometry                       1.000000
Eclipse Timing Variations        0.666667
Imaging                          0.473684
Microlensing                     0.565217
Orbital Brightness Modulation    1.000000
Pulsar Timing                    0.200000
Pulsation Timing Variations           NaN
Radial Velocity                  0.388788
Transit                          0.843829
Transit Timing Variations        1.000000
Name: method, dtype: float64

Find a dataset of Nifty numbers for 2018 and 2019 - daily numbers open, close, high, low

In [173]:
nifty50_2018 = pd.read_csv('NIFTY50_2018.csv')
In [174]:
nifty50_2018.head()
Out[174]:
Date Open High Low Close
0 31 Dec 2018 10913.20 10923.55 10853.20 10862.55
1 28 Dec 2018 10820.95 10893.60 10817.15 10859.90
2 27 Dec 2018 10817.90 10834.20 10764.45 10779.80
3 26 Dec 2018 10635.45 10747.50 10534.55 10729.85
4 24 Dec 2018 10780.90 10782.30 10649.25 10663.50
In [175]:
nifty50_2018 = pd.read_csv('NIFTY50_2018.csv', index_col=0)
In [176]:
nifty50_2018.head()
Out[176]:
Open High Low Close
Date
31 Dec 2018 10913.20 10923.55 10853.20 10862.55
28 Dec 2018 10820.95 10893.60 10817.15 10859.90
27 Dec 2018 10817.90 10834.20 10764.45 10779.80
26 Dec 2018 10635.45 10747.50 10534.55 10729.85
24 Dec 2018 10780.90 10782.30 10649.25 10663.50
In [177]:
nifty50_2018.loc['31 Dec 2018']
Out[177]:
Open     10913.20
High     10923.55
Low      10853.20
Close    10862.55
Name: 31 Dec 2018, dtype: float64
In [178]:
nifty50_2018['Open']
Out[178]:
Date
31 Dec 2018    10913.20
28 Dec 2018    10820.95
27 Dec 2018    10817.90
26 Dec 2018    10635.45
24 Dec 2018    10780.90
                 ...   
05 Jan 2018    10534.25
04 Jan 2018    10469.40
03 Jan 2018    10482.65
02 Jan 2018    10477.55
01 Jan 2018    10531.70
Name: Open, Length: 246, dtype: float64
In [179]:
nifty50_2019 = pd.read_csv('NIFTY50_2019.csv', index_col=0)
In [180]:
nifty50_2019.head()
Out[180]:
Open High Low Close
Date
31 Dec 2019 12247.10 12247.10 12151.80 12168.45
30 Dec 2019 12274.90 12286.45 12213.80 12255.85
27 Dec 2019 12172.90 12258.45 12157.90 12245.80
26 Dec 2019 12211.85 12221.55 12118.85 12126.55
24 Dec 2019 12269.25 12283.70 12202.10 12214.55
In [182]:
print(nifty50_2018.shape, nifty50_2019.shape)
(246, 4) (245, 4)
In [183]:
nifty50 = pd.concat([nifty50_2018, nifty50_2019])
In [184]:
nifty50.shape
Out[184]:
(491, 4)
In [185]:
niftynext50_2019 = pd.read_csv('NIFTYNext50_2019.csv', index_col = 0)
In [186]:
niftynext50_2019.head()
Out[186]:
Open High Low Close
Date
31 Dec 2019 28495.00 28549.50 28270.25 28307.55
30 Dec 2019 28528.95 28612.95 28406.70 28484.85
27 Dec 2019 28354.50 28500.25 28319.90 28476.80
26 Dec 2019 28409.10 28435.25 28259.75 28280.25
24 Dec 2019 28423.70 28430.40 28318.75 28382.85
In [187]:
nifty_2019 = pd.concat([nifty50_2019, niftynext50_2019], axis=1)
In [188]:
nifty_2019.shape
Out[188]:
(245, 8)
In [189]:
nifty_2019.head()
Out[189]:
Open High Low Close Open High Low Close
Date
31 Dec 2019 12247.10 12247.10 12151.80 12168.45 28495.00 28549.50 28270.25 28307.55
30 Dec 2019 12274.90 12286.45 12213.80 12255.85 28528.95 28612.95 28406.70 28484.85
27 Dec 2019 12172.90 12258.45 12157.90 12245.80 28354.50 28500.25 28319.90 28476.80
26 Dec 2019 12211.85 12221.55 12118.85 12126.55 28409.10 28435.25 28259.75 28280.25
24 Dec 2019 12269.25 12283.70 12202.10 12214.55 28423.70 28430.40 28318.75 28382.85
In [190]:
nifty_2019 = pd.concat([nifty50_2019, niftynext50_2019], axis=1,
                       keys=['nifty50', 'niftynext50'])
In [191]:
nifty_2019.shape
Out[191]:
(245, 8)
In [192]:
nifty_2019.head()
Out[192]:
nifty50 niftynext50
Open High Low Close Open High Low Close
Date
31 Dec 2019 12247.10 12247.10 12151.80 12168.45 28495.00 28549.50 28270.25 28307.55
30 Dec 2019 12274.90 12286.45 12213.80 12255.85 28528.95 28612.95 28406.70 28484.85
27 Dec 2019 12172.90 12258.45 12157.90 12245.80 28354.50 28500.25 28319.90 28476.80
26 Dec 2019 12211.85 12221.55 12118.85 12126.55 28409.10 28435.25 28259.75 28280.25
24 Dec 2019 12269.25 12283.70 12202.10 12214.55 28423.70 28430.40 28318.75 28382.85
In [196]:
nifty_2019['nifty50'].loc['31 Dec 2019']
Out[196]:
Open     12247.10
High     12247.10
Low      12151.80
Close    12168.45
Name: 31 Dec 2019, dtype: float64

Tasks on the NIFTY datasets:

  1. In 2019, in how many days was the NIFTY50 volatile (high > 105% of low)

  2. In 2019, in how many days was the NIFTYNEXT50 volatile (high > 105% of low)

  3. In 2019, how many days belonged to the four classes NIFTY50 volatile / non-volatile and NIFTYNext50 volatile / non-volatile

  4. Compute the mean, median, std, var of closing values for each weekday in NIFTY50 for 2019

  5. Compute the mean, median, std, var of closing values for each month in NIFTY50 for 2019

  6. On the days in which NIFTY50 closed higher than the open, what was the mean of (close - open) for NIFTYNext50

  7. In 2019, how many days had the day's high lower than the previous day's low in NIFTY50

  8. In 2019, on how many days did the day's close exceed the 30 day moving average in NIFTY50 (exclude first month)

In [ ]:
 
In [ ]: