PANDAS DATAFRAME
class work
padhai_Pandas_dataframe
In [ ]:
import numpy as np
import pandas as pd

mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

diameter = pd.Series([4879, 120104, 12756, 6792, 142984, 120536, 51118, 49528, 2376], index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

  • DataFrame is nothing but a collection of series either row wise or col wise or both
  • it can be created from numpy array as well

DataFrame in pandas

Creating a two dimensional numpy array

In [ ]:
arr = np.random.randint(1, 30, (5, 3))
arr
Out[ ]:
array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])
In [ ]:
arr
Out[ ]:
array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])
In [ ]:
# Creating a dataframe from the numpy array
df = pd.DataFrame(arr)
In [ ]:
df
Out[ ]:
0 1 2
0 26 25 14
1 9 20 1
2 9 2 1
3 15 17 6
4 27 4 19
In [ ]:
df.values
Out[ ]:
array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])
In [ ]:
df.columns
Out[ ]:
RangeIndex(start=0, stop=3, step=1)
In [ ]:
df[2:3]
Out[ ]:
0 1 2
2 9 2 1
In [ ]:
df.index
Out[ ]:
RangeIndex(start=0, stop=5, step=1)
In [ ]:
for c in df.columns:
    print(c)
0
1
2
In [ ]:
df.index = ["R1", "R2", "R3", "R4", "R5"]
df.columns = ["C1", "C2", "C3"]
In [ ]:
df
Out[ ]:
C1 C2 C3
R1 26 25 14
R2 9 20 1
R3 9 2 1
R4 15 17 6
R5 27 4 19

Using Loc and iLoc functions

returns a single element

In [ ]:
df.loc['R3', 'C2']
Out[ ]:
2
In [ ]:
df.iloc[2,1]
Out[ ]:
2
In [ ]:
df.iloc[4,2]
Out[ ]:
19

returns a DataFrame

In [ ]:
df.iloc[2:4, 1:3]
Out[ ]:
C2 C3
R3 2 1
R4 17 6
In [ ]:
type(df.iloc[2:4, 1:3])
Out[ ]:
pandas.core.frame.DataFrame

returns a series

In [ ]:
arr
Out[ ]:
array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])
In [ ]:
df.iloc[0]
Out[ ]:
C1    26
C2    25
C3    14
Name: R1, dtype: int64
In [ ]:
type(df.iloc[0])
Out[ ]:
pandas.core.series.Series
In [ ]:
df.iloc[:2]
Out[ ]:
C1 C2 C3
R1 26 25 14
R2 9 20 1
In [ ]:
df.iloc[:1]
Out[ ]:
C1 C2 C3
R1 26 25 14
In [ ]:
df.iloc[0:3]
Out[ ]:
C1 C2 C3
R1 26 25 14
R2 9 20 1
R3 9 2 1
In [ ]:
df.iloc[:3]
Out[ ]:
C1 C2 C3
R1 26 25 14
R2 9 20 1
R3 9 2 1
In [ ]:
df.iloc[:,0]
Out[ ]:
R1    26
R2     9
R3     9
R4    15
R5    27
Name: C1, dtype: int64
In [ ]:
df.iloc[:,2]
Out[ ]:
R1    14
R2     1
R3     1
R4     6
R5    19
Name: C3, dtype: int64
In [ ]:
df.iloc[2,:]
Out[ ]:
C1    9
C2    2
C3    1
Name: R3, dtype: int64
In [ ]:
arr
Out[ ]:
array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])
In [ ]:
df.loc["R2"]
Out[ ]:
C1     9
C2    20
C3     1
Name: R2, dtype: int64
In [ ]:
df.shape
Out[ ]:
(5, 3)
In [ ]:
df.T
Out[ ]:
R1 R2 R3 R4 R5
C1 26 9 9 15 27
C2 25 20 2 17 4
C3 14 1 1 6 19

homework to create a dataframe

In [ ]:
def create_df(nrows, ncols):
    arr1 = np.random.randint(1, 30, (nrows, ncols))
    df = pd.DataFrame(arr1)
    print(arr1)
In [ ]:
%%time
create_df(3, 4)
[[20  9 20 25]
 [16 23 23 19]
 [12 27 23 17]]
CPU times: user 1.36 ms, sys: 0 ns, total: 1.36 ms
Wall time: 1.6 ms
In [ ]:
create_df(4, 5)
[[ 2  4 17 25 22]
 [23 13 13 16 22]
 [ 8 29 17 23  6]
 [23  3  4 24 13]]
In [ ]:
def createDF(nRows, nCols):
    df = pd.DataFrame(np.random.randint(1, 30, (nRows, nCols)))
    print(df)
In [ ]:
%%time
createDF(4, 3)
    0   1   2
0  29   3   2
1  29  10  20
2  13  11  22
3  28   8  12
CPU times: user 2.95 ms, sys: 0 ns, total: 2.95 ms
Wall time: 3.56 ms
In [ ]:
createDF(10, 10)
    0   1   2   3   4   5   6   7   8   9
0  25  10   2  29   1  17   8  20   6  25
1  10   4  23  14  29  20   5   8  13  16
2   5   7  17  24   2   6  26  22  23  21
3  24   4  19   4  12  27  18   8  26  29
4  16   9   9   3  13   2  15  25  23  13
5  18  12  16   9  16  14  24  22   7  17
6  10  10  12   9  15  27   7   1  20  24
7  27   4   9  22  27  10  20  21  20  22
8   4   7  27   7  21  21   8   8  16  12
9  11  18   2   3   6  17  18  18  27   5

as taught in class

In [ ]:
import numpy as np
import pandas as pd
In [ ]:
def create_DF(nRows, nCols, maxData=10):
    arr = np.random.randint(0, maxData, (nRows, nCols))
    df = pd.DataFrame(arr)
    df.index = ["R" + str(x) for x in np.arange(1, nRows + 1)]
    df.columns = ["C" + str(x) for x in np.arange(1, nCols + 1)]
    return df
In [ ]:
create_DF(4, 5, 50)
Out[ ]:
C1 C2 C3 C4 C5
R1 14 15 0 3 31
R2 38 45 21 23 34
R3 13 6 23 34 26
R4 30 27 46 39 9
In [ ]:
create_DF(2, 5)
Out[ ]:
C1 C2 C3 C4 C5
R1 7 8 1 6 1
R2 7 1 8 4 4

to create DataFrame by invoking multiple series objects

In [ ]:
mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series([4879, 120104, 12756, 6792, 142984, 120536, 51118, 49528, 2376], 
                     index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
In [ ]:
mass
Out[ ]:
Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
dtype: float64
In [ ]:
diameter
Out[ ]:
Mercury      4879
Venus      120104
Earth       12756
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2376
dtype: int64

in the dictionary, 'mass' and 'diameter' represents the names of the columns

rows will be the union of the first series dataset and the second series dataset

In [ ]:
df = pd.DataFrame({'mass': mass, 'diameter':diameter})
In [ ]:
df
Out[ ]:
mass diameter
Mercury 0.3300 4879
Venus 4.8700 120104
Earth 5.9700 12756
Mars 0.6420 6792
Jupiter 1898.0000 142984
Saturn 568.0000 120536
Uranus 86.8000 51118
Neptune 102.0000 49528
Pluto 0.0146 2376

adding a new element to Series Object

In [ ]:
diameter = diameter.append(pd.Series(3475, index=['Moon']))
In [ ]:
diameter
Out[ ]:
Mercury      4879
Venus      120104
Earth       12756
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2376
Moon         3475
dtype: int64
In [ ]:
df_moon = pd.DataFrame({'mass': mass, 'diameter':diameter})

because DataFrame couldnot identify the item "Moon" in the first dataset, it is placed as NaN (Not a Number)

In [ ]:
df_moon
Out[ ]:
mass diameter
Earth 5.9700 12756
Jupiter 1898.0000 142984
Mars 0.6420 6792
Mercury 0.3300 4879
Moon NaN 3475
Neptune 102.0000 49528
Pluto 0.0146 2376
Saturn 568.0000 120536
Uranus 86.8000 51118
Venus 4.8700 120104
In [ ]:
df['mass']
Out[ ]:
Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
Name: mass, dtype: float64
In [ ]:
df['diameter']
Out[ ]:
Mercury      4879
Venus      120104
Earth       12756
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2376
Name: diameter, dtype: int64
In [ ]:
df = pd.DataFrame({'mass': mass, 'diameter':diameter})

the following data is retrieved from the recent dataframe and nothing to do with Series

In [ ]:
df['mass']
Out[ ]:
Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: mass, dtype: float64
In [ ]:
df['diameter']
Out[ ]:
Earth       12756
Jupiter    142984
Mars         6792
Mercury      4879
Moon         3475
Neptune     49528
Pluto        2376
Saturn     120536
Uranus      51118
Venus      120104
Name: diameter, dtype: int64
In [ ]:
df['mass']['Earth']
Out[ ]:
5.97
In [ ]:
df.mass.Earth
Out[ ]:
5.97
In [ ]:
df['diameter']['Earth']
Out[ ]:
12756
In [ ]:
df.diameter.Earth
Out[ ]:
12756

adding a new column to an existing dataframe

In [ ]:
df['Population'] = 0
In [ ]:
df
Out[ ]:
mass diameter Population
Earth 5.9700 12756 0
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2376 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 120104 0
In [ ]:
df['Population']['Earth'] = 8000000000
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [ ]:
df
Out[ ]:
mass diameter Population
Earth 5.9700 12756 8000000000
Jupiter 1898.0000 142984 0
Mars 0.6420 6792 0
Mercury 0.3300 4879 0
Moon NaN 3475 0
Neptune 102.0000 49528 0
Pluto 0.0146 2376 0
Saturn 568.0000 120536 0
Uranus 86.8000 51118 0
Venus 4.8700 120104 0
In [ ]:
df['pop'] = 0
In [ ]:
df
Out[ ]:
mass diameter Population pop
Earth 5.9700 12756 8000000000 0
Jupiter 1898.0000 142984 0 0
Mars 0.6420 6792 0 0
Mercury 0.3300 4879 0 0
Moon NaN 3475 0 0
Neptune 102.0000 49528 0 0
Pluto 0.0146 2376 0 0
Saturn 568.0000 120536 0 0
Uranus 86.8000 51118 0 0
Venus 4.8700 120104 0 0
In [ ]:
df['mass'] == df.mass
Out[ ]:
Earth       True
Jupiter     True
Mars        True
Mercury     True
Moon       False
Neptune     True
Pluto       True
Saturn      True
Uranus      True
Venus       True
Name: mass, dtype: bool
In [ ]:
df.mass is df['mass']
Out[ ]:
True

method to access elements as series

In [ ]:
# returns series - horizontal
df.loc['Earth']
Out[ ]:
mass          5.970000e+00
diameter      1.275600e+04
Population    8.000000e+09
pop           0.000000e+00
Name: Earth, dtype: float64
In [ ]:
type(df.loc['Earth'])
Out[ ]:
pandas.core.series.Series
In [ ]:
# returns series - Vertical
df.loc[:, 'mass']
Out[ ]:
Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: mass, dtype: float64
In [ ]:
type(df.loc[:, 'mass'])
Out[ ]:
pandas.core.series.Series

the reason for FALSE as result is that pop is a built-in function, hence built-in f() takes precedence

  • this is one of the pitfalls with using . (Dot) notation hence it is advised to use [] notation
In [ ]:
df['pop'] is df.pop
Out[ ]:
False

Homework

  • how to create a new row
  • create a mew f() to create a new row which would be a MEAN to calculate the mean of the columns mass and diameter
In [ ]:
df2 = df
In [ ]:
df2
Out[ ]:
mass diameter Population pop
Earth 5.9700 12756 8000000000 0
Jupiter 1898.0000 142984 0 0
Mars 0.6420 6792 0 0
Mercury 0.3300 4879 0 0
Moon NaN 3475 0 0
Neptune 102.0000 49528 0 0
Pluto 0.0146 2376 0 0
Saturn 568.0000 120536 0 0
Uranus 86.8000 51118 0 0
Venus 4.8700 120104 0 0
In [ ]:
new_row = {'mass':1000, 'diameter':50000, 
                    'Population':0, 'pop':12345678}
df2.loc['New Row'] = new_row
In [ ]:
df2
Out[ ]:
mass diameter Population pop
Earth 5.9700 12756 8000000000 0
Jupiter 1898.0000 142984 0 0
Mars 0.6420 6792 0 0
Mercury 0.3300 4879 0 0
Moon NaN 3475 0 0
Neptune 102.0000 49528 0 0
Pluto 0.0146 2376 0 0
Saturn 568.0000 120536 0 0
Uranus 86.8000 51118 0 0
Venus 4.8700 120104 0 0
New Row 1000.0000 50000 0 12345678
In [ ]:
Mean_row = {'mass': np.mean(mass), 'diameter': np.mean(diameter)}
df2.loc['Mean_row'] = Mean_row
In [ ]:
df2
Out[ ]:
mass diameter Population pop
Earth 5.970000 12756.0 8.000000e+09 0.0
Jupiter 1898.000000 142984.0 0.000000e+00 0.0
Mars 0.642000 6792.0 0.000000e+00 0.0
Mercury 0.330000 4879.0 0.000000e+00 0.0
Moon NaN 3475.0 0.000000e+00 0.0
Neptune 102.000000 49528.0 0.000000e+00 0.0
Pluto 0.014600 2376.0 0.000000e+00 0.0
Saturn 568.000000 120536.0 0.000000e+00 0.0
Uranus 86.800000 51118.0 0.000000e+00 0.0
Venus 4.870000 120104.0 0.000000e+00 0.0
New Row 1000.000000 50000.0 0.000000e+00 12345678.0
Mean_row 296.291844 51454.8 NaN NaN

as taught in class

In [ ]:
df.loc['Col_Mean'] = 0
In [ ]:
df
Out[ ]:
mass diameter Population pop
Earth 5.970000 12756.0 8.000000e+09 0.0
Jupiter 1898.000000 142984.0 0.000000e+00 0.0
Mars 0.642000 6792.0 0.000000e+00 0.0
Mercury 0.330000 4879.0 0.000000e+00 0.0
Moon NaN 3475.0 0.000000e+00 0.0
Neptune 102.000000 49528.0 0.000000e+00 0.0
Pluto 0.014600 2376.0 0.000000e+00 0.0
Saturn 568.000000 120536.0 0.000000e+00 0.0
Uranus 86.800000 51118.0 0.000000e+00 0.0
Venus 4.870000 120104.0 0.000000e+00 0.0
New Row 1000.000000 50000.0 0.000000e+00 12345678.0
Mean_row 296.291844 51454.8 NaN NaN
Col_Mean 0.000000 0.0 0.000000e+00 0.0
In [ ]:
np.mean(df['mass'])
Out[ ]:
330.2432037037037

to delete a row or a column

In [ ]:
df.drop('Col_Mean')
Out[ ]:
mass diameter Population pop
Earth 5.970000 12756.0 8.000000e+09 0.0
Jupiter 1898.000000 142984.0 0.000000e+00 0.0
Mars 0.642000 6792.0 0.000000e+00 0.0
Mercury 0.330000 4879.0 0.000000e+00 0.0
Moon NaN 3475.0 0.000000e+00 0.0
Neptune 102.000000 49528.0 0.000000e+00 0.0
Pluto 0.014600 2376.0 0.000000e+00 0.0
Saturn 568.000000 120536.0 0.000000e+00 0.0
Uranus 86.800000 51118.0 0.000000e+00 0.0
Venus 4.870000 120104.0 0.000000e+00 0.0
New Row 1000.000000 50000.0 0.000000e+00 12345678.0
Mean_row 296.291844 51454.8 NaN NaN
In [ ]:
df.drop('pop', axis=1)
Out[ ]:
mass diameter Population
Earth 5.970000 12756.0 8.000000e+09
Jupiter 1898.000000 142984.0 0.000000e+00
Mars 0.642000 6792.0 0.000000e+00
Mercury 0.330000 4879.0 0.000000e+00
Moon NaN 3475.0 0.000000e+00
Neptune 102.000000 49528.0 0.000000e+00
Pluto 0.014600 2376.0 0.000000e+00
Saturn 568.000000 120536.0 0.000000e+00
Uranus 86.800000 51118.0 0.000000e+00
Venus 4.870000 120104.0 0.000000e+00
New Row 1000.000000 50000.0 0.000000e+00
Mean_row 296.291844 51454.8 NaN
Col_Mean 0.000000 0.0 0.000000e+00

to reflect the changes (deletion) in the original dataframe use INPLACE

In [ ]:
df
Out[ ]:
mass diameter Population pop
Earth 5.970000 12756.0 8.000000e+09 0.0
Jupiter 1898.000000 142984.0 0.000000e+00 0.0
Mars 0.642000 6792.0 0.000000e+00 0.0
Mercury 0.330000 4879.0 0.000000e+00 0.0
Moon NaN 3475.0 0.000000e+00 0.0
Neptune 102.000000 49528.0 0.000000e+00 0.0
Pluto 0.014600 2376.0 0.000000e+00 0.0
Saturn 568.000000 120536.0 0.000000e+00 0.0
Uranus 86.800000 51118.0 0.000000e+00 0.0
Venus 4.870000 120104.0 0.000000e+00 0.0
New Row 1000.000000 50000.0 0.000000e+00 12345678.0
Mean_row 296.291844 51454.8 NaN NaN
Col_Mean 0.000000 0.0 0.000000e+00 0.0
In [ ]:
df.drop('pop', axis = 1, inplace=True)
In [ ]:
df
Out[ ]:
mass diameter Population
Earth 5.970000 12756.0 8.000000e+09
Jupiter 1898.000000 142984.0 0.000000e+00
Mars 0.642000 6792.0 0.000000e+00
Mercury 0.330000 4879.0 0.000000e+00
Moon NaN 3475.0 0.000000e+00
Neptune 102.000000 49528.0 0.000000e+00
Pluto 0.014600 2376.0 0.000000e+00
Saturn 568.000000 120536.0 0.000000e+00
Uranus 86.800000 51118.0 0.000000e+00
Venus 4.870000 120104.0 0.000000e+00
New Row 1000.000000 50000.0 0.000000e+00
Mean_row 296.291844 51454.8 NaN
Col_Mean 0.000000 0.0 0.000000e+00
In [ ]:
df.drop('Col_Mean', inplace = True)
In [ ]:
df
Out[ ]:
mass diameter Population
Earth 5.970000 12756.0 8.000000e+09
Jupiter 1898.000000 142984.0 0.000000e+00
Mars 0.642000 6792.0 0.000000e+00
Mercury 0.330000 4879.0 0.000000e+00
Moon NaN 3475.0 0.000000e+00
Neptune 102.000000 49528.0 0.000000e+00
Pluto 0.014600 2376.0 0.000000e+00
Saturn 568.000000 120536.0 0.000000e+00
Uranus 86.800000 51118.0 0.000000e+00
Venus 4.870000 120104.0 0.000000e+00
New Row 1000.000000 50000.0 0.000000e+00
Mean_row 296.291844 51454.8 NaN
In [ ]:
df.drop('Population', axis = 1, inplace = True)
In [ ]:
df
Out[ ]:
mass diameter
Earth 5.970000 12756.0
Jupiter 1898.000000 142984.0
Mars 0.642000 6792.0
Mercury 0.330000 4879.0
Moon NaN 3475.0
Neptune 102.000000 49528.0
Pluto 0.014600 2376.0
Saturn 568.000000 120536.0
Uranus 86.800000 51118.0
Venus 4.870000 120104.0
New Row 1000.000000 50000.0
Mean_row 296.291844 51454.8
In [ ]:
df.drop('Mean_row', inplace = True)
In [ ]:
df.drop('New Row', inplace = True)
In [ ]:
df
Out[ ]:
mass diameter
Earth 5.9700 12756.0
Jupiter 1898.0000 142984.0
Mars 0.6420 6792.0
Mercury 0.3300 4879.0
Moon NaN 3475.0
Neptune 102.0000 49528.0
Pluto 0.0146 2376.0
Saturn 568.0000 120536.0
Uranus 86.8000 51118.0
Venus 4.8700 120104.0
In [ ]:
def create_mean_row(df):
    df.loc['Col_Mean'] = [np.mean(df[col]) for col in df.columns]
    return df
In [ ]:
create_mean_row(df)
Out[ ]:
mass diameter
Earth 5.970000 12756.0
Jupiter 1898.000000 142984.0
Mars 0.642000 6792.0
Mercury 0.330000 4879.0
Moon NaN 3475.0
Neptune 102.000000 49528.0
Pluto 0.014600 2376.0
Saturn 568.000000 120536.0
Uranus 86.800000 51118.0
Venus 4.870000 120104.0
Col_Mean 296.291844 51454.8
In [ ]:
def create_New_Mean_Row(df):
    df.loc['Col_Mean_row'] = df.mean()
    return df
In [ ]:
create_New_Mean_Row(df)
Out[ ]:
mass diameter
Earth 5.970000 12756.0
Jupiter 1898.000000 142984.0
Mars 0.642000 6792.0
Mercury 0.330000 4879.0
Moon NaN 3475.0
Neptune 102.000000 49528.0
Pluto 0.014600 2376.0
Saturn 568.000000 120536.0
Uranus 86.800000 51118.0
Venus 4.870000 120104.0
Col_Mean 296.291844 51454.8
Col_Mean_row 296.291844 51454.8
In [ ]:
dff = pd.DataFrame(arr)
In [ ]:
dff.mean()
Out[ ]:
0    17.2
1    13.6
2     8.2
dtype: float64
In [ ]:
dff.mean(axis = 1)
Out[ ]:
0    21.666667
1    10.000000
2     4.000000
3    12.666667
4    16.666667
dtype: float64
In [ ]:
dff
Out[ ]:
0 1 2
0 26 25 14
1 9 20 1
2 9 2 1
3 15 17 6
4 27 4 19
In [ ]:
dff['row_mean'] = dff.mean(axis = 1)
In [ ]:
dff
Out[ ]:
0 1 2 row_mean
0 26 25 14 21.666667
1 9 20 1 10.000000
2 9 2 1 4.000000
3 15 17 6 12.666667
4 27 4 19 16.666667
In [ ]:
dff
Out[ ]:
0 1 2 row_mean
0 26 25 14 21.666667
1 9 20 1 10.000000
2 9 2 1 4.000000
3 15 17 6 12.666667
4 27 4 19 16.666667
In [ ]:
dff.loc['col_mean'] = dff.mean()
In [ ]:
dff
Out[ ]:
0 1 2 row_mean
0 26.0 25.0 14.0 21.666667
1 9.0 20.0 1.0 10.000000
2 9.0 2.0 1.0 4.000000
3 15.0 17.0 6.0 12.666667
4 27.0 4.0 19.0 16.666667
col_mean 17.2 13.6 8.2 13.000000
In [ ]:
df.median()
Out[ ]:
mass           86.8
diameter    50323.0
dtype: float64
In [ ]:
dff.loc['col_median'] = dff.median()
In [ ]:
dff
Out[ ]:
0 1 2 row_mean
0 26.0 25.0 14.0 21.666667
1 9.0 20.0 1.0 10.000000
2 9.0 2.0 1.0 4.000000
3 15.0 17.0 6.0 12.666667
4 27.0 4.0 19.0 16.666667
col_mean 17.2 13.6 8.2 13.000000
col_median 16.1 15.3 7.1 12.833333
In [ ]:
dff['row_median'] = dff.median(axis = 1)
In [ ]:
dff
Out[ ]:
0 1 2 row_mean row_median
0 26.0 25.0 14.0 21.666667 23.333333
1 9.0 20.0 1.0 10.000000 9.500000
2 9.0 2.0 1.0 4.000000 3.000000
3 15.0 17.0 6.0 12.666667 13.833333
4 27.0 4.0 19.0 16.666667 17.833333
col_mean 17.2 13.6 8.2 13.000000 13.300000
col_median 16.1 15.3 7.1 12.833333 14.066667
In [ ]:
dff['row_stddev'] = df.std(axis=1)
In [ ]:
dff
Out[ ]:
0 1 2 row_mean row_median row_stddev
0 26.0 25.0 14.0 21.666667 23.333333 NaN
1 9.0 20.0 1.0 10.000000 9.500000 NaN
2 9.0 2.0 1.0 4.000000 3.000000 NaN
3 15.0 17.0 6.0 12.666667 13.833333 NaN
4 27.0 4.0 19.0 16.666667 17.833333 NaN
col_mean 17.2 13.6 8.2 13.000000 13.300000 NaN
col_median 16.1 15.3 7.1 12.833333 14.066667 NaN
In [ ]:
dff.loc['col_stddev'] = dff.std()
In [ ]:
dff
Out[ ]:
0 1 2 row_mean row_median row_stddev
0 26.000000 25.000000 14.000000 21.666667 23.333333 NaN
1 9.000000 20.000000 1.000000 10.000000 9.500000 NaN
2 9.000000 2.000000 1.000000 4.000000 3.000000 NaN
3 15.000000 17.000000 6.000000 12.666667 13.833333 NaN
4 27.000000 4.000000 19.000000 16.666667 17.833333 NaN
col_mean 17.200000 13.600000 8.200000 13.000000 13.300000 NaN
col_median 16.100000 15.300000 7.100000 12.833333 14.066667 NaN
col_stddev 7.232302 8.283288 6.580744 5.457265 6.353152 NaN
In [ ]:
dff.quantile(0.25)
Out[ ]:
0             9.000000
1             7.212466
2             4.750000
row_mean      8.864316
row_median    8.713288
row_stddev         NaN
Name: 0.25, dtype: float64
In [ ]:
dff.min()
Out[ ]:
0             7.232302
1             2.000000
2             1.000000
row_mean      4.000000
row_median    3.000000
row_stddev         NaN
dtype: float64
In [ ]:
dff.max()
Out[ ]:
0             27.000000
1             25.000000
2             19.000000
row_mean      21.666667
row_median    23.333333
row_stddev          NaN
dtype: float64
In [ ]:
dff.describe()
Out[ ]:
0 1 2 row_mean row_median row_stddev
count 8.000000 8.000000 8.000000 8.000000 8.000000 0.0
mean 15.816538 13.147911 7.860093 12.036325 12.652477 NaN
std 7.540872 7.916730 6.114476 5.709115 6.408984 NaN
min 7.232302 2.000000 1.000000 4.000000 3.000000 NaN
25% 9.000000 7.212466 4.750000 8.864316 8.713288 NaN
50% 15.550000 14.450000 6.840372 12.750000 13.566667 NaN
75% 19.400000 17.750000 9.650000 13.916667 15.008333 NaN
max 27.000000 25.000000 19.000000 21.666667 23.333333 NaN
In [ ]:
df
Out[ ]:
mass diameter
Earth 5.970000 12756.0
Jupiter 1898.000000 142984.0
Mars 0.642000 6792.0
Mercury 0.330000 4879.0
Moon NaN 3475.0
Neptune 102.000000 49528.0
Pluto 0.014600 2376.0
Saturn 568.000000 120536.0
Uranus 86.800000 51118.0
Venus 4.870000 120104.0
Col_Mean 296.291844 51454.8
Col_Mean_row 296.291844 51454.8
In [ ]:
mass = pd.Series(
    [0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series(
    [4879, 120104, 12756, 6792, 142984, 120536, 51118, 49528, 2376], 
                     index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
In [ ]:
planets = pd.DataFrame({'mass': mass, 'diameter':diameter})
In [ ]:
planets.describe()
Out[ ]:
mass diameter
count 9.000000 9.000000
mean 296.291844 56785.888889
std 627.786429 56657.917916
min 0.014600 2376.000000
25% 0.642000 6792.000000
50% 5.970000 49528.000000
75% 102.000000 120104.000000
max 1898.000000 142984.000000
In [ ]:
planets.describe
Out[ ]:
<bound method NDFrame.describe of               mass  diameter
Mercury     0.3300      4879
Venus       4.8700    120104
Earth       5.9700     12756
Mars        0.6420      6792
Jupiter  1898.0000    142984
Saturn    568.0000    120536
Uranus     86.8000     51118
Neptune   102.0000     49528
Pluto       0.0146      2376>
In [1]:
!pip install nbconvert
%shell jupyter nbconvert --to html /content/testfile.ipynb
Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (5.6.1)
Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from nbconvert) (2.6.1)
Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.4)
Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (5.1.1)
Requirement already satisfied: jinja2>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (2.11.3)
Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.6.0)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.7.1)
Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert) (4.1.0)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (1.5.0)
Requirement already satisfied: nbformat>=4.4 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (5.1.3)
Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.8.4)
Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbconvert) (4.9.2)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2>=2.4->nbconvert) (2.0.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.4->nbconvert) (4.3.3)
Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.4->nbconvert) (0.2.0)
Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (0.18.1)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (4.11.2)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (3.10.0.2)
Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (5.4.0)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (21.4.0)
Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (3.7.0)
Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (1.15.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (0.5.1)
Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->bleach->nbconvert) (3.0.7)
In [ ]: