import numpy as np
import pandas as pd

mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

diameter = pd.Series([4879, 120104, 12756, 6792, 142984, 120536, 51118, 49528, 2376], index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

DataFrame is nothing but a collection of series either row wise or col wise or both
it can be created from numpy array as well

DataFrame in pandas¶

Creating a two dimensional numpy array¶

arr = np.random.randint(1, 30, (5, 3))
arr

array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])

arr

array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])

# Creating a dataframe from the numpy array
df = pd.DataFrame(arr)

df

df.values

array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])

df.columns

RangeIndex(start=0, stop=3, step=1)

df[2:3]

df.index

RangeIndex(start=0, stop=5, step=1)

for c in df.columns:
    print(c)

0
1
2

df.index = ["R1", "R2", "R3", "R4", "R5"]
df.columns = ["C1", "C2", "C3"]

df

Using Loc and iLoc functions¶

returns a single element¶

df.loc['R3', 'C2']

2

df.iloc[2,1]

2

df.iloc[4,2]

19

returns a DataFrame¶

df.iloc[2:4, 1:3]

type(df.iloc[2:4, 1:3])

pandas.core.frame.DataFrame

returns a series¶

arr

array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])

df.iloc[0]

C1    26
C2    25
C3    14
Name: R1, dtype: int64

type(df.iloc[0])

pandas.core.series.Series

df.iloc[:2]

df.iloc[:1]

df.iloc[0:3]

df.iloc[:3]

df.iloc[:,0]

R1    26
R2     9
R3     9
R4    15
R5    27
Name: C1, dtype: int64

df.iloc[:,2]

R1    14
R2     1
R3     1
R4     6
R5    19
Name: C3, dtype: int64

df.iloc[2,:]

C1    9
C2    2
C3    1
Name: R3, dtype: int64

arr

array([[26, 25, 14],
       [ 9, 20,  1],
       [ 9,  2,  1],
       [15, 17,  6],
       [27,  4, 19]])

df.loc["R2"]

C1     9
C2    20
C3     1
Name: R2, dtype: int64

df.shape

(5, 3)

df.T

homework to create a dataframe¶

def create_df(nrows, ncols):
    arr1 = np.random.randint(1, 30, (nrows, ncols))
    df = pd.DataFrame(arr1)
    print(arr1)

%%time
create_df(3, 4)

[[20  9 20 25]
 [16 23 23 19]
 [12 27 23 17]]
CPU times: user 1.36 ms, sys: 0 ns, total: 1.36 ms
Wall time: 1.6 ms

create_df(4, 5)

[[ 2  4 17 25 22]
 [23 13 13 16 22]
 [ 8 29 17 23  6]
 [23  3  4 24 13]]

def createDF(nRows, nCols):
    df = pd.DataFrame(np.random.randint(1, 30, (nRows, nCols)))
    print(df)

%%time
createDF(4, 3)

    0   1   2
0  29   3   2
1  29  10  20
2  13  11  22
3  28   8  12
CPU times: user 2.95 ms, sys: 0 ns, total: 2.95 ms
Wall time: 3.56 ms

createDF(10, 10)

    0   1   2   3   4   5   6   7   8   9
0  25  10   2  29   1  17   8  20   6  25
1  10   4  23  14  29  20   5   8  13  16
2   5   7  17  24   2   6  26  22  23  21
3  24   4  19   4  12  27  18   8  26  29
4  16   9   9   3  13   2  15  25  23  13
5  18  12  16   9  16  14  24  22   7  17
6  10  10  12   9  15  27   7   1  20  24
7  27   4   9  22  27  10  20  21  20  22
8   4   7  27   7  21  21   8   8  16  12
9  11  18   2   3   6  17  18  18  27   5

as taught in class¶

import numpy as np
import pandas as pd

def create_DF(nRows, nCols, maxData=10):
    arr = np.random.randint(0, maxData, (nRows, nCols))
    df = pd.DataFrame(arr)
    df.index = ["R" + str(x) for x in np.arange(1, nRows + 1)]
    df.columns = ["C" + str(x) for x in np.arange(1, nCols + 1)]
    return df

create_DF(4, 5, 50)

create_DF(2, 5)

to create DataFrame by invoking multiple series objects¶

mass = pd.Series([0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series([4879, 120104, 12756, 6792, 142984, 120536, 51118, 49528, 2376], 
                     index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

mass

Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
dtype: float64

diameter

Mercury      4879
Venus      120104
Earth       12756
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2376
dtype: int64

in the dictionary, 'mass' and 'diameter' represents the names of the columns¶

rows will be the union of the first series dataset and the second series dataset¶

df = pd.DataFrame({'mass': mass, 'diameter':diameter})

df

adding a new element to Series Object¶

diameter = diameter.append(pd.Series(3475, index=['Moon']))

diameter

Mercury      4879
Venus      120104
Earth       12756
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2376
Moon         3475
dtype: int64

df_moon = pd.DataFrame({'mass': mass, 'diameter':diameter})

because DataFrame couldnot identify the item "Moon" in the first dataset, it is placed as NaN (Not a Number)¶

df_moon

df['mass']

Mercury       0.3300
Venus         4.8700
Earth         5.9700
Mars          0.6420
Jupiter    1898.0000
Saturn      568.0000
Uranus       86.8000
Neptune     102.0000
Pluto         0.0146
Name: mass, dtype: float64

df['diameter']

Mercury      4879
Venus      120104
Earth       12756
Mars         6792
Jupiter    142984
Saturn     120536
Uranus      51118
Neptune     49528
Pluto        2376
Name: diameter, dtype: int64

df = pd.DataFrame({'mass': mass, 'diameter':diameter})

the following data is retrieved from the recent dataframe and nothing to do with Series¶

df['mass']

Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: mass, dtype: float64

df['diameter']

Earth       12756
Jupiter    142984
Mars         6792
Mercury      4879
Moon         3475
Neptune     49528
Pluto        2376
Saturn     120536
Uranus      51118
Venus      120104
Name: diameter, dtype: int64

df['mass']['Earth']

5.97

df.mass.Earth

5.97

df['diameter']['Earth']

12756

df.diameter.Earth

12756

adding a new column to an existing dataframe¶

df['Population'] = 0

df

df['Population']['Earth'] = 8000000000

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

df

df['pop'] = 0

df

df['mass'] == df.mass

Earth       True
Jupiter     True
Mars        True
Mercury     True
Moon       False
Neptune     True
Pluto       True
Saturn      True
Uranus      True
Venus       True
Name: mass, dtype: bool

df.mass is df['mass']

True

method to access elements as series¶

# returns series - horizontal
df.loc['Earth']

mass          5.970000e+00
diameter      1.275600e+04
Population    8.000000e+09
pop           0.000000e+00
Name: Earth, dtype: float64

type(df.loc['Earth'])

pandas.core.series.Series

# returns series - Vertical
df.loc[:, 'mass']

Earth         5.9700
Jupiter    1898.0000
Mars          0.6420
Mercury       0.3300
Moon             NaN
Neptune     102.0000
Pluto         0.0146
Saturn      568.0000
Uranus       86.8000
Venus         4.8700
Name: mass, dtype: float64

type(df.loc[:, 'mass'])

pandas.core.series.Series

the reason for FALSE as result is that pop is a built-in function, hence built-in f() takes precedence¶

this is one of the pitfalls with using . (Dot) notation hence it is advised to use [] notation

df['pop'] is df.pop

False

Homework¶

how to create a new row
create a mew f() to create a new row which would be a MEAN to calculate the mean of the columns mass and diameter

df2 = df

df2

new_row = {'mass':1000, 'diameter':50000, 
                    'Population':0, 'pop':12345678}
df2.loc['New Row'] = new_row

df2

Mean_row = {'mass': np.mean(mass), 'diameter': np.mean(diameter)}
df2.loc['Mean_row'] = Mean_row

df2

as taught in class¶

df.loc['Col_Mean'] = 0

df

np.mean(df['mass'])

330.2432037037037

to delete a row or a column¶

df.drop('Col_Mean')

df.drop('pop', axis=1)

to reflect the changes (deletion) in the original dataframe use INPLACE¶

df

df.drop('pop', axis = 1, inplace=True)

df

df.drop('Col_Mean', inplace = True)

df

df.drop('Population', axis = 1, inplace = True)

df

df.drop('Mean_row', inplace = True)

df.drop('New Row', inplace = True)

df

def create_mean_row(df):
    df.loc['Col_Mean'] = [np.mean(df[col]) for col in df.columns]
    return df

create_mean_row(df)

def create_New_Mean_Row(df):
    df.loc['Col_Mean_row'] = df.mean()
    return df

create_New_Mean_Row(df)

dff = pd.DataFrame(arr)

dff.mean()

0    17.2
1    13.6
2     8.2
dtype: float64

dff.mean(axis = 1)

0    21.666667
1    10.000000
2     4.000000
3    12.666667
4    16.666667
dtype: float64

dff

dff['row_mean'] = dff.mean(axis = 1)

dff

dff

dff.loc['col_mean'] = dff.mean()

dff

df.median()

mass           86.8
diameter    50323.0
dtype: float64

dff.loc['col_median'] = dff.median()

dff

dff['row_median'] = dff.median(axis = 1)

dff

dff['row_stddev'] = df.std(axis=1)

dff

dff.loc['col_stddev'] = dff.std()

dff

dff.quantile(0.25)

0             9.000000
1             7.212466
2             4.750000
row_mean      8.864316
row_median    8.713288
row_stddev         NaN
Name: 0.25, dtype: float64

dff.min()

0             7.232302
1             2.000000
2             1.000000
row_mean      4.000000
row_median    3.000000
row_stddev         NaN
dtype: float64

dff.max()

0             27.000000
1             25.000000
2             19.000000
row_mean      21.666667
row_median    23.333333
row_stddev          NaN
dtype: float64

dff.describe()

df

mass = pd.Series(
    [0.33, 4.87, 5.97, 0.642, 1898, 568, 86.8, 102, 0.0146], 
                 index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])
diameter = pd.Series(
    [4879, 120104, 12756, 6792, 142984, 120536, 51118, 49528, 2376], 
                     index=['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'])

planets = pd.DataFrame({'mass': mass, 'diameter':diameter})

planets.describe()

planets.describe

<bound method NDFrame.describe of               mass  diameter
Mercury     0.3300      4879
Venus       4.8700    120104
Earth       5.9700     12756
Mars        0.6420      6792
Jupiter  1898.0000    142984
Saturn    568.0000    120536
Uranus     86.8000     51118
Neptune   102.0000     49528
Pluto       0.0146      2376>

!pip install nbconvert
%shell jupyter nbconvert --to html /content/testfile.ipynb

Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (5.6.1)
Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from nbconvert) (2.6.1)
Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.4)
Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (5.1.1)
Requirement already satisfied: jinja2>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (2.11.3)
Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.6.0)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.7.1)
Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert) (4.1.0)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (1.5.0)
Requirement already satisfied: nbformat>=4.4 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (5.1.3)
Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.8.4)
Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbconvert) (4.9.2)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2>=2.4->nbconvert) (2.0.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.4->nbconvert) (4.3.3)
Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.4->nbconvert) (0.2.0)
Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (0.18.1)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (4.11.2)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (3.10.0.2)
Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (5.4.0)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (21.4.0)
Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (3.7.0)
Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (1.15.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (0.5.1)
Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->bleach->nbconvert) (3.0.7)

	mass	diameter
Earth	5.970000	12756.0
Jupiter	1898.000000	142984.0
Mars	0.642000	6792.0
Mercury	0.330000	4879.0
Moon	NaN	3475.0
Neptune	102.000000	49528.0
Pluto	0.014600	2376.0
Saturn	568.000000	120536.0
Uranus	86.800000	51118.0
Venus	4.870000	120104.0
New Row	1000.000000	50000.0
Mean_row	296.291844	51454.8

	mass	diameter
Earth	5.970000	12756.0
Jupiter	1898.000000	142984.0
Mars	0.642000	6792.0
Mercury	0.330000	4879.0
Moon	NaN	3475.0
Neptune	102.000000	49528.0
Pluto	0.014600	2376.0
Saturn	568.000000	120536.0
Uranus	86.800000	51118.0
Venus	4.870000	120104.0
Col_Mean	296.291844	51454.8

	mass	diameter
Earth	5.970000	12756.0
Jupiter	1898.000000	142984.0
Mars	0.642000	6792.0
Mercury	0.330000	4879.0
Moon	NaN	3475.0
Neptune	102.000000	49528.0
Pluto	0.014600	2376.0
Saturn	568.000000	120536.0
Uranus	86.800000	51118.0
Venus	4.870000	120104.0
Col_Mean	296.291844	51454.8
Col_Mean_row	296.291844	51454.8

	mass	diameter
Earth	5.970000	12756.0
Jupiter	1898.000000	142984.0
Mars	0.642000	6792.0
Mercury	0.330000	4879.0
Moon	NaN	3475.0
Neptune	102.000000	49528.0
Pluto	0.014600	2376.0
Saturn	568.000000	120536.0
Uranus	86.800000	51118.0
Venus	4.870000	120104.0
Col_Mean	296.291844	51454.8
Col_Mean_row	296.291844	51454.8

	mass	diameter
count	9.000000	9.000000
mean	296.291844	56785.888889
std	627.786429	56657.917916
min	0.014600	2376.000000
25%	0.642000	6792.000000
50%	5.970000	49528.000000
75%	102.000000	120104.000000
max	1898.000000	142984.000000

Consulting - Quality

PANDAS DATAFRAME

class work

DataFrame in pandas¶

Creating a two dimensional numpy array¶

Using Loc and iLoc functions¶

returns a single element¶

returns a DataFrame¶

returns a series¶

homework to create a dataframe¶

as taught in class¶

to create DataFrame by invoking multiple series objects¶

in the dictionary, 'mass' and 'diameter' represents the names of the columns¶

rows will be the union of the first series dataset and the second series dataset¶

adding a new element to Series Object¶

because DataFrame couldnot identify the item "Moon" in the first dataset, it is placed as NaN (Not a Number)¶

the following data is retrieved from the recent dataframe and nothing to do with Series¶

adding a new column to an existing dataframe¶

method to access elements as series¶

the reason for FALSE as result is that pop is a built-in function, hence built-in f() takes precedence¶

Homework¶

as taught in class¶

to delete a row or a column¶

to reflect the changes (deletion) in the original dataframe use INPLACE¶

	mass	diameter
Mercury	0.3300	4879
Venus	4.8700	120104
Earth	5.9700	12756
Mars	0.6420	6792
Jupiter	1898.0000	142984
Saturn	568.0000	120536
Uranus	86.8000	51118
Neptune	102.0000	49528
Pluto	0.0146	2376

	0	1	2	row_mean
0	26	25	14	21.666667
1	9	20	1	10.000000
2	9	2	1	4.000000
3	15	17	6	12.666667
4	27	4	19	16.666667

	0	1	2	row_mean
0	26.0	25.0	14.0	21.666667
1	9.0	20.0	1.0	10.000000
2	9.0	2.0	1.0	4.000000
3	15.0	17.0	6.0	12.666667
4	27.0	4.0	19.0	16.666667
col_mean	17.2	13.6	8.2	13.000000

	0	1	2	row_mean	row_median	row_stddev
0	26.000000	25.000000	14.000000	21.666667	23.333333	NaN
1	9.000000	20.000000	1.000000	10.000000	9.500000	NaN
2	9.000000	2.000000	1.000000	4.000000	3.000000	NaN
3	15.000000	17.000000	6.000000	12.666667	13.833333	NaN
4	27.000000	4.000000	19.000000	16.666667	17.833333	NaN
col_mean	17.200000	13.600000	8.200000	13.000000	13.300000	NaN
col_median	16.100000	15.300000	7.100000	12.833333	14.066667	NaN
col_stddev	7.232302	8.283288	6.580744	5.457265	6.353152	NaN

	0	1	2	row_mean	row_median	row_stddev
count	8.000000	8.000000	8.000000	8.000000	8.000000	0.0
mean	15.816538	13.147911	7.860093	12.036325	12.652477	NaN
std	7.540872	7.916730	6.114476	5.709115	6.408984	NaN
min	7.232302	2.000000	1.000000	4.000000	3.000000	NaN
25%	9.000000	7.212466	4.750000	8.864316	8.713288	NaN
50%	15.550000	14.450000	6.840372	12.750000	13.566667	NaN
75%	19.400000	17.750000	9.650000	13.916667	15.008333	NaN
max	27.000000	25.000000	19.000000	21.666667	23.333333	NaN