import numpy as np
import pandas as pd
import seaborn as sns

Dealing with missing data¶

Numpy¶

x = np.array([1, 2, 3, 4, 5])

x.sum()

15

print(x.dtype)

int64

x = np.array([1, 2, 3, '--', 5])

print(x.dtype)

<U21

x.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-7-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.6/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     36 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     37          initial=_NoValue, where=True):
---> 38     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     39 
     40 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: cannot perform reduce with flexible type

x = np.array([1, 2, 3, None, 5])

x.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.6/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     36 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     37          initial=_NoValue, where=True):
---> 38     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     39 
     40 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

x = np.array([1, 2, 3, np.nan, 5])

x.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.6/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     36 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     37          initial=_NoValue, where=True):
---> 38     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     39 
     40 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

1 * np.nan

nan

x_b = np.array([True, True, True, False, True])

x[x_b]

array([1, 2, 3, 5], dtype=object)

x[x_b].mean()

2.75

m_x = np.ma.masked_array(x, mask = [0, 0, 0, 1, 0])

m_x.mean()

2.75

Dealing with missing data with Pandas¶

df = pd.read_csv("rooms.csv")

df.head()

df.dtypes

Room_Number     float64
Num_Students     object
Department       object
Occupied         object
dtype: object

%timeit np.arange(100000, dtype="int").sum()

1000 loops, best of 3: 264 µs per loop

%timeit np.arange(100000, dtype="object").sum()

100 loops, best of 3: 6.49 ms per loop

df.Room_Number.isnull()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: Room_Number, dtype: bool

df.Room_Number.isnull().sum()

1

df.isnull()

df.isnull().sum()

Room_Number     1
Num_Students    3
Department      0
Occupied        1
dtype: int64

missing_values = ["NA", "n/a", "na"]

df = pd.read_csv("rooms.csv", 
                 na_values = missing_values)

df.isnull()

df.Num_Students.mean()

2.0

missing_values = ["NA", "n/a", "na", "Empty", "--"]

df = pd.read_csv("rooms.csv", 
                 na_values = missing_values)

df.isnull()

df.Department.unique()

array(['Mechanical', nan, 'Electrical', 'Chemical', 'Civil', 'CS'],
      dtype=object)

df.Occupied.fillna("N", inplace=True)

df

def convert_to_binary(v):
    if v == 'Y':
        return True
    else:
        return False

df.Occupied = df.Occupied.apply(convert_to_binary)

df

df["Dept2"] = df.Department

df.Department.fillna(method="pad", inplace=True)

df

df.Dept2.fillna(method="bfill", inplace=True)

df

df.Num_Students.fillna(df.Num_Students.median(), inplace=True)

df

df.Room_Number.interpolate(inplace=True)

df

Open ended descriptive statistics¶

http://research.aspiringminds.com/resources/#datasets

df = pd.read_excel("ameo_2015.xlsx")

df.head()

df.shape

(3998, 38)

df.isnull().sum().sum()

0

df.dtypes

ID                                int64
Salary                            int64
DOJ                      datetime64[ns]
DOL                              object
Designation                      object
JobCity                          object
Gender                           object
DOB                      datetime64[ns]
10percentage                    float64
10board                          object
12graduation                      int64
12percentage                    float64
12board                          object
CollegeID                         int64
CollegeTier                       int64
Degree                           object
Specialization                   object
collegeGPA                      float64
CollegeCityID                     int64
CollegeCityTier                   int64
CollegeState                     object
GraduationYear                    int64
English                           int64
Logical                           int64
Quant                             int64
Domain                          float64
ComputerProgramming               int64
ElectronicsAndSemicon             int64
ComputerScience                   int64
MechanicalEngg                    int64
ElectricalEngg                    int64
TelecomEngg                       int64
CivilEngg                         int64
conscientiousness               float64
agreeableness                   float64
extraversion                    float64
nueroticism                     float64
openess_to_experience           float64
dtype: object

df.Gender.unique()

array(['f', 'm'], dtype=object)

sns.violinplot(x='Gender', y='Salary', data=df);

df[['10percentage', '12percentage', 'collegeGPA', 'Gender']].groupby('Gender').mean()

df[['10percentage', '12percentage', 'collegeGPA', 'Gender']].groupby('Gender').median()

df[['conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience', 'Gender']].groupby('Gender').mean()

df[['conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience', 'Gender']].groupby('Gender').median()

df[['Salary', 'Gender']].groupby('Gender').mean()

th = df.Salary.mean()+df.Salary.std()

df['HighIncome'] = (df.Salary > th)

df.sample(10)

df.head()

df[['Salary', 'HighIncome', 'Gender']].groupby(['HighIncome', 'Gender']).mean()

df[['Salary', 'HighIncome', 'Gender']].groupby(['HighIncome', 'Gender']).count()

print('Low income female percentage', 917/(2809+917)*100)

Low income female percentage 24.610842726784757

print('High income female percentage', 40/(232+40)*100)

High income female percentage 14.705882352941178

df.CollegeTier.unique()

array([2, 1])

df[['CollegeTier', 'HighIncome', 'Salary']].groupby(['HighIncome', 'CollegeTier']).count()

print('Low income college tier 2 percentage is', 3492/(3492+234)*100)

Low income college tier 2 percentage is 93.71980676328504

print('High income college tier 2 percentage is', 209/(209+63)*100)

High income college tier 2 percentage is 76.83823529411765

df[['Gender', 'CollegeTier', 'Salary']].groupby(['CollegeTier', 'Gender']).count()

print('In college tier 1 female percentage is', 51/(246+51)*100)

In college tier 1 female percentage is 17.17171717171717

print('In college tier 2 female percentage is', 906/(906+2795)*100)

In college tier 2 female percentage is 24.479870305322883

Agriculture example¶

df = pd.read_csv('apy.csv', na_values="=")

df.head()

df.State_Name.unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ',
       'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype=object)

df.Crop_Year.unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2010, 1997, 1998, 1999,
       2007, 2008, 2009, 2011, 2012, 2013, 2014, 2015])

df.dtypes

State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production       float64
dtype: object

df.Season.unique()

array(['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '], dtype=object)

df.Crop.unique()

pd.to_numeric(df.Production)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "="

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-114-f0c12b31c09e> in <module>()
----> 1 pd.to_numeric(df.Production)

/usr/local/lib/python3.6/dist-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    148         try:
    149             values = lib.maybe_convert_numeric(
--> 150                 values, set(), coerce_numeric=coerce_numeric
    151             )
    152         except (ValueError, TypeError):

pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "=" at position 623

df.Production.isnull().sum()

3727

df.shape

(246091, 7)

df.dropna(inplace=True)

df.shape

(242364, 7)

sns.kdeplot(df.Production)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7e2dec4e0>

sns.boxplot(df.Production)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7e2a36fd0>

sns.boxplot(df.Area)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7e29cd048>

sns.kdeplot(df.Area)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7e2a265f8>

df[df.State_Name == "Karnataka"]['District_Name'].unique()

array(['BAGALKOT', 'BANGALORE RURAL', 'BELGAUM', 'BELLARY',
       'BENGALURU URBAN', 'BIDAR', 'BIJAPUR', 'CHAMARAJANAGAR',
       'CHIKBALLAPUR', 'CHIKMAGALUR', 'CHITRADURGA', 'DAKSHIN KANNAD',
       'DAVANGERE', 'DHARWAD', 'GADAG', 'GULBARGA', 'HASSAN', 'HAVERI',
       'KODAGU', 'KOLAR', 'KOPPAL', 'MANDYA', 'MYSORE', 'RAICHUR',
       'RAMANAGARA', 'SHIMOGA', 'TUMKUR', 'UDUPI', 'UTTAR KANNAD',
       'YADGIR'], dtype=object)

df.groupby(['State_Name', 'Crop', 'Crop_Year']).sum()

df[df.State_Name == "West Bengal"]['Crop'].unique()

array(['Rice', 'Jute', 'Mesta', 'Urad', 'Gram', 'Khesari', 'Masoor',
       'Moong(Green Gram)', 'Oilseeds total', 'Wheat', 'Arecanut',
       'Arhar/Tur', 'Coconut ', 'Dry chillies', 'Groundnut', 'Linseed',
       'Maize', 'Potato', 'Pulses total', 'Rapeseed &Mustard', 'Sesamum',
       'Sugarcane', 'Turmeric', 'Dry ginger', 'Sunflower',
       'Peas & beans (Pulses)', 'Cotton(lint)', 'Safflower', 'Garlic',
       'Barley', 'Bajra', 'Horse-gram', 'Other Kharif pulses', 'Soyabean',
       'Jowar', 'Niger seed', 'Sannhamp', 'Small millets', 'Tobacco',
       'Ragi', 'Other  Rabi pulses', 'Cardamom', 'Castor seed', 'Moth'],
      dtype=object)

df.groupby(['State_Name', 'Crop_Year']).sum()

df_ = df.groupby(['State_Name', 'Crop_Year']).sum()

df_.reset_index(inplace=True)

df_.head()

df_[['State_Name', 'Crop_Year']].groupby('State_Name').count()

sns.lineplot(x="Crop_Year", y="Production", data=df[df.State_Name == "Tamil Nadu"]);

sns.lineplot(x="Crop_Year", y="Production", data=df, hue="State_Name");

!pip3 install plotly_express

import plotly_express as px

px.scatter(df_, x="Area", y="Production", animation_frame="Crop_Year", 
           animation_group="State_Name", color="State_Name")

df_.sort_values('Crop_Year', inplace=True)

df[(df.State_Name == "Kerala") & (df.Crop_Year == 2000)].sort_values('Production')

df_ = df[df.Crop.isin(['Rice', 'Wheat', 'Maize', 'Ragi'])].groupby(['State_Name', 'Crop_Year']).sum()

df_.head()

df_.reset_index(inplace=True)

df_.sort_values('Crop_Year', inplace=True)

px.scatter(df_, x="Area", y="Production", animation_frame="Crop_Year", 
           animation_group="State_Name", color="State_Name")

df_['Efficiency'] = df_['Production'] / df_['Area']

px.scatter(df_, x="Area", y="Efficiency", size="Production", animation_frame="Crop_Year", 
           animation_group="State_Name", color="State_Name", range_y = [0.75, 5], range_x=[-1E6, 20E6])

End cell¶

!pip install nbconvert

Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (5.6.1)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (1.5.0)
Requirement already satisfied: nbformat>=4.4 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (5.2.0)
Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbconvert) (4.9.2)
Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.4)
Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.6.0)
Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert) (4.1.0)
Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (5.1.1)
Requirement already satisfied: jinja2>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (2.11.3)
Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.8.4)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert) (0.7.1)
Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from nbconvert) (2.6.1)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2>=2.4->nbconvert) (2.0.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.4->nbconvert) (4.3.3)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (3.10.0.2)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (21.4.0)
Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (5.4.0)
Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (0.18.1)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (4.11.3)
Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (3.7.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (0.5.1)
Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (1.15.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->bleach->nbconvert) (3.0.7)

	ID	Salary	DOJ	DOL	Designation	JobCity	Gender	DOB	10percentage	10board	12graduation	12percentage	12board	CollegeID	CollegeTier	Degree	Specialization	collegeGPA	CollegeCityID	CollegeCityTier	CollegeState	GraduationYear	English	Logical	Quant	Domain	ComputerProgramming	ElectronicsAndSemicon	ComputerScience	MechanicalEngg	ElectricalEngg	TelecomEngg	CivilEngg	conscientiousness	agreeableness	extraversion	nueroticism	openess_to_experience
0	203097	420000	2012-06-01	present	senior quality engineer	Bangalore	f	1990-02-19	84.3	board ofsecondary education,ap	2007	95.8	board of intermediate education,ap	1141	2	B.Tech/B.E.	computer engineering	78.00	1141	0	Andhra Pradesh	2011	515	585	525	0.635979	445	-1	-1	-1	-1	-1	-1	0.9737	0.8128	0.5269	1.35490	-0.4455
1	579905	500000	2013-09-01	present	assistant manager	Indore	m	1989-10-04	85.4	cbse	2007	85.0	cbse	5807	2	B.Tech/B.E.	electronics and communication engineering	70.06	5807	0	Madhya Pradesh	2012	695	610	780	0.960603	-1	466	-1	-1	-1	-1	-1	-0.7335	0.3789	1.2396	-0.10760	0.8637
2	810601	325000	2014-06-01	present	systems engineer	Chennai	f	1992-08-03	85.0	cbse	2010	68.2	cbse	64	2	B.Tech/B.E.	information technology	70.00	64	0	Uttar Pradesh	2014	615	545	370	0.450877	395	-1	-1	-1	-1	-1	-1	0.2718	1.7109	0.1637	-0.86820	0.6721
3	267447	1100000	2011-07-01	present	senior software engineer	Gurgaon	m	1989-12-05	85.6	cbse	2007	83.6	cbse	6920	1	B.Tech/B.E.	computer engineering	74.64	6920	1	Delhi	2011	635	585	625	0.974396	615	-1	-1	-1	-1	-1	-1	0.0464	0.3448	-0.3440	-0.40780	-0.9194
4	343523	200000	2014-03-01	2015-03-01 00:00:00	get	Manesar	m	1991-02-27	78.0	cbse	2008	76.8	cbse	11368	2	B.Tech/B.E.	electronics and communication engineering	73.90	11368	0	Uttar Pradesh	2012	545	625	465	0.124502	-1	233	-1	-1	-1	-1	-1	-0.8810	-0.2793	-1.0697	0.09163	-0.1295

	10percentage	12percentage	collegeGPA
Gender
f	80.932894	77.007618	74.048056
m	76.979000	73.666636	70.679947

	10percentage	12percentage	collegeGPA
Gender
f	82.4	77.0	74.00
m	78.0	73.4	70.66

	conscientiousness	agreeableness	extraversion	nueroticism	openess_to_experience
Gender
f	0.121034	0.292444	0.012173	-0.179358	0.038246
m	-0.087826	0.100566	-0.000198	-0.165783	-0.193609

	conscientiousness	agreeableness	extraversion	nueroticism	openess_to_experience
Gender
f	0.2718	0.3789	0.0914	-0.23440	0.0973
m	-0.0154	0.2124	0.0914	-0.17277	-0.0943

Consulting - Quality

Handling Missing Data

Week 13 - Class Demo

Dealing with missing data¶

Numpy¶

Dealing with missing data with Pandas¶

Open ended descriptive statistics¶

Agriculture example¶

End cell¶

	Room_Number	Num_Students	Department	Occupied
0	101.0	1	Mechanical	Y
1	102.0	NaN	Empty	N
2	103.0	3	Electrical	Y
3	104.0	2	Mechanical	Y
4	105.0	NaN	Chemical	N

	Room_Number	Num_Students	Department	Occupied
0	False	False	False	False
1	False	True	False	False
2	False	False	False	False
3	False	False	False	False
4	False	True	False	False
5	True	False	False	False
6	False	False	False	False
7	False	True	False	False
8	False	False	False	True
9	False	False	False	False

		Salary
HighIncome	Gender
False	f	271499.454744
False	m	272598.433606
True	f	832250.000000
True	m	785344.827586

	State_Name	District_Name	Crop_Year	Season	Crop	Area	Production
0	Andaman and Nicobar Islands	NICOBARS	2000	Kharif	Arecanut	1254.0	2000.0
1	Andaman and Nicobar Islands	NICOBARS	2000	Kharif	Other Kharif pulses	2.0	1.0
2	Andaman and Nicobar Islands	NICOBARS	2000	Kharif	Rice	102.0	321.0
3	Andaman and Nicobar Islands	NICOBARS	2000	Whole Year	Banana	176.0	641.0
4	Andaman and Nicobar Islands	NICOBARS	2000	Whole Year	Cashewnut	720.0	165.0

			Area	Production
State_Name	Crop	Crop_Year
Andaman and Nicobar Islands	Arecanut	2000	4354.00	7200.00
		2001	4354.00	7300.00
		2002	4363.00	7350.00
		2003	4379.00	6707.00
		2004	4425.37	4781.05
...	...	...	...	...
West Bengal	Wheat	2010	316808.00	874415.00
		2011	315659.00	872895.00
		2012	321572.00	895927.00
		2013	331481.00	927837.00
		2014	334640.00	939254.00

	State_Name	Crop_Year	Area	Production
0	Andaman and Nicobar Islands	2000	44518.00	89060914.00
1	Andaman and Nicobar Islands	2001	41163.00	89718700.00
2	Andaman and Nicobar Islands	2002	45231.40	94387137.67
3	Andaman and Nicobar Islands	2003	44799.40	95296454.67
4	Andaman and Nicobar Islands	2004	45308.77	87186497.63

	State_Name	District_Name	Crop_Year	Season	Crop	Area	Production
99868	Kerala	KOTTAYAM	2000	Kharif	Sesamum	6.0	1.0
100437	Kerala	MALAPPURAM	2000	Kharif	Ragi	4.0	3.0
99567	Kerala	KOLLAM	2000	Summer	Rice	4.0	4.0
98946	Kerala	KANNUR	2000	Kharif	Sesamum	10.0	6.0
98604	Kerala	IDUKKI	2000	Kharif	Ragi	8.0	7.0
...	...	...	...	...	...	...	...
101702	Kerala	THRISSUR	2000	Whole Year	Coconut	89472.0	540000000.0
98953	Kerala	KANNUR	2000	Whole Year	Coconut	96975.0	621000000.0
100445	Kerala	MALAPPURAM	2000	Whole Year	Coconut	110378.0	626000000.0
101425	Kerala	THIRUVANANTHAPURAM	2000	Whole Year	Coconut	88663.0	635000000.0
100162	Kerala	KOZHIKODE	2000	Whole Year	Coconut	128739.0	903000000.0

	Salary
Gender
f	294937.304075
m	311716.211772

		Salary
CollegeTier	Gender
1	f	51
1	m	246
2	f	906
2	m	2795