Dealing with missing data with Numpy¶

import numpy as np
import pandas as pd
import seaborn as sns

x = np.array([1,2,3,4,5,6,7,8,9,10])

x.sum()

55

print(x.dtype)

int64

x = np.array([1,2,3,4,5,6.2,7,8,9,10])

x.dtype

dtype('float64')

x.sum()

55.2

x = np.array([1,2,3,4,5,'--',7,8,9,10])

cannot perform reduce with flexible type¶

reduce operation means that the function sum() is going to reduce the given numbers to a single number
fexlible type means that the given dataset contains different datatypes and not single type or similar type
and is not able to perform such function because of different data types present in the dataset

x.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     47          initial=_NoValue, where=True):
---> 48     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     49 
     50 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: cannot perform reduce with flexible type

x.dtype

dtype('<U21')

# Using NONE seems to be good enough unlike the previous dataset, 
# but still the same error occurs when we perform any such operation

x = np.array([1,2,3,4,5,None,7,8,9,10])

x.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     47          initial=_NoValue, where=True):
---> 48     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     49 
     50 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

# even now, we get the same error and cannot perform any operations

x = np.array([1,2,3, np.nan ,7])

x.sum()

nan

25 * np.nan

nan

take a boolean array and map it on to the original array, so that it returns only True values and not False values
the number of True and False elements should be identical to the position of valid and missing values in the dataset.
the above method can be applied to an array of any dimension

x = np.array([1,2,3, np.nan ,7])

# x = np.array([1,     2,    3,   np.nan, 7])
x_b = np.array([True, True, True, False, True])

x[x_b]

array([1., 2., 3., 7.])

x[x_b].sum()

13.0

# the following boolean array doesnot work as the mapping is not exact and returns nan
# x = np.array([1,     2,    3,   np.nan, 7])
x_b = np.array([True, False, True, True, True])

x[x_b].sum()

nan

x = np.array([1,2,3, np.nan ,7])

# x = np.array([1,     2,    3,   np.nan, 7])
x_b = np.array([True, True, True, False, True])

x[x_b].mean()

3.25

Find the missing elements in a sequence¶

arr = [1,2,4,5,6,7,9,10, 12, 25, 30]    # serial numbers or numbers in sequence
missing = []                            # empty array to gather missing elements

for i in range(arr[0], arr[-1]+1):      # run a loop with a range of numbers in arr
    if i not in arr:                    # if number in range is not in given array
        missing.append(i)               # append that value to empty array
print(missing)                          # print the elements that are missing

[3, 8, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29]

alternative method - to find missing elements in a sequence¶

arr = [1,4,5,7,9,10]
missing_values = [i for i in range(arr[0], arr[-1]+1) if i not in arr]
print(missing_values)

[2, 3, 6, 8]

arr = np.array([1,2,3, np.nan ,7])

len(arr)

5

using isnan() from Numpy¶

arr = np.array([1, 2, 3, np.nan, 7, 10, np.nan, 15])

bl_array = []
for i in arr:
    if np.isnan(i):
        bl_array.append(False)
    else:
        bl_array.append(True)
print(bl_array)

[True, True, True, False, True, True, False, True]

using isnumeric() from Numpy¶

arr = np.array([1, np.nan, None, 'missing', 3, '--' ,7])

import numbers

take close look, np.nan is not counted as non-number¶

bl_array = []
for i in arr:
#    if np.char.isnumeric(i):
#    if type(i) == int or type(i) == float:
    if isinstance(i, numbers.Number):
        bl_array.append(True)
#    elsif np.isnan(i):
#        bl_array.append(False)
    else:
        bl_array.append(False)
print(bl_array)

[True, True, False, False, True, False, True]

take close look, np.nan is not counted as non-number¶

arr = np.array([1, np.nan, None, 'missing', 3, '--' ,7])

bl_array = []
for i in arr:
    if type(i) == int or type(i) == float:
        bl_array.append(True)
    else:
        bl_array.append(False)
print(bl_array)

[True, True, False, False, True, False, True]

to work with mask functionality¶

arr = np.array([1, 2, None, 'missing', 3, '--' ,7])

bl_array = []
for i in arr:
    if type(i) == int or type(i) == float:
        bl_array.append(0)
    else:
        bl_array.append(1)
print(bl_array)

[0, 0, 1, 1, 0, 1, 0]

m_x = np.ma.masked_array(arr, mask = bl_array)

m_x

masked_array(data=[1, 2, --, --, 3, --, 7],
             mask=[False, False,  True,  True, False,  True, False],
       fill_value='?',
            dtype=object)

print(m_x)

[1 2 -- -- 3 -- 7]

m_x.sum()

13

m_x.mean()

3.25

# Dealing with missing data with Pandas¶

creating sample dataset with of the type of dictionary¶

room = {"room number": [101, 102, 103, 104, 105, '' , 107, 108, 109, 110],
            "num_students": [1, 2, "", 4, 'empty', 2, 3, 4, 'na', 3],
        "Department": ["Civil", "Civil", "", "Electrical", "n/a", "CS", "Metallurgy", "na", "Chemical", "Civil"],
        "Occupied": ['y', 'n', 'y', ' ', 'y', '--', 'n', 'na', '--', 'y']}

df = pd.DataFrame(room)

df

df.dtypes

room number     object
num_students    object
Department      object
Occupied        object
dtype: object

df['room number'] = pd.to_numeric(df['room number'])

df.dtypes

room number     float64
num_students     object
Department       object
Occupied         object
dtype: object

operations are much faster when the datatype is of int or float rather than of object type - follow the below example¶

%timeit np.arange(1000000, dtype="int").sum()

The slowest run took 5.55 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 1.66 ms per loop

%timeit np.arange(1000000, dtype="object").sum()

10 loops, best of 5: 74.3 ms per loop

df['room number'].isnull()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: room number, dtype: bool

adds all the true values to give the count of null values¶

df['room number'].isnull().sum()

1

df.isnull()

df.isnull().sum()

room number     1
num_students    0
Department      0
Occupied        0
dtype: int64

rooms = pd.read_csv('/content/rooms.csv')

rooms

rooms.dtypes

room number     float64
num_students     object
department       object
occupied         object
dtype: object

rooms.isnull().sum()

room number     2
num_students    3
department      1
occupied        2
dtype: int64

missing_values = ['na', 'n/a', 'empty', 'NA', '--']

rooms = pd.read_csv('/content/rooms.csv', na_values = missing_values)

rooms

rooms.isnull()

rooms.department.unique()

array(['Civil', nan, 'Electrical', 'CS', 'Metallurgy', 'Chemical'],
      dtype=object)

rooms['room_number'].count()

10

Filling the missing values¶

to fill all the missing values or null values with string "N"¶

rooms['occupied'].fillna("n", inplace = True)

rooms

def Convert_Y_to_True(v):
    if v == "y":
        return True
    else: 
        return False

rooms['occupied'] = rooms['occupied'].apply(Convert_Y_to_True)

rooms

def Convert_Y_to_True(v):
    if v == True:
        return "y"
    else: 
        return "n"

rooms['occupied'] = rooms['occupied'].apply(Convert_Y_to_True)

rooms

# lambda x: 'found' if x == "Tie" else "not found"
rooms['occupied'] = rooms['occupied'].apply(lambda x: True if x == "y" else "False")

rooms

to fill the missing value with the value above the null value - with forward fill - ffill, also called 'pad'¶

rooms['Dept2'] = rooms['department']

rooms

rooms['Dept2'].fillna(method='ffill', inplace = True)

rooms

rooms['dept'] = rooms['department']

rooms

rooms['dept'] = rooms['dept'].fillna(method = 'bfill', inplace = True)

rooms

rooms.drop(['dept'], axis=1)

rooms['dept'] = rooms['department']

rooms

rooms['dept'].fillna(method = 'bfill', inplace = True)

rooms

rooms

rooms['num_students'].fillna(rooms['num_students'].median(), inplace = True)

rooms

to delete a particular cell in a column¶

rooms.at[6, 'room_number'] = np.nan

rooms

rooms.drop(106, inplace=True)

rooms

rooms['room_number'].interpolate(inplace = True)

rooms

Experiments on AMEO_15¶

df = pd.read_excel('/content/Ameo15.xlsx')

df.head()

df['10board'].unique()

array(['board ofsecondary education,ap', 'cbse', 'state board',
       'mp board bhopal', 'icse',
       'karnataka secondary school of examination', 'up',
       'karnataka state education examination board', 'ssc',
       'kerala state technical education', 0, 'bseb',
       'state board of secondary education, andhra pradesh',
       'matriculation', 'gujarat state board', 'karnataka state board',
       'wbbse', 'maharashtra state board', 'icse board', 'up board',
       'board of secondary education(bse) orissa',
       'little jacky matric higher secondary school',
       'uttar pradesh board', 'bsc,orissa', 'mp board', 'upboard',
       'matriculation board', 'j & k bord', 'rbse',
       'central board of secondary education', 'pseb', 'jkbose',
       'haryana board of school education,(hbse)', 'metric', 'ms board',
       'kseeb', 'stateboard', 'maticulation',
       'karnataka secondory education board', 'mumbai board', 'sslc',
       'kseb', 'board secondary  education', 'matric board',
       'board of secondary education',
       'west bengal board of secondary education',
       'jharkhand secondary examination board,ranchi', 'u p board',
       'bseb,patna', 'hsc', 'bse', 'sss pune',
       'karnataka education board (keeb)', 'kerala',
       'state board of secondary education( ssc)', 'gsheb',
       'up(allahabad)', 'nagpur', 'don bosco maatriculation school',
       'karnataka state secondary education board', 'maharashtra',
       'karnataka secondary education board',
       'himachal pradesh board of school education',
       'certificate of middle years program of ib',
       'karnataka board of secondary education',
       'board of secondary education rajasthan', 'uttarakhand board',
       'ua', 'board of secendary education orissa',
       'karantaka secondary education and examination borad', 'hbsc',
       'kseeb(karnataka secondary education examination board)',
       'cbse[gulf zone]', 'hbse', 'state(karnataka board)',
       'jharkhand accademic council',
       'jharkhand secondary examination board (ranchi)',
       'karnataka secondary education examination board', 'delhi board',
       'mirza ahmed ali baig', 'jseb', 'bse, odisha', 'bihar board',
       'maharashtra state(latur board)', 'rajasthan board', 'mpboard',
       'upbhsie', 'secondary board of rajasthan',
       'tamilnadu matriculation board', 'jharkhand secondary board',
       'board of secondary education,andhara pradesh', 'up baord',
       'state', 'board of intermediate education',
       'state board of secondary education,andhra pradesh',
       'up board , allahabad',
       'stjosephs girls higher sec school,dindigul', 'maharashtra board',
       'education board of kerala', 'board of ssc',
       'maharashtra state board pune',
       'board of school education harayana',
       'secondary school cerfificate', 'maharashtra sate board', 'ksseb',
       'bihar examination board, patna', 'latur',
       'board of secondary education, rajasthan', 'state borad hp',
       'cluny', 'bsepatna', 'up borad', 'ssc board of andrapradesh',
       'matric', 'bse,orissa', 'ssc-andhra pradesh', 'mp',
       'karnataka education board', 'mhsbse',
       'karnataka sslc board bangalore', 'karnataka', 'u p',
       'secondary school of education', 'state board of karnataka',
       'karnataka secondary board', 'andhra pradesh board ssc',
       'stjoseph of cluny matrhrsecschool,neyveli,cuddalore district',
       'hse,orissa', 'national public school', 'nagpur board',
       'jharkhand academic council', 'bsemp',
       'board of secondary education, andhra pradesh',
       'board of secondary education orissa',
       'board of secondary education,rajasthan(rbse)',
       'board of secondary education,ap',
       'board of secondary education,andhra pradesh',
       'jawahar navodaya vidyalaya', 'aisse',
       'karnataka board of higher education', 'bihar',
       'kerala state board', 'cicse', 'tn state board',
       'kolhapur divisional board, maharashtra',
       'bharathi matriculation school', 'uttaranchal state board',
       'wbbsce', 'mp state board', 'seba(assam)', 'anglo indian', 'gseb',
       'uttar pradesh', 'ghseb', 'board of school education uttarakhand',
       'msbshse,pune', 'tamilnadu state board', 'kerala university',
       'uttaranchal shiksha avam pariksha parishad',
       'bse(board of secondary education)',
       'bright way college, (up board)',
       'school secondary education, andhra pradesh',
       'secondary state certificate',
       'maharashtra state board of secondary and higher secondary education,pune',
       'andhra pradesh state board', 'stmary higher secondary', 'cgbse',
       'secondary school certificate', 'rajasthan board ajmer', 'mpbse',
       'pune board', 'cbse ', 'board of secondary education,orissa',
       'maharashtra state board,pune', 'up bord',
       'kiran english medium high school', 'state board (jac, ranchi)',
       'gujarat board', 'state board ', 'sarada high scchool',
       'kalaimagal matriculation higher secondary school',
       'karnataka board', 'maharastra board', 'sslc board',
       'ssc maharashtra board', 'tamil nadu state', 'uttrakhand board',
       'bihar secondary education board,patna',
       'haryana board of school education',
       'sri kannika parameswari highier secondary school, udumalpet',
       'ksseb(karnataka state board)', 'nashik board',
       'jharkhand secondary education board', 'himachal pradesh board',
       'maharashtra satate board',
       'maharashtra state board mumbai divisional board',
       'dav public school,hehal',
       'state board of secondary education, ap',
       'rajasthan board of secondary education', 'hsce',
       'karnataka secondary education',
       'board of secondary education,odisha', 'maharashtra nasik board',
       'west bengal board of secondary examination (wbbse)',
       'holy cross matriculation hr sec school', 'cbsc', 'apssc',
       'bseb patna', 'kolhapur', 'bseb, patna', 'up board allahabad',
       'biharboard', 'nagpur board,nagpur', 'pune', 'gyan bharati school',
       'rbse,ajmer', 'board of secondaray education',
       'secondary school education', 'state bord', 'jbse,jharkhand',
       'hse', 'madhya pradesh board', 'bihar school examination board',
       'west bengal board of secondary eucation', 'state boardmp board ',
       'icse board , new delhi',
       'board of secondary education (bse) orissa',
       'maharashtra state board for ssc',
       'board of secondary school education', 'latur board',
       "stmary's convent inter college", 'nagpur divisional board',
       'ap state board', 'cgbse raipur', 'uttranchal board', 'ksbe',
       'central board of secondary education, new delhi',
       'bihar school examination board patna', 'cbse board',
       'sslc,karnataka', 'mp-bse', 'up bourd', 'dav public school sec 14',
       'board of school education haryana',
       'council for indian school certificate examination',
       'aurangabad board', 'j&k state board of school education',
       'maharashtra state board of secondary and higher secondary education',
       'maharashtra state boar of secondary and higher secondary education',
       'ssc regular', 'karnataka state examination board', 'nasik',
       'west bengal  board of secondary education', 'up board,allahabad',
       'bseb ,patna',
       'state board - west bengal board of secondary education : wbbse',
       'maharashtra state board of secondary & higher secondary education',
       'delhi public school', 'karnataka secondary eduction',
       'secondary education board of rajasthan',
       'maharashtra board, pune', 'rbse (state board)', 'apsche',
       'board of  secondary education',
       'board of high school and intermediate education uttarpradesh',
       'kea', 'board of secondary education - andhra pradesh',
       'ap state board for secondary education', 'seba',
       'punjab school education board, mohali',
       'jharkhand acedemic council', 'hse,board',
       'board of ssc education andhra pradesh', 'up-board', 'bse,odisha',
       'kebb', 'ssm school', 'upb', 'chhattishgarh', 'vjrschool',
       'west bengal board', 'wbbose', 'uttar pradesh(up) board', 'cisce',
       'matriculaton', 'sse', 'stanthony', 'kses',
       'council of indian school certificate examinations',
       'secondary school certificate(ssc)',
       'punjab school education board', 'c b s e', 'secondary school',
       'rajasthan board of secondary education,ajmer,rajasthan', 'aislc',
       'state board of gujarat', 'uttaranchal board',
       'state board of andhra pradesh',
       'maharashtra state board of technical education', 'ua board',
       'kerala board for public examination',
       'lions matric higher secondary school, dindigul',
       'jharkhand board', 'kerala state, board of examination',
       'kendriya vidyalaya island grounds', 'bse, orissa',
       'jharkhand academic council,ranchi', 'pseb mohali',
       'aligarh muslim university', 'karnataka higher secondary board',
       'thslc,board of technical education, kerala',
       'state syllabus( karnataka  state sslc board)',
       'andhra pradesh board of secondary education', 'ap state',
       'haryana board', 'karnataka state secondary education',
       'dasmesh public school', 'dsse',
       'central board of secondary education(cbse)',
       'karnataka secondary education examination',
       'sri venkateshwara matriculation school', 'gshseb',
       'jamia millia islamia', 'matriculation examination',
       'jharkhand secondary examination board', 'punjab board',
       'up boadr', 'm p board', 'board of secondary education hyderabad',
       'gseb, gandhinagar', ' board of secondary education',
       'state board of tamilnadu', 'board of secondary education, orissa',
       'karnataka secondary examination board',
       "st clare's sr sec school", 'state board, kerala', 'jac',
       'tamil nadu matriculation board',
       'state boadr of school education', 'ssc,ap', 'msbshse',
       'mp board of secondary education', 'b s e b, patna',
       'karanataka state board',
       'indian certificate for secondary education',
       'board of secondary education,ajmer(rajasthan)',
       'board of secondery education', "stxavier's school",
       'state board(tamil)', 'nalanda vidhya bhavan high school',
       'matriculation examinatopm'], dtype=object)

df.tail()

df.isna().any().count()

38

df.isna().any()

ID                       False
Salary                   False
DOJ                      False
DOL                      False
Designation              False
JobCity                  False
Gender                   False
DOB                      False
10percentage             False
10board                  False
12graduation             False
12percentage             False
12board                  False
CollegeID                False
CollegeTier              False
Degree                   False
Specialization           False
collegeGPA               False
CollegeCityID            False
CollegeCityTier          False
CollegeState             False
GraduationYear           False
English                  False
Logical                  False
Quant                    False
Domain                   False
ComputerProgramming      False
ElectronicsAndSemicon    False
ComputerScience          False
MechanicalEngg           False
ElectricalEngg           False
TelecomEngg              False
CivilEngg                False
conscientiousness        False
agreeableness            False
extraversion             False
nueroticism              False
openess_to_experience    False
dtype: bool

df.isnull()

df.isnull().any()

ID                       False
Salary                   False
DOJ                      False
DOL                      False
Designation              False
JobCity                  False
Gender                   False
DOB                      False
10percentage             False
10board                  False
12graduation             False
12percentage             False
12board                  False
CollegeID                False
CollegeTier              False
Degree                   False
Specialization           False
collegeGPA               False
CollegeCityID            False
CollegeCityTier          False
CollegeState             False
GraduationYear           False
English                  False
Logical                  False
Quant                    False
Domain                   False
ComputerProgramming      False
ElectronicsAndSemicon    False
ComputerScience          False
MechanicalEngg           False
ElectricalEngg           False
TelecomEngg              False
CivilEngg                False
conscientiousness        False
agreeableness            False
extraversion             False
nueroticism              False
openess_to_experience    False
dtype: bool

df.dtypes

id                                int64
salary                            int64
doj                      datetime64[ns]
dol                              object
designation                      object
jobcity                          object
gender                           object
dob                      datetime64[ns]
10percentage                    float64
10board                          object
12graduation                      int64
12percentage                    float64
12board                          object
collegeid                         int64
collegetier                       int64
degree                           object
specialization                   object
collegegpa                      float64
collegecityid                     int64
collegecitytier                   int64
collegestate                     object
graduationyear                    int64
english                           int64
logical                           int64
quant                             int64
domain                          float64
computer_programming              int64
electronics_semicon               int64
computer_science                  int64
mechanical_engg                   int64
electrical_engg                   int64
telecom_engg                      int64
civil_engg                        int64
conscientious                   float64
agreeableness                   float64
extraversion                    float64
nueroticism                     float64
openess_to_experience           float64
dtype: object

df.Gender.unique()

array(['f', 'm'], dtype=object)

sns.violinplot(x = 'gender', y='salary', data = df)

<matplotlib.axes._subplots.AxesSubplot at 0x7efee622b890>

df[['10percentage', '12percentage', 'collegegpa', 'gender']].groupby('gender').mean()

df[['10percentage', '12percentage', 'collegegpa', 'gender']].groupby('gender').median()

df[['conscientious', 'agreeableness', 
    'extraversion', 'nueroticism', 'openess_to_experience',
    'gender']].groupby('gender').mean()

df[['conscientious', 'agreeableness', 
    'extraversion', 'nueroticism', 'openess_to_experience',
    'gender']].groupby('gender').median()

df[['salary', 'gender']].groupby('gender').mean()

define a threshold for salary¶

df['salary'].mean()

305605.49290651147

df['salary'].mean() + df['salary'].std()

508615.72317936417

th = df['salary'].mean() + df['salary'].std()

df['HighIncome'] = (df.salary > th)

df.sample(10)

df[['salary', 'HighIncome', 'gender']].groupby(['HighIncome', 'gender']).mean()

df[['salary', 'HighIncome', 'gender']].groupby(['HighIncome', 'gender']).count()

print("Low income female percentage ", 1264/3825*100)

Low income female percentage  33.04575163398693

print("High income female percentage ", 76/333*100)

High income female percentage  22.822822822822822

print("Low income female percentage ", 1264/(1264+3825)*100)

Low income female percentage  24.837885635684813

print("High income female percentage ", 76/(76+333)*100)

High income female percentage  18.581907090464547

df_ = df

df_ = df_[['electrical_engg', 'telecom_engg', 'civil_engg' ]].sample(30)

df_

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5498 entries, 0 to 5497
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     5498 non-null   int64         
 1   salary                 5498 non-null   int64         
 2   doj                    5498 non-null   datetime64[ns]
 3   dol                    5498 non-null   object        
 4   designation            5498 non-null   object        
 5   jobcity                5498 non-null   object        
 6   gender                 5498 non-null   object        
 7   dob                    5498 non-null   datetime64[ns]
 8   10percentage           5498 non-null   float64       
 9   10board                5498 non-null   object        
 10  12graduation           5498 non-null   int64         
 11  12percentage           5498 non-null   float64       
 12  12board                5498 non-null   object        
 13  collegeid              5498 non-null   int64         
 14  collegetier            5498 non-null   int64         
 15  degree                 5498 non-null   object        
 16  specialization         5498 non-null   object        
 17  collegegpa             5498 non-null   float64       
 18  collegecityid          5498 non-null   int64         
 19  collegecitytier        5498 non-null   int64         
 20  collegestate           5498 non-null   object        
 21  graduationyear         5498 non-null   int64         
 22  english                5498 non-null   int64         
 23  logical                5498 non-null   int64         
 24  quant                  5498 non-null   int64         
 25  domain                 5498 non-null   float64       
 26  computer_programming   5498 non-null   int64         
 27  electronics_semicon    5498 non-null   int64         
 28  computer_science       5498 non-null   int64         
 29  mechanical_engg        5498 non-null   int64         
 30  electrical_engg        5498 non-null   int64         
 31  telecom_engg           5498 non-null   int64         
 32  civil_engg             5498 non-null   int64         
 33  conscientious          5498 non-null   float64       
 34  agreeableness          5498 non-null   float64       
 35  extraversion           5498 non-null   float64       
 36  nueroticism            5498 non-null   float64       
 37  openess_to_experience  5498 non-null   float64       
 38  HighIncome             5498 non-null   bool          
 39  EnggTrade              63 non-null     object        
dtypes: bool(1), datetime64[ns](2), float64(9), int64(18), object(10)
memory usage: 1.6+ MB

def trade_categoriese(row):
    if row['electrical_engg'] > 0:
        return 'ee'
    elif row['telecom_engg'] > 0:
        return 'te'
    elif row['civil_engg'] > 0:
        return 'ce'
    elif row['computer_programming'] > 0:
        return 'cp'
    elif row['electronics_semicon'] > 0:
        return 'es'
    elif row['computer_science'] > 0:
        return 'cs'
    elif row['mechanical_engg'] > 0:
        return 'me'

df['EnggTrade'] = df.apply(lambda row: trade_categoriese(row), axis=1)

df.head()

dx = df[['gender', "EnggTrade", 'salary', ]].groupby(['EnggTrade', 'gender']).mean()

dx

dx.style.highlight_max(color='green').highlight_min(color='red')

!pip install nbconvert

%shell jupyter nbconvert --to html /content/testfile.ipynb

	id	salary	doj	dol	designation	jobcity	gender	dob	10percentage	10board	...	computer_science	mechanical_engg	electrical_engg	telecom_engg	civil_engg	conscientious	agreeableness	extraversion	nueroticism	openess_to_experience
0	203097	420000	2012-06-01	present	senior quality engineer	Bangalore	f	1990-02-19	84.3	board ofsecondary education,ap	...	-1	-1	-1	-1	-1	0.9737	0.8128	0.5269	1.35490	-0.4455
1	579905	500000	2013-09-01	present	assistant manager	Indore	m	1989-10-04	85.4	cbse	...	-1	-1	-1	-1	-1	-0.7335	0.3789	1.2396	-0.10760	0.8637
2	810601	325000	2014-06-01	present	systems engineer	Chennai	f	1992-08-03	85.0	cbse	...	-1	-1	-1	-1	-1	0.2718	1.7109	0.1637	-0.86820	0.6721
3	267447	1100000	2011-07-01	present	senior software engineer	Gurgaon	m	1989-12-05	85.6	cbse	...	-1	-1	-1	-1	-1	0.0464	0.3448	-0.3440	-0.40780	-0.9194
4	343523	200000	2014-03-01	2015-03-01 00:00:00	get	Manesar	m	1991-02-27	78.0	cbse	...	-1	-1	-1	-1	-1	-0.8810	-0.2793	-1.0697	0.09163	-0.1295

	ID	Salary	DOJ	DOL	Designation	JobCity	Gender	DOB	10percentage	10board	...	ComputerScience	MechanicalEngg	ElectricalEngg	TelecomEngg	CivilEngg	conscientiousness	agreeableness	extraversion	nueroticism	openess_to_experience
5493	407755	?	?	?	?	?	m	1991-01-10	82.88	kseeb	...	-1	-1	-1	-1	-1	-0.7651	-1.5273	-1.0697	0.0035	-1.5513
5494	575154	?	?	?	?	?	m	1991-06-26	87.00	cbse	...	-1	-1	-1	366	-1	0.2718	0.0459	0.3174	0.7798	-0.4776
5495	216598	?	?	?	?	?	m	1989-01-12	84.50	cbse	...	-1	-1	-1	-1	-1	-0.1082	-0.2793	0.5269	-0.9953	0.6603
5496	919755	?	?	?	?	?	m	1991-08-10	63.33	cbse	...	-1	-1	-1	-1	-1	-0.8772	-1.4526	0.1637	-0.1076	-2.2021
5497	1037308	?	?	?	?	?	m	1990-10-23	66.00	cbse	...	-1	438	-1	-1	-1	-0.3027	-1.2861	0.0100	0.2727	-0.6692

	10percentage	12percentage	collegegpa
gender
f	81.039321	76.802306	74.112179
m	77.087557	73.886965	70.686414

	10percentage	12percentage	collegegpa
gender
f	82.5	77.0	74.0
m	78.2	73.8	71.0

	conscientious	agreeableness	extraversion	nueroticism	openess_to_experience
gender
f	0.144125	0.314236	0.037868	-0.157495	0.060083
m	-0.096661	0.105828	0.007920	-0.144813	-0.189094

Consulting - Quality

HANDLING MISSING DATA

CW - Overall Understanding

Dealing with missing data with Numpy¶

cannot perform reduce with flexible type¶

Find the missing elements in a sequence¶

alternative method - to find missing elements in a sequence¶

using isnan() from Numpy¶

using isnumeric() from Numpy¶

take close look, np.nan is not counted as non-number¶

take close look, np.nan is not counted as non-number¶

to work with mask functionality¶

# Dealing with missing data with Pandas¶

creating sample dataset with of the type of dictionary¶

operations are much faster when the datatype is of int or float rather than of object type - follow the below example¶

adds all the true values to give the count of null values¶

Filling the missing values¶

to fill all the missing values or null values with string "N"¶

to fill the missing value with the value above the null value - with forward fill - ffill, also called 'pad'¶

to delete a particular cell in a column¶

Experiments on AMEO_15¶

define a threshold for salary¶

	room number	num_students	Department	Occupied
0	101	1	Civil	y
1	102	2	Civil	n
2	103			y
3	104	4	Electrical
4	105	empty	n/a	y
5		2	CS	--
6	107	3	Metallurgy	n
7	108	4	na	na
8	109	na	Chemical	--
9	110	3	Civil	y

	room number	num_students	Department	Occupied
0	False	False	False	False
1	False	False	False	False
2	False	False	False	False
3	False	False	False	False
4	False	False	False	False
5	False	False	False	False
6	False	False	False	False
7	False	False	False	False
8	False	False	False	False
9	False	False	False	False

	room_number	num_students	department	occupied
0	101	1.0	Civil	True
1	102	2.0	Civil	False
2	103	NaN	NaN	True
3	104	4.0	Electrical	False
4	105	NaN	NaN	True
5	106	2.0	CS	False
6	107	3.0	Metallurgy	False
7	108	4.0	NaN	False
8	109	NaN	Chemical	False
9	110	3.0	Civil	True

	room_number	num_students	department	occupied	Dept2	dept
0	101.0	1.0	Civil	True	Civil	Civil
1	102.0	2.0	Civil	False	Civil	Civil
2	103.0	3.0	NaN	True	Civil	Electrical
3	104.0	4.0	Electrical	False	Electrical	Electrical
4	105.0	3.0	NaN	True	Electrical	CS
5	106.0	2.0	CS	False	CS	CS
6	NaN	3.0	Metallurgy	False	Metallurgy	Metallurgy
7	108.0	4.0	NaN	False	Metallurgy	Chemical
8	109.0	3.0	Chemical	False	Chemical	Chemical
9	110.0	3.0	Civil	True	Civil	Civil
106	NaN	NaN	NaN	NaN	NaN	NaN

	id	salary	doj	dol	designation	jobcity	gender	dob	10percentage	10board	...	mechanical_engg	electrical_engg	telecom_engg	civil_engg	conscientious	agreeableness	extraversion	nueroticism	openess_to_experience	HighIncome
1021	407662	360000	2013-06-01	2015-06-01 00:00:00	java software engineer	Bangalore	m	1989-12-31	81.60	cbse	...	-1	-1	-1	-1	-0.1082	0.8518	-0.1626	-0.7603	-0.2875	False
3590	242719	220000	2011-11-01	2013-05-01 00:00:00	software developer	Bangalore	m	1986-10-13	58.00	0	...	-1	-1	-1	-1	0.0464	0.5008	-0.1988	-1.2303	0.8183	False
2246	1179971	120000	2014-12-01	present	android developer	Kochi/Cochin	m	1992-10-03	80.00	state board	...	-1	-1	-1	-1	0.2718	-1.2861	-1.5270	-1.2486	-0.8608	False
1806	1088588	300000	2014-07-01	present	technical consultant	Chittor	m	1993-07-02	88.50	state board	...	438	-1	-1	-1	0.7027	0.0459	-0.9122	-0.1076	-0.6692	False
58	1063642	200000	2013-06-01	2014-01-01 00:00:00	project engineer	Gaziabaad	m	1990-03-27	63.30	state board	...	-1	-1	-1	-1	-1.0208	-0.6201	-0.2974	-0.1076	-1.2440	False
3412	351702	480000	2012-07-01	present	senior engineer	Pune	m	1989-11-05	81.60	0	...	-1	-1	-1	-1	-1.3447	-0.9033	-0.8157	-0.0552	-1.7093	False
1124	1072371	240000	2015-02-01	present	software developer	Noida	m	1991-06-03	78.66	cbse	...	-1	-1	-1	-1	0.1282	0.2124	0.4711	0.0192	1.2470	False
4573	852122	360000	2013-09-01	present	assistant system engineer	Tadepally Gudem	m	1988-08-12	61.40	cbse	...	-1	-1	-1	-1	-0.8772	-0.7866	0.0100	-0.1076	-2.0105	False
3888	604821	280000	2013-05-01	2014-05-01 00:00:00	test technician	Tadepally Gudem	f	1991-11-14	81.20	cbse	...	-1	-1	-1	-1	0.8463	-0.1206	0.1637	-0.4879	0.0973	False
2703	1059888	500000	2014-08-01	present	software engineer	Chennai	m	1993-06-04	94.00	state board	...	-1	-1	-1	-1	0.5591	0.5454	-0.6048	0.5262	0.0973	False

		salary
HighIncome	gender
False	f	270181.962025
False	m	269781.437908
True	f	723223.684211
True	m	756246.246246

	electrical_engg	telecom_engg	civil_engg
3321	-1	-1	-1
2850	-1	-1	-1
3030	-1	-1	-1
900	-1	446	-1
1840	-1	-1	-1
1537	-1	-1	-1
911	-1	-1	-1
3078	-1	-1	-1
5451	-1	-1	-1
3492	-1	-1	-1
4387	-1	-1	-1
2007	-1	393	-1
3564	-1	-1	-1
373	-1	-1	-1
4471	-1	-1	-1
2799	-1	-1	-1
2365	-1	-1	-1
641	612	-1	-1
3119	-1	-1	-1
1209	-1	446	-1
1280	-1	-1	-1
5238	-1	-1	-1
3418	-1	-1	-1
864	-1	-1	-1
3687	-1	-1	-1
3680	-1	260	-1
146	-1	-1	-1
2465	-1	-1	-1
961	-1	-1	-1
4214	-1	-1	-1

		salary
EnggTrade	gender
ce	f	334000.000000
ce	m	341862.745098
cp	f	298923.303835
cp	m	317064.347826
cs	f	216666.666667
cs	m	225000.000000
ee	f	259270.833333
ee	m	268513.513514
es	f	293269.230769
es	m	305334.394904
me	f	319642.857143
me	m	300147.208122
te	f	293565.573770
te	m	281629.353234

	salary
gender
f	295876.865672
m	308740.740741