HANDLING MISSING DATA
CW - Overall Understanding
handling

Dealing with missing data with Numpy

In [ ]:
import numpy as np
import pandas as pd
import seaborn as sns
In [ ]:
x = np.array([1,2,3,4,5,6,7,8,9,10])
In [ ]:
x.sum()
Out[ ]:
55
In [ ]:
print(x.dtype)
int64
In [ ]:
x = np.array([1,2,3,4,5,6.2,7,8,9,10])
In [ ]:
x.dtype
Out[ ]:
dtype('float64')
In [ ]:
x.sum()
Out[ ]:
55.2
In [ ]:
x = np.array([1,2,3,4,5,'--',7,8,9,10])

cannot perform reduce with flexible type

  • reduce operation means that the function sum() is going to reduce the given numbers to a single number
  • fexlible type means that the given dataset contains different datatypes and not single type or similar type
  • and is not able to perform such function because of different data types present in the dataset
In [ ]:
x.sum()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     47          initial=_NoValue, where=True):
---> 48     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     49 
     50 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: cannot perform reduce with flexible type
In [ ]:
x.dtype
Out[ ]:
dtype('<U21')
In [ ]:
# Using NONE seems to be good enough unlike the previous dataset, 
# but still the same error occurs when we perform any such operation

x = np.array([1,2,3,4,5,None,7,8,9,10])
In [ ]:
x.sum()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-c6d45b513c2c> in <module>()
----> 1 x.sum()

/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     47          initial=_NoValue, where=True):
---> 48     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     49 
     50 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
In [ ]:
# even now, we get the same error and cannot perform any operations

x = np.array([1,2,3, np.nan ,7])
In [ ]:
x.sum()
Out[ ]:
nan
In [ ]:
25 * np.nan
Out[ ]:
nan
  • take a boolean array and map it on to the original array, so that it returns only True values and not False values
  • the number of True and False elements should be identical to the position of valid and missing values in the dataset.
  • the above method can be applied to an array of any dimension
In [ ]:
x = np.array([1,2,3, np.nan ,7])
In [ ]:
# x = np.array([1,     2,    3,   np.nan, 7])
x_b = np.array([True, True, True, False, True])
In [ ]:
x[x_b]
Out[ ]:
array([1., 2., 3., 7.])
In [ ]:
x[x_b].sum()
Out[ ]:
13.0
In [ ]:
# the following boolean array doesnot work as the mapping is not exact and returns nan
# x = np.array([1,     2,    3,   np.nan, 7])
x_b = np.array([True, False, True, True, True])
In [ ]:
x[x_b].sum()
Out[ ]:
nan
In [ ]:
x = np.array([1,2,3, np.nan ,7])
In [ ]:
# x = np.array([1,     2,    3,   np.nan, 7])
x_b = np.array([True, True, True, False, True])
In [ ]:
x[x_b].mean()
Out[ ]:
3.25

Find the missing elements in a sequence

In [ ]:
arr = [1,2,4,5,6,7,9,10, 12, 25, 30]    # serial numbers or numbers in sequence
missing = []                            # empty array to gather missing elements

for i in range(arr[0], arr[-1]+1):      # run a loop with a range of numbers in arr
    if i not in arr:                    # if number in range is not in given array
        missing.append(i)               # append that value to empty array
print(missing)                          # print the elements that are missing
[3, 8, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29]

alternative method - to find missing elements in a sequence

In [ ]:
arr = [1,4,5,7,9,10]
missing_values = [i for i in range(arr[0], arr[-1]+1) if i not in arr]
print(missing_values)
[2, 3, 6, 8]
In [ ]:
arr = np.array([1,2,3, np.nan ,7])
In [ ]:
len(arr)
Out[ ]:
5

using isnan() from Numpy

In [ ]:
arr = np.array([1, 2, 3, np.nan, 7, 10, np.nan, 15])

bl_array = []
for i in arr:
    if np.isnan(i):
        bl_array.append(False)
    else:
        bl_array.append(True)
print(bl_array)
[True, True, True, False, True, True, False, True]

using isnumeric() from Numpy

In [ ]:
arr = np.array([1, np.nan, None, 'missing', 3, '--' ,7])
In [ ]:
import numbers

take close look, np.nan is not counted as non-number

In [ ]:
bl_array = []
for i in arr:
#    if np.char.isnumeric(i):
#    if type(i) == int or type(i) == float:
    if isinstance(i, numbers.Number):
        bl_array.append(True)
#    elsif np.isnan(i):
#        bl_array.append(False)
    else:
        bl_array.append(False)
print(bl_array)
[True, True, False, False, True, False, True]

take close look, np.nan is not counted as non-number

In [ ]:
arr = np.array([1, np.nan, None, 'missing', 3, '--' ,7])

bl_array = []
for i in arr:
    if type(i) == int or type(i) == float:
        bl_array.append(True)
    else:
        bl_array.append(False)
print(bl_array)
[True, True, False, False, True, False, True]

to work with mask functionality

In [ ]:
arr = np.array([1, 2, None, 'missing', 3, '--' ,7])

bl_array = []
for i in arr:
    if type(i) == int or type(i) == float:
        bl_array.append(0)
    else:
        bl_array.append(1)
print(bl_array)
[0, 0, 1, 1, 0, 1, 0]
In [ ]:
m_x = np.ma.masked_array(arr, mask = bl_array)
In [ ]:
m_x
Out[ ]:
masked_array(data=[1, 2, --, --, 3, --, 7],
             mask=[False, False,  True,  True, False,  True, False],
       fill_value='?',
            dtype=object)
In [ ]:
print(m_x)
[1 2 -- -- 3 -- 7]
In [ ]:
m_x.sum()
Out[ ]:
13
In [ ]:
m_x.mean()
Out[ ]:
3.25

# Dealing with missing data with Pandas

creating sample dataset with of the type of dictionary

In [ ]:
room = {"room number": [101, 102, 103, 104, 105, '' , 107, 108, 109, 110],
            "num_students": [1, 2, "", 4, 'empty', 2, 3, 4, 'na', 3],
        "Department": ["Civil", "Civil", "", "Electrical", "n/a", "CS", "Metallurgy", "na", "Chemical", "Civil"],
        "Occupied": ['y', 'n', 'y', ' ', 'y', '--', 'n', 'na', '--', 'y']}

df = pd.DataFrame(room)
In [ ]:
df
Out[ ]:
room number num_students Department Occupied
0 101 1 Civil y
1 102 2 Civil n
2 103 y
3 104 4 Electrical
4 105 empty n/a y
5 2 CS --
6 107 3 Metallurgy n
7 108 4 na na
8 109 na Chemical --
9 110 3 Civil y
In [ ]:
df.dtypes
Out[ ]:
room number     object
num_students    object
Department      object
Occupied        object
dtype: object
In [ ]:
df['room number'] = pd.to_numeric(df['room number'])
In [ ]:
df.dtypes
Out[ ]:
room number     float64
num_students     object
Department       object
Occupied         object
dtype: object

operations are much faster when the datatype is of int or float rather than of object type - follow the below example

In [ ]:
%timeit np.arange(1000000, dtype="int").sum()
The slowest run took 5.55 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 1.66 ms per loop
In [ ]:
%timeit np.arange(1000000, dtype="object").sum()
10 loops, best of 5: 74.3 ms per loop
In [ ]:
df['room number'].isnull()
Out[ ]:
0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: room number, dtype: bool

adds all the true values to give the count of null values

In [ ]:
df['room number'].isnull().sum()
Out[ ]:
1
In [ ]:
df.isnull()
Out[ ]:
room number num_students Department Occupied
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False False False
5 False False False False
6 False False False False
7 False False False False
8 False False False False
9 False False False False
In [ ]:
df.isnull().sum()
Out[ ]:
room number     1
num_students    0
Department      0
Occupied        0
dtype: int64
In [ ]:
rooms = pd.read_csv('/content/rooms.csv')
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied
0 101 1 Civil y
1 102 2 Civil n
2 103 NaN NaN y
3 104 4 Electrical NaN
4 105 empty NaN y
5 106 2 CS --
6 107 3 Metallurgy n
7 108 4 na na
8 109 na Chemical --
9 110 3 Civil y
In [ ]:
rooms.dtypes
Out[ ]:
room number     float64
num_students     object
department       object
occupied         object
dtype: object
In [ ]:
rooms.isnull().sum()
Out[ ]:
room number     2
num_students    3
department      1
occupied        2
dtype: int64
In [ ]:
missing_values = ['na', 'n/a', 'empty', 'NA', '--']
In [ ]:
rooms = pd.read_csv('/content/rooms.csv', na_values = missing_values)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied
0 101 1.0 Civil y
1 102 2.0 Civil n
2 103 NaN NaN y
3 104 4.0 Electrical NaN
4 105 NaN NaN y
5 106 2.0 CS NaN
6 107 3.0 Metallurgy n
7 108 4.0 NaN NaN
8 109 NaN Chemical NaN
9 110 3.0 Civil y
In [ ]:
rooms.isnull()
Out[ ]:
room_number num_students department occupied
0 False False False False
1 False False False False
2 False True True False
3 False False False True
4 False True True False
5 False False False True
6 False False False False
7 False False True True
8 False True False True
9 False False False False
In [ ]:
rooms.department.unique()
Out[ ]:
array(['Civil', nan, 'Electrical', 'CS', 'Metallurgy', 'Chemical'],
      dtype=object)
In [ ]:
rooms['room_number'].count()
Out[ ]:
10

Filling the missing values

to fill all the missing values or null values with string "N"

In [ ]:
rooms['occupied'].fillna("n", inplace = True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied
0 101 1.0 Civil y
1 102 2.0 Civil n
2 103 NaN NaN y
3 104 4.0 Electrical n
4 105 NaN NaN y
5 106 2.0 CS n
6 107 3.0 Metallurgy n
7 108 4.0 NaN n
8 109 NaN Chemical n
9 110 3.0 Civil y
In [ ]:
def Convert_Y_to_True(v):
    if v == "y":
        return True
    else: 
        return False
In [ ]:
rooms['occupied'] = rooms['occupied'].apply(Convert_Y_to_True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied
0 101 1.0 Civil True
1 102 2.0 Civil False
2 103 NaN NaN True
3 104 4.0 Electrical False
4 105 NaN NaN True
5 106 2.0 CS False
6 107 3.0 Metallurgy False
7 108 4.0 NaN False
8 109 NaN Chemical False
9 110 3.0 Civil True
In [ ]:
def Convert_Y_to_True(v):
    if v == True:
        return "y"
    else: 
        return "n"
In [ ]:
rooms['occupied'] = rooms['occupied'].apply(Convert_Y_to_True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied
0 101 1.0 Civil y
1 102 2.0 Civil n
2 103 NaN NaN y
3 104 4.0 Electrical n
4 105 NaN NaN y
5 106 2.0 CS n
6 107 3.0 Metallurgy n
7 108 4.0 NaN n
8 109 NaN Chemical n
9 110 3.0 Civil y
In [ ]:
# lambda x: 'found' if x == "Tie" else "not found"
rooms['occupied'] = rooms['occupied'].apply(lambda x: True if x == "y" else "False")
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied
0 101 1.0 Civil True
1 102 2.0 Civil False
2 103 NaN NaN True
3 104 4.0 Electrical False
4 105 NaN NaN True
5 106 2.0 CS False
6 107 3.0 Metallurgy False
7 108 4.0 NaN False
8 109 NaN Chemical False
9 110 3.0 Civil True

to fill the missing value with the value above the null value - with forward fill - ffill, also called 'pad'

In [ ]:
rooms['Dept2'] = rooms['department']
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2
0 101 1.0 Civil True Civil
1 102 2.0 Civil False Civil
2 103 NaN NaN True NaN
3 104 4.0 Electrical False Electrical
4 105 NaN NaN True NaN
5 106 2.0 CS False CS
6 107 3.0 Metallurgy False Metallurgy
7 108 4.0 NaN False NaN
8 109 NaN Chemical False Chemical
9 110 3.0 Civil True Civil
In [ ]:
rooms['Dept2'].fillna(method='ffill', inplace = True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2
0 101 1.0 Civil True Civil
1 102 2.0 Civil False Civil
2 103 NaN NaN True Civil
3 104 4.0 Electrical False Electrical
4 105 NaN NaN True Electrical
5 106 2.0 CS False CS
6 107 3.0 Metallurgy False Metallurgy
7 108 4.0 NaN False Metallurgy
8 109 NaN Chemical False Chemical
9 110 3.0 Civil True Civil
In [ ]:
rooms['dept'] = rooms['department']
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101 1.0 Civil True Civil Civil
1 102 2.0 Civil False Civil Civil
2 103 NaN NaN True Civil NaN
3 104 4.0 Electrical False Electrical Electrical
4 105 NaN NaN True Electrical NaN
5 106 2.0 CS False CS CS
6 107 3.0 Metallurgy False Metallurgy Metallurgy
7 108 4.0 NaN False Metallurgy NaN
8 109 NaN Chemical False Chemical Chemical
9 110 3.0 Civil True Civil Civil
In [ ]:
rooms['dept'] = rooms['dept'].fillna(method = 'bfill', inplace = True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101 1.0 Civil True Civil None
1 102 2.0 Civil False Civil None
2 103 NaN NaN True Civil None
3 104 4.0 Electrical False Electrical None
4 105 NaN NaN True Electrical None
5 106 2.0 CS False CS None
6 107 3.0 Metallurgy False Metallurgy None
7 108 4.0 NaN False Metallurgy None
8 109 NaN Chemical False Chemical None
9 110 3.0 Civil True Civil None
In [ ]:
rooms.drop(['dept'], axis=1)
Out[ ]:
room_number num_students department occupied Dept2
0 101 1.0 Civil True Civil
1 102 2.0 Civil False Civil
2 103 NaN NaN True Civil
3 104 4.0 Electrical False Electrical
4 105 NaN NaN True Electrical
5 106 2.0 CS False CS
6 107 3.0 Metallurgy False Metallurgy
7 108 4.0 NaN False Metallurgy
8 109 NaN Chemical False Chemical
9 110 3.0 Civil True Civil
In [ ]:
rooms['dept'] = rooms['department']
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101 1.0 Civil True Civil Civil
1 102 2.0 Civil False Civil Civil
2 103 NaN NaN True Civil NaN
3 104 4.0 Electrical False Electrical Electrical
4 105 NaN NaN True Electrical NaN
5 106 2.0 CS False CS CS
6 107 3.0 Metallurgy False Metallurgy Metallurgy
7 108 4.0 NaN False Metallurgy NaN
8 109 NaN Chemical False Chemical Chemical
9 110 3.0 Civil True Civil Civil
In [ ]:
rooms['dept'].fillna(method = 'bfill', inplace = True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101 1.0 Civil True Civil Civil
1 102 2.0 Civil False Civil Civil
2 103 NaN NaN True Civil Electrical
3 104 4.0 Electrical False Electrical Electrical
4 105 NaN NaN True Electrical CS
5 106 2.0 CS False CS CS
6 107 3.0 Metallurgy False Metallurgy Metallurgy
7 108 4.0 NaN False Metallurgy Chemical
8 109 NaN Chemical False Chemical Chemical
9 110 3.0 Civil True Civil Civil
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101 1.0 Civil True Civil Civil
1 102 2.0 Civil False Civil Civil
2 103 NaN NaN True Civil Electrical
3 104 4.0 Electrical False Electrical Electrical
4 105 NaN NaN True Electrical CS
5 106 2.0 CS False CS CS
6 107 3.0 Metallurgy False Metallurgy Metallurgy
7 108 4.0 NaN False Metallurgy Chemical
8 109 NaN Chemical False Chemical Chemical
9 110 3.0 Civil True Civil Civil
In [ ]:
rooms['num_students'].fillna(rooms['num_students'].median(), inplace = True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101 1.0 Civil True Civil Civil
1 102 2.0 Civil False Civil Civil
2 103 3.0 NaN True Civil Electrical
3 104 4.0 Electrical False Electrical Electrical
4 105 3.0 NaN True Electrical CS
5 106 2.0 CS False CS CS
6 107 3.0 Metallurgy False Metallurgy Metallurgy
7 108 4.0 NaN False Metallurgy Chemical
8 109 3.0 Chemical False Chemical Chemical
9 110 3.0 Civil True Civil Civil

to delete a particular cell in a column

In [ ]:
rooms.at[6, 'room_number'] = np.nan
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101.0 1.0 Civil True Civil Civil
1 102.0 2.0 Civil False Civil Civil
2 103.0 3.0 NaN True Civil Electrical
3 104.0 4.0 Electrical False Electrical Electrical
4 105.0 3.0 NaN True Electrical CS
5 106.0 2.0 CS False CS CS
6 NaN 3.0 Metallurgy False Metallurgy Metallurgy
7 108.0 4.0 NaN False Metallurgy Chemical
8 109.0 3.0 Chemical False Chemical Chemical
9 110.0 3.0 Civil True Civil Civil
106 NaN NaN NaN NaN NaN NaN
In [ ]:
rooms.drop(106, inplace=True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101.0 1.0 Civil True Civil Civil
1 102.0 2.0 Civil False Civil Civil
2 103.0 3.0 NaN True Civil Electrical
3 104.0 4.0 Electrical False Electrical Electrical
4 105.0 3.0 NaN True Electrical CS
5 106.0 2.0 CS False CS CS
6 NaN 3.0 Metallurgy False Metallurgy Metallurgy
7 108.0 4.0 NaN False Metallurgy Chemical
8 109.0 3.0 Chemical False Chemical Chemical
9 110.0 3.0 Civil True Civil Civil
In [ ]:
rooms['room_number'].interpolate(inplace = True)
In [ ]:
rooms
Out[ ]:
room_number num_students department occupied Dept2 dept
0 101.0 1.0 Civil True Civil Civil
1 102.0 2.0 Civil False Civil Civil
2 103.0 3.0 NaN True Civil Electrical
3 104.0 4.0 Electrical False Electrical Electrical
4 105.0 3.0 NaN True Electrical CS
5 106.0 2.0 CS False CS CS
6 107.0 3.0 Metallurgy False Metallurgy Metallurgy
7 108.0 4.0 NaN False Metallurgy Chemical
8 109.0 3.0 Chemical False Chemical Chemical
9 110.0 3.0 Civil True Civil Civil

Experiments on AMEO_15

In [ ]:
df = pd.read_excel('/content/Ameo15.xlsx')
In [ ]:
df.head()
Out[ ]:
id salary doj dol designation jobcity gender dob 10percentage 10board ... computer_science mechanical_engg electrical_engg telecom_engg civil_engg conscientious agreeableness extraversion nueroticism openess_to_experience
0 203097 420000 2012-06-01 present senior quality engineer Bangalore f 1990-02-19 84.3 board ofsecondary education,ap ... -1 -1 -1 -1 -1 0.9737 0.8128 0.5269 1.35490 -0.4455
1 579905 500000 2013-09-01 present assistant manager Indore m 1989-10-04 85.4 cbse ... -1 -1 -1 -1 -1 -0.7335 0.3789 1.2396 -0.10760 0.8637
2 810601 325000 2014-06-01 present systems engineer Chennai f 1992-08-03 85.0 cbse ... -1 -1 -1 -1 -1 0.2718 1.7109 0.1637 -0.86820 0.6721
3 267447 1100000 2011-07-01 present senior software engineer Gurgaon m 1989-12-05 85.6 cbse ... -1 -1 -1 -1 -1 0.0464 0.3448 -0.3440 -0.40780 -0.9194
4 343523 200000 2014-03-01 2015-03-01 00:00:00 get Manesar m 1991-02-27 78.0 cbse ... -1 -1 -1 -1 -1 -0.8810 -0.2793 -1.0697 0.09163 -0.1295

5 rows × 38 columns

In [ ]:
df['10board'].unique()
Out[ ]:
array(['board ofsecondary education,ap', 'cbse', 'state board',
       'mp board bhopal', 'icse',
       'karnataka secondary school of examination', 'up',
       'karnataka state education examination board', 'ssc',
       'kerala state technical education', 0, 'bseb',
       'state board of secondary education, andhra pradesh',
       'matriculation', 'gujarat state board', 'karnataka state board',
       'wbbse', 'maharashtra state board', 'icse board', 'up board',
       'board of secondary education(bse) orissa',
       'little jacky matric higher secondary school',
       'uttar pradesh board', 'bsc,orissa', 'mp board', 'upboard',
       'matriculation board', 'j & k bord', 'rbse',
       'central board of secondary education', 'pseb', 'jkbose',
       'haryana board of school education,(hbse)', 'metric', 'ms board',
       'kseeb', 'stateboard', 'maticulation',
       'karnataka secondory education board', 'mumbai board', 'sslc',
       'kseb', 'board secondary  education', 'matric board',
       'board of secondary education',
       'west bengal board of secondary education',
       'jharkhand secondary examination board,ranchi', 'u p board',
       'bseb,patna', 'hsc', 'bse', 'sss pune',
       'karnataka education board (keeb)', 'kerala',
       'state board of secondary education( ssc)', 'gsheb',
       'up(allahabad)', 'nagpur', 'don bosco maatriculation school',
       'karnataka state secondary education board', 'maharashtra',
       'karnataka secondary education board',
       'himachal pradesh board of school education',
       'certificate of middle years program of ib',
       'karnataka board of secondary education',
       'board of secondary education rajasthan', 'uttarakhand board',
       'ua', 'board of secendary education orissa',
       'karantaka secondary education and examination borad', 'hbsc',
       'kseeb(karnataka secondary education examination board)',
       'cbse[gulf zone]', 'hbse', 'state(karnataka board)',
       'jharkhand accademic council',
       'jharkhand secondary examination board (ranchi)',
       'karnataka secondary education examination board', 'delhi board',
       'mirza ahmed ali baig', 'jseb', 'bse, odisha', 'bihar board',
       'maharashtra state(latur board)', 'rajasthan board', 'mpboard',
       'upbhsie', 'secondary board of rajasthan',
       'tamilnadu matriculation board', 'jharkhand secondary board',
       'board of secondary education,andhara pradesh', 'up baord',
       'state', 'board of intermediate education',
       'state board of secondary education,andhra pradesh',
       'up board , allahabad',
       'stjosephs girls higher sec school,dindigul', 'maharashtra board',
       'education board of kerala', 'board of ssc',
       'maharashtra state board pune',
       'board of school education harayana',
       'secondary school cerfificate', 'maharashtra sate board', 'ksseb',
       'bihar examination board, patna', 'latur',
       'board of secondary education, rajasthan', 'state borad hp',
       'cluny', 'bsepatna', 'up borad', 'ssc board of andrapradesh',
       'matric', 'bse,orissa', 'ssc-andhra pradesh', 'mp',
       'karnataka education board', 'mhsbse',
       'karnataka sslc board bangalore', 'karnataka', 'u p',
       'secondary school of education', 'state board of karnataka',
       'karnataka secondary board', 'andhra pradesh board ssc',
       'stjoseph of cluny matrhrsecschool,neyveli,cuddalore district',
       'hse,orissa', 'national public school', 'nagpur board',
       'jharkhand academic council', 'bsemp',
       'board of secondary education, andhra pradesh',
       'board of secondary education orissa',
       'board of secondary education,rajasthan(rbse)',
       'board of secondary education,ap',
       'board of secondary education,andhra pradesh',
       'jawahar navodaya vidyalaya', 'aisse',
       'karnataka board of higher education', 'bihar',
       'kerala state board', 'cicse', 'tn state board',
       'kolhapur divisional board, maharashtra',
       'bharathi matriculation school', 'uttaranchal state board',
       'wbbsce', 'mp state board', 'seba(assam)', 'anglo indian', 'gseb',
       'uttar pradesh', 'ghseb', 'board of school education uttarakhand',
       'msbshse,pune', 'tamilnadu state board', 'kerala university',
       'uttaranchal shiksha avam pariksha parishad',
       'bse(board of secondary education)',
       'bright way college, (up board)',
       'school secondary education, andhra pradesh',
       'secondary state certificate',
       'maharashtra state board of secondary and higher secondary education,pune',
       'andhra pradesh state board', 'stmary higher secondary', 'cgbse',
       'secondary school certificate', 'rajasthan board ajmer', 'mpbse',
       'pune board', 'cbse ', 'board of secondary education,orissa',
       'maharashtra state board,pune', 'up bord',
       'kiran english medium high school', 'state board (jac, ranchi)',
       'gujarat board', 'state board ', 'sarada high scchool',
       'kalaimagal matriculation higher secondary school',
       'karnataka board', 'maharastra board', 'sslc board',
       'ssc maharashtra board', 'tamil nadu state', 'uttrakhand board',
       'bihar secondary education board,patna',
       'haryana board of school education',
       'sri kannika parameswari highier secondary school, udumalpet',
       'ksseb(karnataka state board)', 'nashik board',
       'jharkhand secondary education board', 'himachal pradesh board',
       'maharashtra satate board',
       'maharashtra state board mumbai divisional board',
       'dav public school,hehal',
       'state board of secondary education, ap',
       'rajasthan board of secondary education', 'hsce',
       'karnataka secondary education',
       'board of secondary education,odisha', 'maharashtra nasik board',
       'west bengal board of secondary examination (wbbse)',
       'holy cross matriculation hr sec school', 'cbsc', 'apssc',
       'bseb patna', 'kolhapur', 'bseb, patna', 'up board allahabad',
       'biharboard', 'nagpur board,nagpur', 'pune', 'gyan bharati school',
       'rbse,ajmer', 'board of secondaray education',
       'secondary school education', 'state bord', 'jbse,jharkhand',
       'hse', 'madhya pradesh board', 'bihar school examination board',
       'west bengal board of secondary eucation', 'state boardmp board ',
       'icse board , new delhi',
       'board of secondary education (bse) orissa',
       'maharashtra state board for ssc',
       'board of secondary school education', 'latur board',
       "stmary's convent inter college", 'nagpur divisional board',
       'ap state board', 'cgbse raipur', 'uttranchal board', 'ksbe',
       'central board of secondary education, new delhi',
       'bihar school examination board patna', 'cbse board',
       'sslc,karnataka', 'mp-bse', 'up bourd', 'dav public school sec 14',
       'board of school education haryana',
       'council for indian school certificate examination',
       'aurangabad board', 'j&k state board of school education',
       'maharashtra state board of secondary and higher secondary education',
       'maharashtra state boar of secondary and higher secondary education',
       'ssc regular', 'karnataka state examination board', 'nasik',
       'west bengal  board of secondary education', 'up board,allahabad',
       'bseb ,patna',
       'state board - west bengal board of secondary education : wbbse',
       'maharashtra state board of secondary & higher secondary education',
       'delhi public school', 'karnataka secondary eduction',
       'secondary education board of rajasthan',
       'maharashtra board, pune', 'rbse (state board)', 'apsche',
       'board of  secondary education',
       'board of high school and intermediate education uttarpradesh',
       'kea', 'board of secondary education - andhra pradesh',
       'ap state board for secondary education', 'seba',
       'punjab school education board, mohali',
       'jharkhand acedemic council', 'hse,board',
       'board of ssc education andhra pradesh', 'up-board', 'bse,odisha',
       'kebb', 'ssm school', 'upb', 'chhattishgarh', 'vjrschool',
       'west bengal board', 'wbbose', 'uttar pradesh(up) board', 'cisce',
       'matriculaton', 'sse', 'stanthony', 'kses',
       'council of indian school certificate examinations',
       'secondary school certificate(ssc)',
       'punjab school education board', 'c b s e', 'secondary school',
       'rajasthan board of secondary education,ajmer,rajasthan', 'aislc',
       'state board of gujarat', 'uttaranchal board',
       'state board of andhra pradesh',
       'maharashtra state board of technical education', 'ua board',
       'kerala board for public examination',
       'lions matric higher secondary school, dindigul',
       'jharkhand board', 'kerala state, board of examination',
       'kendriya vidyalaya island grounds', 'bse, orissa',
       'jharkhand academic council,ranchi', 'pseb mohali',
       'aligarh muslim university', 'karnataka higher secondary board',
       'thslc,board of technical education, kerala',
       'state syllabus( karnataka  state sslc board)',
       'andhra pradesh board of secondary education', 'ap state',
       'haryana board', 'karnataka state secondary education',
       'dasmesh public school', 'dsse',
       'central board of secondary education(cbse)',
       'karnataka secondary education examination',
       'sri venkateshwara matriculation school', 'gshseb',
       'jamia millia islamia', 'matriculation examination',
       'jharkhand secondary examination board', 'punjab board',
       'up boadr', 'm p board', 'board of secondary education hyderabad',
       'gseb, gandhinagar', ' board of secondary education',
       'state board of tamilnadu', 'board of secondary education, orissa',
       'karnataka secondary examination board',
       "st clare's sr sec school", 'state board, kerala', 'jac',
       'tamil nadu matriculation board',
       'state boadr of school education', 'ssc,ap', 'msbshse',
       'mp board of secondary education', 'b s e b, patna',
       'karanataka state board',
       'indian certificate for secondary education',
       'board of secondary education,ajmer(rajasthan)',
       'board of secondery education', "stxavier's school",
       'state board(tamil)', 'nalanda vidhya bhavan high school',
       'matriculation examinatopm'], dtype=object)
In [ ]:
df.tail()
Out[ ]:
ID Salary DOJ DOL Designation JobCity Gender DOB 10percentage 10board ... ComputerScience MechanicalEngg ElectricalEngg TelecomEngg CivilEngg conscientiousness agreeableness extraversion nueroticism openess_to_experience
5493 407755 ? ? ? ? ? m 1991-01-10 82.88 kseeb ... -1 -1 -1 -1 -1 -0.7651 -1.5273 -1.0697 0.0035 -1.5513
5494 575154 ? ? ? ? ? m 1991-06-26 87.00 cbse ... -1 -1 -1 366 -1 0.2718 0.0459 0.3174 0.7798 -0.4776
5495 216598 ? ? ? ? ? m 1989-01-12 84.50 cbse ... -1 -1 -1 -1 -1 -0.1082 -0.2793 0.5269 -0.9953 0.6603
5496 919755 ? ? ? ? ? m 1991-08-10 63.33 cbse ... -1 -1 -1 -1 -1 -0.8772 -1.4526 0.1637 -0.1076 -2.2021
5497 1037308 ? ? ? ? ? m 1990-10-23 66.00 cbse ... -1 438 -1 -1 -1 -0.3027 -1.2861 0.0100 0.2727 -0.6692

5 rows × 38 columns

In [ ]:
df.isna().any().count()
Out[ ]:
38
In [ ]:
df.isna().any()
Out[ ]:
ID                       False
Salary                   False
DOJ                      False
DOL                      False
Designation              False
JobCity                  False
Gender                   False
DOB                      False
10percentage             False
10board                  False
12graduation             False
12percentage             False
12board                  False
CollegeID                False
CollegeTier              False
Degree                   False
Specialization           False
collegeGPA               False
CollegeCityID            False
CollegeCityTier          False
CollegeState             False
GraduationYear           False
English                  False
Logical                  False
Quant                    False
Domain                   False
ComputerProgramming      False
ElectronicsAndSemicon    False
ComputerScience          False
MechanicalEngg           False
ElectricalEngg           False
TelecomEngg              False
CivilEngg                False
conscientiousness        False
agreeableness            False
extraversion             False
nueroticism              False
openess_to_experience    False
dtype: bool
In [ ]:
df.isnull()
Out[ ]:
ID Salary DOJ DOL Designation JobCity Gender DOB 10percentage 10board ... ComputerScience MechanicalEngg ElectricalEngg TelecomEngg CivilEngg conscientiousness agreeableness extraversion nueroticism openess_to_experience
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5493 False False False False False False False False False False ... False False False False False False False False False False
5494 False False False False False False False False False False ... False False False False False False False False False False
5495 False False False False False False False False False False ... False False False False False False False False False False
5496 False False False False False False False False False False ... False False False False False False False False False False
5497 False False False False False False False False False False ... False False False False False False False False False False

5498 rows × 38 columns

In [ ]:
df.isnull().any()
Out[ ]:
ID                       False
Salary                   False
DOJ                      False
DOL                      False
Designation              False
JobCity                  False
Gender                   False
DOB                      False
10percentage             False
10board                  False
12graduation             False
12percentage             False
12board                  False
CollegeID                False
CollegeTier              False
Degree                   False
Specialization           False
collegeGPA               False
CollegeCityID            False
CollegeCityTier          False
CollegeState             False
GraduationYear           False
English                  False
Logical                  False
Quant                    False
Domain                   False
ComputerProgramming      False
ElectronicsAndSemicon    False
ComputerScience          False
MechanicalEngg           False
ElectricalEngg           False
TelecomEngg              False
CivilEngg                False
conscientiousness        False
agreeableness            False
extraversion             False
nueroticism              False
openess_to_experience    False
dtype: bool
In [ ]:
df.dtypes
Out[ ]:
id                                int64
salary                            int64
doj                      datetime64[ns]
dol                              object
designation                      object
jobcity                          object
gender                           object
dob                      datetime64[ns]
10percentage                    float64
10board                          object
12graduation                      int64
12percentage                    float64
12board                          object
collegeid                         int64
collegetier                       int64
degree                           object
specialization                   object
collegegpa                      float64
collegecityid                     int64
collegecitytier                   int64
collegestate                     object
graduationyear                    int64
english                           int64
logical                           int64
quant                             int64
domain                          float64
computer_programming              int64
electronics_semicon               int64
computer_science                  int64
mechanical_engg                   int64
electrical_engg                   int64
telecom_engg                      int64
civil_engg                        int64
conscientious                   float64
agreeableness                   float64
extraversion                    float64
nueroticism                     float64
openess_to_experience           float64
dtype: object
In [ ]:
df.Gender.unique()
Out[ ]:
array(['f', 'm'], dtype=object)
In [ ]:
sns.violinplot(x = 'gender', y='salary', data = df)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efee622b890>
In [ ]:
df[['10percentage', '12percentage', 'collegegpa', 'gender']].groupby('gender').mean()
Out[ ]:
10percentage 12percentage collegegpa
gender
f 81.039321 76.802306 74.112179
m 77.087557 73.886965 70.686414
In [ ]:
df[['10percentage', '12percentage', 'collegegpa', 'gender']].groupby('gender').median()
Out[ ]:
10percentage 12percentage collegegpa
gender
f 82.5 77.0 74.0
m 78.2 73.8 71.0
In [ ]:
df[['conscientious', 'agreeableness', 
    'extraversion', 'nueroticism', 'openess_to_experience',
    'gender']].groupby('gender').mean()
Out[ ]:
conscientious agreeableness extraversion nueroticism openess_to_experience
gender
f 0.144125 0.314236 0.037868 -0.157495 0.060083
m -0.096661 0.105828 0.007920 -0.144813 -0.189094
In [ ]:
df[['conscientious', 'agreeableness', 
    'extraversion', 'nueroticism', 'openess_to_experience',
    'gender']].groupby('gender').median()
Out[ ]:
conscientious agreeableness extraversion nueroticism openess_to_experience
gender
f 0.2718 0.3789 0.1637 -0.2344 0.0973
m -0.0154 0.2124 0.0914 -0.1727 -0.0943
In [ ]:
df[['salary', 'gender']].groupby('gender').mean()
Out[ ]:
salary
gender
f 295876.865672
m 308740.740741

define a threshold for salary

In [ ]:
df['salary'].mean()
Out[ ]:
305605.49290651147
In [ ]:
df['salary'].mean() + df['salary'].std()
Out[ ]:
508615.72317936417
In [ ]:
th = df['salary'].mean() + df['salary'].std()
In [ ]:
df['HighIncome'] = (df.salary > th)
In [ ]:
df.sample(10)
Out[ ]:
id salary doj dol designation jobcity gender dob 10percentage 10board ... mechanical_engg electrical_engg telecom_engg civil_engg conscientious agreeableness extraversion nueroticism openess_to_experience HighIncome
1021 407662 360000 2013-06-01 2015-06-01 00:00:00 java software engineer Bangalore m 1989-12-31 81.60 cbse ... -1 -1 -1 -1 -0.1082 0.8518 -0.1626 -0.7603 -0.2875 False
3590 242719 220000 2011-11-01 2013-05-01 00:00:00 software developer Bangalore m 1986-10-13 58.00 0 ... -1 -1 -1 -1 0.0464 0.5008 -0.1988 -1.2303 0.8183 False
2246 1179971 120000 2014-12-01 present android developer Kochi/Cochin m 1992-10-03 80.00 state board ... -1 -1 -1 -1 0.2718 -1.2861 -1.5270 -1.2486 -0.8608 False
1806 1088588 300000 2014-07-01 present technical consultant Chittor m 1993-07-02 88.50 state board ... 438 -1 -1 -1 0.7027 0.0459 -0.9122 -0.1076 -0.6692 False
58 1063642 200000 2013-06-01 2014-01-01 00:00:00 project engineer Gaziabaad m 1990-03-27 63.30 state board ... -1 -1 -1 -1 -1.0208 -0.6201 -0.2974 -0.1076 -1.2440 False
3412 351702 480000 2012-07-01 present senior engineer Pune m 1989-11-05 81.60 0 ... -1 -1 -1 -1 -1.3447 -0.9033 -0.8157 -0.0552 -1.7093 False
1124 1072371 240000 2015-02-01 present software developer Noida m 1991-06-03 78.66 cbse ... -1 -1 -1 -1 0.1282 0.2124 0.4711 0.0192 1.2470 False
4573 852122 360000 2013-09-01 present assistant system engineer Tadepally Gudem m 1988-08-12 61.40 cbse ... -1 -1 -1 -1 -0.8772 -0.7866 0.0100 -0.1076 -2.0105 False
3888 604821 280000 2013-05-01 2014-05-01 00:00:00 test technician Tadepally Gudem f 1991-11-14 81.20 cbse ... -1 -1 -1 -1 0.8463 -0.1206 0.1637 -0.4879 0.0973 False
2703 1059888 500000 2014-08-01 present software engineer Chennai m 1993-06-04 94.00 state board ... -1 -1 -1 -1 0.5591 0.5454 -0.6048 0.5262 0.0973 False

10 rows × 39 columns

In [ ]:
df[['salary', 'HighIncome', 'gender']].groupby(['HighIncome', 'gender']).mean()
Out[ ]:
salary
HighIncome gender
False f 270181.962025
m 269781.437908
True f 723223.684211
m 756246.246246
In [ ]:
df[['salary', 'HighIncome', 'gender']].groupby(['HighIncome', 'gender']).count()
Out[ ]:
salary
HighIncome gender
False f 1264
m 3825
True f 76
m 333
In [ ]:
print("Low income female percentage ", 1264/3825*100)
Low income female percentage  33.04575163398693
In [ ]:
print("High income female percentage ", 76/333*100)
High income female percentage  22.822822822822822
In [ ]:
print("Low income female percentage ", 1264/(1264+3825)*100)
Low income female percentage  24.837885635684813
In [ ]:
print("High income female percentage ", 76/(76+333)*100)
High income female percentage  18.581907090464547
In [ ]:
df_ = df
In [ ]:
df_ = df_[['electrical_engg', 'telecom_engg', 'civil_engg' ]].sample(30)
In [ ]:
df_
Out[ ]:
electrical_engg telecom_engg civil_engg
3321 -1 -1 -1
2850 -1 -1 -1
3030 -1 -1 -1
900 -1 446 -1
1840 -1 -1 -1
1537 -1 -1 -1
911 -1 -1 -1
3078 -1 -1 -1
5451 -1 -1 -1
3492 -1 -1 -1
4387 -1 -1 -1
2007 -1 393 -1
3564 -1 -1 -1
373 -1 -1 -1
4471 -1 -1 -1
2799 -1 -1 -1
2365 -1 -1 -1
641 612 -1 -1
3119 -1 -1 -1
1209 -1 446 -1
1280 -1 -1 -1
5238 -1 -1 -1
3418 -1 -1 -1
864 -1 -1 -1
3687 -1 -1 -1
3680 -1 260 -1
146 -1 -1 -1
2465 -1 -1 -1
961 -1 -1 -1
4214 -1 -1 -1
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5498 entries, 0 to 5497
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     5498 non-null   int64         
 1   salary                 5498 non-null   int64         
 2   doj                    5498 non-null   datetime64[ns]
 3   dol                    5498 non-null   object        
 4   designation            5498 non-null   object        
 5   jobcity                5498 non-null   object        
 6   gender                 5498 non-null   object        
 7   dob                    5498 non-null   datetime64[ns]
 8   10percentage           5498 non-null   float64       
 9   10board                5498 non-null   object        
 10  12graduation           5498 non-null   int64         
 11  12percentage           5498 non-null   float64       
 12  12board                5498 non-null   object        
 13  collegeid              5498 non-null   int64         
 14  collegetier            5498 non-null   int64         
 15  degree                 5498 non-null   object        
 16  specialization         5498 non-null   object        
 17  collegegpa             5498 non-null   float64       
 18  collegecityid          5498 non-null   int64         
 19  collegecitytier        5498 non-null   int64         
 20  collegestate           5498 non-null   object        
 21  graduationyear         5498 non-null   int64         
 22  english                5498 non-null   int64         
 23  logical                5498 non-null   int64         
 24  quant                  5498 non-null   int64         
 25  domain                 5498 non-null   float64       
 26  computer_programming   5498 non-null   int64         
 27  electronics_semicon    5498 non-null   int64         
 28  computer_science       5498 non-null   int64         
 29  mechanical_engg        5498 non-null   int64         
 30  electrical_engg        5498 non-null   int64         
 31  telecom_engg           5498 non-null   int64         
 32  civil_engg             5498 non-null   int64         
 33  conscientious          5498 non-null   float64       
 34  agreeableness          5498 non-null   float64       
 35  extraversion           5498 non-null   float64       
 36  nueroticism            5498 non-null   float64       
 37  openess_to_experience  5498 non-null   float64       
 38  HighIncome             5498 non-null   bool          
 39  EnggTrade              63 non-null     object        
dtypes: bool(1), datetime64[ns](2), float64(9), int64(18), object(10)
memory usage: 1.6+ MB
In [ ]:
def trade_categoriese(row):
    if row['electrical_engg'] > 0:
        return 'ee'
    elif row['telecom_engg'] > 0:
        return 'te'
    elif row['civil_engg'] > 0:
        return 'ce'
    elif row['computer_programming'] > 0:
        return 'cp'
    elif row['electronics_semicon'] > 0:
        return 'es'
    elif row['computer_science'] > 0:
        return 'cs'
    elif row['mechanical_engg'] > 0:
        return 'me'
In [ ]:
df['EnggTrade'] = df.apply(lambda row: trade_categoriese(row), axis=1)
In [ ]:
df.head()
Out[ ]:
id salary doj dol designation jobcity gender dob 10percentage 10board ... electrical_engg telecom_engg civil_engg conscientious agreeableness extraversion nueroticism openess_to_experience HighIncome EnggTrade
0 203097 420000 2012-06-01 present senior quality engineer Bangalore f 1990-02-19 84.3 board ofsecondary education,ap ... -1 -1 -1 0.9737 0.8128 0.5269 1.35490 -0.4455 False cp
1 579905 500000 2013-09-01 present assistant manager Indore m 1989-10-04 85.4 cbse ... -1 -1 -1 -0.7335 0.3789 1.2396 -0.10760 0.8637 False es
2 810601 325000 2014-06-01 present systems engineer Chennai f 1992-08-03 85.0 cbse ... -1 -1 -1 0.2718 1.7109 0.1637 -0.86820 0.6721 False cp
3 267447 1100000 2011-07-01 present senior software engineer Gurgaon m 1989-12-05 85.6 cbse ... -1 -1 -1 0.0464 0.3448 -0.3440 -0.40780 -0.9194 True cp
4 343523 200000 2014-03-01 2015-03-01 00:00:00 get Manesar m 1991-02-27 78.0 cbse ... -1 -1 -1 -0.8810 -0.2793 -1.0697 0.09163 -0.1295 False es

5 rows × 40 columns

In [ ]:
dx = df[['gender', "EnggTrade", 'salary', ]].groupby(['EnggTrade', 'gender']).mean()
In [ ]:
dx
Out[ ]:
salary
EnggTrade gender
ce f 334000.000000
m 341862.745098
cp f 298923.303835
m 317064.347826
cs f 216666.666667
m 225000.000000
ee f 259270.833333
m 268513.513514
es f 293269.230769
m 305334.394904
me f 319642.857143
m 300147.208122
te f 293565.573770
m 281629.353234
In [ ]:
dx.style.highlight_max(color='green').highlight_min(color='red')
Out[ ]:
    salary
EnggTrade gender  
ce f 334000.000000
m 341862.745098
cp f 298923.303835
m 317064.347826
cs f 216666.666667
m 225000.000000
ee f 259270.833333
m 268513.513514
es f 293269.230769
m 305334.394904
me f 319642.857143
m 300147.208122
te f 293565.573770
m 281629.353234
In [ ]:
!pip install nbconvert
In [ ]:
%shell jupyter nbconvert --to html /content/testfile.ipynb