Difference makes the DIFFERENCE
import numpy as np
import pandas as pd
import seaborn as sns
x = np.array([1,2,3,4,5,6,7,8,9,10])
x.sum()
print(x.dtype)
x = np.array([1,2,3,4,5,6.2,7,8,9,10])
x.dtype
x.sum()
x = np.array([1,2,3,4,5,'--',7,8,9,10])
x.sum()
x.dtype
# Using NONE seems to be good enough unlike the previous dataset,
# but still the same error occurs when we perform any such operation
x = np.array([1,2,3,4,5,None,7,8,9,10])
x.sum()
# even now, we get the same error and cannot perform any operations
x = np.array([1,2,3, np.nan ,7])
x.sum()
25 * np.nan
x = np.array([1,2,3, np.nan ,7])
# x = np.array([1, 2, 3, np.nan, 7])
x_b = np.array([True, True, True, False, True])
x[x_b]
x[x_b].sum()
# the following boolean array doesnot work as the mapping is not exact and returns nan
# x = np.array([1, 2, 3, np.nan, 7])
x_b = np.array([True, False, True, True, True])
x[x_b].sum()
x = np.array([1,2,3, np.nan ,7])
# x = np.array([1, 2, 3, np.nan, 7])
x_b = np.array([True, True, True, False, True])
x[x_b].mean()
arr = [1,2,4,5,6,7,9,10, 12, 25, 30] # serial numbers or numbers in sequence
missing = [] # empty array to gather missing elements
for i in range(arr[0], arr[-1]+1): # run a loop with a range of numbers in arr
if i not in arr: # if number in range is not in given array
missing.append(i) # append that value to empty array
print(missing) # print the elements that are missing
arr = [1,4,5,7,9,10]
missing_values = [i for i in range(arr[0], arr[-1]+1) if i not in arr]
print(missing_values)
arr = np.array([1,2,3, np.nan ,7])
len(arr)
arr = np.array([1, 2, 3, np.nan, 7, 10, np.nan, 15])
bl_array = []
for i in arr:
if np.isnan(i):
bl_array.append(False)
else:
bl_array.append(True)
print(bl_array)
arr = np.array([1, np.nan, None, 'missing', 3, '--' ,7])
import numbers
bl_array = []
for i in arr:
# if np.char.isnumeric(i):
# if type(i) == int or type(i) == float:
if isinstance(i, numbers.Number):
bl_array.append(True)
# elsif np.isnan(i):
# bl_array.append(False)
else:
bl_array.append(False)
print(bl_array)
arr = np.array([1, np.nan, None, 'missing', 3, '--' ,7])
bl_array = []
for i in arr:
if type(i) == int or type(i) == float:
bl_array.append(True)
else:
bl_array.append(False)
print(bl_array)
arr = np.array([1, 2, None, 'missing', 3, '--' ,7])
bl_array = []
for i in arr:
if type(i) == int or type(i) == float:
bl_array.append(0)
else:
bl_array.append(1)
print(bl_array)
m_x = np.ma.masked_array(arr, mask = bl_array)
m_x
print(m_x)
m_x.sum()
m_x.mean()
room = {"room number": [101, 102, 103, 104, 105, '' , 107, 108, 109, 110],
"num_students": [1, 2, "", 4, 'empty', 2, 3, 4, 'na', 3],
"Department": ["Civil", "Civil", "", "Electrical", "n/a", "CS", "Metallurgy", "na", "Chemical", "Civil"],
"Occupied": ['y', 'n', 'y', ' ', 'y', '--', 'n', 'na', '--', 'y']}
df = pd.DataFrame(room)
df
df.dtypes
df['room number'] = pd.to_numeric(df['room number'])
df.dtypes
%timeit np.arange(1000000, dtype="int").sum()
%timeit np.arange(1000000, dtype="object").sum()
df['room number'].isnull()
df['room number'].isnull().sum()
df.isnull()
df.isnull().sum()
rooms = pd.read_csv('/content/rooms.csv')
rooms
rooms.dtypes
rooms.isnull().sum()
missing_values = ['na', 'n/a', 'empty', 'NA', '--']
rooms = pd.read_csv('/content/rooms.csv', na_values = missing_values)
rooms
rooms.isnull()
rooms.department.unique()
rooms['room_number'].count()
rooms['occupied'].fillna("n", inplace = True)
rooms
def Convert_Y_to_True(v):
if v == "y":
return True
else:
return False
rooms['occupied'] = rooms['occupied'].apply(Convert_Y_to_True)
rooms
def Convert_Y_to_True(v):
if v == True:
return "y"
else:
return "n"
rooms['occupied'] = rooms['occupied'].apply(Convert_Y_to_True)
rooms
# lambda x: 'found' if x == "Tie" else "not found"
rooms['occupied'] = rooms['occupied'].apply(lambda x: True if x == "y" else "False")
rooms
rooms['Dept2'] = rooms['department']
rooms
rooms['Dept2'].fillna(method='ffill', inplace = True)
rooms
rooms['dept'] = rooms['department']
rooms
rooms['dept'] = rooms['dept'].fillna(method = 'bfill', inplace = True)
rooms
rooms.drop(['dept'], axis=1)
rooms['dept'] = rooms['department']
rooms
rooms['dept'].fillna(method = 'bfill', inplace = True)
rooms
rooms
rooms['num_students'].fillna(rooms['num_students'].median(), inplace = True)
rooms
rooms.at[6, 'room_number'] = np.nan
rooms
rooms.drop(106, inplace=True)
rooms
rooms['room_number'].interpolate(inplace = True)
rooms
df = pd.read_excel('/content/Ameo15.xlsx')
df.head()
df['10board'].unique()
df.tail()
df.isna().any().count()
df.isna().any()
df.isnull()
df.isnull().any()
df.dtypes
df.Gender.unique()
sns.violinplot(x = 'gender', y='salary', data = df)
df[['10percentage', '12percentage', 'collegegpa', 'gender']].groupby('gender').mean()
df[['10percentage', '12percentage', 'collegegpa', 'gender']].groupby('gender').median()
df[['conscientious', 'agreeableness',
'extraversion', 'nueroticism', 'openess_to_experience',
'gender']].groupby('gender').mean()
df[['conscientious', 'agreeableness',
'extraversion', 'nueroticism', 'openess_to_experience',
'gender']].groupby('gender').median()
df[['salary', 'gender']].groupby('gender').mean()
df['salary'].mean()
df['salary'].mean() + df['salary'].std()
th = df['salary'].mean() + df['salary'].std()
df['HighIncome'] = (df.salary > th)
df.sample(10)
df[['salary', 'HighIncome', 'gender']].groupby(['HighIncome', 'gender']).mean()
df[['salary', 'HighIncome', 'gender']].groupby(['HighIncome', 'gender']).count()
print("Low income female percentage ", 1264/3825*100)
print("High income female percentage ", 76/333*100)
print("Low income female percentage ", 1264/(1264+3825)*100)
print("High income female percentage ", 76/(76+333)*100)
df_ = df
df_ = df_[['electrical_engg', 'telecom_engg', 'civil_engg' ]].sample(30)
df_
df.info()
def trade_categoriese(row):
if row['electrical_engg'] > 0:
return 'ee'
elif row['telecom_engg'] > 0:
return 'te'
elif row['civil_engg'] > 0:
return 'ce'
elif row['computer_programming'] > 0:
return 'cp'
elif row['electronics_semicon'] > 0:
return 'es'
elif row['computer_science'] > 0:
return 'cs'
elif row['mechanical_engg'] > 0:
return 'me'
df['EnggTrade'] = df.apply(lambda row: trade_categoriese(row), axis=1)
df.head()
dx = df[['gender', "EnggTrade", 'salary', ]].groupby(['EnggTrade', 'gender']).mean()
dx
dx.style.highlight_max(color='green').highlight_min(color='red')
!pip install nbconvert
%shell jupyter nbconvert --to html /content/testfile.ipynb