Difference makes the DIFFERENCE
import numpy as np
import pandas as pd
import seaborn as sns
x = np.array([1, 2, 3, 4, 5])
x.sum()
print(x.dtype)
x = np.array([1, 2, 3, '--', 5])
print(x.dtype)
x.sum()
x = np.array([1, 2, 3, None, 5])
x.sum()
x = np.array([1, 2, 3, np.nan, 5])
x.sum()
1 * np.nan
x_b = np.array([True, True, True, False, True])
x[x_b]
x[x_b].mean()
m_x = np.ma.masked_array(x, mask = [0, 0, 0, 1, 0])
m_x.mean()
df = pd.read_csv("rooms.csv")
df.head()
df.dtypes
%timeit np.arange(100000, dtype="int").sum()
%timeit np.arange(100000, dtype="object").sum()
df.Room_Number.isnull()
df.Room_Number.isnull().sum()
df.isnull()
df.isnull().sum()
missing_values = ["NA", "n/a", "na"]
df = pd.read_csv("rooms.csv",
na_values = missing_values)
df.isnull()
df.Num_Students.mean()
missing_values = ["NA", "n/a", "na", "Empty", "--"]
df = pd.read_csv("rooms.csv",
na_values = missing_values)
df.isnull()
df.Department.unique()
df.Occupied.fillna("N", inplace=True)
df
def convert_to_binary(v):
if v == 'Y':
return True
else:
return False
df.Occupied = df.Occupied.apply(convert_to_binary)
df
df["Dept2"] = df.Department
df.Department.fillna(method="pad", inplace=True)
df
df.Dept2.fillna(method="bfill", inplace=True)
df
df.Num_Students.fillna(df.Num_Students.median(), inplace=True)
df
df.Room_Number.interpolate(inplace=True)
df
df = pd.read_excel("ameo_2015.xlsx")
df.head()
df.shape
df.isnull().sum().sum()
df.dtypes
df.Gender.unique()
sns.violinplot(x='Gender', y='Salary', data=df);
df[['10percentage', '12percentage', 'collegeGPA', 'Gender']].groupby('Gender').mean()
df[['10percentage', '12percentage', 'collegeGPA', 'Gender']].groupby('Gender').median()
df[['conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience', 'Gender']].groupby('Gender').mean()
df[['conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience', 'Gender']].groupby('Gender').median()
df[['Salary', 'Gender']].groupby('Gender').mean()
th = df.Salary.mean()+df.Salary.std()
df['HighIncome'] = (df.Salary > th)
df.sample(10)
df.head()
df[['Salary', 'HighIncome', 'Gender']].groupby(['HighIncome', 'Gender']).mean()
df[['Salary', 'HighIncome', 'Gender']].groupby(['HighIncome', 'Gender']).count()
print('Low income female percentage', 917/(2809+917)*100)
print('High income female percentage', 40/(232+40)*100)
df.CollegeTier.unique()
df[['CollegeTier', 'HighIncome', 'Salary']].groupby(['HighIncome', 'CollegeTier']).count()
print('Low income college tier 2 percentage is', 3492/(3492+234)*100)
print('High income college tier 2 percentage is', 209/(209+63)*100)
df[['Gender', 'CollegeTier', 'Salary']].groupby(['CollegeTier', 'Gender']).count()
print('In college tier 1 female percentage is', 51/(246+51)*100)
print('In college tier 2 female percentage is', 906/(906+2795)*100)
df = pd.read_csv('apy.csv', na_values="=")
df.head()
df.State_Name.unique()
df.Crop_Year.unique()
df.dtypes
df.Season.unique()
df.Crop.unique()
pd.to_numeric(df.Production)
df.Production.isnull().sum()
df.shape
df.dropna(inplace=True)
df.shape
sns.kdeplot(df.Production)
sns.boxplot(df.Production)
sns.boxplot(df.Area)
sns.kdeplot(df.Area)
df[df.State_Name == "Karnataka"]['District_Name'].unique()
df.groupby(['State_Name', 'Crop', 'Crop_Year']).sum()
df[df.State_Name == "West Bengal"]['Crop'].unique()
df.groupby(['State_Name', 'Crop_Year']).sum()
df_ = df.groupby(['State_Name', 'Crop_Year']).sum()
df_.reset_index(inplace=True)
df_.head()
df_[['State_Name', 'Crop_Year']].groupby('State_Name').count()
sns.lineplot(x="Crop_Year", y="Production", data=df[df.State_Name == "Tamil Nadu"]);
sns.lineplot(x="Crop_Year", y="Production", data=df, hue="State_Name");
!pip3 install plotly_express
import plotly_express as px
px.scatter(df_, x="Area", y="Production", animation_frame="Crop_Year",
animation_group="State_Name", color="State_Name")
df_.sort_values('Crop_Year', inplace=True)
df[(df.State_Name == "Kerala") & (df.Crop_Year == 2000)].sort_values('Production')
df_ = df[df.Crop.isin(['Rice', 'Wheat', 'Maize', 'Ragi'])].groupby(['State_Name', 'Crop_Year']).sum()
df_.head()
df_.reset_index(inplace=True)
df_.sort_values('Crop_Year', inplace=True)
px.scatter(df_, x="Area", y="Production", animation_frame="Crop_Year",
animation_group="State_Name", color="State_Name")
df_['Efficiency'] = df_['Production'] / df_['Area']
px.scatter(df_, x="Area", y="Efficiency", size="Production", animation_frame="Crop_Year",
animation_group="State_Name", color="State_Name", range_y = [0.75, 5], range_x=[-1E6, 20E6])