import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv('/content/clean_agri_ds.csv')

df.head(2)

df.describe()

df.drop(['Len State', 'Len Dist', 'Len Season', 'Len Crop'], inplace = True, axis=1)

df.head(2)

df.set_index(['Unnamed: 0'])

df.rename(columns = {'Unnamed: 0':"ColIndex"}, inplace = True)

df.set_index(['ColIndex'])

df['Crop_Year'] = df['Crop_Year'].astype(str)

df['ColIndex'] = df['ColIndex'].astype(str)

df.describe()

Conditionally retrieving data from a dataframe with QUERY¶

DataFrame.query(expr, inplace=False, **kwargs)

df.loc[df.State_Name == "Andhra Pradesh", 'District_Name'].unique()

array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)

df.loc[df['State_Name'] == 'Andhra Pradesh']['District_Name'].unique()

array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)

df.query('State_Name == "Andhra Pradesh"')['District_Name'].unique()

array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)

df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM"')

df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Season == "Rabi"')

df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Season == "Rabi" and Crop_Year == "2011"' )

df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Crop_Year == "2011" and Season == "Rabi" and Crop == "Onion"')

df_dist = df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM"')

df_dist.head(1)

def getDistNames_forState(strStateName):
    print(df.query('State_Name == "' + strStateName + '"')['Crop'].unique())

getDistNames_forState('Andhra Pradesh')

['Moong(Green Gram)' 'Rice' 'Coconut ' 'Cabbage' 'Banana' 'Jowar' 'Gram'
 'Sugarcane' 'Urad' 'Dry chillies' 'Sesamum' 'Groundnut' 'Ragi' 'Bajra'
 'Horse-gram' 'Sweet potato' 'Tapioca' 'Cotton(lint)' 'Tobacco'
 'Small millets' 'Onion' 'Mesta' 'Maize' 'Turmeric' 'Rapeseed &Mustard'
 'Other Kharif pulses' 'Citrus Fruit' 'Castor seed' 'Potato'
 'other oilseeds' 'Cashewnut' 'Tomato' 'Arhar/Tur' 'Brinjal'
 'Cowpea(Lobia)' 'Niger seed' 'Wheat' 'Coriander' 'Other Fresh Fruits'
 'Dry ginger' 'Orange' 'Papaya' 'Sunflower' 'Korra' 'Lemon' 'Bhindi'
 'Mango' 'Grapes' 'Safflower' 'Beans & Mutter(Vegetable)' 'Pome Granet'
 'Arecanut' 'Other  Rabi pulses' 'Samai' 'Soyabean' 'Masoor' 'Pome Fruit'
 'Sapota' 'other misc. pulses' 'Linseed' 'Ginger' 'Varagu' 'Garlic']

display(df['State_Name'].unique())

array(['Andhra Pradesh', 'Assam', 'Andaman and Nicobar Islands',
       'Arunachal Pradesh', 'Bihar'], dtype=object)

def getDistrict_Names_for_State(strStateName):
    print(df.loc[df['State_Name'] == strStateName, "District_Name"].iloc[:].unique())

getDistrict_Names_for_State("Arunachal Pradesh")

['PAPUM PARE' 'DIBANG VALLEY' 'LOWER DIBANG VALLEY' 'WEST KAMENG'
 'UPPER SIANG' 'LOHIT' 'CHANGLANG' 'EAST KAMENG' 'UPPER SUBANSIRI' 'ANJAW'
 'TAWANG' 'LOWER SUBANSIRI' 'WEST SIANG' 'EAST SIANG' 'KURUNG KUMEY'
 'TIRAP' 'LONGDING' 'NAMSAI']

getDistrict_Names_for_State("Andhra Pradesh")

['KRISHNA' 'GUNTUR' 'VISAKHAPATANAM' 'VIZIANAGARAM' 'WEST GODAVARI'
 'ANANTAPUR' 'KADAPA' 'SRIKAKULAM' 'EAST GODAVARI' 'PRAKASAM' 'KURNOOL'
 'CHITTOOR' 'SPSR NELLORE']

examples¶

df.loc[(df["B"] > 50) & (df["C"] == 900), "A"].values
df.loc[(df["B"] > 50) & (df["C"] == 900), "A"]

def getCropNames_for_State_District(strStateName, strDistName):
    print(df.loc[(df['State_Name'] == strStateName) & (df['District_Name'] == strDistName), "Crop"].unique())

getCropNames_for_State_District("Arunachal Pradesh", "WEST KAMENG")

['Small millets' 'Rapeseed &Mustard' 'Rice' 'Wheat' 'Dry ginger' 'Maize'
 'Soyabean' 'Oilseeds total' 'Dry chillies' 'Potato' 'Pulses total'
 'Groundnut' 'Sunflower']

getCropNames_for_State_District("Assam","KARBI ANGLONG" )

['Papaya' 'Rice' 'Paddy' 'Other  Rabi pulses' 'Arecanut' 'Sweet potato'
 'Masoor' 'Mesta' 'Rapeseed &Mustard' 'Potato' 'Niger seed' 'Arhar/Tur'
 'Maize' 'Orange' 'Moong(Green Gram)' 'Turmeric' 'Small millets' 'Urad'
 'Onion' 'Coconut ' 'Cotton(lint)' 'Linseed' 'Peas & beans (Pulses)'
 'Black pepper' 'Gram' 'Dry chillies' 'Tapioca' 'Blackgram' 'Tobacco'
 'Pineapple' 'Sugarcane' 'Banana' 'Jute' 'Wheat' 'Sesamum' 'Castor seed'
 'Ginger' 'Dry ginger']

df.loc[df['State_Name'] == "Andhra Pradesh", "District_Name"].iloc[:].unique()

array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)

getDistrict_Names_for_State("Arunachal Pradesh")

['PAPUM PARE' 'DIBANG VALLEY' 'LOWER DIBANG VALLEY' 'WEST KAMENG'
 'UPPER SIANG' 'LOHIT' 'CHANGLANG' 'EAST KAMENG' 'UPPER SUBANSIRI' 'ANJAW'
 'TAWANG' 'LOWER SUBANSIRI' 'WEST SIANG' 'EAST SIANG' 'KURUNG KUMEY'
 'TIRAP' 'LONGDING' 'NAMSAI']

getDistrict_Names_for_State("Bihar")

['BANKA' 'KAIMUR (BHABUA)' 'KHAGARIA' 'MADHEPURA' 'BEGUSARAI' 'JAMUI'
 'AURANGABAD' 'KISHANGANJ' 'DARBHANGA' 'GAYA' 'BUXAR' 'BHAGALPUR'
 'MADHUBANI' 'BHOJPUR' 'GOPALGANJ' 'JEHANABAD' 'NALANDA' 'ARARIA'
 'KATIHAR' 'LAKHISARAI' 'MUNGER' 'ARWAL' 'MUZAFFARPUR' 'NAWADA']

print(getCropNames_for_State_District("Bihar", "MADHEPURA"))

['Rapeseed &Mustard' 'Khesari' 'Rice' 'Wheat' 'Maize' 'Linseed' 'Mesta'
 'Peas & beans (Pulses)' 'Potato' 'Turmeric' 'Jute' 'Moong(Green Gram)'
 'Sunflower' 'Masoor' 'Banana' 'Sweet potato' 'Urad' 'Garlic' 'Ragi'
 'Sannhamp' 'Onion' 'Jowar' 'Arhar/Tur' 'Dry ginger' 'Tobacco' 'Sugarcane'
 'Barley' 'Safflower' 'Gram' 'Small millets' 'Coriander' 'Horse-gram'
 'Sesamum' 'Other Kharif pulses' 'Bajra' 'Dry chillies' 'Castor seed'
 'Blackgram' 'Groundnut' 'Other  Rabi pulses']
None

ArPr_Crops = list(getCropNames_for_State_District("Bihar", "MADHEPURA"))

['Rapeseed &Mustard' 'Khesari' 'Rice' 'Wheat' 'Maize' 'Linseed' 'Mesta'
 'Peas & beans (Pulses)' 'Potato' 'Turmeric' 'Jute' 'Moong(Green Gram)'
 'Sunflower' 'Masoor' 'Banana' 'Sweet potato' 'Urad' 'Garlic' 'Ragi'
 'Sannhamp' 'Onion' 'Jowar' 'Arhar/Tur' 'Dry ginger' 'Tobacco' 'Sugarcane'
 'Barley' 'Safflower' 'Gram' 'Small millets' 'Coriander' 'Horse-gram'
 'Sesamum' 'Other Kharif pulses' 'Bajra' 'Dry chillies' 'Castor seed'
 'Blackgram' 'Groundnut' 'Other  Rabi pulses']

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-64-5d1d5cf6e6ad> in <module>()
----> 1 ArPr_Crops = list(getCropNames_for_State_District("Bihar", "MADHEPURA"))

TypeError: 'NoneType' object is not iterable

ArPr_Crops

lstArPrCrops = []
for value in ArPr_Crops:
    lstArPrCrops.append(value)

getCropNames_for_State_District("Arunachal Pradesh", 'DIBANG VALLEY')

['Oilseeds total' 'Groundnut' 'Sesamum' 'Small millets' 'Dry chillies'
 'Maize' 'Rice' 'Dry ginger' 'Pulses total' 'Potato' 'Soyabean' 'Turmeric'
 'Wheat' 'Rapeseed &Mustard' 'Sugarcane']

df.loc[df['State_Name'] == "Andhra Pradesh", "District_Name"].values[:]

array(['KRISHNA', 'GUNTUR', 'KRISHNA', ..., 'CHITTOOR', 'VISAKHAPATANAM',
       'PRAKASAM'], dtype=object)

Conditionally retrieving data from a dataframe with FILTER¶

DataFrame.filter(items=None, like=None, regex=None, axis=None)

df.filter(["State_Name", "Production", "Area"])

df.filter(["Crop_Year", "State_Name","Area"]).groupby('Crop_Year').sum()

Use filter() to retrieve all columns in a dataframe, which has, for example, the letter ‘a’ or ‘A’ in its name.¶

df.filter(regex = '[Aa]')

Names dataset - Open DataSource¶

url = "https://data.chhs.ca.gov/dataset/4a8cb74f-c4fa-458a-8ab1-5f2c0b2e22e3/resource/f3fe42ed-4441-4fd8-bf53-92fb80a246da/download/2021-06-18_topbabynames_1960-2019.csv"
ndf = pd.read_csv(url)

ndf.head()

ndf.describe()

ndf[ndf['Name'].str.startswith('AL')]

ndf[ndf["Name"].str.endswith('RA')]

ndf[ndf['Name'].str.contains("AS")]

ndf[ndf['Name'].str.contains("AS", case = False)]

ndf[ndf['Name'].str.contains("IAN$|INE$", case = False, regex = True)].head()

ndf[ndf['Name'].str.contains("IAN$|INE$", case = False, regex = False)]

ndf[((ndf['Name'] == 'SPPHIA') | (ndf["Name"] == "PAUL"))].head()

ndf[ndf["Name"].isin(["SOPHIA", "PAUL"])].head(4)

ndf.filter(items=['Year', 'Name'], axis=1)

ndf.filter(like='17', axis = 0)

ndf.filter(regex = '^5|8|3', axis=0)

ndf.isnull().count()

Year     3053
Sex      3053
Rank     3053
Name     3053
Count    3053
dtype: int64

	Unnamed: 0	Crop_Year	Area	Production	Len State	Len Dist	Len Season	Len Crop
count	32262.000000	32262.000000	32261.000000	3.226100e+04	32261.000000	32261.000000	32261.000000	32261.000000
mean	19226.226985	2005.848583	7280.181729	5.956365e+05	8.148260	8.589070	6.458076	8.312855
std	11030.538816	5.121896	28244.194598	1.521592e+07	4.721486	2.571346	2.305782	4.267502
min	0.000000	1997.000000	1.000000	2.000000e+00	5.000000	4.000000	4.000000	4.000000
25%	9763.250000	2002.000000	92.000000	1.430000e+02	5.000000	7.000000	4.000000	5.000000
50%	19224.000000	2006.000000	515.000000	7.290000e+02	5.000000	8.000000	6.000000	7.000000
75%	28813.750000	2010.000000	2800.000000	6.211000e+03	14.000000	10.000000	10.000000	10.000000
max	38262.000000	2014.000000	877029.000000	7.801620e+08	27.000000	24.000000	10.000000	25.000000

	Area	Production
count	32261.000000	3.226100e+04
mean	7280.181729	5.956365e+05
std	28244.194598	1.521592e+07
min	1.000000	2.000000e+00
25%	92.000000	1.430000e+02
50%	515.000000	7.290000e+02
75%	2800.000000	6.211000e+03
max	877029.000000	7.801620e+08

	State_Name	Production	Area
0	Andhra Pradesh	9991.0	12334.0
1	Assam	9991.0	571.0
2	Andhra Pradesh	99900.0	29100.0
3	Andaman and Nicobar Islands	999.5	53.5
4	Arunachal Pradesh	999.0	107.0
...	...	...	...
32257	Arunachal Pradesh	1364.0	82.0
32258	Assam	1364.0	1705.0
32259	Andhra Pradesh	13639.0	1098.0
32260	Arunachal Pradesh	13630.0	13980.0
32261	Assam	NaN	NaN

	Area
Crop_Year
1997	12002440.00
1998	12403629.00
1999	10535575.00
2000	13728871.00
2001	12461665.00
2002	12500128.10
2003	13252396.20
2004	12560392.77
2005	12620591.50
2006	11881158.66
2007	12249331.00
2008	12992633.00
2009	12030716.00
2010	13253392.52
2011	14586127.00
2012	14726307.00
2013	15105014.00
2014	15975575.00

	Year	Sex	Rank	Name	Count
0	1960	Female	1	SUSAN	3299
1	1960	Female	2	MARY	3248
2	1960	Female	3	KAREN	3156
3	1960	Female	4	CYNTHIA	2982
4	1960	Female	5	LISA	2839

Consulting - Quality

QUERYING A DATAFRA,E Practice session

Conditionally retrieving data from a dataframe with QUERY¶

examples¶

Conditionally retrieving data from a dataframe with FILTER¶

Use filter() to retrieve all columns in a dataframe, which has, for example, the letter ‘a’ or ‘A’ in its name.¶

Names dataset - Open DataSource¶

	Unnamed: 0	State_Name	District_Name	Crop_Year	Season	Crop	Area	Production	Len State	Len Dist	Len Season	Len Crop
0	4351	Andhra Pradesh	KRISHNA	2005	Kharif	Moong(Green Gram)	12334.0	9991.0	14.0	7.0	6.0	17.0
1	21402	Assam	KARBI ANGLONG	2013	Kharif	Papaya	571.0	9991.0	5.0	13.0	6.0	6.0

	State_Name	District_Name	Crop_Year	Season	Crop	Area	Production
ColIndex
4351	Andhra Pradesh	KRISHNA	2005	Kharif	Moong(Green Gram)	12334.0	9991.0
21402	Assam	KARBI ANGLONG	2013	Kharif	Papaya	571.0	9991.0
2592	Andhra Pradesh	GUNTUR	1998	Rabi	Rice	29100.0	99900.0
181	Andaman and Nicobar Islands	SOUTH ANDAMANS	2006	Whole Year	Sugarcane	53.5	999.5
11244	Arunachal Pradesh	PAPUM PARE	1998	Whole Year	Sugarcane	107.0	999.0
...	...	...	...	...	...	...	...
10646	Arunachal Pradesh	KURUNG KUMEY	2004	Whole Year	Sugarcane	82.0	1364.0
12893	Assam	BARPETA	2002	Whole Year	Dry chillies	1705.0	1364.0
6230	Andhra Pradesh	PRAKASAM	2013	Kharif	Tomato	1098.0	13639.0
10754	Arunachal Pradesh	LOHIT	2000	Kharif	Rice	13980.0	13630.0
25952	Assam	SONITPUR	2011	Kharif	Tap	NaN	NaN

	ColIndex	State_Name	District_Name	Crop_Year	Season	Crop	Area	Production
15	9231	Andhra Pradesh	VIZIANAGARAM	2014	Rabi	Cabbage	81.0	998.0
30	9118	Andhra Pradesh	VIZIANAGARAM	2012	Kharif	Sugarcane	16021.0	996737.0
124	9249	Andhra Pradesh	VIZIANAGARAM	2014	Rabi	Urad	17755.0	9890.0
149	9176	Andhra Pradesh	VIZIANAGARAM	2013	Rabi	Dry chillies	848.0	986.0
322	8620	Andhra Pradesh	VIZIANAGARAM	2000	Rabi	Maize	1810.0	9745.0
...	...	...	...	...	...	...	...	...
32163	8930	Andhra Pradesh	VIZIANAGARAM	2008	Kharif	Bajra	1351.0	1374.0
32179	9137	Andhra Pradesh	VIZIANAGARAM	2012	Rabi	Sesamum	4883.0	1372.0
32229	8816	Andhra Pradesh	VIZIANAGARAM	2004	Whole Year	Banana	5591.0	136806.0
32230	8725	Andhra Pradesh	VIZIANAGARAM	2002	Whole Year	Tapioca	226.0	1368.0
32234	9051	Andhra Pradesh	VIZIANAGARAM	2010	Whole Year	Turmeric	224.0	1368.0

	ColIndex	State_Name	District_Name	Crop_Year	Season	Crop	Area	Production
4640	9082	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Urad	13409.0	7455.0
5797	9076	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Moong(Green Gram)	14019.0	6995.0
6060	9079	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Ragi	601.0	686.0
7145	9075	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Maize	10199.0	63448.0
7742	9074	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Jowar	13.0	61.0
8308	9077	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Onion	233.0	5965.0
15605	9071	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Gram	250.0	379.0
16580	9072	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Groundnut	2387.0	3535.0
18295	9073	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Horse-gram	9814.0	3150.0
21324	9080	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Rice	1078.0	2639.0
27458	9081	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Sesamum	6885.0	1838.0
28974	9070	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	Dry chillies	1077.0	1670.0
30679	9078	Andhra Pradesh	VIZIANAGARAM	2011	Rabi	other oilseeds	43.0	15.0

	Year	Rank	Count
count	3053.000000	3053.000000	3053.000000
mean	1990.013757	13.005568	2346.005568
std	17.612114	7.213803	1226.429522
min	1960.000000	1.000000	745.000000
25%	1975.000000	7.000000	1507.000000
50%	1990.000000	13.000000	2073.000000
75%	2005.000000	19.000000	2777.000000
max	2020.000000	25.000000	9444.000000

	Year	Sex	Rank	Name	Count
1448	1988	Male	23	ALEXANDER	2266
1497	1989	Male	22	ALEXANDER	2304
1520	1990	Female	20	ALYSSA	1630
1548	1990	Male	23	ALEXANDER	2536
1575	1991	Female	25	ALEXANDRA	1457
...	...	...	...	...	...
2834	2016	Male	7	ALEXANDER	2218
2886	2017	Male	9	ALEXANDER	1998
2935	2018	Male	8	ALEXANDER	1908
2984	2019	Male	7	ALEXANDER	1838
3036	2020	Male	9	ALEXANDER	1591

	Year	Sex	Rank	Name	Count
6	1960	Female	7	DEBRA	2692
9	1960	Female	10	SANDRA	2404
10	1960	Female	11	LAURA	2032
20	1960	Female	21	BARBARA	1605
58	1961	Female	9	SANDRA	2483
...	...	...	...	...	...
1572	1991	Female	22	LAURA	1492
1575	1991	Female	25	ALEXANDRA	1457
1623	1992	Female	23	ALEXANDRA	1463
1669	1993	Female	19	ALEXANDRA	1559
1774	1995	Female	24	ALEXANDRA	1343

	Year	Sex	Rank	Name	Count
35	1960	Male	11	THOMAS	2978
86	1961	Male	12	THOMAS	2991
136	1962	Male	12	THOMAS	2902
186	1963	Male	12	THOMAS	2850
236	1964	Male	12	THOMAS	2733
...	...	...	...	...	...
2981	2019	Male	4	SEBASTIAN	2154
2987	2019	Male	10	LUCAS	1724
3000	2019	Male	23	MASON	1233
3031	2020	Male	4	SEBASTIAN	2000
3039	2020	Male	12	LUCAS	1355

	Year	Sex	Rank	Name	Count
46	1960	Male	22	BRIAN	1968
98	1961	Male	24	BRIAN	2026
147	1962	Male	23	BRIAN	2061
196	1963	Male	22	BRIAN	2264
246	1964	Male	22	BRIAN	2083

	Year	Sex	Rank	Name	Count
45	1960	Male	21	PAUL	2136
95	1961	Male	21	PAUL	2162
145	1962	Male	21	PAUL	2157
194	1963	Male	20	PAUL	2312
240	1964	Male	16	PAUL	2408

	Year	Sex	Rank	Name	Count
17	1960	Female	18	TERESA	1709
117	1962	Female	18	KATHLEEN*	1701
170	1963	Female	21	MARIA	1513
171	1963	Female	22	DONNA	1510
172	1963	Female	23	DENISE	1487
...	...	...	...	...	...
2617	2012	Female	17	GENESIS	1029
2717	2014	Female	16	NATALIE	1066
2817	2016	Female	15	PENELOPE	1059
2917	2018	Female	15	AMELIA	1207
3017	2020	Female	15	VICTORIA	1041