QUERYING A DATAFRA,E Practice session
Querying_dataframe
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
df = pd.read_csv('/content/clean_agri_ds.csv')
In [5]:
df.head(2)
Out[5]:
Unnamed: 0 State_Name District_Name Crop_Year Season Crop Area Production Len State Len Dist Len Season Len Crop
0 4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0 14.0 7.0 6.0 17.0
1 21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0 5.0 13.0 6.0 6.0
In [7]:
df.describe()
Out[7]:
Unnamed: 0 Crop_Year Area Production Len State Len Dist Len Season Len Crop
count 32262.000000 32262.000000 32261.000000 3.226100e+04 32261.000000 32261.000000 32261.000000 32261.000000
mean 19226.226985 2005.848583 7280.181729 5.956365e+05 8.148260 8.589070 6.458076 8.312855
std 11030.538816 5.121896 28244.194598 1.521592e+07 4.721486 2.571346 2.305782 4.267502
min 0.000000 1997.000000 1.000000 2.000000e+00 5.000000 4.000000 4.000000 4.000000
25% 9763.250000 2002.000000 92.000000 1.430000e+02 5.000000 7.000000 4.000000 5.000000
50% 19224.000000 2006.000000 515.000000 7.290000e+02 5.000000 8.000000 6.000000 7.000000
75% 28813.750000 2010.000000 2800.000000 6.211000e+03 14.000000 10.000000 10.000000 10.000000
max 38262.000000 2014.000000 877029.000000 7.801620e+08 27.000000 24.000000 10.000000 25.000000
In [8]:
df.drop(['Len State', 'Len Dist', 'Len Season', 'Len Crop'], inplace = True, axis=1)
In [9]:
df.head(2)
Out[9]:
Unnamed: 0 State_Name District_Name Crop_Year Season Crop Area Production
0 4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
1 21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
In [10]:
df.set_index(['Unnamed: 0'])
Out[10]:
State_Name District_Name Crop_Year Season Crop Area Production
Unnamed: 0
4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
2592 Andhra Pradesh GUNTUR 1998 Rabi Rice 29100.0 99900.0
181 Andaman and Nicobar Islands SOUTH ANDAMANS 2006 Whole Year Sugarcane 53.5 999.5
11244 Arunachal Pradesh PAPUM PARE 1998 Whole Year Sugarcane 107.0 999.0
... ... ... ... ... ... ... ...
10646 Arunachal Pradesh KURUNG KUMEY 2004 Whole Year Sugarcane 82.0 1364.0
12893 Assam BARPETA 2002 Whole Year Dry chillies 1705.0 1364.0
6230 Andhra Pradesh PRAKASAM 2013 Kharif Tomato 1098.0 13639.0
10754 Arunachal Pradesh LOHIT 2000 Kharif Rice 13980.0 13630.0
25952 Assam SONITPUR 2011 Kharif Tap NaN NaN

32262 rows × 7 columns

In [11]:
df.rename(columns = {'Unnamed: 0':"ColIndex"}, inplace = True)
In [12]:
df.set_index(['ColIndex'])
Out[12]:
State_Name District_Name Crop_Year Season Crop Area Production
ColIndex
4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
2592 Andhra Pradesh GUNTUR 1998 Rabi Rice 29100.0 99900.0
181 Andaman and Nicobar Islands SOUTH ANDAMANS 2006 Whole Year Sugarcane 53.5 999.5
11244 Arunachal Pradesh PAPUM PARE 1998 Whole Year Sugarcane 107.0 999.0
... ... ... ... ... ... ... ...
10646 Arunachal Pradesh KURUNG KUMEY 2004 Whole Year Sugarcane 82.0 1364.0
12893 Assam BARPETA 2002 Whole Year Dry chillies 1705.0 1364.0
6230 Andhra Pradesh PRAKASAM 2013 Kharif Tomato 1098.0 13639.0
10754 Arunachal Pradesh LOHIT 2000 Kharif Rice 13980.0 13630.0
25952 Assam SONITPUR 2011 Kharif Tap NaN NaN

32262 rows × 7 columns

In [13]:
df['Crop_Year'] = df['Crop_Year'].astype(str)
In [14]:
df['ColIndex'] = df['ColIndex'].astype(str)
In [15]:
df.describe()
Out[15]:
Area Production
count 32261.000000 3.226100e+04
mean 7280.181729 5.956365e+05
std 28244.194598 1.521592e+07
min 1.000000 2.000000e+00
25% 92.000000 1.430000e+02
50% 515.000000 7.290000e+02
75% 2800.000000 6.211000e+03
max 877029.000000 7.801620e+08

Conditionally retrieving data from a dataframe with QUERY

DataFrame.query(expr, inplace=False, **kwargs)

In [16]:
df.loc[df.State_Name == "Andhra Pradesh", 'District_Name'].unique()
Out[16]:
array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)
In [17]:
df.loc[df['State_Name'] == 'Andhra Pradesh']['District_Name'].unique()
Out[17]:
array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)
In [18]:
df.query('State_Name == "Andhra Pradesh"')['District_Name'].unique()
Out[18]:
array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)
In [19]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM"')
Out[19]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
15 9231 Andhra Pradesh VIZIANAGARAM 2014 Rabi Cabbage 81.0 998.0
30 9118 Andhra Pradesh VIZIANAGARAM 2012 Kharif Sugarcane 16021.0 996737.0
124 9249 Andhra Pradesh VIZIANAGARAM 2014 Rabi Urad 17755.0 9890.0
149 9176 Andhra Pradesh VIZIANAGARAM 2013 Rabi Dry chillies 848.0 986.0
322 8620 Andhra Pradesh VIZIANAGARAM 2000 Rabi Maize 1810.0 9745.0
... ... ... ... ... ... ... ... ...
32163 8930 Andhra Pradesh VIZIANAGARAM 2008 Kharif Bajra 1351.0 1374.0
32179 9137 Andhra Pradesh VIZIANAGARAM 2012 Rabi Sesamum 4883.0 1372.0
32229 8816 Andhra Pradesh VIZIANAGARAM 2004 Whole Year Banana 5591.0 136806.0
32230 8725 Andhra Pradesh VIZIANAGARAM 2002 Whole Year Tapioca 226.0 1368.0
32234 9051 Andhra Pradesh VIZIANAGARAM 2010 Whole Year Turmeric 224.0 1368.0

618 rows × 8 columns

In [20]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Season == "Rabi"')
Out[20]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
15 9231 Andhra Pradesh VIZIANAGARAM 2014 Rabi Cabbage 81.0 998.0
124 9249 Andhra Pradesh VIZIANAGARAM 2014 Rabi Urad 17755.0 9890.0
149 9176 Andhra Pradesh VIZIANAGARAM 2013 Rabi Dry chillies 848.0 986.0
322 8620 Andhra Pradesh VIZIANAGARAM 2000 Rabi Maize 1810.0 9745.0
349 8885 Andhra Pradesh VIZIANAGARAM 2006 Rabi Horse-gram 19884.0 9723.0
... ... ... ... ... ... ... ... ...
31401 9136 Andhra Pradesh VIZIANAGARAM 2012 Rabi Rice 6690.0 14310.0
31660 8665 Andhra Pradesh VIZIANAGARAM 2001 Rabi Sesamum 5277.0 1409.0
31811 8584 Andhra Pradesh VIZIANAGARAM 1999 Rabi Jowar 34.0 14.0
32040 8852 Andhra Pradesh VIZIANAGARAM 2005 Rabi Sesamum 4274.0 1385.0
32179 9137 Andhra Pradesh VIZIANAGARAM 2012 Rabi Sesamum 4883.0 1372.0

209 rows × 8 columns

In [21]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Season == "Rabi" and Crop_Year == "2011"' )
Out[21]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
4640 9082 Andhra Pradesh VIZIANAGARAM 2011 Rabi Urad 13409.0 7455.0
5797 9076 Andhra Pradesh VIZIANAGARAM 2011 Rabi Moong(Green Gram) 14019.0 6995.0
6060 9079 Andhra Pradesh VIZIANAGARAM 2011 Rabi Ragi 601.0 686.0
7145 9075 Andhra Pradesh VIZIANAGARAM 2011 Rabi Maize 10199.0 63448.0
7742 9074 Andhra Pradesh VIZIANAGARAM 2011 Rabi Jowar 13.0 61.0
8308 9077 Andhra Pradesh VIZIANAGARAM 2011 Rabi Onion 233.0 5965.0
15605 9071 Andhra Pradesh VIZIANAGARAM 2011 Rabi Gram 250.0 379.0
16580 9072 Andhra Pradesh VIZIANAGARAM 2011 Rabi Groundnut 2387.0 3535.0
18295 9073 Andhra Pradesh VIZIANAGARAM 2011 Rabi Horse-gram 9814.0 3150.0
21324 9080 Andhra Pradesh VIZIANAGARAM 2011 Rabi Rice 1078.0 2639.0
27458 9081 Andhra Pradesh VIZIANAGARAM 2011 Rabi Sesamum 6885.0 1838.0
28974 9070 Andhra Pradesh VIZIANAGARAM 2011 Rabi Dry chillies 1077.0 1670.0
30679 9078 Andhra Pradesh VIZIANAGARAM 2011 Rabi other oilseeds 43.0 15.0
In [22]:
df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM" and Crop_Year == "2011" and Season == "Rabi" and Crop == "Onion"')
Out[22]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
8308 9077 Andhra Pradesh VIZIANAGARAM 2011 Rabi Onion 233.0 5965.0
In [23]:
df_dist = df.query('State_Name == "Andhra Pradesh" and District_Name == "VIZIANAGARAM"')
In [24]:
df_dist.head(1)
Out[24]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
15 9231 Andhra Pradesh VIZIANAGARAM 2014 Rabi Cabbage 81.0 998.0
In [25]:
def getDistNames_forState(strStateName):
    print(df.query('State_Name == "' + strStateName + '"')['Crop'].unique())
In [26]:
getDistNames_forState('Andhra Pradesh')
['Moong(Green Gram)' 'Rice' 'Coconut ' 'Cabbage' 'Banana' 'Jowar' 'Gram'
 'Sugarcane' 'Urad' 'Dry chillies' 'Sesamum' 'Groundnut' 'Ragi' 'Bajra'
 'Horse-gram' 'Sweet potato' 'Tapioca' 'Cotton(lint)' 'Tobacco'
 'Small millets' 'Onion' 'Mesta' 'Maize' 'Turmeric' 'Rapeseed &Mustard'
 'Other Kharif pulses' 'Citrus Fruit' 'Castor seed' 'Potato'
 'other oilseeds' 'Cashewnut' 'Tomato' 'Arhar/Tur' 'Brinjal'
 'Cowpea(Lobia)' 'Niger seed' 'Wheat' 'Coriander' 'Other Fresh Fruits'
 'Dry ginger' 'Orange' 'Papaya' 'Sunflower' 'Korra' 'Lemon' 'Bhindi'
 'Mango' 'Grapes' 'Safflower' 'Beans & Mutter(Vegetable)' 'Pome Granet'
 'Arecanut' 'Other  Rabi pulses' 'Samai' 'Soyabean' 'Masoor' 'Pome Fruit'
 'Sapota' 'other misc. pulses' 'Linseed' 'Ginger' 'Varagu' 'Garlic']
In [27]:
display(df['State_Name'].unique())
array(['Andhra Pradesh', 'Assam', 'Andaman and Nicobar Islands',
       'Arunachal Pradesh', 'Bihar'], dtype=object)
In [28]:
def getDistrict_Names_for_State(strStateName):
    print(df.loc[df['State_Name'] == strStateName, "District_Name"].iloc[:].unique())
In [29]:
getDistrict_Names_for_State("Arunachal Pradesh")
['PAPUM PARE' 'DIBANG VALLEY' 'LOWER DIBANG VALLEY' 'WEST KAMENG'
 'UPPER SIANG' 'LOHIT' 'CHANGLANG' 'EAST KAMENG' 'UPPER SUBANSIRI' 'ANJAW'
 'TAWANG' 'LOWER SUBANSIRI' 'WEST SIANG' 'EAST SIANG' 'KURUNG KUMEY'
 'TIRAP' 'LONGDING' 'NAMSAI']
In [30]:
getDistrict_Names_for_State("Andhra Pradesh")
['KRISHNA' 'GUNTUR' 'VISAKHAPATANAM' 'VIZIANAGARAM' 'WEST GODAVARI'
 'ANANTAPUR' 'KADAPA' 'SRIKAKULAM' 'EAST GODAVARI' 'PRAKASAM' 'KURNOOL'
 'CHITTOOR' 'SPSR NELLORE']

examples

  • df.loc[(df["B"] > 50) & (df["C"] == 900), "A"].values
  • df.loc[(df["B"] > 50) & (df["C"] == 900), "A"]
In [31]:
def getCropNames_for_State_District(strStateName, strDistName):
    print(df.loc[(df['State_Name'] == strStateName) & (df['District_Name'] == strDistName), "Crop"].unique())
In [32]:
getCropNames_for_State_District("Arunachal Pradesh", "WEST KAMENG")
['Small millets' 'Rapeseed &Mustard' 'Rice' 'Wheat' 'Dry ginger' 'Maize'
 'Soyabean' 'Oilseeds total' 'Dry chillies' 'Potato' 'Pulses total'
 'Groundnut' 'Sunflower']
In [35]:
getCropNames_for_State_District("Assam","KARBI ANGLONG" )
['Papaya' 'Rice' 'Paddy' 'Other  Rabi pulses' 'Arecanut' 'Sweet potato'
 'Masoor' 'Mesta' 'Rapeseed &Mustard' 'Potato' 'Niger seed' 'Arhar/Tur'
 'Maize' 'Orange' 'Moong(Green Gram)' 'Turmeric' 'Small millets' 'Urad'
 'Onion' 'Coconut ' 'Cotton(lint)' 'Linseed' 'Peas & beans (Pulses)'
 'Black pepper' 'Gram' 'Dry chillies' 'Tapioca' 'Blackgram' 'Tobacco'
 'Pineapple' 'Sugarcane' 'Banana' 'Jute' 'Wheat' 'Sesamum' 'Castor seed'
 'Ginger' 'Dry ginger']
In [36]:
df.loc[df['State_Name'] == "Andhra Pradesh", "District_Name"].iloc[:].unique()
Out[36]:
array(['KRISHNA', 'GUNTUR', 'VISAKHAPATANAM', 'VIZIANAGARAM',
       'WEST GODAVARI', 'ANANTAPUR', 'KADAPA', 'SRIKAKULAM',
       'EAST GODAVARI', 'PRAKASAM', 'KURNOOL', 'CHITTOOR', 'SPSR NELLORE'],
      dtype=object)
In [37]:
getDistrict_Names_for_State("Arunachal Pradesh")
['PAPUM PARE' 'DIBANG VALLEY' 'LOWER DIBANG VALLEY' 'WEST KAMENG'
 'UPPER SIANG' 'LOHIT' 'CHANGLANG' 'EAST KAMENG' 'UPPER SUBANSIRI' 'ANJAW'
 'TAWANG' 'LOWER SUBANSIRI' 'WEST SIANG' 'EAST SIANG' 'KURUNG KUMEY'
 'TIRAP' 'LONGDING' 'NAMSAI']
In [38]:
getDistrict_Names_for_State("Bihar")
['BANKA' 'KAIMUR (BHABUA)' 'KHAGARIA' 'MADHEPURA' 'BEGUSARAI' 'JAMUI'
 'AURANGABAD' 'KISHANGANJ' 'DARBHANGA' 'GAYA' 'BUXAR' 'BHAGALPUR'
 'MADHUBANI' 'BHOJPUR' 'GOPALGANJ' 'JEHANABAD' 'NALANDA' 'ARARIA'
 'KATIHAR' 'LAKHISARAI' 'MUNGER' 'ARWAL' 'MUZAFFARPUR' 'NAWADA']
In [39]:
print(getCropNames_for_State_District("Bihar", "MADHEPURA"))
['Rapeseed &Mustard' 'Khesari' 'Rice' 'Wheat' 'Maize' 'Linseed' 'Mesta'
 'Peas & beans (Pulses)' 'Potato' 'Turmeric' 'Jute' 'Moong(Green Gram)'
 'Sunflower' 'Masoor' 'Banana' 'Sweet potato' 'Urad' 'Garlic' 'Ragi'
 'Sannhamp' 'Onion' 'Jowar' 'Arhar/Tur' 'Dry ginger' 'Tobacco' 'Sugarcane'
 'Barley' 'Safflower' 'Gram' 'Small millets' 'Coriander' 'Horse-gram'
 'Sesamum' 'Other Kharif pulses' 'Bajra' 'Dry chillies' 'Castor seed'
 'Blackgram' 'Groundnut' 'Other  Rabi pulses']
None
In [64]:
ArPr_Crops = list(getCropNames_for_State_District("Bihar", "MADHEPURA"))
['Rapeseed &Mustard' 'Khesari' 'Rice' 'Wheat' 'Maize' 'Linseed' 'Mesta'
 'Peas & beans (Pulses)' 'Potato' 'Turmeric' 'Jute' 'Moong(Green Gram)'
 'Sunflower' 'Masoor' 'Banana' 'Sweet potato' 'Urad' 'Garlic' 'Ragi'
 'Sannhamp' 'Onion' 'Jowar' 'Arhar/Tur' 'Dry ginger' 'Tobacco' 'Sugarcane'
 'Barley' 'Safflower' 'Gram' 'Small millets' 'Coriander' 'Horse-gram'
 'Sesamum' 'Other Kharif pulses' 'Bajra' 'Dry chillies' 'Castor seed'
 'Blackgram' 'Groundnut' 'Other  Rabi pulses']
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-64-5d1d5cf6e6ad> in <module>()
----> 1 ArPr_Crops = list(getCropNames_for_State_District("Bihar", "MADHEPURA"))

TypeError: 'NoneType' object is not iterable
In [ ]:
ArPr_Crops
In [ ]:
lstArPrCrops = []
for value in ArPr_Crops:
    lstArPrCrops.append(value)
In [62]:
getCropNames_for_State_District("Arunachal Pradesh", 'DIBANG VALLEY')
['Oilseeds total' 'Groundnut' 'Sesamum' 'Small millets' 'Dry chillies'
 'Maize' 'Rice' 'Dry ginger' 'Pulses total' 'Potato' 'Soyabean' 'Turmeric'
 'Wheat' 'Rapeseed &Mustard' 'Sugarcane']
In [63]:
df.loc[df['State_Name'] == "Andhra Pradesh", "District_Name"].values[:]
Out[63]:
array(['KRISHNA', 'GUNTUR', 'KRISHNA', ..., 'CHITTOOR', 'VISAKHAPATANAM',
       'PRAKASAM'], dtype=object)

Conditionally retrieving data from a dataframe with FILTER

DataFrame.filter(items=None, like=None, regex=None, axis=None)

In [41]:
df.filter(["State_Name", "Production", "Area"])
Out[41]:
State_Name Production Area
0 Andhra Pradesh 9991.0 12334.0
1 Assam 9991.0 571.0
2 Andhra Pradesh 99900.0 29100.0
3 Andaman and Nicobar Islands 999.5 53.5
4 Arunachal Pradesh 999.0 107.0
... ... ... ...
32257 Arunachal Pradesh 1364.0 82.0
32258 Assam 1364.0 1705.0
32259 Andhra Pradesh 13639.0 1098.0
32260 Arunachal Pradesh 13630.0 13980.0
32261 Assam NaN NaN

32262 rows × 3 columns

In [42]:
df.filter(["Crop_Year", "State_Name","Area"]).groupby('Crop_Year').sum()
Out[42]:
Area
Crop_Year
1997 12002440.00
1998 12403629.00
1999 10535575.00
2000 13728871.00
2001 12461665.00
2002 12500128.10
2003 13252396.20
2004 12560392.77
2005 12620591.50
2006 11881158.66
2007 12249331.00
2008 12992633.00
2009 12030716.00
2010 13253392.52
2011 14586127.00
2012 14726307.00
2013 15105014.00
2014 15975575.00

Use filter() to retrieve all columns in a dataframe, which has, for example, the letter ‘a’ or ‘A’ in its name.

In [43]:
df.filter(regex = '[Aa]')
Out[43]:
State_Name District_Name Crop_Year Season Area
0 Andhra Pradesh KRISHNA 2005 Kharif 12334.0
1 Assam KARBI ANGLONG 2013 Kharif 571.0
2 Andhra Pradesh GUNTUR 1998 Rabi 29100.0
3 Andaman and Nicobar Islands SOUTH ANDAMANS 2006 Whole Year 53.5
4 Arunachal Pradesh PAPUM PARE 1998 Whole Year 107.0
... ... ... ... ... ...
32257 Arunachal Pradesh KURUNG KUMEY 2004 Whole Year 82.0
32258 Assam BARPETA 2002 Whole Year 1705.0
32259 Andhra Pradesh PRAKASAM 2013 Kharif 1098.0
32260 Arunachal Pradesh LOHIT 2000 Kharif 13980.0
32261 Assam SONITPUR 2011 Kharif NaN

32262 rows × 5 columns

Names dataset - Open DataSource

In [44]:
url = "https://data.chhs.ca.gov/dataset/4a8cb74f-c4fa-458a-8ab1-5f2c0b2e22e3/resource/f3fe42ed-4441-4fd8-bf53-92fb80a246da/download/2021-06-18_topbabynames_1960-2019.csv"
ndf = pd.read_csv(url)
In [45]:
ndf.head()
Out[45]:
Year Sex Rank Name Count
0 1960 Female 1 SUSAN 3299
1 1960 Female 2 MARY 3248
2 1960 Female 3 KAREN 3156
3 1960 Female 4 CYNTHIA 2982
4 1960 Female 5 LISA 2839
In [46]:
ndf.describe()
Out[46]:
Year Rank Count
count 3053.000000 3053.000000 3053.000000
mean 1990.013757 13.005568 2346.005568
std 17.612114 7.213803 1226.429522
min 1960.000000 1.000000 745.000000
25% 1975.000000 7.000000 1507.000000
50% 1990.000000 13.000000 2073.000000
75% 2005.000000 19.000000 2777.000000
max 2020.000000 25.000000 9444.000000
In [47]:
ndf[ndf['Name'].str.startswith('AL')]
Out[47]:
Year Sex Rank Name Count
1448 1988 Male 23 ALEXANDER 2266
1497 1989 Male 22 ALEXANDER 2304
1520 1990 Female 20 ALYSSA 1630
1548 1990 Male 23 ALEXANDER 2536
1575 1991 Female 25 ALEXANDRA 1457
... ... ... ... ... ...
2834 2016 Male 7 ALEXANDER 2218
2886 2017 Male 9 ALEXANDER 1998
2935 2018 Male 8 ALEXANDER 1908
2984 2019 Male 7 ALEXANDER 1838
3036 2020 Male 9 ALEXANDER 1591

80 rows × 5 columns

In [48]:
ndf[ndf["Name"].str.endswith('RA')]
Out[48]:
Year Sex Rank Name Count
6 1960 Female 7 DEBRA 2692
9 1960 Female 10 SANDRA 2404
10 1960 Female 11 LAURA 2032
20 1960 Female 21 BARBARA 1605
58 1961 Female 9 SANDRA 2483
... ... ... ... ... ...
1572 1991 Female 22 LAURA 1492
1575 1991 Female 25 ALEXANDRA 1457
1623 1992 Female 23 ALEXANDRA 1463
1669 1993 Female 19 ALEXANDRA 1559
1774 1995 Female 24 ALEXANDRA 1343

61 rows × 5 columns

In [49]:
ndf[ndf['Name'].str.contains("AS")]
Out[49]:
Year Sex Rank Name Count
35 1960 Male 11 THOMAS 2978
86 1961 Male 12 THOMAS 2991
136 1962 Male 12 THOMAS 2902
186 1963 Male 12 THOMAS 2850
236 1964 Male 12 THOMAS 2733
... ... ... ... ... ...
2981 2019 Male 4 SEBASTIAN 2154
2987 2019 Male 10 LUCAS 1724
3000 2019 Male 23 MASON 1233
3031 2020 Male 4 SEBASTIAN 2000
3039 2020 Male 12 LUCAS 1355

128 rows × 5 columns

In [50]:
ndf[ndf['Name'].str.contains("AS", case = False)]
Out[50]:
Year Sex Rank Name Count
35 1960 Male 11 THOMAS 2978
86 1961 Male 12 THOMAS 2991
136 1962 Male 12 THOMAS 2902
186 1963 Male 12 THOMAS 2850
236 1964 Male 12 THOMAS 2733
... ... ... ... ... ...
2981 2019 Male 4 SEBASTIAN 2154
2987 2019 Male 10 LUCAS 1724
3000 2019 Male 23 MASON 1233
3031 2020 Male 4 SEBASTIAN 2000
3039 2020 Male 12 LUCAS 1355

128 rows × 5 columns

In [61]:
ndf[ndf['Name'].str.contains("IAN$|INE$", case = False, regex = True)].head()
Out[61]:
Year Sex Rank Name Count
46 1960 Male 22 BRIAN 1968
98 1961 Male 24 BRIAN 2026
147 1962 Male 23 BRIAN 2061
196 1963 Male 22 BRIAN 2264
246 1964 Male 22 BRIAN 2083
In [52]:
ndf[ndf['Name'].str.contains("IAN$|INE$", case = False, regex = False)]
Out[52]:
Year Sex Rank Name Count
In [60]:
ndf[((ndf['Name'] == 'SPPHIA') | (ndf["Name"] == "PAUL"))].head()
Out[60]:
Year Sex Rank Name Count
45 1960 Male 21 PAUL 2136
95 1961 Male 21 PAUL 2162
145 1962 Male 21 PAUL 2157
194 1963 Male 20 PAUL 2312
240 1964 Male 16 PAUL 2408
In [59]:
ndf[ndf["Name"].isin(["SOPHIA", "PAUL"])].head(4)
Out[59]:
Year Sex Rank Name Count
45 1960 Male 21 PAUL 2136
95 1961 Male 21 PAUL 2162
145 1962 Male 21 PAUL 2157
194 1963 Male 20 PAUL 2312
In [55]:
ndf.filter(items=['Year', 'Name'], axis=1)
Out[55]:
Year Name
0 1960 SUSAN
1 1960 MARY
2 1960 KAREN
3 1960 CYNTHIA
4 1960 LISA
... ... ...
3048 2020 EZEKIEL
3049 2020 NATHAN
3050 2020 ADRIAN
3051 2020 DYLAN
3052 2020 MICHAEL

3053 rows × 2 columns

In [56]:
ndf.filter(like='17', axis = 0)
Out[56]:
Year Sex Rank Name Count
17 1960 Female 18 TERESA 1709
117 1962 Female 18 KATHLEEN* 1701
170 1963 Female 21 MARIA 1513
171 1963 Female 22 DONNA 1510
172 1963 Female 23 DENISE 1487
... ... ... ... ... ...
2617 2012 Female 17 GENESIS 1029
2717 2014 Female 16 NATALIE 1066
2817 2016 Female 15 PENELOPE 1059
2917 2018 Female 15 AMELIA 1207
3017 2020 Female 15 VICTORIA 1041

160 rows × 5 columns

In [57]:
ndf.filter(regex = '^5|8|3', axis=0)
Out[57]:
Year Sex Rank Name Count
3 1960 Female 4 CYNTHIA 2982
5 1960 Female 6 LINDA 2807
8 1960 Female 9 DEBORAH 2490
13 1960 Female 14 DONNA 1820
18 1960 Female 19 JULIE 1665
... ... ... ... ... ...
3048 2020 Male 21 EZEKIEL 1141
3049 2020 Male 22 NATHAN 1099
3050 2020 Male 23 ADRIAN 1093
3051 2020 Male 24 DYLAN 1085
3052 2020 Male 25 MICHAEL 1077

1590 rows × 5 columns

In [58]:
ndf.isnull().count()
Out[58]:
Year     3053
Sex      3053
Rank     3053
Name     3053
Count    3053
dtype: int64
In [58]: