Agriculture DataSet
Bars, Boxes and Functions
work_with_clean_Agri_ds
In [1]:
!pip3 install plotly_express
Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Requirement already satisfied: patsy>=0.5 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (0.5.2)
Requirement already satisfied: scipy>=0.18 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.4.1)
Requirement already satisfied: pandas>=0.20.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.3.5)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.21.6)
Requirement already satisfied: plotly>=4.1.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (5.5.0)
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (0.10.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.0->plotly_express) (2022.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.0->plotly_express) (2.8.2)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from patsy>=0.5->plotly_express) (1.15.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly>=4.1.0->plotly_express) (8.0.1)
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly_express as px
In [4]:
df = pd.read_csv('/content/clean_agri_ds.csv')
In [5]:
df.State_Name.unique()
Out[5]:
array(['Andhra Pradesh', 'Assam', 'Andaman and Nicobar Islands',
       'Arunachal Pradesh', 'Bihar'], dtype=object)
In [6]:
df.dtypes
Out[6]:
Unnamed: 0         int64
State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production       float64
Len State          int64
Len Dist           int64
Len Season         int64
Len Crop           int64
dtype: object
In [7]:
df.head(3)
Out[7]:
Unnamed: 0 State_Name District_Name Crop_Year Season Crop Area Production Len State Len Dist Len Season Len Crop
0 4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0 14 7 6 17
1 21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0 5 13 6 6
2 2592 Andhra Pradesh GUNTUR 1998 Rabi Rice 29100.0 99900.0 14 6 4 4
In [8]:
df[df.isna().any(axis=1)]
Out[8]:
Unnamed: 0 State_Name District_Name Crop_Year Season Crop Area Production Len State Len Dist Len Season Len Crop
In [9]:
df[df['Production'] == "="]
Out[9]:
Unnamed: 0 State_Name District_Name Crop_Year Season Crop Area Production Len State Len Dist Len Season Len Crop
In [10]:
df.drop(['Len State', 'Len Dist', 'Len Season', 'Len Crop'], inplace = True, axis=1)
In [11]:
df.head(3)
Out[11]:
Unnamed: 0 State_Name District_Name Crop_Year Season Crop Area Production
0 4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
1 21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
2 2592 Andhra Pradesh GUNTUR 1998 Rabi Rice 29100.0 99900.0
In [12]:
df.describe()
Out[12]:
Unnamed: 0 Crop_Year Area Production
count 38181.000000 38181.000000 38181.000000 3.818100e+04
mean 19153.164794 2005.863440 7285.304866 5.326191e+05
std 11039.090664 5.127556 27716.183577 1.407250e+07
min 0.000000 1997.000000 0.200000 0.000000e+00
25% 9610.000000 2002.000000 88.000000 1.070000e+02
50% 19160.000000 2006.000000 498.000000 7.810000e+02
75% 28710.000000 2010.000000 2698.000000 6.515000e+03
max 38262.000000 2014.000000 877029.000000 7.801620e+08
In [13]:
df.set_index(['Unnamed: 0'])
Out[13]:
State_Name District_Name Crop_Year Season Crop Area Production
Unnamed: 0
4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
2592 Andhra Pradesh GUNTUR 1998 Rabi Rice 29100.0 99900.0
181 Andaman and Nicobar Islands SOUTH ANDAMANS 2006 Whole Year Sugarcane 53.5 999.5
11244 Arunachal Pradesh PAPUM PARE 1998 Whole Year Sugarcane 107.0 999.0
... ... ... ... ... ... ... ...
6528 Andhra Pradesh SPSR NELLORE 2002 Whole Year Cucumber 85.0 0.0
6532 Andhra Pradesh SPSR NELLORE 2002 Whole Year Other Vegetables 525.0 0.0
7202 Andhra Pradesh SRIKAKULAM 2002 Whole Year Bottle Gourd 45.0 0.0
7204 Andhra Pradesh SRIKAKULAM 2002 Whole Year Cabbage 242.0 0.0
5818 Andhra Pradesh PRAKASAM 2003 Whole Year Cucumber 331.0 0.0

38181 rows × 7 columns

In [14]:
df.rename(columns = {'Unnamed: 0':"ColIndex"}, inplace = True)
In [15]:
df.head(2)
Out[15]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
0 4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
1 21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
In [16]:
df.set_index(['ColIndex'])
Out[16]:
State_Name District_Name Crop_Year Season Crop Area Production
ColIndex
4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
2592 Andhra Pradesh GUNTUR 1998 Rabi Rice 29100.0 99900.0
181 Andaman and Nicobar Islands SOUTH ANDAMANS 2006 Whole Year Sugarcane 53.5 999.5
11244 Arunachal Pradesh PAPUM PARE 1998 Whole Year Sugarcane 107.0 999.0
... ... ... ... ... ... ... ...
6528 Andhra Pradesh SPSR NELLORE 2002 Whole Year Cucumber 85.0 0.0
6532 Andhra Pradesh SPSR NELLORE 2002 Whole Year Other Vegetables 525.0 0.0
7202 Andhra Pradesh SRIKAKULAM 2002 Whole Year Bottle Gourd 45.0 0.0
7204 Andhra Pradesh SRIKAKULAM 2002 Whole Year Cabbage 242.0 0.0
5818 Andhra Pradesh PRAKASAM 2003 Whole Year Cucumber 331.0 0.0

38181 rows × 7 columns

In [17]:
df.head(2)
Out[17]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
0 4351 Andhra Pradesh KRISHNA 2005 Kharif Moong(Green Gram) 12334.0 9991.0
1 21402 Assam KARBI ANGLONG 2013 Kharif Papaya 571.0 9991.0
In [18]:
df.sort_values(['Production'], inplace = True, ascending = False)
In [19]:
df.head(2)
Out[19]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
3939 2543 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
4954 2432 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0

Create subset for AP from India dataset

In [20]:
dfap = df[df['State_Name'] == "Andhra Pradesh"]
In [21]:
dfap.head(3)
Out[21]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
3939 2543 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
4954 2432 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0
5092 2488 Andhra Pradesh EAST GODAVARI 2013 Whole Year Coconut 49114.0 720895000.0
In [22]:
dfap_group = df[df['State_Name'] == "Andhra Pradesh"].groupby('District_Name').sum()
In [23]:
dfap_group
Out[23]:
ColIndex Crop_Year Area Production
District_Name
ANANTAPUR 467368 1572950 18471177.0 8.150159e+07
CHITTOOR 1082883 1568880 6102142.0 3.741366e+08
EAST GODAVARI 1633239 1516502 11593557.0 8.271057e+09
GUNTUR 1975435 1372144 13739734.0 8.479591e+07
KADAPA 2976731 1639233 7219613.0 2.566223e+07
KRISHNA 2763748 1267735 11134515.0 3.694278e+08
KURNOOL 4207466 1653083 16968338.0 3.494962e+07
PRAKASAM 4750501 1608867 9611563.0 3.770760e+07
SPSR NELLORE 4521747 1360080 6198964.0 1.314234e+08
SRIKAKULAM 5030712 1372127 7102905.0 2.141621e+09
VISAKHAPATANAM 6556844 1622967 5645769.0 1.010412e+09
VIZIANAGARAM 6526938 1474488 6516585.0 4.741696e+08
WEST GODAVARI 5476505 1151411 11202457.0 4.287727e+09
In [24]:
dfap.head(2)
Out[24]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
3939 2543 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
4954 2432 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0
In [25]:
dfap.rename(columns = {"Unnamed: 0":"ColIndex"}, inplace=True)
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:5047: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [26]:
dfap.set_index('ColIndex')
Out[26]:
State_Name District_Name Crop_Year Season Crop Area Production
ColIndex
2543 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
2432 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0
2488 Andhra Pradesh EAST GODAVARI 2013 Whole Year Coconut 49114.0 720895000.0
2378 Andhra Pradesh EAST GODAVARI 2011 Whole Year Coconut 50741.0 719961050.0
9829 Andhra Pradesh WEST GODAVARI 2014 Whole Year Coconut 21729.0 718991000.0
... ... ... ... ... ... ... ...
8711 Andhra Pradesh VIZIANAGARAM 2002 Whole Year Cabbage 147.0 0.0
8709 Andhra Pradesh VIZIANAGARAM 2002 Whole Year Bottle Gourd 102.0 0.0
1203 Andhra Pradesh CHITTOOR 2002 Whole Year Cabbage 4.0 0.0
5811 Andhra Pradesh PRAKASAM 2003 Whole Year Bottle Gourd 46.0 0.0
5818 Andhra Pradesh PRAKASAM 2003 Whole Year Cucumber 331.0 0.0

9561 rows × 7 columns

In [27]:
dfap.describe()
Out[27]:
ColIndex Crop_Year Area Production
count 9561.000000 9561.000000 9561.000000 9.561000e+03
mean 5017.269846 2006.115155 13754.556950 1.812006e+06
std 2779.343152 5.175459 47304.631455 2.794296e+07
min 203.000000 1997.000000 1.000000 0.000000e+00
25% 2613.000000 2002.000000 176.000000 2.140000e+02
50% 5020.000000 2006.000000 1133.000000 1.913000e+03
75% 7426.000000 2011.000000 6631.000000 1.396500e+04
max 9830.000000 2014.000000 877029.000000 7.801620e+08
In [28]:
dfap['Crop_Year'] = df['Crop_Year'].astype(str)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [29]:
dfap['ColIndex'] = dfap['ColIndex'].astype(str)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [30]:
dfap.describe()
Out[30]:
Area Production
count 9561.000000 9.561000e+03
mean 13754.556950 1.812006e+06
std 47304.631455 2.794296e+07
min 1.000000 0.000000e+00
25% 176.000000 2.140000e+02
50% 1133.000000 1.913000e+03
75% 6631.000000 1.396500e+04
max 877029.000000 7.801620e+08
In [31]:
sns.lineplot(x="District_Name", y="Area", hue='Crop_Year', data = dfap)
plt.xticks(rotation = 90)
plt.legend(bbox_to_anchor = [1,1])
Out[31]:
<matplotlib.legend.Legend at 0x7f0ba344a650>
In [32]:
sns.lineplot(x="District_Name", y="Production", hue='Crop_Year', data = dfap)
plt.xticks(rotation = 90)
plt.legend(bbox_to_anchor = [1,1])
Out[32]:
<matplotlib.legend.Legend at 0x7f0ba335c3d0>
In [33]:
sns.barplot(x="District_Name", y="Area", data = dfap)
plt.xticks(rotation = 90)
Out[33]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 <a list of 13 Text major ticklabel objects>)
In [34]:
sns.barplot(x="District_Name", y="Production", data = dfap)
plt.xticks(rotation = 90)
Out[34]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 <a list of 13 Text major ticklabel objects>)
In [35]:
dfap['District_Name'].unique()
Out[35]:
array(['EAST GODAVARI', 'WEST GODAVARI', 'SRIKAKULAM', 'VISAKHAPATANAM',
       'VIZIANAGARAM', 'KRISHNA', 'CHITTOOR', 'SPSR NELLORE', 'ANANTAPUR',
       'GUNTUR', 'PRAKASAM', 'KADAPA', 'KURNOOL'], dtype=object)
In [36]:
dfap.describe()
Out[36]:
Area Production
count 9561.000000 9.561000e+03
mean 13754.556950 1.812006e+06
std 47304.631455 2.794296e+07
min 1.000000 0.000000e+00
25% 176.000000 2.140000e+02
50% 1133.000000 1.913000e+03
75% 6631.000000 1.396500e+04
max 877029.000000 7.801620e+08
In [37]:
dfap.dtypes
Out[37]:
ColIndex          object
State_Name        object
District_Name     object
Crop_Year         object
Season            object
Crop              object
Area             float64
Production       float64
dtype: object

Working with Plot features in Pandas

  • The .plot.* methods are applicable on both Series and DataFrames
  • By default, each of the columns is plotted as a different element (line, boxplot,…)
  • Any plot created by pandas is a Matplotlib object.

source: https://pandas.pydata.org/docs/getting_started/intro_tutorials/04_plotting.html

In [38]:
dfap.plot(figsize = (8,4))
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ba34f0310>
In [39]:
dfap.groupby('District_Name')['Area'].sum().plot.bar()
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ba3391a90>
In [40]:
dfap.groupby('District_Name').sum().plot(kind='line')
plt.xticks(rotation=90)
Out[40]:
(array([-2.,  0.,  2.,  4.,  6.,  8., 10., 12., 14.]),
 <a list of 9 Text major ticklabel objects>)
In [41]:
dfap.sort_values(by="District_Name", inplace=True)
/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
In [42]:
dfap.plot.scatter(x='Area', y='Production')
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ba33f1190>
In [43]:
dfap.plot.box()
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ba33e5890>
In [44]:
fig, axs = plt.subplots(figsize = (12, 6));
# dfap.plot.area(figsize = (12, 6), subplots = True)
dfap.plot.line(ax=axs, subplots = True, marker=".");
/usr/local/lib/python3.7/dist-packages/pandas/plotting/_matplotlib/__init__.py:71: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  plot_obj.generate()
In [45]:
fig.savefig('Area_Production.png')
In [46]:
dfap
Out[46]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
28468 817 Andhra Pradesh ANANTAPUR 2012 Kharif Papaya 447.0 17197.0
24059 277 Andhra Pradesh ANANTAPUR 1999 Rabi Maize 455.0 2203.0
33419 896 Andhra Pradesh ANANTAPUR 2013 Rabi Cotton(lint) 62.0 127.0
16971 838 Andhra Pradesh ANANTAPUR 2012 Rabi Groundnut 20987.0 34419.0
33214 323 Andhra Pradesh ANANTAPUR 2000 Rabi Safflower 703.0 129.0
... ... ... ... ... ... ... ... ...
1615 9796 Andhra Pradesh WEST GODAVARI 2014 Kharif Cowpea(Lobia) 21.0 9.0
31245 9354 Andhra Pradesh WEST GODAVARI 2001 Kharif Dry chillies 40.0 145.0
1658 9671 Andhra Pradesh WEST GODAVARI 2010 Whole Year Arecanut 20.0 9.0
15576 9806 Andhra Pradesh WEST GODAVARI 2014 Kharif Sapota 500.0 3798.0
32297 9713 Andhra Pradesh WEST GODAVARI 2012 Kharif Arhar/Tur 173.0 136.0

9561 rows × 8 columns

In [47]:
dfap.sort_values("Production", ascending = True, inplace = True)
/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
In [48]:
dfaps = dfap['Production'].cumsum()
In [49]:
dfaps.plot()
plt.xticks(rotation = 90)
Out[49]:
(array([-5000.,     0.,  5000., 10000., 15000., 20000., 25000., 30000.,
        35000., 40000., 45000.]), <a list of 11 Text major ticklabel objects>)
In [50]:
dfapa = dfap['Area'].cumsum()
In [51]:
dfapa.plot()
plt.xticks(rotation = 90)
Out[51]:
(array([-5000.,     0.,  5000., 10000., 15000., 20000., 25000., 30000.,
        35000., 40000., 45000.]), <a list of 11 Text major ticklabel objects>)
In [52]:
plt.figure()
dfap.Production.plot(kind='density')
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ba357a810>
In [53]:
df2 = pd.DataFrame(np.random.rand(10, 4), columns = ['a', 'b', 'c', 'd'])
df2.plot.bar();
In [54]:
df2 = pd.DataFrame(np.random.rand(10, 4), columns = ['a', 'b', 'c', 'd'])
df2.plot.bar(stacked = True);
In [55]:
df2 = pd.DataFrame(np.random.rand(10, 4), columns = ['a', 'b', 'c', 'd'])
df2.plot.bar(stacked = True);
plt.legend(bbox_to_anchor = [1,1])      # to place legend outside the graph, towards the right top
Out[55]:
<matplotlib.legend.Legend at 0x7f0ba33e7350>
In [56]:
df2 = pd.DataFrame(np.random.rand(10, 4), columns = ['a', 'b', 'c', 'd'])
df2.plot.barh(stacked = True);
In [57]:
dfap['Production'].diff().hist()
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e8d0610>
In [58]:
plt.figure()
dfap['Area'].diff().hist(color = 'k', bins=50)
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e8f7ed0>
In [59]:
plt.figure()
dfap['Area'].diff().hist(color = 'k', bins=50)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e76c890>
In [60]:
dfapb = dfap.sample(1000).groupby('Season').sum()
dfapb.plot.box()
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e5fe390>
In [61]:
dfapb
Out[61]:
Area Production
Season
Kharif 7861992.0 1.995853e+07
Rabi 4472667.0 8.505345e+06
Whole Year 1334646.0 1.113622e+09
In [62]:
dfapb = dfap.sample(10)
dfapb.boxplot()
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e57d950>
In [63]:
dfap.sample(10).boxplot(by='Crop_Year', figsize=(10, 3))
plt.xticks(rotation=90)
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
Out[63]:
(array([1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]),
 <a list of 16 Text major ticklabel objects>)
In [64]:
dfap.sample(10).boxplot(by='Season', figsize=(10, 3))
plt.xticks(rotation=90)
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
Out[64]:
(array([1, 2, 3, 1, 2, 3]), <a list of 6 Text major ticklabel objects>)
In [65]:
dfap.sample(10).boxplot(by='Crop', figsize=(10, 3))
plt.xticks(rotation=90)
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
Out[65]:
(array([1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 <a list of 18 Text major ticklabel objects>)
In [66]:
dfap.head(2)
Out[66]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
38171 7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
38165 1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
In [67]:
dfap.sample(10).boxplot(by="Season");
plt.xticks(rotation=90);
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
In [68]:
dfap.sample(10).boxplot(by=["Season", 'Crop'], 
                        figsize = (8,6), layout=(2, 1));
plt.xticks(rotation=90);
In [69]:
dfap.plot.scatter(x="Production", y='Area')
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9de77410>
In [70]:
dfap.plot.scatter(x="Production", y='Area', s=5)
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9dcbd150>
In [71]:
dfap.plot.scatter(x="Area", y='Production', s=2)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9dc3e550>
In [72]:
dfap.sample(20).plot.hexbin('Area', 'Production', gridsize=15)
Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9dc1b6d0>
In [73]:
dfap.sample(30).Production.plot();
dfap.sample(30).Area.plot(secondary_y = True, style = "o")
plt.tight_layout()
In [74]:
dfap.sample(100).plot(subplots = True, figsize = (6, 6))
Out[74]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0b9b25f2d0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f0b9ea9c2d0>],
      dtype=object)

How to calculate summary statistics?¶

In [75]:
dfap.Production.mean()
Out[75]:
1812006.09727016
In [76]:
dfap.Area.mean()
Out[76]:
13754.556950109822
In [77]:
dfap[['Production', 'Area']].mean()
Out[77]:
Production    1.812006e+06
Area          1.375456e+04
dtype: float64
In [78]:
dfap[['Production', 'Area']].describe()
Out[78]:
Production Area
count 9.561000e+03 9561.000000
mean 1.812006e+06 13754.556950
std 2.794296e+07 47304.631455
min 0.000000e+00 1.000000
25% 2.140000e+02 176.000000
50% 1.913000e+03 1133.000000
75% 1.396500e+04 6631.000000
max 7.801620e+08 877029.000000
In [79]:
dfap.agg(
    {
        "Production": ["min", "max", "median", "skew"],
        "Area": ["min", "max", "median", "mean"],
    }
)
Out[79]:
Production Area
min 0.000000e+00 1.00000
max 7.801620e+08 877029.00000
median 1.913000e+03 1133.00000
skew 2.095303e+01 NaN
mean NaN 13754.55695
In [80]:
dfap.head(2)
Out[80]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
38171 7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
38165 1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
  • Aggregation statistics can be calculated on entire columns or rows
  • groupby provides the power of the split-apply-combine pattern
  • value_counts is a convenient shortcut to count the number of entries in each category of a variable.

Source: https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html#min-tut-06-stats

In [81]:
dfap[['Season', 'Area']].groupby('Season').mean()
Out[81]:
Area
Season
Kharif 18574.629101
Rabi 11415.132644
Whole Year 7277.522436
In [82]:
dfap[['Crop', "Area"]].groupby('Crop').mean()
Out[82]:
Area
Crop
Arecanut 109.414634
Arhar/Tur 9942.347059
Bajra 3259.521036
Banana 3767.365385
Beans & Mutter(Vegetable) 494.920000
... ...
Varagu 716.076923
Wheat 279.448276
other fibres 3200.000000
other misc. pulses 406.363636
other oilseeds 4920.617021

68 rows × 1 columns

In [83]:
dfap.groupby('District_Name').mean()
Out[83]:
Area Production
District_Name
ANANTAPUR 23560.174745 1.039561e+05
CHITTOOR 7803.250639 4.784355e+05
EAST GODAVARI 15335.392857 1.094055e+07
GUNTUR 20087.330409 1.239706e+05
KADAPA 8836.735618 3.141032e+04
KRISHNA 17617.903481 5.845377e+05
KURNOOL 20592.643204 4.241458e+04
PRAKASAM 11984.492519 4.701696e+04
SPSR NELLORE 9143.014749 1.938399e+05
SRIKAKULAM 10384.364035 3.131024e+06
VISAKHAPATANAM 6978.700865 1.248964e+06
VIZIANAGARAM 8866.102041 6.451287e+05
WEST GODAVARI 19516.475610 7.469907e+06
In [84]:
dfap.groupby('District_Name').mean().sort_values('Area', ascending = False)
Out[84]:
Area Production
District_Name
ANANTAPUR 23560.174745 1.039561e+05
KURNOOL 20592.643204 4.241458e+04
GUNTUR 20087.330409 1.239706e+05
WEST GODAVARI 19516.475610 7.469907e+06
KRISHNA 17617.903481 5.845377e+05
EAST GODAVARI 15335.392857 1.094055e+07
PRAKASAM 11984.492519 4.701696e+04
SRIKAKULAM 10384.364035 3.131024e+06
SPSR NELLORE 9143.014749 1.938399e+05
VIZIANAGARAM 8866.102041 6.451287e+05
KADAPA 8836.735618 3.141032e+04
CHITTOOR 7803.250639 4.784355e+05
VISAKHAPATANAM 6978.700865 1.248964e+06
In [85]:
dfap.groupby('District_Name').mean().sort_values('Production', ascending = False)
Out[85]:
Area Production
District_Name
EAST GODAVARI 15335.392857 1.094055e+07
WEST GODAVARI 19516.475610 7.469907e+06
SRIKAKULAM 10384.364035 3.131024e+06
VISAKHAPATANAM 6978.700865 1.248964e+06
VIZIANAGARAM 8866.102041 6.451287e+05
KRISHNA 17617.903481 5.845377e+05
CHITTOOR 7803.250639 4.784355e+05
SPSR NELLORE 9143.014749 1.938399e+05
GUNTUR 20087.330409 1.239706e+05
ANANTAPUR 23560.174745 1.039561e+05
PRAKASAM 11984.492519 4.701696e+04
KURNOOL 20592.643204 4.241458e+04
KADAPA 8836.735618 3.141032e+04
In [86]:
dfaps = dfap.groupby('District_Name').mean().sort_values('Production', ascending = False)
In [87]:
dfaps.plot(figsize=(10, 5))
plt.xticks(rotation=90)
Out[87]:
(array([-2.,  0.,  2.,  4.,  6.,  8., 10., 12., 14.]),
 <a list of 9 Text major ticklabel objects>)
In [88]:
dfap_season = dfap.groupby('Season')['Area'].mean()
In [89]:
dfap_season
Out[89]:
Season
Kharif        18574.629101
Rabi          11415.132644
Whole Year     7277.522436
Name: Area, dtype: float64
In [90]:
dfap_season.plot.box()
Out[90]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9ddd5390>
In [91]:
dfap_season.plot.bar()
Out[91]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9eb23350>
In [92]:
dfTest = dfap.groupby('Season')['Area'].median()
ax = dfTest.plot.bar(figsize = (10, 4))

for p in ax.patches:
    ax.annotate(np.round(p.get_height(), decimals = 2), (p.get_x() * 1.05, p.get_height() * 1.05))
In [93]:
dfap.groupby('Season')['Area'].mean().plot.bar()
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9de90210>
In [94]:
dfap.groupby("Crop")['Area'].mean().plot.bar(figsize = (10, 3))
Out[94]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9dff3b50>
In [95]:
dfap.groupby("Crop")['Area'].median().plot.bar(figsize = (10, 3))
Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e0ce510>
In [96]:
dfTest = dfap.groupby('District_Name')['Area'].mean()
ax = dfTest.plot.bar(figsize = (10, 4))

for p in ax.patches:
    ax.annotate(np.round(p.get_height(), decimals = 2), (p.get_x() * 1.05, p.get_height() * 1.05))
In [97]:
dfap.groupby('District_Name')['Area'].median().plot.bar(figsize = (10, 3))
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9afe6190>
In [98]:
dfap.groupby(["District_Name", 'Season'])['Area'].mean().plot.bar(figsize = (10, 3))
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9af59110>
In [99]:
dfap.groupby(["Season", 'District_Name'])['Area'].mean().plot.bar(figsize = (10, 3))
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9ae837d0>

Placing values over bars in bar plot

In [100]:
dfTest = dfap.groupby(["Season", 'District_Name'])['Area'].mean()
ax = dfTest.plot.bar(figsize = (10, 4))

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.05, p.get_height() * 1.05))
In [101]:
dfTest = dfap.groupby(["Season", 'District_Name'])['Area'].mean()
ax = dfTest.plot.bar(figsize = (12, 4))

for p in ax.patches:
    ax.annotate(np.round(p.get_height(), decimals = 2), (p.get_x() * 1.05, p.get_height() * 1.05))
In [102]:
dfTest = dfap.groupby(["Season", 'District_Name'])['Area'].mean()
ax = dfTest.plot.barh(figsize = (4, 12))

#for p in ax.patches:
#    ax.annotate(np.round(p.get_width(), decimals = 2), (p.get_y() * 1.05, p.get_width() * 1.05))
for i in ax.patches:
    plt.text(i.get_width(), i.get_y()+.05,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
In [103]:
dfap.head()
Out[103]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
38171 7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
38165 1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
38095 1268 Andhra Pradesh CHITTOOR 2003 Whole Year Cucumber 1.0 0.0
38126 1208 Andhra Pradesh CHITTOOR 2002 Whole Year Cucumber 8.0 0.0
38087 7986 Andhra Pradesh VISAKHAPATANAM 2003 Whole Year Cucumber 19.0 0.0

Bar plot with MatPlotLib

In [104]:
df.head(2)
Out[104]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
3939 2543 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
4954 2432 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0
In [105]:
dfap.head(2)
Out[105]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
38171 7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
38165 1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
In [106]:
dfap.dtypes
Out[106]:
ColIndex          object
State_Name        object
District_Name     object
Crop_Year         object
Season            object
Crop              object
Area             float64
Production       float64
dtype: object
In [107]:
dfap.set_index("ColIndex", inplace = True)
In [108]:
dfap.head(2)
Out[108]:
State_Name District_Name Crop_Year Season Crop Area Production
ColIndex
7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
In [109]:
dfapTest = dfap.groupby('District_Name').sum()
dfapTest.reset_index(inplace = True)
dfapTest
Out[109]:
District_Name Area Production
0 ANANTAPUR 18471177.0 8.150159e+07
1 CHITTOOR 6102142.0 3.741366e+08
2 EAST GODAVARI 11593557.0 8.271057e+09
3 GUNTUR 13739734.0 8.479591e+07
4 KADAPA 7219613.0 2.566223e+07
5 KRISHNA 11134515.0 3.694278e+08
6 KURNOOL 16968338.0 3.494962e+07
7 PRAKASAM 9611563.0 3.770760e+07
8 SPSR NELLORE 6198964.0 1.314234e+08
9 SRIKAKULAM 7102905.0 2.141621e+09
10 VISAKHAPATANAM 5645769.0 1.010412e+09
11 VIZIANAGARAM 6516585.0 4.741696e+08
12 WEST GODAVARI 11202457.0 4.287727e+09
In [110]:
fig = plt.figure(figsize = (10, 4))
dfapTest = dfap.groupby('District_Name').sum()
dfapTest.reset_index(inplace=True)
plt.bar(dfapTest['District_Name'], dfapTest['Production'])
plt.xticks(rotation = 90)
Out[110]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 <a list of 13 Text major ticklabel objects>)
In [111]:
fig = plt.figure(figsize = (10, 4))
dfapTest = dfap.groupby('District_Name').sum()
dfapTest.reset_index(inplace=True)
plt.barh(dfapTest['District_Name'], dfapTest['Production'])
plt.xticks(rotation = 90)
Out[111]:
(array([0.e+00, 1.e+09, 2.e+09, 3.e+09, 4.e+09, 5.e+09, 6.e+09, 7.e+09,
        8.e+09, 9.e+09]), <a list of 10 Text major ticklabel objects>)
In [112]:
dfapTest = dfap.groupby('District_Name').sum()
dfapTest.reset_index(inplace=True)

plt.figure(figsize = (15, 3))
plt.subplot(131)
plt.bar(dfapTest['District_Name'], dfapTest['Production'])
plt.xticks(rotation = 90)

plt.subplot(132)
plt.scatter(dfapTest['District_Name'], dfapTest['Production'])
plt.xticks(rotation = 90)

plt.subplot(133)
plt.plot(dfapTest['District_Name'], dfapTest['Production'], linewidth=4)
plt.xticks(rotation = 90)
Out[112]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 <a list of 13 Text major ticklabel objects>)
In [113]:
dfapTest = dfap.groupby('District_Name').sum()
dfapTest.reset_index(inplace=True)

plt.figure(figsize = (15, 3))
plt.subplot(131)
plt.bar(dfapTest['District_Name'], dfapTest['Area'])
plt.xticks(rotation = 90)

plt.subplot(132)
plt.scatter(dfapTest['District_Name'], dfapTest['Area'])
plt.xticks(rotation = 90)

plt.subplot(133)
plt.plot(dfapTest['District_Name'], dfapTest['Area'], linewidth=4)
plt.xticks(rotation = 90)
Out[113]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 <a list of 13 Text major ticklabel objects>)
In [114]:
dfapTest = dfap.groupby('Crop').sum()
dfapTest.reset_index(inplace = True)
In [115]:
dfapTest.sort_values('Production', inplace = True, ascending=False)
In [116]:
plt.figure(figsize = (12, 4))
plt.bar(dfapTest['Crop'], dfapTest["Production"])
plt.xticks(rotation=90);

dataframe dfapNC to exclude Coconut Crop - as it is dominant crop

In [117]:
# dfapTestNoCoconut = dfapNC
dfapNC = dfap.loc[~dfap['Crop'].str.contains('Coconut', case = False)]
In [118]:
dfapNC
Out[118]:
State_Name District_Name Crop_Year Season Crop Area Production
ColIndex
7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
1268 Andhra Pradesh CHITTOOR 2003 Whole Year Cucumber 1.0 0.0
1208 Andhra Pradesh CHITTOOR 2002 Whole Year Cucumber 8.0 0.0
7986 Andhra Pradesh VISAKHAPATANAM 2003 Whole Year Cucumber 19.0 0.0
... ... ... ... ... ... ... ...
1159 Andhra Pradesh CHITTOOR 2001 Whole Year Sugarcane 38789.0 3337250.0
1037 Andhra Pradesh CHITTOOR 1998 Whole Year Sugarcane 34500.0 3377000.0
1407 Andhra Pradesh CHITTOOR 2006 Whole Year Sugarcane 37965.0 3496956.0
1113 Andhra Pradesh CHITTOOR 2000 Whole Year Sugarcane 40539.0 3697441.0
1066 Andhra Pradesh CHITTOOR 1999 Whole Year Sugarcane 44929.0 4097998.0

9392 rows × 7 columns

In [119]:
dfapNC.groupby('Crop').sum()
dfapNC.reset_index(inplace = True)

plot excluding Coconut as it is the most dominating crop

In [120]:
plt.figure(figsize = (10, 4));
plt.bar(dfapNC["Crop"], dfapNC['Production']);
plt.xticks(rotation = 90);

excluding Sugarcane as well as it the next biggest crop after Coconut

In [121]:
dfapNcSc = dfapNC[~ dfapNC['Crop'].str.contains('Sugarcane', case = False)]
In [122]:
dfapNcSc = dfapNcSc[~ dfapNcSc['Crop'].str.contains('Rice', case = False)]
In [123]:
dfapNcSc
Out[123]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
0 7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
1 1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
2 1268 Andhra Pradesh CHITTOOR 2003 Whole Year Cucumber 1.0 0.0
3 1208 Andhra Pradesh CHITTOOR 2002 Whole Year Cucumber 8.0 0.0
4 7986 Andhra Pradesh VISAKHAPATANAM 2003 Whole Year Cucumber 19.0 0.0
... ... ... ... ... ... ... ... ...
9262 294 Andhra Pradesh ANANTAPUR 2000 Kharif Groundnut 791559.0 884963.0
9268 3028 Andhra Pradesh GUNTUR 2010 Rabi Maize 80083.0 904377.0
9277 2955 Andhra Pradesh GUNTUR 2008 Rabi Maize 95547.0 950884.0
9298 3187 Andhra Pradesh GUNTUR 2014 Kharif Cotton(lint) 206374.0 1075856.0
9301 620 Andhra Pradesh ANANTAPUR 2007 Kharif Groundnut 876000.0 1102000.0

8679 rows × 8 columns

In [124]:
dfapNcSc.reset_index(inplace = True)
In [125]:
dfapncsc = dfapNcSc.groupby('Crop').sum()
In [126]:
dfapncsc.reset_index(inplace = True)
In [127]:
plt.figure(figsize = (12, 4));
plt.bar(dfapncsc['Crop'], dfapncsc['Production']);
plt.xticks(rotation = 90);

# plt.xlabel("Crop Names")
#plt.ylabel("Production")
# plt.title(label="excludes Coconut, Sugarcane, Rice", fontsize=20, color='green')
In [128]:
plt.figure(figsize = (12, 4));
plt.bar(dfapncsc['Crop'], dfapncsc['Area']);
plt.xticks(rotation = 90);
In [129]:
dfapncsc
Out[129]:
Crop index Area Production
0 Arecanut 47029 4486.0 3784.0
1 Arhar/Tur 1269496 3380398.0 1549955.0
2 Bajra 1281345 1007192.0 1265709.0
3 Banana 1228289 587709.0 15041631.0
4 Beans & Mutter(Vegetable) 115351 12373.0 70041.0
... ... ... ... ...
60 Varagu 31092 9309.0 6355.0
61 Wheat 103829 16208.0 10595.0
62 other fibres 128 6400.0 0.0
63 other misc. pulses 13069 4470.0 1426.0
64 other oilseeds 257512 231269.0 3595365.0

65 rows × 4 columns

In [130]:
dfap_season
Out[130]:
Season
Kharif        18574.629101
Rabi          11415.132644
Whole Year     7277.522436
Name: Area, dtype: float64
In [131]:
dfapseason = dfap.groupby('Season').sum()
In [132]:
dfapseason
Out[132]:
Area Production
Season
Kharif 78124890.0 2.003361e+08
Rabi 39758907.0 9.815619e+07
Whole Year 13623522.0 1.702610e+10
In [133]:
dfapseason.reset_index(inplace = True)
In [134]:
plt.bar(dfapseason['Season'], dfapseason['Production'])
Out[134]:
<BarContainer object of 3 artists>
In [135]:
plt.bar(dfapseason['Season'], dfapseason['Area'])
Out[135]:
<BarContainer object of 3 artists>
In [136]:
dfapds = dfap.groupby(['District_Name', 'Season']).sum()
In [137]:
dfapds
Out[137]:
Area Production
District_Name Season
ANANTAPUR Kharif 15772031.0 1.110110e+07
Rabi 2251451.0 3.112690e+06
Whole Year 447695.0 6.728780e+07
CHITTOOR Kharif 4132190.0 1.584805e+07
Rabi 1235928.0 3.721102e+06
Whole Year 734024.0 3.545674e+08
EAST GODAVARI Kharif 4762584.0 1.717548e+07
Rabi 4693116.0 1.436639e+07
Whole Year 2137857.0 8.239515e+09
GUNTUR Kharif 9444869.0 2.977378e+07
Rabi 3688520.0 1.132108e+07
Whole Year 606345.0 4.370105e+07
KADAPA Kharif 4215587.0 6.152978e+06
Rabi 2195094.0 3.161947e+06
Whole Year 808932.0 1.634731e+07
KRISHNA Kharif 6365638.0 2.522153e+07
Rabi 3966161.0 9.034844e+06
Whole Year 802716.0 3.351714e+08
KURNOOL Kharif 9942501.0 1.801736e+07
Rabi 5695330.0 9.021489e+06
Whole Year 1330507.0 7.910767e+06
PRAKASAM Kharif 4352171.0 7.485288e+06
Rabi 3966730.0 9.517836e+06
Whole Year 1292662.0 2.070448e+07
SPSR NELLORE Kharif 1408113.0 6.691477e+06
Rabi 4143036.0 1.259368e+07
Whole Year 647815.0 1.121383e+08
SRIKAKULAM Kharif 4488675.0 1.121324e+07
Rabi 1803887.0 1.791155e+06
Whole Year 810343.0 2.128616e+09
VISAKHAPATANAM Kharif 3857648.0 1.232187e+07
Rabi 516937.0 7.105650e+05
Whole Year 1271184.0 9.973791e+08
VIZIANAGARAM Kharif 4478037.0 1.571793e+07
Rabi 1334481.0 1.489938e+06
Whole Year 704067.0 4.569617e+08
WEST GODAVARI Kharif 4904846.0 2.361597e+07
Rabi 4268236.0 1.831347e+07
Whole Year 2029375.0 4.245797e+09
In [138]:
dfapds.reset_index(inplace = True)
In [139]:
dfapds.to_excel('dfapds_for_graph.xlsx')
In [140]:
df.head()
Out[140]:
ColIndex State_Name District_Name Crop_Year Season Crop Area Production
3939 2543 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
4954 2432 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0
5092 2488 Andhra Pradesh EAST GODAVARI 2013 Whole Year Coconut 49114.0 720895000.0
5159 2378 Andhra Pradesh EAST GODAVARI 2011 Whole Year Coconut 50741.0 719961050.0
5175 9829 Andhra Pradesh WEST GODAVARI 2014 Whole Year Coconut 21729.0 718991000.0
In [141]:
df.drop('ColIndex', inplace = True, axis=1)
In [142]:
df.head()
Out[142]:
State_Name District_Name Crop_Year Season Crop Area Production
3939 Andhra Pradesh EAST GODAVARI 2014 Whole Year Coconut 46865.0 780162000.0
4954 Andhra Pradesh EAST GODAVARI 2012 Whole Year Coconut 49516.0 729965000.0
5092 Andhra Pradesh EAST GODAVARI 2013 Whole Year Coconut 49114.0 720895000.0
5159 Andhra Pradesh EAST GODAVARI 2011 Whole Year Coconut 50741.0 719961050.0
5175 Andhra Pradesh WEST GODAVARI 2014 Whole Year Coconut 21729.0 718991000.0
In [143]:
dfap.head(2)
Out[143]:
State_Name District_Name Crop_Year Season Crop Area Production
ColIndex
7919 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
1263 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
In [144]:
dfap.reset_index(inplace = True)
In [145]:
dfap.dtypes
Out[145]:
ColIndex          object
State_Name        object
District_Name     object
Crop_Year         object
Season            object
Crop              object
Area             float64
Production       float64
dtype: object
In [146]:
dfap.drop('ColIndex', inplace = True, axis=1)
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4913: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [147]:
dfap.head(2)
Out[147]:
State_Name District_Name Crop_Year Season Crop Area Production
0 Andhra Pradesh VISAKHAPATANAM 2002 Whole Year Bottle Gourd 37.0 0.0
1 Andhra Pradesh CHITTOOR 2003 Whole Year Cabbage 73.0 0.0
In [148]:
dfapgrp = dfap.groupby(['Crop_Year', 'District_Name', 'Season', 'Crop']).sum()
In [149]:
dfapgrp.head(2)
Out[149]:
Area Production
Crop_Year District_Name Season Crop
1997 ANANTAPUR Kharif Arhar/Tur 21400.0 2600.0
Bajra 1400.0 500.0
In [150]:
dfapgrp.reset_index(inplace = True)
In [151]:
dfapgrp.shape
Out[151]:
(9561, 6)
In [152]:
dfapgrp.head(2)
Out[152]:
Crop_Year District_Name Season Crop Area Production
0 1997 ANANTAPUR Kharif Arhar/Tur 21400.0 2600.0
1 1997 ANANTAPUR Kharif Bajra 1400.0 500.0
In [153]:
dfapgrp.to_excel('dfapgrp.xlsx')
In [155]:
dfapgrp.head(2)
Out[155]:
Crop_Year District_Name Season Crop Area Production
0 1997 ANANTAPUR Kharif Arhar/Tur 21400.0 2600.0
1 1997 ANANTAPUR Kharif Bajra 1400.0 500.0
In [157]:
plt.bar(dfapgrp['Crop_Year'], dfapgrp['Area'])
plt.xticks(rotation = 90)
Out[157]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 <a list of 18 Text major ticklabel objects>)
In [168]:
dfaptest = dfapgrp.groupby(['Crop_Year', 'Season'])['Area'].sum()
ax = dfaptest.plot.bar(figsize = (15, 4), fontsize=10)

for p in ax.patches:
    ax.annotate(np.round(p.get_height(), decimals = 2), (p.get_x() * 1.05, p.get_height() * 1.05))
In [177]:
dfaptest = dfapgrp.groupby(['Crop_Year', 'Season'])['Area'].sum()
ax = dfaptest.plot.barh(figsize = (15, 20), fontsize=10)
for i in ax.patches:
    plt.text(i.get_width(), i.get_y()+.05,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
In [190]:
dftest = dfapgrp.loc[(dfapgrp['District_Name'] == 'GUNTUR')];
dftest.groupby(['Crop_Year', 'Season'])['Area'].sum().plot.bar(figsize = (10, 4));
plt.xticks(rotation = 90);

Function to plot graph

In [191]:
def plotDistGraph(distName):
    dftest = dfapgrp.loc[(dfapgrp['District_Name'] == distName)];
    dftest.groupby(['Crop_Year', 'Season'])['Area'].sum().plot.bar(figsize = (10, 4));
    plt.xticks(rotation = 90);
In [192]:
dfapgrp.District_Name.unique()
Out[192]:
array(['ANANTAPUR', 'CHITTOOR', 'EAST GODAVARI', 'GUNTUR', 'KADAPA',
       'KRISHNA', 'KURNOOL', 'PRAKASAM', 'SPSR NELLORE', 'SRIKAKULAM',
       'VISAKHAPATANAM', 'VIZIANAGARAM', 'WEST GODAVARI'], dtype=object)
In [194]:
plotDistGraph('VIZIANAGARAM')
In [195]:
plotDistGraph('SRIKAKULAM')
In [200]:
print(dfapgrp.min())
Crop_Year             1997
District_Name    ANANTAPUR
Season              Kharif
Crop              Arecanut
Area                   1.0
Production             0.0
dtype: object
In [201]:
print(dfapgrp.max())
Crop_Year                  2014
District_Name     WEST GODAVARI
Season               Whole Year
Crop             other oilseeds
Area                   877029.0
Production          780162000.0
dtype: object
In [220]:
dfTest = dfapgrp[(dfapgrp['District_Name'] == "ANANTAPUR") & 
                 (dfapgrp['Season'] == "Rabi")]
In [221]:
dfTest
Out[221]:
Crop_Year District_Name Season Crop Area Production
17 1997 ANANTAPUR Rabi Dry chillies 100.0 100.0
18 1997 ANANTAPUR Rabi Gram 28000.0 14800.0
19 1997 ANANTAPUR Rabi Groundnut 20200.0 21700.0
20 1997 ANANTAPUR Rabi Horse-gram 600.0 200.0
21 1997 ANANTAPUR Rabi Jowar 18800.0 9400.0
... ... ... ... ... ... ...
8923 2014 ANANTAPUR Rabi Sunflower 3386.0 1642.0
8924 2014 ANANTAPUR Rabi Sweet potato 4.0 44.0
8925 2014 ANANTAPUR Rabi Tobacco 341.0 635.0
8926 2014 ANANTAPUR Rabi Tomato 4191.0 87055.0
8927 2014 ANANTAPUR Rabi Urad 42.0 61.0

287 rows × 6 columns

In [231]:
def plotDistGraph(distName, season):
    dfTest = dfapgrp[(dfapgrp['District_Name'] == distName) & 
                 (dfapgrp['Season'] == season)]
    dfTest.groupby(['Crop_Year', 'Season'])['Production'].sum().plot.bar(figsize = (10, 4));
    plt.xticks(rotation = 90);
In [232]:
plotDistGraph('ANANTAPUR', 'Rabi')
In [224]:
plotDistGraph('ANANTAPUR', 'Kharif')
In [225]:
dfapgrp.District_Name.unique()
Out[225]:
array(['ANANTAPUR', 'CHITTOOR', 'EAST GODAVARI', 'GUNTUR', 'KADAPA',
       'KRISHNA', 'KURNOOL', 'PRAKASAM', 'SPSR NELLORE', 'SRIKAKULAM',
       'VISAKHAPATANAM', 'VIZIANAGARAM', 'WEST GODAVARI'], dtype=object)
In [227]:
plotDistGraph('VIZIANAGARAM', 'Rabi')
In [228]:
plotDistGraph('VIZIANAGARAM', 'Whole Year')
In [233]:
def plotDistGraph(distName, season):
    dfTest = dfapgrp[(dfapgrp['District_Name'] == distName) & 
                 (dfapgrp['Season'] == season)]
    dfTest.groupby(['Crop'])['Production'].sum().plot.bar(figsize = (10, 4));
    plt.xticks(rotation = 90);
In [236]:
dfapgrp.District_Name.unique()
Out[236]:
array(['ANANTAPUR', 'CHITTOOR', 'EAST GODAVARI', 'GUNTUR', 'KADAPA',
       'KRISHNA', 'KURNOOL', 'PRAKASAM', 'SPSR NELLORE', 'SRIKAKULAM',
       'VISAKHAPATANAM', 'VIZIANAGARAM', 'WEST GODAVARI'], dtype=object)
In [238]:
plotDistGraph('PRAKASAM', 'Whole Year')
In [239]:
plotDistGraph('GUNTUR', 'Whole Year')
In [240]:
plotDistGraph('KURNOOL', 'Whole Year')
In [245]:
def plotGraph(stateName, distName, season):
    dfTest = df[(df['State_Name'] == stateName) &
                     (df['District_Name'] == distName) & 
                 (df['Season'] == season)]
    dfTest.groupby(['Crop'])['Production'].sum().plot.bar(figsize = (10, 4));
    plt.xticks(rotation = 90);
In [243]:
df.State_Name.unique()
Out[243]:
array(['Andhra Pradesh', 'Andaman and Nicobar Islands', 'Assam', 'Bihar',
       'Arunachal Pradesh'], dtype=object)
In [247]:
plotGraph('Andhra Pradesh', 'GUNTUR', 'Rabi')
In [248]:
plotGraph('Andhra Pradesh', 'GUNTUR', 'Kharif')
In [249]:
plotGraph('Andhra Pradesh', 'GUNTUR', 'Whole Year')
In [255]:
dfTest = df[(df['State_Name'] == 'Andhra Pradesh') &
                     (df['District_Name'] == 'GUNTUR') & 
                 (df['Season'] == 'Whole Year')].groupby('Crop').sum()
In [256]:
dfTest
Out[256]:
Crop_Year Area Production
Crop
Banana 24072 73146.0 1936195.0
Beans & Mutter(Vegetable) 4005 978.0 5853.0
Bhindi 4005 3952.0 24420.0
Bottle Gourd 4005 550.0 0.0
Brinjal 4005 2400.0 40733.0
Cabbage 4005 445.0 0.0
Cashewnut 24072 1824.0 463.0
Castor seed 2013 933.0 1313.0
Citrus Fruit 4005 4027.0 19594.0
Coconut 26088 3198.0 38200935.0
Coriander 14029 472.0 348.0
Cucumber 4005 3443.0 0.0
Dry chillies 1999 59256.0 111260.0
Mango 4005 1897.0 12738.0
Onion 1999 1593.0 23110.0
Orange 4005 1247.0 19673.0
Other Fresh Fruits 4005 2709.0 12637.0
Other Vegetables 4005 10437.0 0.0
Papaya 4005 175.0 19340.0
Pome Fruit 4005 592.0 4799.0
Rice 2014 285302.0 1111932.0
Sugarcane 28063 21702.0 1652095.0
Sunflower 12049 554.0 596.0
Sweet potato 22059 426.0 8028.0
Tapioca 2004 5.0 36.0
Tobacco 26063 62659.0 130793.0
Tomato 4005 2545.0 21452.0
Turmeric 22059 53610.0 342706.0
other fibres 2002 6268.0 0.0
In [ ]: