Agriculture OpenSource DataSet
 Visualisation with Plotly Express
agriculture_ds
In [ ]:
! pip3 install plotly==5.7.0
! pip3 install chart_studio
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly
import plotly.io as pio
In [3]:
import cufflinks as cf
from plotly.offline import download_plotlyjs, iplot, plot, init_notebook_mode
init_notebook_mode(connected=True)
cf.go_offline()
In [4]:
# ~~~~~~~~~~~~~~~~~~ Code for Plotly and CuffLinks ~~~~~~~~~~~~~~~~~~~~~
# reference: https://www.repath.in/Plotly-and-Cuff-Links/

#~~~~~~~~~~~~~~~~ the following function helps Google Colab to display graphs
# and should be used in every cell, where ever a graph needs to be plotted ~~~~~~~~~~~~~~~~~~~

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))
  
# configure_plotly_browser_state()
# df.plot()
In [6]:
df = pd.read_csv('/content/agriculture_ds.csv')
In [7]:
df.Production.isnull().sum()
Out[7]:
1
In [ ]:
df.shape()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-7-0e566b70f572> in <module>()
----> 1 df.shape()

TypeError: 'tuple' object is not callable
In [8]:
df.shape
Out[8]:
(38264, 7)
In [9]:
df.head()
Out[9]:
State_Name District_Name Crop_Year Season Crop Area Production
0 Andaman and Nicobar Islands NICOBARS 2000 Kharif Arecanut 1254.0 2000
1 Andaman and Nicobar Islands NICOBARS 2000 Kharif Other Kharif pulses 2.0 1
2 Andaman and Nicobar Islands NICOBARS 2000 Kharif Rice 102.0 321
3 Andaman and Nicobar Islands NICOBARS 2000 Whole Year Banana 176.0 641
4 Andaman and Nicobar Islands NICOBARS 2000 Whole Year Cashewnut 720.0 165
In [10]:
sns.heatmap(df.isnull(), cmap = 'Blues_r')
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc0fab3ed0>
In [12]:
df.head(1)
Out[12]:
State_Name District_Name Crop_Year Season Crop Area Production
0 Andaman and Nicobar Islands NICOBARS 2000 Kharif Arecanut 1254.0 2000
In [13]:
df.State_Name.unique()
Out[13]:
array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar'], dtype=object)
In [ ]:
df.Crop_Year.unique()
Out[ ]:
array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2010, 1997, 1998, 1999,
       2007, 2008, 2009, 2011, 2012, 2013, 2014, 2015])
In [ ]:
df.Season.unique()
Out[ ]:
array(['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '], dtype=object)
In [ ]:
df.Crop.unique()
Out[ ]:
array(['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Rapeseed &Mustard', 'Mesta',
       'Cowpea(Lobia)', 'Lemon', 'Pome Granet', 'Sapota', 'Cabbage',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Sannhamp',
       'Varagu', 'Garlic', 'Ginger', 'Oilseeds total', 'Pulses total',
       'Jute', 'Peas & beans (Pulses)', 'Blackgram', 'Paddy', 'Pineapple',
       'Barley', 'Khesari', 'Guar seed', 'Moth',
       'Other Cereals & Millets', 'Cond-spcs other', 'Turnip', 'Carrot',
       'Redish', 'Arcanut (Processed)', 'Atcanut (Raw)',
       'Cashewnut Processed', 'Cashewnut Raw', 'Cardamom', 'Rubber',
       'Bitter Gourd', 'Drum Stick', 'Jack Fruit', 'Snak Guard',
       'Pump Kin', 'Tea', 'Coffee', 'Cauliflower', 'Other Citrus Fruit',
       'Water Melon', 'Total foodgrain', 'Kapas', 'Colocosia', 'Lentil',
       'Bean', 'Jobster', 'Perilla', 'Rajmash Kholar',
       'Ricebean (nagadal)', 'Ash Gourd', 'Beet Root', 'Lab-Lab',
       'Ribed Guard', 'Yam', 'Apple', 'Peach', 'Pear', 'Plums', 'Litchi',
       'Ber', 'Other Dry Fruit', 'Jute & mesta'], dtype=object)
In [14]:
df.dtypes
Out[14]:
State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production        object
dtype: object
In [15]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38264 entries, 0 to 38263
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State_Name     38264 non-null  object 
 1   District_Name  38264 non-null  object 
 2   Crop_Year      38264 non-null  int64  
 3   Season         38264 non-null  object 
 4   Crop           38263 non-null  object 
 5   Area           38263 non-null  float64
 6   Production     38263 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB
In [16]:
df = pd.read_csv('/content/agriculture_ds.csv', na_values = "=")
In [17]:
df.Production.isnull().sum()
Out[17]:
2567

the following heatmap helps to identify the null values in the dataframa

  • all the white horizontal bars, which are white in color are null values
  • color bar cbar is not requred though
In [19]:
sns.heatmap(df.isnull(), cmap = 'Blues_r')
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc0cbf11d0>
In [ ]:
df.dtypes
Out[ ]:
State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production       float64
dtype: object
In [ ]:
df.shape
Out[ ]:
(246091, 7)
In [20]:
df.Production.isnull().count()
Out[20]:
131297
In [ ]:
df.Production.isnull().sum()
In [ ]:
print ("Null values percentage : " , (3727/246091)*100)
Null values percentage :  1.5144804157811542

dropna()

  • deletes all rows, even if the row contains atleast a single null value (ie., across any column)
  • Threshold is another parameter that can check the number of null values
  • for example if there be a need to delete rows that has more than 4 null values
In [21]:
df.dropna(inplace = True)
In [22]:
df.shape
Out[22]:
(128730, 7)

Question: Which state has done well and which are not?

watch - Hans Roslng video from Ted on Data Visualisation

In [23]:
sns.kdeplot(df.Production)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc0c9da650>
In [24]:
sns.boxplot(df.Production)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc0a95cb50>
In [25]:
sns.boxplot(df.Production, hue=df.State_Name)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc0c9daa90>
In [ ]:
sns.boxplot(df.Area)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dadb3b50>
In [ ]:
sns.kdeplot(df.Area)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dad9bad0>
In [ ]:
df[df.State_Name == 'Karnataka']
Out[ ]:
State_Name District_Name Crop_Year Season Crop Area Production
76865 Karnataka BAGALKOT 1998 Kharif Arhar/Tur 6154.0 2602.0
76866 Karnataka BAGALKOT 1998 Kharif Bajra 48855.0 52375.0
76867 Karnataka BAGALKOT 1998 Kharif Castor seed 71.0 61.0
76868 Karnataka BAGALKOT 1998 Kharif Cotton(lint) 15225.0 22129.0
76869 Karnataka BAGALKOT 1998 Kharif Groundnut 16368.0 7734.0
... ... ... ... ... ... ... ...
97981 Karnataka YADGIR 2014 Summer Onion 7.0 34.0
97982 Karnataka YADGIR 2014 Summer Rice 15205.0 35029.0
97983 Karnataka YADGIR 2014 Summer Sunflower 14.0 5.0
97984 Karnataka YADGIR 2014 Whole Year Coconut 168.0 1499.0
97986 Karnataka YADGIR 2014 Whole Year Sugarcane 862.0 78614.0

21079 rows × 7 columns

In [ ]:
df[df.State_Name == "Karnataka"]['District_Name'].unique()
Out[ ]:
array(['BAGALKOT', 'BANGALORE RURAL', 'BELGAUM', 'BELLARY',
       'BENGALURU URBAN', 'BIDAR', 'BIJAPUR', 'CHAMARAJANAGAR',
       'CHIKBALLAPUR', 'CHIKMAGALUR', 'CHITRADURGA', 'DAKSHIN KANNAD',
       'DAVANGERE', 'DHARWAD', 'GADAG', 'GULBARGA', 'HASSAN', 'HAVERI',
       'KODAGU', 'KOLAR', 'KOPPAL', 'MANDYA', 'MYSORE', 'RAICHUR',
       'RAMANAGARA', 'SHIMOGA', 'TUMKUR', 'UDUPI', 'UTTAR KANNAD',
       'YADGIR'], dtype=object)
In [ ]:
df_total = df.loc[df['District_Name'].str.contains('Total', case = False)]
In [ ]:
df_total.count()
Out[ ]:
State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

to create a subset with crop type to be only Rice Crop

In [ ]:
df_total = df.loc[df['State_Name'].str.contains('Total', case = False)]
In [ ]:
df_total
Out[ ]:
State_Name District_Name Crop_Year Season Crop Area Production
In [ ]:
df_rice = df.loc[df['Crop'].str.contains('rice', case = False)]
In [ ]:
df_rice
Out[ ]:
State_Name District_Name Crop_Year Season Crop Area Production
2 Andaman and Nicobar Islands NICOBARS 2000 Kharif Rice 102.00 321.00
12 Andaman and Nicobar Islands NICOBARS 2001 Kharif Rice 83.00 300.00
18 Andaman and Nicobar Islands NICOBARS 2002 Kharif Rice 189.20 510.84
27 Andaman and Nicobar Islands NICOBARS 2003 Kharif Rice 52.00 90.17
36 Andaman and Nicobar Islands NICOBARS 2004 Kharif Rice 52.94 72.57
... ... ... ... ... ... ... ...
246049 West Bengal PURULIA 2013 Summer Rice 516.00 1274.00
246052 West Bengal PURULIA 2013 Winter Rice 302274.00 730136.00
246058 West Bengal PURULIA 2014 Autumn Rice 264.00 721.00
246086 West Bengal PURULIA 2014 Summer Rice 306.00 801.00
246089 West Bengal PURULIA 2014 Winter Rice 279151.00 597899.00

15092 rows × 7 columns

to create a subset with only Rice and Wheat as crops

In [ ]:
df_rice_wheat = df.loc[df['Crop'].str.contains("wheat", case = False) |
                       df['Crop'].str.contains('rice', case = False)]
In [ ]:
df_rice_wheat
Out[ ]:
State_Name District_Name Crop_Year Season Crop Area Production
2 Andaman and Nicobar Islands NICOBARS 2000 Kharif Rice 102.00 321.00
12 Andaman and Nicobar Islands NICOBARS 2001 Kharif Rice 83.00 300.00
18 Andaman and Nicobar Islands NICOBARS 2002 Kharif Rice 189.20 510.84
27 Andaman and Nicobar Islands NICOBARS 2003 Kharif Rice 52.00 90.17
36 Andaman and Nicobar Islands NICOBARS 2004 Kharif Rice 52.94 72.57
... ... ... ... ... ... ... ...
246052 West Bengal PURULIA 2013 Winter Rice 302274.00 730136.00
246058 West Bengal PURULIA 2014 Autumn Rice 264.00 721.00
246084 West Bengal PURULIA 2014 Rabi Wheat 1622.00 3663.00
246086 West Bengal PURULIA 2014 Summer Rice 306.00 801.00
246089 West Bengal PURULIA 2014 Winter Rice 279151.00 597899.00

22970 rows × 7 columns

In [26]:
df.groupby(['State_Name', 'Crop', 'Crop_Year']).sum()
Out[26]:
Area Production
State_Name Crop Crop_Year
Andaman and Nicobar Islands Arecanut 2000.0 4354.00 7200.00
2001.0 4354.00 7300.00
2002.0 4363.00 7350.00
2003.0 4379.00 6707.00
2004.0 4425.37 4781.05
... ... ... ... ...
Maharashtra Wheat 2012.0 335600.00 508000.00
2013.0 481800.00 692000.00
2014.0 492900.00 598200.00
other oilseeds 2003.0 5500.00 1400.00
2004.0 6100.00 2100.00

6763 rows × 2 columns

In [27]:
df[df.State_Name == "West Bengal"]['Crop'].unique()
Out[27]:
array([], dtype=object)
In [ ]:
df.groupby(['State_Name', 'Crop_Year']).sum()
Out[ ]:
Area Production
State_Name Crop_Year
Andaman and Nicobar Islands 2000 44518.00 89060914.00
2001 41163.00 89718700.00
2002 45231.40 94387137.67
2003 44799.40 95296454.67
2004 45308.77 87186497.63
... ... ... ...
West Bengal 2010 7246875.00 38308645.00
2011 7755360.00 36777774.00
2012 7850936.00 38918275.00
2013 7999815.00 37901281.00
2014 8058390.00 43584403.00

519 rows × 2 columns

In [28]:
df_ = df.groupby(['State_Name', 'Crop_Year']).sum()
In [29]:
df_.head()
Out[29]:
Area Production
State_Name Crop_Year
Andaman and Nicobar Islands 2000.0 44518.00 89060914.00
2001.0 41163.00 89718700.00
2002.0 45231.40 94387137.67
2003.0 44799.40 95296454.67
2004.0 45308.77 87186497.63
In [30]:
df_.reset_index(inplace = True)

to get the Crop_Year Count in the dataset

In [31]:
df_[['State_Name', 'Crop_Year']].groupby('State_Name').count()
Out[31]:
Crop_Year
State_Name
Andaman and Nicobar Islands 8
Andhra Pradesh 18
Arunachal Pradesh 18
Assam 18
Bihar 18
Chandigarh 13
Chhattisgarh 15
Dadra and Nagar Haveli 17
Goa 12
Gujarat 16
Haryana 16
Himachal Pradesh 12
Jammu and Kashmir 12
Jharkhand 7
Karnataka 18
Kerala 18
Madhya Pradesh 17
Maharashtra 18
In [ ]:
df[['State_Name', 'Crop']].groupby('State_Name').count()
Out[ ]:
Crop
State_Name
Andaman and Nicobar Islands 203
Andhra Pradesh 9561
Arunachal Pradesh 2545
Assam 14622
Bihar 18874
Chandigarh 89
Chhattisgarh 10368
Dadra and Nagar Haveli 263
Goa 207
Gujarat 8365
Haryana 4540
Himachal Pradesh 2456
Jammu and Kashmir 1632
Jharkhand 1266
Karnataka 21079
Kerala 4003
Madhya Pradesh 22605
Maharashtra 12496
Manipur 1266
Meghalaya 2867
Mizoram 954
Nagaland 3904
Odisha 13524
Puducherry 872
Punjab 3143
Rajasthan 12066
Sikkim 714
Tamil Nadu 13266
Telangana 5591
Tripura 1412
Uttar Pradesh 33189
Uttarakhand 4825
West Bengal 9597
In [ ]:
df_ani = df.loc[df['State_Name'].str.contains('Andaman and Nicobar Islands', case = False)]
In [ ]:
df_ani
Out[ ]:
State_Name District_Name Crop_Year Season Crop Area Production
0 Andaman and Nicobar Islands NICOBARS 2000 Kharif Arecanut 1254.0 2000.00
1 Andaman and Nicobar Islands NICOBARS 2000 Kharif Other Kharif pulses 2.0 1.00
2 Andaman and Nicobar Islands NICOBARS 2000 Kharif Rice 102.0 321.00
3 Andaman and Nicobar Islands NICOBARS 2000 Whole Year Banana 176.0 641.00
4 Andaman and Nicobar Islands NICOBARS 2000 Whole Year Cashewnut 720.0 165.00
... ... ... ... ... ... ... ...
198 Andaman and Nicobar Islands SOUTH ANDAMANS 2010 Rabi Turmeric 10.0 105.00
199 Andaman and Nicobar Islands SOUTH ANDAMANS 2010 Rabi Urad 34.0 15.05
200 Andaman and Nicobar Islands SOUTH ANDAMANS 2010 Whole Year Banana 360.0 5517.00
201 Andaman and Nicobar Islands SOUTH ANDAMANS 2010 Whole Year Coconut 3540.0 11000000.00
202 Andaman and Nicobar Islands SOUTH ANDAMANS 2010 Whole Year Tapioca 22.5 220.00

203 rows × 7 columns

In [ ]:
sns.lineplot(x='Crop_Year', y='Area', data = df[df.State_Name == 'Andaman and Nicobar Islands'])
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dad06b10>
In [ ]:
sns.lineplot(x='Crop_Year', y='Area', data = df[df.State_Name == 'Odisha'])
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dabede90>
In [ ]:
sns.lineplot(x='Crop_Year', y='Area', data = df[df.State_Name == 'Andhra Pradesh'])
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dabe2750>
In [ ]:
sns.lineplot(x='Crop_Year', y='Production', data = df[df.State_Name == 'Andhra Pradesh'])
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dab96bd0>
In [ ]:
sns.lineplot(x = 'Crop_Year', y = 'Production', data = df_, hue = 'State_Name')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe5dcfd0250>

place legend beside the graph so that they may not overlap

In [ ]:
import matplotlib.pyplot as plt
sns.lineplot(x = 'Crop_Year', y = 'Production', data = df_, hue = 'State_Name')
plt.legend(bbox_to_anchor = [1,1])
Out[ ]:
<matplotlib.legend.Legend at 0x7fe5c23d2e90>
In [ ]:
# from matplotlib.cbook import boxplot_stats
sns.lineplot(x = 'Crop_Year', y = 'Production', data = df_, hue = 'State_Name')
plt.legend(bbox_to_anchor = (1, 1))
Out[ ]:
<matplotlib.legend.Legend at 0x7fe5c0514e50>

source for the following code:

https://stackoverflow.com/questions/47230817/plotly-notebook-mode-with-google-colaboratory

  • simply pass "colab" as the value for the parameter renderer in fig.show(renderer="colab")
In [ ]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Bar(y=[2, 1, 3, 5, 8, 9, 12, 11, 2, 4, 2, 3])],
    layout_title_text="A Figure Displayed with the 'colab' Renderer")
fig.show(renderer="colab")
In [32]:
!pip3 install plotly_express
Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Requirement already satisfied: plotly>=4.1.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (5.7.0)
Requirement already satisfied: patsy>=0.5 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (0.5.2)
Requirement already satisfied: pandas>=0.20.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.3.5)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.21.5)
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (0.10.2)
Requirement already satisfied: scipy>=0.18 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.4.1)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.0->plotly_express) (2018.9)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.0->plotly_express) (2.8.2)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from patsy>=0.5->plotly_express) (1.15.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly>=4.1.0->plotly_express) (8.0.1)
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1
In [33]:
import plotly_express as px
In [34]:
df.head(1)
Out[34]:
State_Name District_Name Crop_Year Season Crop Area Production
0 Andaman and Nicobar Islands NICOBARS 2000.0 Kharif Arecanut 1254.0 2000.0
In [35]:
df_ = df.groupby(['State_Name', 'Crop_Year']).sum()
In [36]:
df_.head()
Out[36]:
Area Production
State_Name Crop_Year
Andaman and Nicobar Islands 2000.0 44518.00 89060914.00
2001.0 41163.00 89718700.00
2002.0 45231.40 94387137.67
2003.0 44799.40 95296454.67
2004.0 45308.77 87186497.63
In [37]:
df_.reset_index(inplace = True)
In [38]:
df_.head()
Out[38]:
State_Name Crop_Year Area Production
0 Andaman and Nicobar Islands 2000.0 44518.00 89060914.00
1 Andaman and Nicobar Islands 2001.0 41163.00 89718700.00
2 Andaman and Nicobar Islands 2002.0 45231.40 94387137.67
3 Andaman and Nicobar Islands 2003.0 44799.40 95296454.67
4 Andaman and Nicobar Islands 2004.0 45308.77 87186497.63
In [ ]:
df_[['State_Name', 'Crop_Year']].groupby('State_Name').count()
Out[ ]:
Crop_Year
State_Name
Andaman and Nicobar Islands 203
Andhra Pradesh 9191
Arunachal Pradesh 2536
Assam 13532
Bihar 17762
Chandigarh 88
Chhattisgarh 9838
Dadra and Nagar Haveli 262
Goa 206
Gujarat 6641
Haryana 4197
Himachal Pradesh 2320
Jammu and Kashmir 1596
Jharkhand 1265
Karnataka 20398
Kerala 3914
Madhya Pradesh 20995
Maharashtra 9665
Manipur 1138
Meghalaya 2838
Mizoram 941
Nagaland 3279
Odisha 12785
Puducherry 797
Punjab 2192
Rajasthan 11058
Sikkim 707
Tamil Nadu 12131
Telangana 5405
Tripura 1401
Uttar Pradesh 29966
Uttarakhand 4559
West Bengal 9466

plotly scatter Graph

In [47]:
df_.head(2)
Out[47]:
State_Name Crop_Year Area Production
137 Gujarat 1997.0 9661100.0 27243227.0
253 Maharashtra 1997.0 6461787.0 6492161.0
In [55]:
df_.sort_values('Crop_Year', inplace = True)

plotly graph by Area

In [56]:
# px.scatter(df_, x = "Area", y='Production', animation_frame = 'Crop_Year',
#           animation_group = "State_Name", color = "State_Name")

import plotly.graph_objects as go
fig = go.Figure(px.scatter(df_, x = "Area", y = 'Production', 
                           animation_frame = 'Crop_Year',animation_group = "State_Name", 
                           color = "State_Name"))
# layout_title_text="Area on X and Production on Y, Animation frame as Crop Year grouped by State Name, Colors indicates states")
fig.show(renderer="colab")
In [57]:
import plotly.graph_objects as go
fig = go.Figure(px.bar(df_, x = "Area", y='Production', 
                           animation_frame = 'Crop_Year',animation_group = "State_Name", 
                           color = "State_Name"),
    layout_title_text="Area on X and Production on Y, Animation frame as Crop Year grouped by State Name, Colors indicates states"
)
fig.show(renderer="colab")
In [58]:
import plotly.graph_objects as go
fig = go.Figure(px.bar(df_, x = "Crop_Year", y='Area', color = "Crop_Year"),
    layout_title_text="Crop year on X and Area on Y, Crop Year as Colors indicates states"
)
fig.show(renderer="colab")
In [59]:
import plotly.graph_objects as go
fig = go.Figure(px.funnel(df_, x = "State_Name", y='Area', color = "State_Name"),
    layout_title_text="State on X and Area on Y, Crop Year as Colors indicates states"
)
fig.show(renderer="colab")
In [60]:
import plotly.graph_objects as go
fig = go.Figure(px.funnel(df_, x = "State_Name", y='Area', color = "Crop_Year"),
    layout_title_text="State on X and Area on Y, Crop Year as Colors indicates states"
)
fig.show(renderer="colab")
In [61]:
import plotly.graph_objects as go
fig = go.Figure(px.funnel(df_, x = "State_Name", y='Area'),
    layout_title_text="State on X and Area on Y"
)
fig.show(renderer="colab")

Exercise from Plotly Webinar - Training session

In [62]:
df = px.data.gapminder()
In [63]:
df
Out[63]:
country continent year lifeExp pop gdpPercap iso_alpha iso_num
0 Afghanistan Asia 1952 28.801 8425333 779.445314 AFG 4
1 Afghanistan Asia 1957 30.332 9240934 820.853030 AFG 4
2 Afghanistan Asia 1962 31.997 10267083 853.100710 AFG 4
3 Afghanistan Asia 1967 34.020 11537966 836.197138 AFG 4
4 Afghanistan Asia 1972 36.088 13079460 739.981106 AFG 4
... ... ... ... ... ... ... ... ...
1699 Zimbabwe Africa 1987 62.351 9216418 706.157306 ZWE 716
1700 Zimbabwe Africa 1992 60.377 10704340 693.420786 ZWE 716
1701 Zimbabwe Africa 1997 46.809 11404948 792.449960 ZWE 716
1702 Zimbabwe Africa 2002 39.989 11926563 672.038623 ZWE 716
1703 Zimbabwe Africa 2007 43.487 12311143 469.709298 ZWE 716

1704 rows × 8 columns

In [64]:
df = px.data.gapminder().query("year == 2007")
In [65]:
df
Out[65]:
country continent year lifeExp pop gdpPercap iso_alpha iso_num
11 Afghanistan Asia 2007 43.828 31889923 974.580338 AFG 4
23 Albania Europe 2007 76.423 3600523 5937.029526 ALB 8
35 Algeria Africa 2007 72.301 33333216 6223.367465 DZA 12
47 Angola Africa 2007 42.731 12420476 4797.231267 AGO 24
59 Argentina Americas 2007 75.320 40301927 12779.379640 ARG 32
... ... ... ... ... ... ... ... ...
1655 Vietnam Asia 2007 74.249 85262356 2441.576404 VNM 704
1667 West Bank and Gaza Asia 2007 73.422 4018332 3025.349798 PSE 275
1679 Yemen, Rep. Asia 2007 62.698 22211743 2280.769906 YEM 887
1691 Zambia Africa 2007 42.384 11746035 1271.211593 ZMB 894
1703 Zimbabwe Africa 2007 43.487 12311143 469.709298 ZWE 716

142 rows × 8 columns

In [66]:
df.head()
Out[66]:
country continent year lifeExp pop gdpPercap iso_alpha iso_num
11 Afghanistan Asia 2007 43.828 31889923 974.580338 AFG 4
23 Albania Europe 2007 76.423 3600523 5937.029526 ALB 8
35 Algeria Africa 2007 72.301 33333216 6223.367465 DZA 12
47 Angola Africa 2007 42.731 12420476 4797.231267 AGO 24
59 Argentina Americas 2007 75.320 40301927 12779.379640 ARG 32
In [67]:
configure_plotly_browser_state
import plotly.graph_objects as go
fig = go.Figure(px.strip(df, x = "lifeExp", hover_name = 'country', color = 'continent'))
fig.show(renderer="colab")
In [68]:
df = pd.read_csv('/content/agriculture_ds.csv', na_values = "=")
In [69]:
df.head(1)
Out[69]:
State_Name District_Name Crop_Year Season Crop Area Production
0 Andaman and Nicobar Islands NICOBARS 2000 Kharif Arecanut 1254.0 2000.0
In [70]:
df.dropna(inplace = True)
In [71]:
df_[(df_.State_Name == 'Kerala') & (df_.Crop_Year == 2000)]
Out[71]:
State_Name Crop_Year Area Production
221 Kerala 2000.0 1887696.0 5.540247e+09
In [72]:
df[(df.State_Name == 'Kerala') & (df.Crop_Year == 2000)].sort_values('Production')
Out[72]:
State_Name District_Name Crop_Year Season Crop Area Production
99868 Kerala KOTTAYAM 2000 Kharif Sesamum 6.0 1.0
100437 Kerala MALAPPURAM 2000 Kharif Ragi 4.0 3.0
99567 Kerala KOLLAM 2000 Summer Rice 4.0 4.0
98946 Kerala KANNUR 2000 Kharif Sesamum 10.0 6.0
98604 Kerala IDUKKI 2000 Kharif Ragi 8.0 7.0
... ... ... ... ... ... ... ...
101702 Kerala THRISSUR 2000 Whole Year Coconut 89472.0 540000000.0
98953 Kerala KANNUR 2000 Whole Year Coconut 96975.0 621000000.0
100445 Kerala MALAPPURAM 2000 Whole Year Coconut 110378.0 626000000.0
101425 Kerala THIRUVANANTHAPURAM 2000 Whole Year Coconut 88663.0 635000000.0
100162 Kerala KOZHIKODE 2000 Whole Year Coconut 128739.0 903000000.0

203 rows × 7 columns

  • as the weight of the coconut will be very heavier than when compared to others, this turns out to be outlie.
  • so total production alone should not be considered but should be sliced into crops of similar nature
  • eg: rice, wheat, maize, ragi, etc...
In [73]:
df_ = df[df.Crop.isin(['Rice', 'Wheat', 'Maize', "Ragi"])]
In [74]:
df_
Out[74]:
State_Name District_Name Crop_Year Season Crop Area Production
2 Andaman and Nicobar Islands NICOBARS 2000 Kharif Rice 102.00 321.00
12 Andaman and Nicobar Islands NICOBARS 2001 Kharif Rice 83.00 300.00
18 Andaman and Nicobar Islands NICOBARS 2002 Kharif Rice 189.20 510.84
27 Andaman and Nicobar Islands NICOBARS 2003 Kharif Rice 52.00 90.17
36 Andaman and Nicobar Islands NICOBARS 2004 Kharif Rice 52.94 72.57
... ... ... ... ... ... ... ...
246058 West Bengal PURULIA 2014 Autumn Rice 264.00 721.00
246084 West Bengal PURULIA 2014 Rabi Wheat 1622.00 3663.00
246085 West Bengal PURULIA 2014 Summer Maize 325.00 2039.00
246086 West Bengal PURULIA 2014 Summer Rice 306.00 801.00
246089 West Bengal PURULIA 2014 Winter Rice 279151.00 597899.00

40863 rows × 7 columns

In [75]:
df_ = df[df.Crop.isin(['Rice', 'Wheat', 'Maize', "Ragi"])].groupby(['State_Name', 'Crop_Year']).sum()
In [76]:
df_
Out[76]:
Area Production
State_Name Crop_Year
Andaman and Nicobar Islands 2000 10881.00 32184.00
2001 9801.00 27333.00
2002 10885.00 32111.66
2003 10561.37 30850.87
2004 10734.92 29192.23
... ... ... ...
West Bengal 2010 5361309.00 14630175.00
2011 5855620.00 15851031.00
2012 5898997.00 16315137.00
2013 5999179.00 16936255.00
2014 6023075.00 17527428.00

518 rows × 2 columns

In [77]:
df_.reset_index(inplace = True)
In [78]:
df_.head(1)
Out[78]:
State_Name Crop_Year Area Production
0 Andaman and Nicobar Islands 2000 10881.0 32184.0
In [79]:
df_.sort_values('Crop_Year', inplace = True)
In [80]:
import plotly.graph_objects as go
fig = go.Figure(px.scatter(df_, x='Area', y = 'Production',
           animation_frame = 'Crop_Year',
           animation_group = 'State_Name', color = 'State_Name'))
fig.show(renderer="colab")
In [81]:
df_['Efficiency'] = df_['Production'] / df_['Area']
In [82]:
df_.head(2)
Out[82]:
State_Name Crop_Year Area Production Efficiency
153 Haryana 1997 2996000.0 10134000.0 3.382510
330 Odisha 1997 137600.0 117812.0 0.856192
In [83]:
import plotly.graph_objects as go
fig = go.Figure(px.scatter(df_, x = 'Area', y = 'Efficiency', size = 'Production',
           animation_frame = "Crop_Year", animation_group = "State_Name",
           color = 'State_Name'))
fig.show(renderer="colab")
In [ ]:
df_.head(2)

fixing the points that are going out side the plot with range parameter

In [84]:
import plotly.graph_objects as go
fig = go.Figure(px.scatter(df_, x = 'Area', y = 'Efficiency', size = 'Production',
           animation_frame = "Crop_Year", animation_group = "State_Name", range_y = [0.75, 5], range_x = [-1E6, 20E6],
           color = 'State_Name'))
fig.show(renderer="colab")
In [85]:
df[(df.State_Name == 'Kerala') & (df.Crop_Year == 2000)].sort_values('Production')
Out[85]:
State_Name District_Name Crop_Year Season Crop Area Production
99868 Kerala KOTTAYAM 2000 Kharif Sesamum 6.0 1.0
100437 Kerala MALAPPURAM 2000 Kharif Ragi 4.0 3.0
99567 Kerala KOLLAM 2000 Summer Rice 4.0 4.0
98946 Kerala KANNUR 2000 Kharif Sesamum 10.0 6.0
98604 Kerala IDUKKI 2000 Kharif Ragi 8.0 7.0
... ... ... ... ... ... ... ...
101702 Kerala THRISSUR 2000 Whole Year Coconut 89472.0 540000000.0
98953 Kerala KANNUR 2000 Whole Year Coconut 96975.0 621000000.0
100445 Kerala MALAPPURAM 2000 Whole Year Coconut 110378.0 626000000.0
101425 Kerala THIRUVANANTHAPURAM 2000 Whole Year Coconut 88663.0 635000000.0
100162 Kerala KOZHIKODE 2000 Whole Year Coconut 128739.0 903000000.0

203 rows × 7 columns

In [86]:
df[df.Crop.isin(['Rice', "Wheat", "Ragi", "Maize"])]
Out[86]:
State_Name District_Name Crop_Year Season Crop Area Production
2 Andaman and Nicobar Islands NICOBARS 2000 Kharif Rice 102.00 321.00
12 Andaman and Nicobar Islands NICOBARS 2001 Kharif Rice 83.00 300.00
18 Andaman and Nicobar Islands NICOBARS 2002 Kharif Rice 189.20 510.84
27 Andaman and Nicobar Islands NICOBARS 2003 Kharif Rice 52.00 90.17
36 Andaman and Nicobar Islands NICOBARS 2004 Kharif Rice 52.94 72.57
... ... ... ... ... ... ... ...
246058 West Bengal PURULIA 2014 Autumn Rice 264.00 721.00
246084 West Bengal PURULIA 2014 Rabi Wheat 1622.00 3663.00
246085 West Bengal PURULIA 2014 Summer Maize 325.00 2039.00
246086 West Bengal PURULIA 2014 Summer Rice 306.00 801.00
246089 West Bengal PURULIA 2014 Winter Rice 279151.00 597899.00

40863 rows × 7 columns

In [96]:
df_ = df[df.Crop.isin(['Rice', "Wheat", "Ragi", "Maize"])].groupby(['State_Name', 'Crop_Year']).sum()
In [97]:
df_.head(2)
Out[97]:
Area Production
State_Name Crop_Year
Andaman and Nicobar Islands 2000 10881.0 32184.0
2001 9801.0 27333.0
In [99]:
df_.reset_index(inplace=True)
In [100]:
df_.sort_values('Crop_Year', inplace=True)
In [101]:
df_.head(2)
Out[101]:
State_Name Crop_Year Area Production
153 Haryana 1997 2996000.0 10134000.0
330 Odisha 1997 137600.0 117812.0
In [105]:
fig = go.Figure(px.scatter(df_, x="Area", y='Production', 
                           animation_frame = "Crop_Year", 
                           animation_group='State_Name', color = "State_Name"))
fig.show(renderer="colab")

New Col by name Efficiency is created, and added to the parameter Size

In [107]:
df_['Efficiency'] = df_['Production'] / df_['Area']
In [109]:
fig = go.Figure(px.scatter(df_, x="Area", y='Efficiency', size='Production',
                           animation_frame = "Crop_Year", 
                           animation_group='State_Name', color = "State_Name"))
fig.show(renderer="colab")

address the values that are going ## beyong the graph area

In [110]:
fig = go.Figure(px.scatter(df_, x="Area", y='Efficiency', size='Production',
                           animation_frame = "Crop_Year", 
                           animation_group='State_Name', color = "State_Name",
                           range_y = [0.75, 5], range_x = [-1E6, 20E6]))
fig.show(renderer="colab")

some other exercises like:

  • single state analysis
  • two states analysis for comparative view - on a single crop or couple of more crops
  • by the price values - as the cost of rice and cost of oil seeds is not the same
  • only years for comparision
  • with too much time, experiment with a comparative view of crop types and
  • seaasons and their types.
In [ ]: