Data Visualisations - 2
Week 12 - Class Demo
FDSWeek12
In [ ]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use(['dark_background'])

import urllib.request
import json

import seaborn as sns
sns.set(color_codes=True)

Plotting the composition of data

Static composition

Pie chart

In [ ]:
p = sns.load_dataset('penguins')
In [ ]:
p.head()
Out[ ]:
species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE
In [ ]:
p.groupby('species')['species'].count()
Out[ ]:
species
Adelie       152
Chinstrap     68
Gentoo       124
Name: species, dtype: int64
In [ ]:
c = p.groupby('species')['species'].count()
In [ ]:
plt.pie(c);
plt.show()
In [ ]:
plt.pie(c, labels=c.index);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%");
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%",
        explode=[0, 1, 0]);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%",
        explode=[0, 1, 0], startangle=180);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%",
        explode=[0, 1, 0], startangle=180, shadow=True);
plt.show()
In [ ]:
plt.pie(np.random.randint(0, 10, 10));
plt.show()

Donut chart

In [ ]:
plt.pie(np.random.randint(0, 10, 10), wedgeprops=dict(width=0.3));
plt.show()
In [ ]:
cmap = plt.get_cmap('Accent')
my_colours = cmap(np.arange(10))
In [ ]:
plt.pie(np.random.randint(0, 10, 10), 
        wedgeprops=dict(width=0.3),
        colors=my_colours);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%", wedgeprops=dict(width=0.3));
plt.show()
In [ ]:
c_i = p.groupby('island')['island'].count()
In [ ]:
plt.pie(c_i, labels=c_i.index, autopct="%.2f%%", wedgeprops=dict(width=0.3));
plt.show()
In [ ]:
c = pd.crosstab(p.species, p.island)
In [ ]:
c = c.T
In [ ]:
c
Out[ ]:
species Adelie Chinstrap Gentoo
island
Biscoe 44 0 124
Dream 56 68 0
Torgersen 52 0 0
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3));
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3));
plt.pie(c.values.flatten(), radius=0.7, 
        wedgeprops=dict(width=0.3));
In [ ]:
cmap = plt.get_cmap('tab20c')
outer_colors = cmap(np.array([0, 4, 8]))
inner_colors = cmap(np.array([1, 2, 3, 5, 6, 7, 9, 10, 11]))
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        wedgeprops=dict(width=0.3),
        colors=inner_colors);
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        labels = ['A', '', 'G', 'A', 'C', '', 'A', '', ''],
        wedgeprops=dict(width=0.3),
        colors=inner_colors);
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        labels = ['A', '', 'G', 'A', 'C', '', 'A', '', ''],
        wedgeprops=dict(width=0.3),
        colors=inner_colors,
        labeldistance=0.75);
In [ ]:
cmap = plt.get_cmap('tab20b')
outer_colors = cmap(np.array([0, 4, 8]))
inner_colors = cmap(np.array([1, 2, 3, 5, 6, 7, 9, 10, 11]))
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        labels = ['A', '', 'G', 'A', 'C', '', 'A', '', ''],
        wedgeprops=dict(width=0.3),
        colors=inner_colors,
        labeldistance=0.75, textprops=dict(color='w'));

Stacked bar plot

In [ ]:
url = 'https://api.covid19india.org/states_daily.json'
urllib.request.urlretrieve(url, 'data.json')

with open('data.json') as f:
  data = json.load(f)
data = data['states_daily']
df = pd.json_normalize(data)
In [ ]:
df.head()
Out[ ]:
an ap ar as br ch ct date dd dl dn ga gj hp hr jh jk ka kl la ld mh ml mn mp mz nl or pb py rj sk status tg tn tr tt un up ut wb
0 0 1 0 0 0 0 0 14-Mar-20 0 7 0 0 0 0 14 0 2 6 19 0 0 14 0 0 0 0 0 0 1 0 3 0 Confirmed 1 1 0 81 0 12 0 0
1 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 Recovered 0 0 0 9 0 4 0 0
2 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Deceased 0 0 0 2 0 0 0 0
3 0 0 0 0 0 0 0 15-Mar-20 0 0 0 0 0 0 0 0 0 0 5 0 0 18 0 0 0 0 0 0 0 0 1 0 Confirmed 2 0 0 27 0 1 0 0
4 0 0 0 0 0 0 0 15-Mar-20 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 Recovered 1 0 0 4 0 0 0 0
In [ ]:
df_ = df.tail(3)
df_.drop('date', axis=1, inplace=True)
df_.set_index('status', inplace=True)
df_ = df_.T
df_ = df_.apply(pd.to_numeric)
df_.drop('tt', inplace=True)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:3997: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased
an 3 12 0
ap 765 376 12
ar 7 1 0
as 1202 416 0
br 349 277 4
In [ ]:
plt.bar(df_.index, df_.Confirmed);
In [ ]:
plt.bar(df_.index, df_.Confirmed);
plt.xticks(rotation=90);
In [ ]:
plt.bar(df_.index, df_.Confirmed);
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed);
plt.xticks(rotation=90);
In [ ]:
plt.bar(df_.index, df_.Confirmed);
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed);
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered);
plt.xticks(rotation=90);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed);
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed);
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered);
plt.xticks(rotation=90);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

for i, val in enumerate(df_.index):
    y = df_.loc[val].sum() + 100
    x = i
    plt.text(x, y, str(y), ha="center");
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

for i, val in enumerate(df_.index):
    y = df_.loc[val].sum() + 100
    if y > 1000:
        x = i
        plt.text(x, y, str(y), ha="center");

Relative stacked bar plots

In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased
an 3 12 0
ap 765 376 12
ar 7 1 0
as 1202 416 0
br 349 277 4
In [ ]:
df_['Total'] = 0
In [ ]:
df_['Total'] = df_.sum(axis = 1)
In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased Total
an 3 12 0 15
ap 765 376 12 1153
ar 7 1 0 8
as 1202 416 0 1618
br 349 277 4 630
In [ ]:
df_['ConfirmedFraction'] = df_['Confirmed'] / df_['Total']
df_['RecoveredFraction'] = df_['Recovered'] / df_['Total']
df_['DeceasedFraction'] = df_['Deceased'] / df_['Total']
In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased Total ConfirmedFraction RecoveredFraction DeceasedFraction
an 3 12 0 15 0.200000 0.800000 0.000000
ap 765 376 12 1153 0.663487 0.326106 0.010408
ar 7 1 0 8 0.875000 0.125000 0.000000
as 1202 416 0 1618 0.742892 0.257108 0.000000
br 349 277 4 630 0.553968 0.439683 0.006349
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.ConfirmedFraction, color='Orange');
plt.bar(df_.index, df_.RecoveredFraction, bottom=df_.ConfirmedFraction, color='Green');
plt.bar(df_.index, df_.DeceasedFraction, bottom=df_.ConfirmedFraction + df_.RecoveredFraction, color='Red');
plt.xticks(rotation=90);
In [ ]:
df_ = df_.sort_values('ConfirmedFraction', ascending=False)
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.ConfirmedFraction, color='Orange');
plt.bar(df_.index, df_.RecoveredFraction, bottom=df_.ConfirmedFraction, color='Green');
plt.bar(df_.index, df_.DeceasedFraction, bottom=df_.ConfirmedFraction + df_.RecoveredFraction, color='Red');
plt.xticks(rotation=90);
In [ ]:
df_ = df_.sort_values('Total', ascending=False)

fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

for i, val in enumerate(df_.index):
    y = df_.loc[val, 'Total'] + 100
    if y > 1000:
        x = i
        plt.text(x, y, str(y), ha="center");
In [ ]:
df_ = df_.sort_values('Total', ascending=False)

fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.barh(df_.index, df_.Confirmed, color='Orange');
plt.barh(df_.index, df_.Recovered, left=df_.Confirmed, color='Green');
plt.barh(df_.index, df_.Deceased, left=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

Time-varying composition of data

Stacked area plots

In [ ]:
 df.head()
Out[ ]:
an ap ar as br ch ct date dd dl dn ga gj hp hr jh jk ka kl la ld mh ml mn mp mz nl or pb py rj sk status tg tn tr tt un up ut wb
0 0 1 0 0 0 0 0 14-Mar-20 0 7 0 0 0 0 14 0 2 6 19 0 0 14 0 0 0 0 0 0 1 0 3 0 Confirmed 1 1 0 81 0 12 0 0
1 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 Recovered 0 0 0 9 0 4 0 0
2 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Deceased 0 0 0 2 0 0 0 0
3 0 0 0 0 0 0 0 15-Mar-20 0 0 0 0 0 0 0 0 0 0 5 0 0 18 0 0 0 0 0 0 0 0 1 0 Confirmed 2 0 0 27 0 1 0 0
4 0 0 0 0 0 0 0 15-Mar-20 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 Recovered 1 0 0 4 0 0 0 0
In [ ]:
df_ = df[['mh', 'date', 'status']]
In [ ]:
df_.head()
Out[ ]:
mh date status
0 14 14-Mar-20 Confirmed
1 0 14-Mar-20 Recovered
2 0 14-Mar-20 Deceased
3 18 15-Mar-20 Confirmed
4 0 15-Mar-20 Recovered
In [ ]:
df_['mh'] = pd.to_numeric(df_['mh'])
df_['date'] = pd.to_datetime(df_['date'])
In [ ]:
df_.head()
Out[ ]:
mh date status
0 14 2020-03-14 Confirmed
1 0 2020-03-14 Recovered
2 0 2020-03-14 Deceased
3 18 2020-03-15 Confirmed
4 0 2020-03-15 Recovered

date | confirmed | recoverd | deceased |

2020-03-14 | 14 | 0 | 0

2020-03-14 | 18 | 0 | 0

pivot

In [ ]:
df_ = df_.pivot_table(values="mh", columns="status", index="date")
In [ ]:
df_.head()
Out[ ]:
status Confirmed Deceased Recovered
date
2020-03-14 14 0 0
2020-03-15 18 0 0
2020-03-16 6 0 0
2020-03-17 3 1 0
2020-03-18 3 0 0
In [ ]:
df_.plot.area();
In [ ]:
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased,
              colors=['orange', 'green', 'red']);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased,
              labels=['Confirmed', 'Recovered', 'Deceased'],
              colors=['orange', 'green', 'red']);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased,
              labels=['Confirmed', 'Recovered', 'Deceased'],
              colors=['orange', 'green', 'red']);
plt.legend();

Relative stacked area plot

In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed/df_.sum(axis=1), 
              df_.Recovered/df_.sum(axis=1), 
              df_.Deceased/df_.sum(axis=1),
              labels=['Confirmed', 'Recovered', 'Deceased'],
              colors=['orange', 'green', 'red']);
plt.legend();
In [ ]:
def plot_stacked_area_by_state(state):
    df_ = df[[state, 'date', 'status']]
    df_[state] = pd.to_numeric(df_[state])
    df_['date'] = pd.to_datetime(df_['date'])
    df_ = df_.pivot_table(values=state, columns="status", index="date")
    fig = plt.gcf();
    fig.set_size_inches(15, 6);
    plt.stackplot(df_.index, df_.Confirmed/df_.sum(axis=1), 
                df_.Recovered/df_.sum(axis=1), 
                df_.Deceased/df_.sum(axis=1),
                labels=['Confirmed', 'Recovered', 'Deceased'],
                colors=['orange', 'green', 'red']);
    plt.legend();
In [ ]:
plot_stacked_area_by_state('tn')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
In [ ]:
plot_stacked_area_by_state('wb')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.