Data Visualisations - 2
Week 12 - Class Demo
FDSWeek12
In [ ]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use(['dark_background'])

import urllib.request
import json

import seaborn as sns
sns.set(color_codes=True)

Plotting the composition of data

Static composition

Pie chart

In [ ]:
p = sns.load_dataset('penguins')
In [ ]:
p.head()
Out[ ]:
species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE
In [ ]:
p.groupby('species')['species'].count()
Out[ ]:
species
Adelie       152
Chinstrap     68
Gentoo       124
Name: species, dtype: int64
In [ ]:
c = p.groupby('species')['species'].count()
In [ ]:
plt.pie(c);
plt.show()
In [ ]:
plt.pie(c, labels=c.index);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%");
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%",
        explode=[0, 1, 0]);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%",
        explode=[0, 1, 0], startangle=180);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%",
        explode=[0, 1, 0], startangle=180, shadow=True);
plt.show()
In [ ]:
plt.pie(np.random.randint(0, 10, 10));
plt.show()

Donut chart

In [ ]:
plt.pie(np.random.randint(0, 10, 10), wedgeprops=dict(width=0.3));
plt.show()
In [ ]:
cmap = plt.get_cmap('Accent')
my_colours = cmap(np.arange(10))
In [ ]:
plt.pie(np.random.randint(0, 10, 10), 
        wedgeprops=dict(width=0.3),
        colors=my_colours);
plt.show()
In [ ]:
plt.pie(c, labels=c.index, autopct="%.2f%%", wedgeprops=dict(width=0.3));
plt.show()
In [ ]:
c_i = p.groupby('island')['island'].count()
In [ ]:
plt.pie(c_i, labels=c_i.index, autopct="%.2f%%", wedgeprops=dict(width=0.3));
plt.show()
In [ ]:
c = pd.crosstab(p.species, p.island)
In [ ]:
c = c.T
In [ ]:
c
Out[ ]:
species Adelie Chinstrap Gentoo
island
Biscoe 44 0 124
Dream 56 68 0
Torgersen 52 0 0
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3));
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3));
plt.pie(c.values.flatten(), radius=0.7, 
        wedgeprops=dict(width=0.3));
In [ ]:
cmap = plt.get_cmap('tab20c')
outer_colors = cmap(np.array([0, 4, 8]))
inner_colors = cmap(np.array([1, 2, 3, 5, 6, 7, 9, 10, 11]))
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        wedgeprops=dict(width=0.3),
        colors=inner_colors);
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        labels = ['A', '', 'G', 'A', 'C', '', 'A', '', ''],
        wedgeprops=dict(width=0.3),
        colors=inner_colors);
In [ ]:
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        labels = ['A', '', 'G', 'A', 'C', '', 'A', '', ''],
        wedgeprops=dict(width=0.3),
        colors=inner_colors,
        labeldistance=0.75);
In [ ]:
cmap = plt.get_cmap('tab20b')
outer_colors = cmap(np.array([0, 4, 8]))
inner_colors = cmap(np.array([1, 2, 3, 5, 6, 7, 9, 10, 11]))
plt.pie(c.sum(axis=1), labels=c.index, 
        radius = 1, wedgeprops=dict(width=0.3), 
        colors=outer_colors);
plt.pie(c.values.flatten(), radius=0.7, 
        labels = ['A', '', 'G', 'A', 'C', '', 'A', '', ''],
        wedgeprops=dict(width=0.3),
        colors=inner_colors,
        labeldistance=0.75, textprops=dict(color='w'));

Stacked bar plot

In [ ]:
url = 'https://api.covid19india.org/states_daily.json'
urllib.request.urlretrieve(url, 'data.json')

with open('data.json') as f:
  data = json.load(f)
data = data['states_daily']
df = pd.json_normalize(data)
In [ ]:
df.head()
Out[ ]:
an ap ar as br ch ct date dd dl dn ga gj hp hr jh jk ka kl la ld mh ml mn mp mz nl or pb py rj sk status tg tn tr tt un up ut wb
0 0 1 0 0 0 0 0 14-Mar-20 0 7 0 0 0 0 14 0 2 6 19 0 0 14 0 0 0 0 0 0 1 0 3 0 Confirmed 1 1 0 81 0 12 0 0
1 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 Recovered 0 0 0 9 0 4 0 0
2 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Deceased 0 0 0 2 0 0 0 0
3 0 0 0 0 0 0 0 15-Mar-20 0 0 0 0 0 0 0 0 0 0 5 0 0 18 0 0 0 0 0 0 0 0 1 0 Confirmed 2 0 0 27 0 1 0 0
4 0 0 0 0 0 0 0 15-Mar-20 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 Recovered 1 0 0 4 0 0 0 0
In [ ]:
df_ = df.tail(3)
df_.drop('date', axis=1, inplace=True)
df_.set_index('status', inplace=True)
df_ = df_.T
df_ = df_.apply(pd.to_numeric)
df_.drop('tt', inplace=True)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:3997: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased
an 3 12 0
ap 765 376 12
ar 7 1 0
as 1202 416 0
br 349 277 4
In [ ]:
plt.bar(df_.index, df_.Confirmed);
In [ ]:
plt.bar(df_.index, df_.Confirmed);
plt.xticks(rotation=90);
In [ ]:
plt.bar(df_.index, df_.Confirmed);
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed);
plt.xticks(rotation=90);
In [ ]:
plt.bar(df_.index, df_.Confirmed);
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed);
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered);
plt.xticks(rotation=90);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed);
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed);
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered);
plt.xticks(rotation=90);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

for i, val in enumerate(df_.index):
    y = df_.loc[val].sum() + 100
    x = i
    plt.text(x, y, str(y), ha="center");
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

for i, val in enumerate(df_.index):
    y = df_.loc[val].sum() + 100
    if y > 1000:
        x = i
        plt.text(x, y, str(y), ha="center");

Relative stacked bar plots

In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased
an 3 12 0
ap 765 376 12
ar 7 1 0
as 1202 416 0
br 349 277 4
In [ ]:
df_['Total'] = 0
In [ ]:
df_['Total'] = df_.sum(axis = 1)
In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased Total
an 3 12 0 15
ap 765 376 12 1153
ar 7 1 0 8
as 1202 416 0 1618
br 349 277 4 630
In [ ]:
df_['ConfirmedFraction'] = df_['Confirmed'] / df_['Total']
df_['RecoveredFraction'] = df_['Recovered'] / df_['Total']
df_['DeceasedFraction'] = df_['Deceased'] / df_['Total']
In [ ]:
df_.head()
Out[ ]:
status Confirmed Recovered Deceased Total ConfirmedFraction RecoveredFraction DeceasedFraction
an 3 12 0 15 0.200000 0.800000 0.000000
ap 765 376 12 1153 0.663487 0.326106 0.010408
ar 7 1 0 8 0.875000 0.125000 0.000000
as 1202 416 0 1618 0.742892 0.257108 0.000000
br 349 277 4 630 0.553968 0.439683 0.006349
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.ConfirmedFraction, color='Orange');
plt.bar(df_.index, df_.RecoveredFraction, bottom=df_.ConfirmedFraction, color='Green');
plt.bar(df_.index, df_.DeceasedFraction, bottom=df_.ConfirmedFraction + df_.RecoveredFraction, color='Red');
plt.xticks(rotation=90);
In [ ]:
df_ = df_.sort_values('ConfirmedFraction', ascending=False)
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.ConfirmedFraction, color='Orange');
plt.bar(df_.index, df_.RecoveredFraction, bottom=df_.ConfirmedFraction, color='Green');
plt.bar(df_.index, df_.DeceasedFraction, bottom=df_.ConfirmedFraction + df_.RecoveredFraction, color='Red');
plt.xticks(rotation=90);
In [ ]:
df_ = df_.sort_values('Total', ascending=False)

fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.bar(df_.index, df_.Confirmed, color='Orange');
plt.bar(df_.index, df_.Recovered, bottom=df_.Confirmed, color='Green');
plt.bar(df_.index, df_.Deceased, bottom=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

for i, val in enumerate(df_.index):
    y = df_.loc[val, 'Total'] + 100
    if y > 1000:
        x = i
        plt.text(x, y, str(y), ha="center");
In [ ]:
df_ = df_.sort_values('Total', ascending=False)

fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.barh(df_.index, df_.Confirmed, color='Orange');
plt.barh(df_.index, df_.Recovered, left=df_.Confirmed, color='Green');
plt.barh(df_.index, df_.Deceased, left=df_.Confirmed + df_.Recovered, color='Red');
plt.xticks(rotation=90);

Time-varying composition of data

Stacked area plots

In [ ]:
 df.head()
Out[ ]:
an ap ar as br ch ct date dd dl dn ga gj hp hr jh jk ka kl la ld mh ml mn mp mz nl or pb py rj sk status tg tn tr tt un up ut wb
0 0 1 0 0 0 0 0 14-Mar-20 0 7 0 0 0 0 14 0 2 6 19 0 0 14 0 0 0 0 0 0 1 0 3 0 Confirmed 1 1 0 81 0 12 0 0
1 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 Recovered 0 0 0 9 0 4 0 0
2 0 0 0 0 0 0 0 14-Mar-20 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Deceased 0 0 0 2 0 0 0 0
3 0 0 0 0 0 0 0 15-Mar-20 0 0 0 0 0 0 0 0 0 0 5 0 0 18 0 0 0 0 0 0 0 0 1 0 Confirmed 2 0 0 27 0 1 0 0
4 0 0 0 0 0 0 0 15-Mar-20 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 Recovered 1 0 0 4 0 0 0 0
In [ ]:
df_ = df[['mh', 'date', 'status']]
In [ ]:
df_.head()
Out[ ]:
mh date status
0 14 14-Mar-20 Confirmed
1 0 14-Mar-20 Recovered
2 0 14-Mar-20 Deceased
3 18 15-Mar-20 Confirmed
4 0 15-Mar-20 Recovered
In [ ]:
df_['mh'] = pd.to_numeric(df_['mh'])
df_['date'] = pd.to_datetime(df_['date'])
In [ ]:
df_.head()
Out[ ]:
mh date status
0 14 2020-03-14 Confirmed
1 0 2020-03-14 Recovered
2 0 2020-03-14 Deceased
3 18 2020-03-15 Confirmed
4 0 2020-03-15 Recovered

date | confirmed | recoverd | deceased |

2020-03-14 | 14 | 0 | 0

2020-03-14 | 18 | 0 | 0

pivot

In [ ]:
df_ = df_.pivot_table(values="mh", columns="status", index="date")
In [ ]:
df_.head()
Out[ ]:
status Confirmed Deceased Recovered
date
2020-03-14 14 0 0
2020-03-15 18 0 0
2020-03-16 6 0 0
2020-03-17 3 1 0
2020-03-18 3 0 0
In [ ]:
df_.plot.area();
In [ ]:
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased,
              colors=['orange', 'green', 'red']);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased,
              labels=['Confirmed', 'Recovered', 'Deceased'],
              colors=['orange', 'green', 'red']);
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed, df_.Recovered, df_.Deceased,
              labels=['Confirmed', 'Recovered', 'Deceased'],
              colors=['orange', 'green', 'red']);
plt.legend();

Relative stacked area plot

In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
plt.stackplot(df_.index, df_.Confirmed/df_.sum(axis=1), 
              df_.Recovered/df_.sum(axis=1), 
              df_.Deceased/df_.sum(axis=1),
              labels=['Confirmed', 'Recovered', 'Deceased'],
              colors=['orange', 'green', 'red']);
plt.legend();
In [ ]:
def plot_stacked_area_by_state(state):
    df_ = df[[state, 'date', 'status']]
    df_[state] = pd.to_numeric(df_[state])
    df_['date'] = pd.to_datetime(df_['date'])
    df_ = df_.pivot_table(values=state, columns="status", index="date")
    fig = plt.gcf();
    fig.set_size_inches(15, 6);
    plt.stackplot(df_.index, df_.Confirmed/df_.sum(axis=1), 
                df_.Recovered/df_.sum(axis=1), 
                df_.Deceased/df_.sum(axis=1),
                labels=['Confirmed', 'Recovered', 'Deceased'],
                colors=['orange', 'green', 'red']);
    plt.legend();
In [ ]:
plot_stacked_area_by_state('tn')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
In [ ]:
plot_stacked_area_by_state('wb')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
In [ ]:
plot_stacked_area_by_state('dl')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.

Plotting relationships between data

Scatter plot

In [ ]:
t = sns.load_dataset('tips')
In [ ]:
t.head()
Out[ ]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t);
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9b9dc550>
In [ ]:
t['tip_fraction'] = t['tip']/t['total_bill']
In [ ]:
sns.scatterplot(x='total_bill', y='tip_fraction', data=t);
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='time');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='sex');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='smoker');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='day');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='size');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='size', style='sex');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='time', style='sex', size='size');
In [ ]:
sns.scatterplot(x='total_bill', y='tip', data=t,
                hue='time', style='sex', size='size');
plt.legend(bbox_to_anchor=(1.05, 1));
In [ ]:
sns.regplot(x='total_bill', y='tip', data=t);
In [ ]:
sns.regplot(x='total_bill', y='tip_fraction', data=t);
In [ ]:
sns.regplot(x='total_bill', y='tip_fraction', data=t, marker="+");
In [ ]:
d = sns.load_dataset('diamonds')
In [ ]:
d.head()
Out[ ]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [ ]:
sns.scatterplot('x', 'price', data=d.sample(1000));
In [ ]:
sns.regplot('x', 'price', data=d.sample(1000));
In [ ]:
sns.regplot('x', 'price', data=d.sample(1000), order=2, marker="+");

Bar plots

In [ ]:
sns.barplot(x="day", y ="tip", data=t);
In [ ]:
sns.barplot(x="day", y ="tip_fraction", data=t);
In [ ]:
sns.barplot(x="day", y ="tip", data=t);
In [ ]:
sns.barplot(x="day", y ="tip", data=t, estimator=np.median);
In [ ]:
def my_estimate(v):
    return np.quantile(v, 0.25)
In [ ]:
sns.barplot(x="day", y ="tip", data=t, estimator=my_estimate);
In [ ]:
sns.barplot(x="day", y ="tip", hue="sex", data=t, estimator=np.median);
In [ ]:
sns.barplot(x="day", y ="tip", hue="smoker", data=t, estimator=np.median);
In [ ]:
sns.barplot(x="day", y ="tip", hue="time", data=t, estimator=np.median);
In [ ]:
sns.barplot(x="day", y ="tip_fraction", hue="time", data=t, estimator=np.median);
In [ ]:
d.head()
Out[ ]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [ ]:
sns.scatterplot('x', 'price', data=d.sample(1000));
In [ ]:
sns.barplot('x', 'price', data=d.sample(1000));
In [ ]:
d['x_q'] = pd.cut(d['x'], bins=15);
In [ ]:
d.head()
Out[ ]:
carat cut color clarity depth table price x y z x_q
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 (3.58, 4.296]
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 (3.58, 4.296]
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 (3.58, 4.296]
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 (3.58, 4.296]
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 (4.296, 5.012]
In [ ]:
sns.barplot('x_q', 'price', data=d.sample(1000));
In [ ]:
d['x_q'] = pd.cut(d['x'], bins=15, labels=False);
In [ ]:
d.head()
Out[ ]:
carat cut color clarity depth table price x y z x_q
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 5
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 5
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 5
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 5
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 6
In [ ]:
sns.barplot('x_q', 'price', data=d.sample(1000));

Line plot

In [ ]:
f = sns.load_dataset('fmri')
In [ ]:
f.head()
Out[ ]:
subject timepoint event region signal
0 s13 18 stim parietal -0.017552
1 s5 14 stim parietal -0.080883
2 s12 18 stim parietal -0.081033
3 s11 18 stim parietal -0.046134
4 s10 18 stim parietal -0.037970
In [ ]:
sns.lineplot('timepoint', 'signal', data=f);
In [ ]:
sns.lineplot('timepoint', 'signal', data=f, hue="region");
In [ ]:
sns.lineplot('timepoint', 'signal', data=f, hue="event");
In [ ]:
sns.lineplot('timepoint', 'signal', data=f, hue="event", style="region");
In [ ]:
sns.lineplot('timepoint', 'signal', data=f, marker=True);
In [ ]:
sns.lineplot('timepoint', 'signal', data=f, marker=True, estimator=np.median);
In [ ]:
sns.lineplot('timepoint', 'signal', data=f, units='subject', estimator=None);
In [ ]:
f_ = f[(f.region == "parietal") & (f.event == "cue")]
In [ ]:
f_.head()
Out[ ]:
subject timepoint event region signal
532 s3 4 cue parietal 0.058219
533 s6 5 cue parietal 0.038145
534 s7 5 cue parietal -0.008158
535 s8 5 cue parietal 0.047136
536 s9 5 cue parietal 0.055847
In [ ]:
sns.lineplot('timepoint', 'signal', data=f_, hue='subject', estimator=None);
In [ ]:
x = np.array([-3, -2, -1, 0, 1, 2, 3])
In [ ]:
y = x * x
In [ ]:
sns.lineplot(x, y);
In [ ]:
with open('data.json') as f:
  data = json.load(f)
data = data['states_daily']
df = pd.json_normalize(data)
df['date'] = pd.to_datetime(df['date']) 
df.drop('tt', axis=1, inplace=True)
df.set_index('date', inplace=True)
df = df[df['status'] == 'Confirmed']
df.drop('status', axis=1, inplace=True)
df = df.apply(pd.to_numeric)
df = df.rolling(7).mean()
df.reset_index(inplace=True)
In [ ]:
df.head()
Out[ ]:
date an ap ar as br ch ct dd dl dn ga gj hp hr jh jk ka kl la ld mh ml mn mp mz nl or pb py rj sk tg tn tr un up ut wb
0 2020-03-14 0 1 0 0 0 0 0 0 7 0 0 0 0 14 0 2 6 19 0 0 14 0 0 0 0 0 0 1 0 3 0 1 1 0 0 12 0 0
1 2020-03-15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 18 0 0 0 0 0 0 0 0 1 0 2 0 0 0 1 0 0
2 2020-03-16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 0 0 6 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0
3 2020-03-17 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 2 0 0 0 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 0 1
4 2020-03-18 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 1 5 0 8 0 3 0 0 0 0 0 1 1 0 3 0 8 1 0 0 2 1 0

date | state | confirmed

2020-03-14 | an | 0

2020-03-14 | ap | 1

In [ ]:
df_ = pd.melt(df, id_vars="date", 
              value_vars = list(df.columns).remove("date"),
              var_name = "state", value_name = "confirmed")
In [ ]:
df_.head()
Out[ ]:
date state confirmed
0 2020-03-14 an 0
1 2020-03-15 an 0
2 2020-03-16 an 0
3 2020-03-17 an 0
4 2020-03-18 an 0
In [ ]:
sns.lineplot('date', 'confirmed', data=df_);
In [ ]:
sns.lineplot('date', 'confirmed', hue="state", data=df_);
In [ ]:
states = ['mh', 'tn', 'dl', 'wb', 'ka', 'gj']
In [ ]:
df_ = df_[df_.state.isin(states)]
In [ ]:
sns.lineplot('date', 'confirmed', hue="state", data=df_);
In [ ]:
sns.lineplot('date', 'confirmed', hue="state", data=df_,
             palette='Reds');
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
sns.lineplot('date', 'confirmed', hue="state", data=df_,
             palette='Reds');
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 6);
sns.lineplot('date', 'confirmed', hue="state", data=df_,
             palette='Reds', hue_order = ['wb', 'gj', 'ka', 'dl', 'tn', 'mh']);

Heatmap

X [10 x 10]

X(i, j) -> value

In [ ]:
x = np.random.rand(10, 10)
In [ ]:
x
Out[ ]:
array([[0.44147103, 0.86844671, 0.97218962, 0.6935189 , 0.8069725 ,
        0.5578699 , 0.09532397, 0.78801904, 0.32624519, 0.03927953],
       [0.07426231, 0.14167535, 0.4004343 , 0.89695182, 0.24108825,
        0.2987024 , 0.452844  , 0.79566103, 0.44566233, 0.93156571],
       [0.29919083, 0.14043667, 0.82908301, 0.18849677, 0.92083531,
        0.72199571, 0.23918676, 0.11534281, 0.71111679, 0.59196739],
       [0.35287507, 0.93144292, 0.09984819, 0.06610241, 0.08692311,
        0.97267655, 0.76874109, 0.24189725, 0.24577086, 0.88816116],
       [0.57603578, 0.17206053, 0.03900838, 0.235467  , 0.72562216,
        0.71056661, 0.77237925, 0.92932866, 0.29885186, 0.07685464],
       [0.15220609, 0.0629219 , 0.61211092, 0.24377965, 0.3031736 ,
        0.2807313 , 0.56958555, 0.41459585, 0.51102999, 0.78950917],
       [0.31998926, 0.72196797, 0.25143335, 0.05519168, 0.58413424,
        0.89281645, 0.96354152, 0.67733497, 0.41681635, 0.62967471],
       [0.80862028, 0.26639888, 0.09040902, 0.9266112 , 0.87010557,
        0.52039608, 0.66714736, 0.70316625, 0.7854821 , 0.16920598],
       [0.94350783, 0.36418735, 0.77478057, 0.6535443 , 0.16562476,
        0.50576092, 0.65717018, 0.5047263 , 0.68163753, 0.72439885],
       [0.23512086, 0.86044499, 0.51851838, 0.6618975 , 0.19513392,
        0.91105141, 0.79445037, 0.34064833, 0.97714602, 0.66702951]])
In [ ]:
sns.heatmap(x)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9b9edef0>
In [ ]:
fl = sns.load_dataset('flights')
In [ ]:
fl.head()
Out[ ]:
year month passengers
0 1949 January 112
1 1949 February 118
2 1949 March 132
3 1949 April 129
4 1949 May 121
In [ ]:
fl.sample(10)
Out[ ]:
year month passengers
12 1950 January 115
64 1954 May 234
102 1957 July 465
22 1950 November 114
129 1959 October 407
32 1951 September 184
124 1959 May 420
77 1955 June 315
62 1954 March 235
95 1956 December 306
year    month   passengers

0 1949 January 112

1 1949 February 118

2 1949 March 132

year January February March

1949 112 118 132

1950

1951

In [ ]:
fl_ = fl.pivot(index='year', columns='month', values='passengers');
In [ ]:
fl_.head()
Out[ ]:
month January February March April May June July August September October November December
year
1949 112 118 132 129 121 135 148 148 136 119 104 118
1950 115 126 141 135 125 149 170 170 158 133 114 140
1951 145 150 178 163 172 178 199 199 184 162 146 166
1952 171 180 193 181 183 218 230 242 209 191 172 194
1953 196 196 236 235 229 243 264 272 237 211 180 201
In [ ]:
sns.heatmap(fl_.T)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9baf99e8>
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 10)
sns.heatmap(fl_.T, annot=True, fmt="d");
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 10)
sns.heatmap(fl_.T, annot=True, fmt="d", cmap="YlGnBu");
In [ ]:
fig = plt.gcf();
fig.set_size_inches(15, 10)
sns.heatmap(fl_.T, annot=True, fmt="d", 
            cmap=sns.diverging_palette(50, 200, n=45));
In [282]:
fig = plt.gcf();
fig.set_size_inches(15, 10)
sns.heatmap(fl_.T, annot=True, fmt="d", 
            cmap=sns.diverging_palette(250, 10, n=45),
            center=fl_.loc[1954, 'January']);

Task on open-ended visualisation

In [285]:
df = pd.read_excel('ameo_2015.xlsx')
In [286]:
df.head()
Out[286]:
ID Salary DOJ DOL Designation JobCity Gender DOB 10percentage 10board 12graduation 12percentage 12board CollegeID CollegeTier Degree Specialization collegeGPA CollegeCityID CollegeCityTier CollegeState GraduationYear English Logical Quant Domain ComputerProgramming ElectronicsAndSemicon ComputerScience MechanicalEngg ElectricalEngg TelecomEngg CivilEngg conscientiousness agreeableness extraversion nueroticism openess_to_experience
0 203097 420000 2012-06-01 present senior quality engineer Bangalore f 1990-02-19 84.3 board ofsecondary education,ap 2007 95.8 board of intermediate education,ap 1141 2 B.Tech/B.E. computer engineering 78.00 1141 0 Andhra Pradesh 2011 515 585 525 0.635979 445 -1 -1 -1 -1 -1 -1 0.9737 0.8128 0.5269 1.35490 -0.4455
1 579905 500000 2013-09-01 present assistant manager Indore m 1989-10-04 85.4 cbse 2007 85.0 cbse 5807 2 B.Tech/B.E. electronics and communication engineering 70.06 5807 0 Madhya Pradesh 2012 695 610 780 0.960603 -1 466 -1 -1 -1 -1 -1 -0.7335 0.3789 1.2396 -0.10760 0.8637
2 810601 325000 2014-06-01 present systems engineer Chennai f 1992-08-03 85.0 cbse 2010 68.2 cbse 64 2 B.Tech/B.E. information technology 70.00 64 0 Uttar Pradesh 2014 615 545 370 0.450877 395 -1 -1 -1 -1 -1 -1 0.2718 1.7109 0.1637 -0.86820 0.6721
3 267447 1100000 2011-07-01 present senior software engineer Gurgaon m 1989-12-05 85.6 cbse 2007 83.6 cbse 6920 1 B.Tech/B.E. computer engineering 74.64 6920 1 Delhi 2011 635 585 625 0.974396 615 -1 -1 -1 -1 -1 -1 0.0464 0.3448 -0.3440 -0.40780 -0.9194
4 343523 200000 2014-03-01 2015-03-01 00:00:00 get Manesar m 1991-02-27 78.0 cbse 2008 76.8 cbse 11368 2 B.Tech/B.E. electronics and communication engineering 73.90 11368 0 Uttar Pradesh 2012 545 625 465 0.124502 -1 233 -1 -1 -1 -1 -1 -0.8810 -0.2793 -1.0697 0.09163 -0.1295
In [ ]: