In the Asynchronous Lecture
matplotlib
and seaborn
plotnine
In the Synchronous Lecture
If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.
The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.
Total time: Approx. 1 hour and 22 minutes
Download the gapminder.csv dataset used in the asynchronous videos.
matplotlib
+ seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %% -----------------------------------------
# Data
# Read data in and transform initial variables
= (pd.read_csv("gapminder.csv")
dat eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
.
dat.head()
# %% -----------------------------------------
# Matplotlib
=(12,7))
plt.figure(figsize= dat.lngdpPercap, y = dat.lifeExp, c="green", alpha=.5,s=100)
plt.scatter(x "Log GDP perCapita")
plt.xlabel("Life Expectancy")
plt.ylabel(
plt.show()
# %% -----------------------------------------
# Matplotlib library is built into pandas
="lngdpPercap",y="lifeExp",figsize=(12,7),alpha=0.5,c="green",s=100)
dat.plot.scatter(x"Log GDP perCapita")
plt.xlabel("Life Expectancy")
plt.ylabel(
plt.show()
# %% -----------------------------------------
# Seaborn
=(12,7))
plt.figure(figsize= "lngdpPercap",y="lifeExp",
sns.scatterplot(x =.5,color="green",s=100,
alpha= dat)
data "Log GDP perCapita")
plt.xlabel("Life Expectancy")
plt.ylabel( plt.show()
plotnine
/ggplot2
import pandas as pd
from plotnine import *
import warnings
'ignore') # Ignore warnings
warnings.filterwarnings(
# Run into issues with plotnine here are some resolutions
#!pip install scipy==1.2 --upgrade
#!pip install scikit-misc
# Read in data
= (pd.read_csv("gapminder.csv")
dat eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
.
dat.head()
# %% -----------------------------------------
# Building a plotnine plot
= "lngdpPercap",y="lifeExp",color="continent")) +
(ggplot(dat,aes(x =3,alpha=.5,show_legend=False) +
geom_point(size+
theme_bw()= "Log GDP Per Capita",y="Life Expectancy",color="",title="Life Expectancy on GDP") +
labs(x = ["blue","steelblue","black","gold","pink"]) +
scale_color_manual(values "continent",scales="free",ncol=1) +
facet_wrap(=(5,12),
theme(figure_size="top")) legend_position
import pandas as pd
from plotnine import *
import seaborn as sns
import warnings
'ignore') # Ignore warnings
warnings.filterwarnings(
# Run into issues with plotnine here are some resolutions
#!pip install scipy==1.2 --upgrade
#!pip install scikit-misc
# Read in data
= (pd.read_csv("gapminder.csv")
dat eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
.
dat.head()
############ Continuous Univariate ############
# %% -----------------------------------------
# Histogram
# plotnine/ggplot2
= 'lifeExp')) +
(ggplot(dat, aes(x
geom_histogram())
# Seaborn
=True,kde=True)
sns.distplot(dat.lifeExp,hist
# %% -----------------------------------------
# Density Plot
# plotnine/ggplot2
= 'lifeExp')) +
(ggplot(dat, aes(x ="blue",color="black",alpha=.5)+
geom_density(fill0,100))
xlim(
# Seaborn
=True)
sns.kdeplot(dat.lifeExp,shade
# %% -----------------------------------------
############ Continuous Bivariate ############
# %% -----------------------------------------
# Scatter Plot
# plotnine/ggplot2
= 'lngdpPercap', y = 'lifeExp')) +
(ggplot(dat, aes(x =.5))
geom_point(alpha
# Seaborn
= 'lngdpPercap', y = 'lifeExp',data=dat)
sns.scatterplot(x
# %% -----------------------------------------
# Line Plot
# plotnine/ggplot2
= 'lngdpPercap', y = 'lifeExp')) +
(ggplot(dat, aes(x
geom_line())
= dat.query("country == 'Nigeria'")
nig = 'year', y = 'lifeExp')) +
(ggplot(nig, aes(x
geom_line())
# Seaborn
= 'lngdpPercap', y = 'lifeExp',data=dat)
sns.lineplot(x
# %% -----------------------------------------
# Binned Plot (density/histogram)
# plotnine/ggplot2
= 'lngdpPercap', y = 'lifeExp')) +
(ggplot(dat, aes(x
geom_bin2d())
# Seaborn
=dat, x="lngdpPercap", y="lifeExp", kind="hex") sns.jointplot(data
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import warnings
'ignore') # Ignore warnings
warnings.filterwarnings(
# Run into issues with plotnine here are some resolutions
#!pip install scipy==1.2 --upgrade
#!pip install scikit-misc
# Read in data
= (pd.read_csv("gapminder.csv")
dat eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
.
# Create an extra categorical variable
'wealthy'] = np.where(dat.lngdpPercap > 9,"yes","no")
dat[
dat.head()
############### Univariate Categorical ###############
# %% -----------------------------------------
# Bar Plot
='continent')) +
(ggplot(dat,aes(x
geom_bar())
# Ordering Bar Plot by Frequency
= dat.continent.value_counts().index.tolist()
cont_order ='continent')) +
(ggplot(dat,aes(x+
geom_bar() =cont_order))
scale_x_discrete(limits
# Adding in more categorical data
='continent',fill='wealthy')) +
(ggplot(dat,aes(x+
geom_bar() =cont_order))
scale_x_discrete(limits
# Dodge
='continent',fill='wealthy')) +
(ggplot(dat,aes(x="dodge") +
geom_bar(position=cont_order))
scale_x_discrete(limits
# Seaborn
="continent", hue = "wealthy",data=dat,kind="count")
sns.catplot(x
# Bivariate: category on continuous -----------------------------------------
# %% -----------------------------------------
# Box plot
# plotnine
='continent',y = 'lifeExp')) +
(ggplot(dat,aes(x
geom_boxplot())
# Flip the axis
='continent',y = 'lifeExp')) +
(ggplot(dat,aes(x+
geom_boxplot()
coord_flip())
# Seaborn
='continent',y = 'lifeExp',data=dat)
sns.boxplot(x
='lifeExp',y = 'continent',data=dat)
sns.boxplot(x
# %% -----------------------------------------
# Violin plot
# ggplot
='continent',y = 'lifeExp')) +
(ggplot(dat,aes(x
geom_violin())
# Seaborn
='continent',y = 'lifeExp',data=dat)
sns.violinplot(x
# %% -----------------------------------------
# Jitter plot
='continent',y = 'lifeExp')) + geom_point())
(ggplot(dat,aes(x
# ggplot
='continent',y = 'lifeExp',color="continent")) +
(ggplot(dat,aes(x= .25,alpha=.5,show_legend=False))
geom_jitter(width
# Layer the representations
='continent',y = 'lifeExp',color="continent")) +
(ggplot(dat,aes(x= .1,alpha=.1,show_legend=False) +
geom_jitter(width =.5,show_legend=False))
geom_boxplot(alpha
# Seaborn
='continent',y = 'lifeExp', data=dat)
sns.stripplot(x
# %% -----------------------------------------
# Heatmap: category on category on continous
# ggplot
='continent',y = 'wealthy',fill='lifeExp')) +
(ggplot(dat,aes(x
geom_tile())
# Seaborn
= dat.pivot_table('lifeExp','wealthy','continent')
M
M
sns.heatmap(M)
# %% -----------------------------------------
# Fitting categories into continuous on continuous
='lngdpPercap',y = 'lifeExp',color="continent")) +
(ggplot(dat, aes(x geom_point())
These exercises are designed to help you reinforce your grasp of the concepts covered in the asynchronous lecture material.
For the following visualizations, please use the palmerpenguins
dataset.
# Data can be imported via the palmerpenguins module
# !pip install palmerpenguins
import pandas as pd
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt
from palmerpenguins import load_penguins
= load_penguins()
dat dat.head()
Plot the body mass in grams (body_mass_g
) as a boxplot. Make sure the following is true:
body_mass_g
is on the x-axis with a label that reads “Body Mass (Grams)”; and the y-axis is species
and has no label.species
but make sure there is no legend.# plotnine solution
=\
p
(="species",y="body_mass_g",fill='species')) +
ggplot(dat,aes(x=.6,show_legend=False) + # added some additional transparency
geom_boxplot(alpha+
coord_flip() ="", y = "Body Mass (Grams)",title="Penguin Body Mass")
labs(x
)
p
# %% -----------------------------------------
# seaborn solution
="body_mass_g",y="species",data=dat)
sns.boxplot(x"Body Mass (Grams)")
plt.xlabel("")
plt.ylabel("Penguin Body Mass")
plt.title(
plt.show()
# %% -----------------------------------------
# You can also set titles using the seaborn object directly
= sns.boxplot(x="body_mass_g",y="species",data=dat)
ax "Penguin Body Mass")
ax.set_title("")
ax.set_ylabel("Body Mass (Grams)") ax.set_xlabel(
Using plotnine
, generate a scatter plot where flipper length (flipper_length_mm
) is on the x-axis and bill length (bill_length_mm
) is on the y-axis. The points should be a different color given the species
of the penguin. Also, the points should be sized differently given the body mass (body_mass_g
) of the penguin.
In addition to the above:
alpha=.5
)minimal
.
(="flipper_length_mm",
ggplot(dat,aes(x="bill_length_mm",
y="species",
color="body_mass_g")) +
size=.5) +
geom_point(alpha= "Flipper Length (MM)",y="Bill Length (MM)",
labs(x ="Penguin Species",size = "Body Mass (Grams)") +
color
theme_minimal() )
Using plotnine
, generate barplot that contains the Penguin species
on the x-axis and a count of the number of observations for each category on the y-axis. You’ll need to drop all missing observations first.
dat.dropna()
From there, do the following:
position = "dodge"
)._bw
).
(="species",fill="sex")) +
ggplot(dat.dropna(),aes(x="dodge") +
geom_bar(position= ["Adelie","Gentoo","Chinstrap"]) +
scale_x_discrete(limits =["darkorange","black"]) +
scale_fill_manual(values="Penguin Species",y="Number of Observations", fill="",
labs(x= "Distribution of Sex by Species") +
title
theme_bw() )
The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.
ed769@georgetown.edu | www.ericdunford.com