In the Asynchronous Lecture
matplotlib and seabornplotnineIn the Synchronous Lecture
If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.
The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.
Total time: Approx. 1 hour and 22 minutes
Download the gapminder.csv dataset used in the asynchronous videos.
matplotlib + seabornimport pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %% -----------------------------------------
# Data
# Read data in and transform initial variables
dat = (pd.read_csv("gapminder.csv")
.eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
dat.head()
# %% -----------------------------------------
# Matplotlib
plt.figure(figsize=(12,7))
plt.scatter(x = dat.lngdpPercap, y = dat.lifeExp, c="green", alpha=.5,s=100)
plt.xlabel("Log GDP perCapita")
plt.ylabel("Life Expectancy")
plt.show()
# %% -----------------------------------------
# Matplotlib library is built into pandas
dat.plot.scatter(x="lngdpPercap",y="lifeExp",figsize=(12,7),alpha=0.5,c="green",s=100)
plt.xlabel("Log GDP perCapita")
plt.ylabel("Life Expectancy")
plt.show()
# %% -----------------------------------------
# Seaborn
plt.figure(figsize=(12,7))
sns.scatterplot(x = "lngdpPercap",y="lifeExp",
alpha=.5,color="green",s=100,
data = dat)
plt.xlabel("Log GDP perCapita")
plt.ylabel("Life Expectancy")
plt.show()plotnine/ggplot2import pandas as pd
from plotnine import *
import warnings
warnings.filterwarnings('ignore') # Ignore warnings
# Run into issues with plotnine here are some resolutions
#!pip install scipy==1.2 --upgrade
#!pip install scikit-misc
# Read in data
dat = (pd.read_csv("gapminder.csv")
.eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
dat.head()
# %% -----------------------------------------
# Building a plotnine plot
(ggplot(dat,aes(x = "lngdpPercap",y="lifeExp",color="continent")) +
geom_point(size=3,alpha=.5,show_legend=False) +
theme_bw()+
labs(x = "Log GDP Per Capita",y="Life Expectancy",color="",title="Life Expectancy on GDP") +
scale_color_manual(values = ["blue","steelblue","black","gold","pink"]) +
facet_wrap("continent",scales="free",ncol=1) +
theme(figure_size=(5,12),
legend_position="top"))import pandas as pd
from plotnine import *
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # Ignore warnings
# Run into issues with plotnine here are some resolutions
#!pip install scipy==1.2 --upgrade
#!pip install scikit-misc
# Read in data
dat = (pd.read_csv("gapminder.csv")
.eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
dat.head()
############ Continuous Univariate ############
# %% -----------------------------------------
# Histogram
# plotnine/ggplot2
(ggplot(dat, aes(x = 'lifeExp')) +
geom_histogram())
# Seaborn
sns.distplot(dat.lifeExp,hist=True,kde=True)
# %% -----------------------------------------
# Density Plot
# plotnine/ggplot2
(ggplot(dat, aes(x = 'lifeExp')) +
geom_density(fill="blue",color="black",alpha=.5)+
xlim(0,100))
# Seaborn
sns.kdeplot(dat.lifeExp,shade=True)
# %% -----------------------------------------
############ Continuous Bivariate ############
# %% -----------------------------------------
# Scatter Plot
# plotnine/ggplot2
(ggplot(dat, aes(x = 'lngdpPercap', y = 'lifeExp')) +
geom_point(alpha=.5))
# Seaborn
sns.scatterplot(x = 'lngdpPercap', y = 'lifeExp',data=dat)
# %% -----------------------------------------
# Line Plot
# plotnine/ggplot2
(ggplot(dat, aes(x = 'lngdpPercap', y = 'lifeExp')) +
geom_line())
nig = dat.query("country == 'Nigeria'")
(ggplot(nig, aes(x = 'year', y = 'lifeExp')) +
geom_line())
# Seaborn
sns.lineplot(x = 'lngdpPercap', y = 'lifeExp',data=dat)
# %% -----------------------------------------
# Binned Plot (density/histogram)
# plotnine/ggplot2
(ggplot(dat, aes(x = 'lngdpPercap', y = 'lifeExp')) +
geom_bin2d())
# Seaborn
sns.jointplot(data=dat, x="lngdpPercap", y="lifeExp", kind="hex")import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # Ignore warnings
# Run into issues with plotnine here are some resolutions
#!pip install scipy==1.2 --upgrade
#!pip install scikit-misc
# Read in data
dat = (pd.read_csv("gapminder.csv")
.eval('lngdpPercap = log(gdpPercap)')
.eval('lnpop = log(pop)'))
# Create an extra categorical variable
dat['wealthy'] = np.where(dat.lngdpPercap > 9,"yes","no")
dat.head()
############### Univariate Categorical ###############
# %% -----------------------------------------
# Bar Plot
(ggplot(dat,aes(x='continent')) +
geom_bar())
# Ordering Bar Plot by Frequency
cont_order = dat.continent.value_counts().index.tolist()
(ggplot(dat,aes(x='continent')) +
geom_bar() +
scale_x_discrete(limits=cont_order))
# Adding in more categorical data
(ggplot(dat,aes(x='continent',fill='wealthy')) +
geom_bar() +
scale_x_discrete(limits=cont_order))
# Dodge
(ggplot(dat,aes(x='continent',fill='wealthy')) +
geom_bar(position="dodge") +
scale_x_discrete(limits=cont_order))
# Seaborn
sns.catplot(x="continent", hue = "wealthy",data=dat,kind="count")
# Bivariate: category on continuous -----------------------------------------
# %% -----------------------------------------
# Box plot
# plotnine
(ggplot(dat,aes(x='continent',y = 'lifeExp')) +
geom_boxplot())
# Flip the axis
(ggplot(dat,aes(x='continent',y = 'lifeExp')) +
geom_boxplot() +
coord_flip())
# Seaborn
sns.boxplot(x='continent',y = 'lifeExp',data=dat)
sns.boxplot(x='lifeExp',y = 'continent',data=dat)
# %% -----------------------------------------
# Violin plot
# ggplot
(ggplot(dat,aes(x='continent',y = 'lifeExp')) +
geom_violin())
# Seaborn
sns.violinplot(x='continent',y = 'lifeExp',data=dat)
# %% -----------------------------------------
# Jitter plot
(ggplot(dat,aes(x='continent',y = 'lifeExp')) + geom_point())
# ggplot
(ggplot(dat,aes(x='continent',y = 'lifeExp',color="continent")) +
geom_jitter(width = .25,alpha=.5,show_legend=False))
# Layer the representations
(ggplot(dat,aes(x='continent',y = 'lifeExp',color="continent")) +
geom_jitter(width = .1,alpha=.1,show_legend=False) +
geom_boxplot(alpha=.5,show_legend=False))
# Seaborn
sns.stripplot(x='continent',y = 'lifeExp', data=dat)
# %% -----------------------------------------
# Heatmap: category on category on continous
# ggplot
(ggplot(dat,aes(x='continent',y = 'wealthy',fill='lifeExp')) +
geom_tile())
# Seaborn
M = dat.pivot_table('lifeExp','wealthy','continent')
M
sns.heatmap(M)
# %% -----------------------------------------
# Fitting categories into continuous on continuous
(ggplot(dat, aes(x='lngdpPercap',y = 'lifeExp',color="continent")) +
geom_point())These exercises are designed to help you reinforce your grasp of the concepts covered in the asynchronous lecture material.
For the following visualizations, please use the palmerpenguins dataset.
# Data can be imported via the palmerpenguins module
# !pip install palmerpenguins
import pandas as pd
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt
from palmerpenguins import load_penguins
dat = load_penguins()
dat.head()Plot the body mass in grams (body_mass_g) as a boxplot. Make sure the following is true:
body_mass_g is on the x-axis with a label that reads “Body Mass (Grams)”; and the y-axis is species and has no label.species but make sure there is no legend.# plotnine solution
p =\
(
ggplot(dat,aes(x="species",y="body_mass_g",fill='species')) +
geom_boxplot(alpha=.6,show_legend=False) + # added some additional transparency
coord_flip() +
labs(x="", y = "Body Mass (Grams)",title="Penguin Body Mass")
)
p
# %% -----------------------------------------
# seaborn solution
sns.boxplot(x="body_mass_g",y="species",data=dat)
plt.xlabel("Body Mass (Grams)")
plt.ylabel("")
plt.title("Penguin Body Mass")
plt.show()
# %% -----------------------------------------
# You can also set titles using the seaborn object directly
ax = sns.boxplot(x="body_mass_g",y="species",data=dat)
ax.set_title("Penguin Body Mass")
ax.set_ylabel("")
ax.set_xlabel("Body Mass (Grams)")Using plotnine, generate a scatter plot where flipper length (flipper_length_mm) is on the x-axis and bill length (bill_length_mm) is on the y-axis. The points should be a different color given the species of the penguin. Also, the points should be sized differently given the body mass (body_mass_g) of the penguin.
In addition to the above:
alpha=.5)minimal.(
ggplot(dat,aes(x="flipper_length_mm",
y="bill_length_mm",
color="species",
size="body_mass_g")) +
geom_point(alpha=.5) +
labs(x = "Flipper Length (MM)",y="Bill Length (MM)",
color="Penguin Species",size = "Body Mass (Grams)") +
theme_minimal()
)Using plotnine, generate barplot that contains the Penguin species on the x-axis and a count of the number of observations for each category on the y-axis. You’ll need to drop all missing observations first.
dat.dropna()From there, do the following:
position = "dodge")._bw).(
ggplot(dat.dropna(),aes(x="species",fill="sex")) +
geom_bar(position="dodge") +
scale_x_discrete(limits = ["Adelie","Gentoo","Chinstrap"]) +
scale_fill_manual(values=["darkorange","black"]) +
labs(x="Penguin Species",y="Number of Observations", fill="",
title = "Distribution of Sex by Species") +
theme_bw()
)The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.
ed769@georgetown.edu | www.ericdunford.com