import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting
import scipy.stats as stats # for calculating the quantiles for a QQ plot
import requests
# Print all columns from the Pandas DataFrame
pd.set_option('display.max_columns', None)
# Ignore warnings from Seaborn (specifically, future update warnings)
import warnings
warnings.filterwarnings("ignore")
def download_data(git_loc,dest_name):
'''
Download data from Github and save to the notebook's working directory.
'''
req = requests.get(git_loc)
with open(dest_name,"w") as file:
for line in req.text:
file.writelines(line)
download_data('https://raw.githubusercontent.com/edunford/ppol564/master/lectures/lecture_10/country_data.csv',
"country_data.csv")
# Read in Data
dat = pd.read_csv("country_data.csv")
dat.shape
dat.columns
dat.index
dat.head()
dat.dtypes
year
to integer typecontinent
to categorical typeregime_type
to categorical typedat.year = dat.year.astype("int")
dat.country = dat.country.astype("category")
dat.continent = dat.continent.astype("category")
dat.regime_type = dat.regime_type.astype("category")
dat.dtypes
Categorical variables are similar to factor variables in R
dat.continent.unique()
dat.continent.cat.codes.unique()
Depending on the unit of analysis:
min_year = dat.year.min()
max_year = dat.year.max()
print(f"The data ranges from {min_year} to {max_year}.")
dat.country.unique()
There are 122 countries in the data.
Are all countries in the data for the same years? A simple way we can explore this is to plot the spatial unit (if fixed and not too large, on the temporal unit.
sns.set_context("notebook", font_scale=2)
g = sns.relplot("year","country",
hue = "continent",
kind="scatter",
height=30,s=200,
data=dat.sort_values('continent'))