import pandas as pd
from dfply import *
Operations:
These are powerful operations in isolation, but when combined, what results is a streamlined way to convert raw data into a useful data construct. We can do this because pandas was built fully utilizing a fluid programming framework (that is, the class always returns itself after every call).
This is easier to show in practice. Let's consider different ways to perform the same sequence of operations.
Let's do the following: Aggregate the data to the country level for all countries in Asia
dat = pd.read_csv("gapminder.csv") # Read the data in
dat = dat.query("continent == 'Asia'")
dat = dat.filter(['country','lifeExp','gdpPercap','population'])
dat = dat.groupby(['country'])
dat = dat.agg(mean)
dat = dat.reset_index()
dat.round(1).head()
dat = pd.read_csv("gapminder.csv")
dat = dat.query("continent == 'Asia'").filter(['country','lifeExp','gdpPercap','population']).groupby(['country']).agg(mean).reset_index()
dat.round(1).head()
dat = pd.read_csv("gapminder.csv")
# (A) use the back slash
dat = dat.\
query("continent == 'Asia'").\
filter(['country','lifeExp','gdpPercap','population']).\
groupby(['country']).\
agg(mean).\
reset_index()
dat.round(1).head()
dat = pd.read_csv("gapminder.csv")
# (B) house in parentheses
dat = (dat
.query("continent == 'Asia'")
.filter(['country','lifeExp','gdpPercap','population'])
.groupby(['country'])
.agg(mean)
.reset_index())
dat.round(1).head()
dfply
¶We've already seen this in play throughout the lecture notes.
dat = pd.read_csv("gapminder.csv")
dat = (dat >>
mask(X.continent == "Asia") >>
select(X.country,columns_between(X.lifeExp,X.gdpPercap)) >>
group_by(X.country) >>
summarize_each([np.mean],X.lifeExp,X.gdpPercap,X.population))
dat.round(1).head()
Note that similar to numpy
, we can manipulate a "view" of a pandas.DataFrame
.
# Create a fake data frame
D = pd.DataFrame(dict(A=np.arange(5),
B=np.arange(5)*-1))
D
# Create and Manipulate a subset (view)
b = D.iloc[:2,:2]
b.iloc[:2,:2] = 8
b
# Changes carry over to the original object
D
dat = pd.read_csv("gapminder.csv")
# Pandas approach
# Country-Years
a = \
(dat
.filter(["country"])
.groupby(['country'])
.size() # We can count the number of observations using the size method
.reset_index()
.rename(columns={0:"n_country_years"})
)
# Countries by continent
b = \
(dat
.filter(["continent","country"])
.drop_duplicates()
.groupby(['continent'])
.size() # We can count the number of observations using the size method
.reset_index()
.rename(columns={0:"n_countries"})
.head(5)
)
# total countries
c = \
(dat
.filter(["country"])
.drop_duplicates()
.shape[0]
)
# Print
display(a.head(3))
display(b)
print(f'There are {c} total countries in the data.')
# dfply approach
# Country-Years
aa = \
(dat >>
group_by(X.country) >>
summarize(n_country_years = n(X.year)) >>
head(3))
# Countries by continent
bb = \
(dat >>
distinct(X.continent,X.country) >>
group_by(X.continent) >>
summarize(n_countries = n(X.country)) >>
head(3))
# total countries
cc = dat >> summarize(n_total_countries = n_distinct(X.country))
# Print
display(aa.head(3))
display(bb)
display(cc)
In statistics and machine learning, we often need to convert a categorical variable into a dummy feature set (i.e. when the variable is "on" it takes the value of 1, 0 otherwise). In statistics, we'll use this type of conversion to generate fixed effects.
pandas
makes this type of manipulation easy to do. with the
d = pd.DataFrame(dict(country = ["Nigeria","Nigeria","United States","United States","Russia","Russia"]))
d
dummies = pd.get_dummies(d.country)
pd.concat([d,dummies],sort=False,axis=1)
dfply
¶For the sake of completeness, let's demonstrate what joining looks like with dfply
. Like dplyr
(which the module is based off of), dfply
uses SQl
syntax to keep track of the type of merge we are performing.
# Same fake data construct as the prior lecture
data_A = pd.DataFrame(dict(country = ["Nigeria","England","Botswana"],
var1 = [4,3,6]))
data_B = pd.DataFrame(dict(country = ["Nigeria","United States","Botswana"],
var2 = ["low","high","medium"]))
display(data_A)
display(data_B)
# Left Join
data_A >> left_join(data_B, by='country')
# right Join
data_A >> right_join(data_B, by='country')
# Inner Join
data_A >> inner_join(data_B, by='country')
# Full Join
data_A >> full_join(data_B, by='country')
# Or Outer Join (same as Full Join)
data_A >> outer_join(data_B, by='country')
# Anti Join
data_A >> anti_join(data_B, by='country')
# Bind Rows
data_A >> bind_rows(data_B)
# Bind columns
data_A >> bind_cols(data_B)