inaug_dat <- read_csv("inaug_speeches.csv")
head(inaug_dat)
## # A tibble: 6 x 5
## X1 Name `Inaugural Address` Date text
## <dbl> <chr> <chr> <chr> <chr>
## 1 4 George Wa… First Inaugural Ad… Thursday, … "Fellow-Citizens of the Sena…
## 2 5 George Wa… Second Inaugural A… Monday, Ma… "Fellow Citizens: \xa0\xa0I…
## 3 6 John Adams Inaugural Address Saturday, … "\xa0\xa0WHEN it was first p…
## 4 7 Thomas Je… First Inaugural Ad… Wednesday,… "Friends and Fellow-Citizens…
## 5 8 Thomas Je… Second Inaugural A… Monday, Ma… "\xa0\xa0PROCEEDING, fellow-…
## 6 9 James Mad… First Inaugural Ad… Saturday, … "\xa0\xa0UNWILLING to depart…
Clean the data.
dat <-
inaug_dat %>%
transmute(president = str_to_lower(Name) %>% str_replace_all(.," ","_"),
address = case_when(
str_detect(`Inaugural Address`,"First") ~ "first",
str_detect(`Inaugural Address`,"Second") ~ "second",
str_detect(`Inaugural Address`,"Third") ~ "third",
str_detect(`Inaugural Address`,"Fourth") ~ "fourth",
T ~ "first"),
date = as.Date(Date,"%A, %B %d, %Y"),
year = lubridate::year(date),
length = str_count(text),
text = text)
# Adjust for one problematic date
dat[dat$president=="bill_clinton" & dat$address=="second",]$date = as.Date("1997-01-20")
dat[dat$president=="bill_clinton" & dat$address=="second",]$year = 1997
head(dat)
## # A tibble: 6 x 6
## president address date year length text
## <chr> <chr> <date> <dbl> <int> <chr>
## 1 george_washi… first 1789-04-30 1789 8641 "Fellow-Citizens of the Senate …
## 2 george_washi… second 1793-03-04 1793 805 "Fellow Citizens: \xa0\xa0I AM…
## 3 john_adams first 1797-03-04 1797 13914 "\xa0\xa0WHEN it was first perc…
## 4 thomas_jeffe… first 1801-03-04 1801 10170 "Friends and Fellow-Citizens: …
## 5 thomas_jeffe… second 1805-03-04 1805 12947 "\xa0\xa0PROCEEDING, fellow-cit…
## 6 james_madison first 1809-03-04 1809 7031 "\xa0\xa0UNWILLING to depart fr…
Answer: Not really. Though the variation in speech length varies less after 1950.
dat %>%
ggplot(aes(year,length)) +
geom_line() +
geom_point() +
geom_smooth(method="loess",se=F)
Most Verbose!
dat %>%
filter(max(length)==length) %>%
select(president,length)
## # A tibble: 1 x 2
## president length
## <chr> <int>
## 1 william_henry_harrison 49871
Least Verbose
dat %>%
filter(min(length)==length) %>%
select(president,length)
## # A tibble: 1 x 2
## president length
## <chr> <int>
## 1 george_washington 805
text_dat <-
dat %>%
unnest_tokens(word,text) %>%
anti_join(stop_words,by="word") %>%
filter(!str_detect(word,"\\d"))
head(text_dat)
## # A tibble: 6 x 6
## president address date year length word
## <chr> <chr> <date> <dbl> <int> <chr>
## 1 george_washington first 1789-04-30 1789 8641 fellow
## 2 george_washington first 1789-04-30 1789 8641 citizens
## 3 george_washington first 1789-04-30 1789 8641 senate
## 4 george_washington first 1789-04-30 1789 8641 house
## 5 george_washington first 1789-04-30 1789 8641 representatives
## 6 george_washington first 1789-04-30 1789 8641 vicissitudes
text_dat %>%
count(word,sort = T) %>%
slice(1:30) %>%
ggplot(aes(x=n,y=fct_reorder(word,n))) +
geom_col() +
labs(x="Term Frequency",y="")
Treat these as stop words that are particular to inaugural speeches (i.e. every president uses these words).
# Flag the relevant stop words
inaug_stop_words <-
text_dat %>%
count(word,sort = T) %>%
slice(1:39) %>%
select(word)
# And drop
text_dat2 <-
text_dat %>%
anti_join(inaug_stop_words,by = "word")
To answer this we need to know the inverse document term frequency, so we need word usage counts by president.
pres_tf <-
text_dat2 %>%
filter(address == "first") %>%
group_by(president,year) %>%
count(word) %>%
# let's bind on the inverse document term frequency.
bind_tf_idf(term = word,
document = president,
n = n )
pres_tf
## # A tibble: 23,604 x 7
## # Groups: president, year [39]
## president year word n tf idf tf_idf
## <chr> <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 abraham_lincoln 1861 abide 2 0.00195 3.66 0.00714
## 2 abraham_lincoln 1861 ability 1 0.000975 1.10 0.00107
## 3 abraham_lincoln 1861 accept 1 0.000975 1.02 0.000999
## 4 abraham_lincoln 1861 acceptance 1 0.000975 2.28 0.00222
## 5 abraham_lincoln 1861 accession 1 0.000975 2.28 0.00222
## 6 abraham_lincoln 1861 acquiesce 2 0.00195 2.97 0.00579
## 7 abraham_lincoln 1861 acquiescence 1 0.000975 2.05 0.00200
## 8 abraham_lincoln 1861 act 1 0.000975 0.719 0.000701
## 9 abraham_lincoln 1861 action 2 0.00195 0.528 0.00103
## 10 abraham_lincoln 1861 actions 1 0.000975 1.58 0.00154
## # … with 23,594 more rows
Now to select the top 5 words unique to each president.
five_unique <-
pres_tf %>%
group_by(president,year) %>%
arrange(desc(tf_idf)) %>%
slice(1:5) %>%
ungroup
five_unique
## # A tibble: 195 x 7
## president year word n tf idf tf_idf
## <chr> <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 abraham_lincoln 1861 clause 5 0.00487 2.97 0.0145
## 2 abraham_lincoln 1861 secede 4 0.00390 3.66 0.0143
## 3 abraham_lincoln 1861 minority 6 0.00585 2.28 0.0133
## 4 abraham_lincoln 1861 plainly 5 0.00487 2.56 0.0125
## 5 abraham_lincoln 1861 lawfully 4 0.00390 2.97 0.0116
## 6 andrew_jackson 1829 defending 2 0.00524 2.56 0.0134
## 7 andrew_jackson 1829 diffidence 2 0.00524 2.56 0.0134
## 8 andrew_jackson 1829 worth 2 0.00524 2.28 0.0119
## 9 andrew_jackson 1829 accountability 2 0.00524 1.87 0.00980
## 10 andrew_jackson 1829 admonishes 1 0.00262 3.66 0.00959
## # … with 185 more rows
Finally, let’s plot as a bar plot.
five_unique %>%
# This line orders the presidents temporally rather than alphabetically
mutate(president = fct_reorder(president,year)) %>%
ggplot(aes(x=tf_idf,y=fct_reorder(word,tf_idf))) +
geom_col() +
facet_wrap(~president,scales="free_y",ncol=3) +
labs(y="Inverse Document Term Frequency",x="") +
theme(text = element_text(size=18))
Let’s examine the sentiment of the speeches.
text_dat_sent <-
text_dat2 %>%
inner_join(get_sentiments("afinn"),by='word')
text_dat_sent
## # A tibble: 6,972 x 7
## president address date year length word value
## <chr> <chr> <date> <dbl> <int> <chr> <dbl>
## 1 george_washington first 1789-04-30 1789 8641 love 3
## 2 george_washington first 1789-04-30 1789 8641 retreat -1
## 3 george_washington first 1789-04-30 1789 8641 hopes 2
## 4 george_washington first 1789-04-30 1789 8641 retreat -1
## 5 george_washington first 1789-04-30 1789 8641 dear 2
## 6 george_washington first 1789-04-30 1789 8641 waste -1
## 7 george_washington first 1789-04-30 1789 8641 committed 1
## 8 george_washington first 1789-04-30 1789 8641 trust 1
## 9 george_washington first 1789-04-30 1789 8641 distrustful -3
## 10 george_washington first 1789-04-30 1789 8641 inferior -2
## # … with 6,962 more rows
Most positive.
text_dat_sent %>%
group_by(president,year,address) %>%
summarize(ave_score = mean(value)) %>%
ungroup %>%
filter(ave_score == max(ave_score))
## # A tibble: 1 x 4
## president year address ave_score
## <chr> <dbl> <chr> <dbl>
## 1 george_bush 1989 first 1.19
Most negative.
text_dat_sent %>%
group_by(president,year,address) %>%
summarize(ave_score = mean(value)) %>%
ungroup %>%
filter(ave_score == min(ave_score))
## # A tibble: 1 x 4
## president year address ave_score
## <chr> <dbl> <chr> <dbl>
## 1 abraham_lincoln 1861 first -0.296
Note: Don’t consider FDR’s third and fourth inaugural speeches.
# Only consider presidents who made it to a second term.
two_timers =
text_dat_sent %>%
filter(address == "second") %>%
distinct(president)
# Filter the text data and aggregate
sent_first_second <-
text_dat_sent %>%
inner_join(two_timers,by="president") %>%
filter(!address %in% c("third","fourth")) %>%
group_by(president,address) %>%
summarize(score = mean(value))
Now plot.
# Plot as a bar graph
sent_first_second %>%
ggplot(aes(x=score,y=president,fill=address)) +
geom_col(position="dodge") +
geom_vline(xintercept = 0) +
labs(x="Sentiment Score",y="",fill="Term in Office") +
ggthemes::scale_fill_colorblind() +
ggthemes::theme_hc() +
theme(legend.position = "top")
Let’s present this same information as a difference.
# Reorganize the data
sent_first_second %>%
pivot_wider(names_from = address,values_from = score) %>%
mutate(diff = second - first) %>%
ggplot(aes(x=diff,y=president,fill=diff>0)) +
geom_col(show.legend = F) +
geom_vline(xintercept = 0) +
scale_fill_manual(values=c("darkred","steelblue")) +
labs(x="Difference in the Sentiment Score from First to Second Term",
y="") +
ggthemes::theme_hc() +
theme(legend.position = "top")
k
(the number of topics you’re looking for to 5
). Try and interpret the output.Let’s subset the text and convert to a document term matrix.
dtm <-
text_dat2 %>%
mutate(id = str_glue("{president} ({address})")) %>%
group_by(id) %>%
# Dropping some additional words
filter(!str_detect(word,"american")) %>%
filter(!str_detect(word,"president")) %>%
# Stem
# mutate(word = SnowballC::wordStem(word)) %>%
count(word) %>%
ungroup %>%
cast_dtm(document = id,term = word,value = n)
dtm
## <<DocumentTermMatrix (documents: 58, terms: 8583)>>
## Non-/sparse entries: 31414/466400
## Sparsity : 94%
## Maximal term length: 17
## Weighting : term frequency (tf)
Let’s run a topic model with 5 possible topics.
inaug_lda <- LDA(dtm,k=5,control = list(seed = 1234))
Let’s explore the topic output.
What are the top ten words most associated with each topic?
# Extract the term to topic associations
inaug_lda %>%
tidy(metric="beta") %>%
# Convert topics metrics to proportions
group_by(topic) %>%
mutate(prop = beta/sum(beta)) %>%
# Grab the top 10
arrange(desc(prop)) %>%
slice(1:5) %>%
ungroup %>%
# Rename topic label
mutate(topic_label = str_glue("Topic {topic}")) %>%
# Plot as word cloud
ggplot(aes(y=term,x=prop,fill=prop)) +
geom_col(show.legend = F) +
facet_wrap(~topic_label,scales="free",ncol=1)
Can we associate meaning to these topics?