Data

inaug_dat <- read_csv("inaug_speeches.csv") 
head(inaug_dat)
## # A tibble: 6 x 5
##      X1 Name       `Inaugural Address` Date        text                         
##   <dbl> <chr>      <chr>               <chr>       <chr>                        
## 1     4 George Wa… First Inaugural Ad… Thursday, … "Fellow-Citizens of the Sena…
## 2     5 George Wa… Second Inaugural A… Monday, Ma… "Fellow Citizens:  \xa0\xa0I…
## 3     6 John Adams Inaugural Address   Saturday, … "\xa0\xa0WHEN it was first p…
## 4     7 Thomas Je… First Inaugural Ad… Wednesday,… "Friends and Fellow-Citizens…
## 5     8 Thomas Je… Second Inaugural A… Monday, Ma… "\xa0\xa0PROCEEDING, fellow-…
## 6     9 James Mad… First Inaugural Ad… Saturday, … "\xa0\xa0UNWILLING to depart…

Clean the data.

dat <- 
  inaug_dat %>% 
  transmute(president = str_to_lower(Name) %>% str_replace_all(.," ","_"),
            address = case_when(
                str_detect(`Inaugural Address`,"First") ~ "first",
                str_detect(`Inaugural Address`,"Second") ~ "second",
                str_detect(`Inaugural Address`,"Third") ~ "third",
                str_detect(`Inaugural Address`,"Fourth") ~ "fourth",
                T ~ "first"),
            date = as.Date(Date,"%A, %B %d, %Y"),
            year = lubridate::year(date),
            length = str_count(text),
            text = text)

# Adjust for one problematic date
dat[dat$president=="bill_clinton" & dat$address=="second",]$date = as.Date("1997-01-20")
dat[dat$president=="bill_clinton" & dat$address=="second",]$year = 1997

head(dat)
## # A tibble: 6 x 6
##   president     address date        year length text                            
##   <chr>         <chr>   <date>     <dbl>  <int> <chr>                           
## 1 george_washi… first   1789-04-30  1789   8641 "Fellow-Citizens of the Senate …
## 2 george_washi… second  1793-03-04  1793    805 "Fellow Citizens:  \xa0\xa0I AM…
## 3 john_adams    first   1797-03-04  1797  13914 "\xa0\xa0WHEN it was first perc…
## 4 thomas_jeffe… first   1801-03-04  1801  10170 "Friends and Fellow-Citizens:  …
## 5 thomas_jeffe… second  1805-03-04  1805  12947 "\xa0\xa0PROCEEDING, fellow-cit…
## 6 james_madison first   1809-03-04  1809   7031 "\xa0\xa0UNWILLING to depart fr…

Questions

(1) Do speeches get longer over time?

Answer: Not really. Though the variation in speech length varies less after 1950.

dat %>% 
  ggplot(aes(year,length)) +
  geom_line() +
  geom_point() +
  geom_smooth(method="loess",se=F)

(2) Who was the most/least verbose?

Most Verbose!

dat %>% 
  filter(max(length)==length) %>% 
  select(president,length)
## # A tibble: 1 x 2
##   president              length
##   <chr>                   <int>
## 1 william_henry_harrison  49871

Least Verbose

dat %>% 
  filter(min(length)==length) %>% 
  select(president,length)
## # A tibble: 1 x 2
##   president         length
##   <chr>              <int>
## 1 george_washington    805

(3) Convert text to tidy format.

  • tokenize using words as the fundamental unit.
  • Remove all stopwords
  • Remove all digits.
text_dat <- 
  dat %>% 
  unnest_tokens(word,text) %>% 
   anti_join(stop_words,by="word") %>% 
  filter(!str_detect(word,"\\d")) 

head(text_dat)
## # A tibble: 6 x 6
##   president         address date        year length word           
##   <chr>             <chr>   <date>     <dbl>  <int> <chr>          
## 1 george_washington first   1789-04-30  1789   8641 fellow         
## 2 george_washington first   1789-04-30  1789   8641 citizens       
## 3 george_washington first   1789-04-30  1789   8641 senate         
## 4 george_washington first   1789-04-30  1789   8641 house          
## 5 george_washington first   1789-04-30  1789   8641 representatives
## 6 george_washington first   1789-04-30  1789   8641 vicissitudes

(4) What are the 30 most frequent words used across all inaugural speeches? Please present this information as a bar plot.

text_dat %>% 
  count(word,sort = T) %>% 
  slice(1:30) %>% 
  ggplot(aes(x=n,y=fct_reorder(word,n))) +
  geom_col() +
  labs(x="Term Frequency",y="")

(5) Remove the top 30 most common words from the inaugural speech data.

Treat these as stop words that are particular to inaugural speeches (i.e. every president uses these words).

# Flag the relevant stop words
inaug_stop_words <-
  text_dat %>% 
  count(word,sort = T) %>% 
  slice(1:39) %>% 
  select(word)

# And drop
text_dat2 <- 
  text_dat %>% 
  anti_join(inaug_stop_words,by = "word")

(6) What are the top five words that are most unique to each president’s inaugural speech in their first term? Please present this information as a faceted bar graph.

To answer this we need to know the inverse document term frequency, so we need word usage counts by president.

pres_tf <- 
  text_dat2 %>% 
  filter(address == "first") %>% 
  group_by(president,year) %>% 
  count(word) %>% 
  
  # let's bind on the inverse document term frequency. 
  bind_tf_idf(term = word,
              document = president,
              n = n )

pres_tf
## # A tibble: 23,604 x 7
## # Groups:   president, year [39]
##    president        year word             n       tf   idf   tf_idf
##    <chr>           <dbl> <chr>        <int>    <dbl> <dbl>    <dbl>
##  1 abraham_lincoln  1861 abide            2 0.00195  3.66  0.00714 
##  2 abraham_lincoln  1861 ability          1 0.000975 1.10  0.00107 
##  3 abraham_lincoln  1861 accept           1 0.000975 1.02  0.000999
##  4 abraham_lincoln  1861 acceptance       1 0.000975 2.28  0.00222 
##  5 abraham_lincoln  1861 accession        1 0.000975 2.28  0.00222 
##  6 abraham_lincoln  1861 acquiesce        2 0.00195  2.97  0.00579 
##  7 abraham_lincoln  1861 acquiescence     1 0.000975 2.05  0.00200 
##  8 abraham_lincoln  1861 act              1 0.000975 0.719 0.000701
##  9 abraham_lincoln  1861 action           2 0.00195  0.528 0.00103 
## 10 abraham_lincoln  1861 actions          1 0.000975 1.58  0.00154 
## # … with 23,594 more rows

Now to select the top 5 words unique to each president.

five_unique <-
  pres_tf %>% 
  group_by(president,year) %>% 
  arrange(desc(tf_idf)) %>% 
  slice(1:5)  %>% 
  ungroup

five_unique
## # A tibble: 195 x 7
##    president        year word               n      tf   idf  tf_idf
##    <chr>           <dbl> <chr>          <int>   <dbl> <dbl>   <dbl>
##  1 abraham_lincoln  1861 clause             5 0.00487  2.97 0.0145 
##  2 abraham_lincoln  1861 secede             4 0.00390  3.66 0.0143 
##  3 abraham_lincoln  1861 minority           6 0.00585  2.28 0.0133 
##  4 abraham_lincoln  1861 plainly            5 0.00487  2.56 0.0125 
##  5 abraham_lincoln  1861 lawfully           4 0.00390  2.97 0.0116 
##  6 andrew_jackson   1829 defending          2 0.00524  2.56 0.0134 
##  7 andrew_jackson   1829 diffidence         2 0.00524  2.56 0.0134 
##  8 andrew_jackson   1829 worth              2 0.00524  2.28 0.0119 
##  9 andrew_jackson   1829 accountability     2 0.00524  1.87 0.00980
## 10 andrew_jackson   1829 admonishes         1 0.00262  3.66 0.00959
## # … with 185 more rows

Finally, let’s plot as a bar plot.

five_unique %>% 
  
  # This line orders the presidents temporally rather than alphabetically
  mutate(president = fct_reorder(president,year)) %>% 
  
  ggplot(aes(x=tf_idf,y=fct_reorder(word,tf_idf))) +
  geom_col() +
  facet_wrap(~president,scales="free_y",ncol=3) +
  labs(y="Inverse Document Term Frequency",x="") +
  theme(text = element_text(size=18))

(7) On Average, which president’s inaugural speech is most “positive”? Which is most “negative”?

Let’s examine the sentiment of the speeches.

text_dat_sent <- 
  text_dat2 %>% 
  inner_join(get_sentiments("afinn"),by='word')

text_dat_sent
## # A tibble: 6,972 x 7
##    president         address date        year length word        value
##    <chr>             <chr>   <date>     <dbl>  <int> <chr>       <dbl>
##  1 george_washington first   1789-04-30  1789   8641 love            3
##  2 george_washington first   1789-04-30  1789   8641 retreat        -1
##  3 george_washington first   1789-04-30  1789   8641 hopes           2
##  4 george_washington first   1789-04-30  1789   8641 retreat        -1
##  5 george_washington first   1789-04-30  1789   8641 dear            2
##  6 george_washington first   1789-04-30  1789   8641 waste          -1
##  7 george_washington first   1789-04-30  1789   8641 committed       1
##  8 george_washington first   1789-04-30  1789   8641 trust           1
##  9 george_washington first   1789-04-30  1789   8641 distrustful    -3
## 10 george_washington first   1789-04-30  1789   8641 inferior       -2
## # … with 6,962 more rows

Most positive.

text_dat_sent %>% 
  group_by(president,year,address) %>% 
  summarize(ave_score = mean(value)) %>% 
  ungroup %>% 
  filter(ave_score == max(ave_score))
## # A tibble: 1 x 4
##   president    year address ave_score
##   <chr>       <dbl> <chr>       <dbl>
## 1 george_bush  1989 first        1.19

Most negative.

text_dat_sent %>% 
  group_by(president,year,address) %>% 
  summarize(ave_score = mean(value)) %>% 
  ungroup %>% 
  filter(ave_score == min(ave_score))
## # A tibble: 1 x 4
##   president        year address ave_score
##   <chr>           <dbl> <chr>       <dbl>
## 1 abraham_lincoln  1861 first      -0.296

(8) Of presidents elected into a second term of office, are they more positive in their second inaugural speech vis-a-vis their first on average? Plot this information as a graph of your choosing.

Note: Don’t consider FDR’s third and fourth inaugural speeches.

# Only consider presidents who made it to a second term.
two_timers = 
  text_dat_sent %>% 
  filter(address == "second") %>% 
  distinct(president)

# Filter the text data and aggregate
sent_first_second <- 
  text_dat_sent %>% 
  inner_join(two_timers,by="president") %>% 
  filter(!address %in% c("third","fourth")) %>% 
  group_by(president,address) %>% 
  summarize(score = mean(value))

Now plot.

# Plot as a bar graph
sent_first_second %>% 
  ggplot(aes(x=score,y=president,fill=address)) +
  geom_col(position="dodge") +
  geom_vline(xintercept = 0) +
  labs(x="Sentiment Score",y="",fill="Term in Office") +
  ggthemes::scale_fill_colorblind() +
  ggthemes::theme_hc() +
  theme(legend.position = "top")

Let’s present this same information as a difference.

# Reorganize the data 
sent_first_second %>% 
  pivot_wider(names_from = address,values_from = score) %>% 
  mutate(diff = second - first) %>% 
  
  ggplot(aes(x=diff,y=president,fill=diff>0)) +
  geom_col(show.legend = F) +
  geom_vline(xintercept = 0) +
  scale_fill_manual(values=c("darkred","steelblue")) +
  labs(x="Difference in the Sentiment Score from First to Second Term",
       y="") +
  ggthemes::theme_hc() +
  theme(legend.position = "top")

(9) Run a topic model on the inaugural speeches setting k (the number of topics you’re looking for to 5). Try and interpret the output.

Let’s subset the text and convert to a document term matrix.

dtm <- 
  text_dat2 %>% 
  mutate(id = str_glue("{president} ({address})")) %>% 
  group_by(id) %>% 
  
  # Dropping some additional words
  filter(!str_detect(word,"american")) %>% 
  filter(!str_detect(word,"president")) %>% 
  
  # Stem
  # mutate(word = SnowballC::wordStem(word)) %>%
  
  count(word) %>% 
  ungroup %>% 
  cast_dtm(document = id,term = word,value = n) 

dtm
## <<DocumentTermMatrix (documents: 58, terms: 8583)>>
## Non-/sparse entries: 31414/466400
## Sparsity           : 94%
## Maximal term length: 17
## Weighting          : term frequency (tf)

Let’s run a topic model with 5 possible topics.

inaug_lda <- LDA(dtm,k=5,control = list(seed = 1234))

Let’s explore the topic output.

What are the top ten words most associated with each topic?

# Extract the term to topic associations
inaug_lda %>% 
  tidy(metric="beta") %>% 
  
  # Convert topics metrics to proportions
  group_by(topic) %>% 
  mutate(prop = beta/sum(beta)) %>% 
  
  # Grab the top 10
  arrange(desc(prop)) %>% 
  slice(1:5) %>% 
  ungroup %>% 
  
  # Rename topic label
  mutate(topic_label = str_glue("Topic {topic}")) %>% 
  
  # Plot as word cloud
  ggplot(aes(y=term,x=prop,fill=prop)) +
  geom_col(show.legend = F) +
  facet_wrap(~topic_label,scales="free",ncol=1) 

Can we associate meaning to these topics?