String Manipulation

Relevant Slides

Regex Description
+ match 1 or more of the previous character
* match 0 or more of the previous character
? the preceding item is optional (i.e., match 0 or 1 of the previous character).
[ ] match 1 of the set of things inside the bracket
\\w match a “word” character (i.e., letters and numbers).
\\d match digits
\\s match a space character
\\t match a “tab” character
\\n match a “newline” character
^ the “beginning edge” of a string
$ the “ending edge” of a string
{n} the preceding character is matched n times
[:punct:] find punctuation in the text

Code from the video

#' **String Manipulation with `stringr`**
#' > `stringr` package is part of the tidyverse.


# How stringr works -----------------------------------------------------------

#' `str_<prefix()`

    # Examples

    str_remove_all("What the %&*!",pattern="[:punct:]")

    str_detect("There is a cat in the street", pattern = "cat")

# String Views: Understanding pattern you're targetting -------------------

    text = "There were 5 cats!"

    # Pattern is a word
    str_view(text, pattern = "cats")

    # Pattern is a regular expression
    str_view(text, pattern = "\\d")

    str_view_all(text, pattern = "\\s")

    str_view(text, pattern = "\\d+\\s+\\w+")

# String Editing  ---------------------------------------------------------

    text # original string

  # Replacement

    str_replace(string = text, pattern = "cats", replacement = "dogs")

  # Removal

    str_remove(string = text, pattern = "[:punct:]")
    # Special case of replace
    str_replace(string = text, pattern = "[:punct:]", replacement = "")

  # Extraction

    str_extract(text,pattern = "\\d")

  # Inserting Data (in a string)

      x <- 10 # data that we want to put in a string.

      # (1) Pasting

        str_c("The answer is ",x,"%") # Concatenate the string

            # Base R: Version of this paste0() --- same thing under the hood.
            paste0("The answer is ",x,"%")

      # (2) Gluing

        str_glue("The answer is {x}%")

        str_glue("The answer is {x + 10}%")

  #' **SIDE NOTE**: `str_remove` vs. `str_remove_all()`

    text2 =  "Hello! What is your name?"

    # Remove only the first instance of the pattern
    str_remove(string = text2, pattern = "[:punct:]")

    # Remove only the all instances of the pattern
    str_remove_all(string = text2, pattern = "[:punct:]")

    #' _Note_: many `str_` methods have an `_all` implementation.
    #' e.g `str_replace_all()` and `str_extract_all()`

# Pattern Detection -------------------------------------------------------

    texts <- c("The man drank 5 beers.",
               "Obama was president.",
               "I think we should walk 2 blocks.")

    # Logical
    str_detect(texts,pattern = "\\d")

    # Index Location
    str_which(texts,pattern = "\\d")

    #' **NOTE**: This works really well with `filter()` when dealing with tidy
    #' data structures

    example_data = tibble(text=texts)

    example_data %>%

# Splitting and Trimming --------------------------------------------------

    # Split a string by some pattern
    str_split(texts[1],pattern = "\\s")
    # Cleaning whitespace
    white_text = "      Hello         "

# Capitalization ----------------------------------------------------------

    text3 <- "TeXt MininG iN r"

    # Lower case

    # Upper case

    # Title case

    # "Sentence case"


Code from the video


# Text to process ---------------------------------------------------------

text <-
  "US opposition politicians and aid agencies have questioned a decision by President Donald Trump to cut off aid to three Central American states --- or so the story reports!"


# We want to make this text tidy --- so convert it into a data frame.

text_data <- tibble(text = text)

# Tokenization ------------------------------------------------------------

  # Process of breaking a body of text up into fundamental units that we wish to
  # analyze

  # Split by each word
  text_data %>%
    unnest_tokens(output = word, input = text, token = "words") 

  # Split by each character
  text_data %>%
    unnest_tokens(word,text,token = "characters")

  # Split by clusters of words
  text_data %>%
    unnest_tokens(word,text,token = "ngrams",n=2)

  # Special cases
  tweet = "Hey @professor, this assignment doesn't make sense. #BadTeacher"

  tibble(text = tweet) %>%
    unnest_tokens(word,text,token = "words")

  tibble(text = tweet) %>%
    unnest_tokens(word,text,token = "tweets")

# Controlling the Tokenization Process ------------------------------------

  # Don't lower the case
  text_data %>%
    unnest_tokens(output = word, input = text,
                  token = "words",to_lower = F)

  # Don't drop the input text
  text_data %>%
    unnest_tokens(output = word, input = text,
                  token = "words",to_lower = F,drop=F)

# Text to Data ------------------------------------------------------------

  # How do we turn text into data? We count!

  text_data %>%
    unnest_tokens(output = word, input = text, token = "words") %>%

Processing Text

Code from the video

require(rvest) # To download some stories

# Get Data with our BBC Scraper -------------------------------------------------------------

    # Recall the BBC Scraper we built on the webscraping week.
    bbc_scraper <- function(url){
      # Download website
      raw = read_html(url)
      # Extract headline
      headline = raw %>%
        html_nodes(xpath='//*[@id="page"]/div[1]/div[2]/div/div[1]/div[1]/h1') %>%
      # Extract date
      date = raw %>%
        html_nodes(xpath='//*[@id="page"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/div/div[1]/div[1]/ul/li/div') %>%
      # Extract Story
      story = raw %>%
        html_nodes(xpath='//*[@id="page"]/div[1]/div[2]/div/div[1]/div[1]/div[2]/p') %>%
        html_text() %>% paste0(.,collapse = " ")
      # Output as data frame and return
      data.out = tibble(headline,date,story)

    # Let's use it to scrape some news data on Trump having COVID
    urls <- c("https://www.bbc.com/news/election-us-2020-54437852",

    news_data <- c()
    for(i in 1:length(urls)){
      draw <- bbc_scraper(urls[i])
      news_data <- bind_rows(news_data,draw)

# Text to Data ----------------------------------------------------------------

    # Tokenize
    news_tokens <-
      news_data %>%
      mutate(story_id = row_number()) %>%  # Create a unique id for each story
      unnest_tokens(output = word, input = story, token = "words")

    # Convert text to data: Term Frequency
    news_tokens %>% count(word,sort=T)

# Removing Stopwords ------------------------------------------------------

    # Tidytext provides a dictionary of stop words to remove

    # We can use this dictionary to clean our text
    news_tokens2 <-
      news_tokens %>%

    # Now count again
    news_tokens2 %>% count(word,sort=T)

    # we can also build our own dictionaries to clean out specific words
    # relevant to the text documents we're processing.
    my_stop_words = tibble(word = c("president","white","house","presidential"))

    # Then anti join in the same way we did above
    news_tokens2 <-
      news_tokens %>%
      anti_join(stop_words,by="word") %>%

    # Now count again
    news_tokens2 %>% count(word,sort=T)

# Removing digits ---------------------------------------------------------

    # tidytext doesn't remove digits, so we might want to get rid of those too

    # Let's first detect all the digits
    news_tokens2 %>%
      filter(str_detect(word,"\\d")) %>%

    # Let's then remove them
    news_tokens3 <-
      news_tokens2 %>%
      filter(!str_detect(word,"\\d")) # note the `!` to negate

# Stemming ----------------------------------------------------------------

    # Often words are the same but appear different because of their tense.

    txt = "cleaned cleaning cleaner beauty beautiful killing killed killer"
    tibble(text = txt) %>%
      unnest_tokens(word,text) %>%

    #' **Stemming** allows use to reduce a word down to it's fundamental root. (
    #' Note: Need to install `SnowballC` package )
    txt = "cleaned cleaning cleaner beauty beautiful killing killed killer"
    tibble(text = txt) %>%
      unnest_tokens(word,text) %>%
      mutate(word = SnowballC::wordStem(word)) %>%

    # Let's stem the news articles
    news_tokens4 <-
      news_tokens3 %>%
      mutate(word = SnowballC::wordStem(word))

# Text to Data (using Group by) -------------------------------------------

    # Now convert to numerical data by counting
    text_data  <-

      news_tokens4 %>%

      # We want to group by story_id b/c we're interested in how the words are
      # used within each stories
      group_by(story_id,headline) %>%

      # Count
      count(word,sort=T) %>%



# Inverse Document Frequency ----------------------------------------------

    # Measures how important a word is given all words in the text. Mainly, we
    # want to up weight infrequently used words across the documents, and down
    # weight words that are used often by all the documents.

    text_data2 <-
      text_data %>%
      bind_tf_idf(term = word,  # Tokens
                  document = story_id, # Document id differentiating the docs
                  n = n) # word counts


# Visualize ---------------------------------------------------------------

    # Pull the top five words that are unique to each story
    top_5idf <-

      text_data2 %>%

      # For each story
      group_by(story_id) %>%
      # Pull the top five words that are unique to that story
      arrange(desc(tf_idf)) %>% 
      slice(1:5) %>% 
    # Visualize as a bar graph
    top_5idf %>%
        ggplot(aes( fct_reorder(word,tf_idf), tf_idf,fill=headline)) +
        geom_col(show.legend = F) +
        xlab(NULL) +
        coord_flip() +
        facet_wrap(~headline,ncol=1,scales="free") +

    #' Visualize as a **word cloud**
    text_data2 %>%

      # Only plot if the word was used more than once
      filter(n > 1) %>%

      # Let's ignore the military leaders story
      filter(!str_detect(headline,"military leaders")) %>%

      # Visualize
      ggplot(aes(label = word,
                 size = tf_idf,
                 color= tf_idf)) +

      #' We'll borrow the following GEOM from the `ggwordcloud` package (so you
      #' need to have this package installed)
      ggwordcloud::geom_text_wordcloud_area() +

      scale_color_gradient(low="darkred",high="steelblue") +

      facet_wrap(~headline,scales="free") +

      theme(text = element_text(size=12))


These exercises are designed to help you reinforce your grasp of the concepts covered in the asynchronous lecture material.


Question 1

Clean the following string so that it reads as: "Clean this string".

dirty_string <- "22clean 5674.5This $%&*_@string!"



# Remove digits
dirty_string = str_remove_all(dirty_string,"\\d")

# Remove all punctuation 
dirty_string = str_remove_all(dirty_string,"[:punct:]")

# Looks like we have one lingering punctuation value
dirty_string =  str_remove_all(dirty_string,"\\$")

# Now to sentence case
clean_string = str_to_sentence(dirty_string)


Question 2

Clean the below text as follows:

  • Remove excess white space
  • Remove the spacing in the middle of the text (\n is computer speak for “new line”)
  • Remove the subordinate clauses (i.e. all text in-between the dashes (-).)
dirty_text <-"

    A comedian with no political experience has won the most votes in the first round of Ukraine's presidential elections, according to exit polls.

They say Volodymyr Zelenskiy - who played the president on TV - received 30.4% of the vote, with current leader Petro Poroshenko second on 17.8%.

The two - who have expressed largely pro-EU opinions - are set to take part in a run-off election next month.




# Clean the white space
dirty_text <- str_trim(dirty_text)

# Remove the spacing in the middle of the test 
dirty_text <- str_remove_all(dirty_text,"\\n")

# Remove the subordinate clauses. 

    # (1) Split the strings by the clauses 
    text_list <- str_split(dirty_text," - ")
    # (2) test returned as a list. Only entry though
    text_list <- text_list[[1]]
    # (3) drop the indices containing the clauses.
    text_list <- text_list[c(-2,-4)]
    # (4) Paste the string back together.
    dirty_text <-  str_c(text_list,collapse = " ")
    # (5) Add spaces between the sentences.
    dirty_text <- str_replace_all(dirty_text,"\\.(?!\\d)",". ") 

