words

references:
- ISI, exploration 2.1a, p.109
- String functions in R, Christopher Bare
- Useful Functions in R for Manipulating Text Data, Eric Cai
- Built-in Functions in R, Quick-R
- Handling and Processing Strings in R, Gaston Sanchez
- R String Functions
- stringr: modern, consistent string processing, Hadley Wickham

data

import the data

words <- read.csv("GettysburgAddress.csv", header = FALSE)
colnames(words) <- "word"
head(words)
##    word
## 1  Four
## 2 score
## 3   and
## 4 seven
## 5 years
## 6   ago
str(words)
## 'data.frame':    268 obs. of  1 variable:
##  $ word: Factor w/ 143 levels "a","above","add",..: 52 109 8 111 143 5 94 44 17 50 ...

Convert words to character vectors

words <- words %>%
  mutate(str = tolower(as.character(word)))
str(words)
## 'data.frame':    268 obs. of  2 variables:
##  $ word: Factor w/ 143 levels "a","above","add",..: 52 109 8 111 143 5 94 44 17 50 ...
##  $ str : chr  "four" "score" "and" "seven" ...

word length

words <- words %>%
  mutate(length = nchar(str))
str(words)
## 'data.frame':    268 obs. of  3 variables:
##  $ word  : Factor w/ 143 levels "a","above","add",..: 52 109 8 111 143 5 94 44 17 50 ...
##  $ str   : chr  "four" "score" "and" "seven" ...
##  $ length: int  4 5 3 5 5 3 3 7 7 5 ...
table(words$length)
## 
##  1  2  3  4  5  6  7  8  9 10 11 
##  7 50 54 58 34 27 15  6 10  4  3
ggplot(words, aes(length)) +
  geom_bar(fill = "turquoise") +
  scale_x_discrete(limits = 1:11) +
  labs(title = "Distribution of length")