1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
#!/usr/bin/env Rscript
if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(magrittr)
if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(tm)
if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(wordcloud)
if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(SnowballC)
data("crude")
stop_raw <- readLines("stop.txt", encoding = "UTF-8")
stop_nocomm <- stop_raw[
!grepl("^\\s*\\|", stop_raw) & # remove comment lines
nzchar(trimws(stop_raw)) # remove empty lines
]
stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm)
stopwords
text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!"
corpus <- VCorpus(VectorSource(text))
corpus_clean <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords) %>%
tm_map(stripWhitespace)
tdm <- TermDocumentMatrix(corpus)
tdm
tdm_clean <- TermDocumentMatrix(corpus_clean)
tdm_clean
corpus_clean_stem <- corpus_clean %>%
tm_map(stemDocument, language = "english")
tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem)
print("dirty")
bow <- as.matrix(tdm)
bow <- sort(bow[, 1], decreasing = TRUE)
bow
print("clean")
bow_clean <- as.matrix(tdm_clean)
bow_clean <- sort(bow_clean[, 1], decreasing = TRUE)
bow_clean
print("clean stem")
bow_clean_stem <- as.matrix(tdm_clean_stem)
bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE)
bow_clean_stem
wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1)
wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1)
tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf))
tdm_tfidf
tfidf_matrix <- as.matrix(tdm_tfidf)
tfidf_matrix
corpus <- VCorpus(VectorSource(content(crude[[2]])))
create_wc <- function(tdm, limit = 20) {
bow <- as.matrix(tdm)
bow
word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20)
# word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
word_freq <- word_freq[word_freq > 0]
if (length(word_freq) > 0) {
wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1)
} else {
message("No words with positive frequency to plot")
}
}
# b
create_wc(TermDocumentMatrix(corpus))
# c
corpus_clean <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords) %>%
tm_map(stripWhitespace)
create_wc(TermDocumentMatrix(corpus_clean))
# d
create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf)))
# e
create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf)))
|