R_NLP/main.r


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

#!/usr/bin/env Rscript
if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(magrittr)


if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(tm)

if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(wordcloud)

if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(SnowballC)

data("crude")

stop_raw <- readLines("stop.txt", encoding = "UTF-8")
stop_nocomm <- stop_raw[
  !grepl("^\\s*\\|", stop_raw) & # remove comment lines
    nzchar(trimws(stop_raw)) # remove empty lines
]
stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm)
stopwords

text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!"


corpus <- VCorpus(VectorSource(text))


corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords) %>%
  tm_map(stripWhitespace)

tdm <- TermDocumentMatrix(corpus)
tdm

tdm_clean <- TermDocumentMatrix(corpus_clean)
tdm_clean

corpus_clean_stem <- corpus_clean %>%
  tm_map(stemDocument, language = "english")

tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem)


print("dirty")

bow <- as.matrix(tdm)
bow <- sort(bow[, 1], decreasing = TRUE)
bow

print("clean")

bow_clean <- as.matrix(tdm_clean)
bow_clean <- sort(bow_clean[, 1], decreasing = TRUE)
bow_clean


print("clean stem")

bow_clean_stem <- as.matrix(tdm_clean_stem)
bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE)
bow_clean_stem

wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1)


wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1)


tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf))
tdm_tfidf

tfidf_matrix <- as.matrix(tdm_tfidf)
tfidf_matrix


corpus <- VCorpus(VectorSource(content(crude[[2]])))

create_wc <- function(tdm, limit = 20) {
  bow <- as.matrix(tdm)
  bow

  word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20)
  # word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  word_freq <- word_freq[word_freq > 0]
  if (length(word_freq) > 0) {
    wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1)
  } else {
    message("No words with positive frequency to plot")
  }
}

# b
create_wc(TermDocumentMatrix(corpus))

# c
corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords) %>%
  tm_map(stripWhitespace)

create_wc(TermDocumentMatrix(corpus_clean))

# d
create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf)))

# e
create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf)))