diff options
Diffstat (limited to 'R_NLP/main.r')
| -rwxr-xr-x | R_NLP/main.r | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/R_NLP/main.r b/R_NLP/main.r new file mode 100755 index 0000000..f3aa7f2 --- /dev/null +++ b/R_NLP/main.r @@ -0,0 +1,115 @@ +#!/usr/bin/env Rscript +if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(magrittr) + + +if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(tm) + +if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(wordcloud) + +if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(SnowballC) + +data("crude") + +stop_raw <- readLines("stop.txt", encoding = "UTF-8") +stop_nocomm <- stop_raw[ + !grepl("^\\s*\\|", stop_raw) & # remove comment lines + nzchar(trimws(stop_raw)) # remove empty lines +] +stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm) +stopwords + +text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!" + + +corpus <- VCorpus(VectorSource(text)) + + +corpus_clean <- corpus %>% + tm_map(content_transformer(tolower)) %>% + tm_map(removePunctuation) %>% + tm_map(removeNumbers) %>% + tm_map(removeWords, stopwords) %>% + tm_map(stripWhitespace) + +tdm <- TermDocumentMatrix(corpus) +tdm + +tdm_clean <- TermDocumentMatrix(corpus_clean) +tdm_clean + +corpus_clean_stem <- corpus_clean %>% + tm_map(stemDocument, language = "english") + +tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem) + + +print("dirty") + +bow <- as.matrix(tdm) +bow <- sort(bow[, 1], decreasing = TRUE) +bow + +print("clean") + +bow_clean <- as.matrix(tdm_clean) +bow_clean <- sort(bow_clean[, 1], decreasing = TRUE) +bow_clean + + +print("clean stem") + +bow_clean_stem <- as.matrix(tdm_clean_stem) +bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE) +bow_clean_stem + +wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1) + + +wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1) + + +tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf)) +tdm_tfidf + +tfidf_matrix <- as.matrix(tdm_tfidf) +tfidf_matrix + + +corpus <- VCorpus(VectorSource(content(crude[[2]]))) + +create_wc <- function(tdm, limit = 20) { + bow <- as.matrix(tdm) + bow + + word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20) + # word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE) + word_freq <- word_freq[word_freq > 0] + if (length(word_freq) > 0) { + wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1) + } else { + message("No words with positive frequency to plot") + } +} + +# b +create_wc(TermDocumentMatrix(corpus)) + +# c +corpus_clean <- corpus %>% + tm_map(content_transformer(tolower)) %>% + tm_map(removePunctuation) %>% + tm_map(removeNumbers) %>% + tm_map(removeWords, stopwords) %>% + tm_map(stripWhitespace) + +create_wc(TermDocumentMatrix(corpus_clean)) + +# d +create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf))) + +# e +create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf))) |
