#!/usr/bin/env Rscript if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint library(magrittr) if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint library(tm) if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint library(wordcloud) if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint library(SnowballC) data("crude") stop_raw <- readLines("stop.txt", encoding = "UTF-8") stop_nocomm <- stop_raw[ !grepl("^\\s*\\|", stop_raw) & # remove comment lines nzchar(trimws(stop_raw)) # remove empty lines ] stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm) stopwords text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!" corpus <- VCorpus(VectorSource(text)) corpus_clean <- corpus %>% tm_map(content_transformer(tolower)) %>% tm_map(removePunctuation) %>% tm_map(removeNumbers) %>% tm_map(removeWords, stopwords) %>% tm_map(stripWhitespace) tdm <- TermDocumentMatrix(corpus) tdm tdm_clean <- TermDocumentMatrix(corpus_clean) tdm_clean corpus_clean_stem <- corpus_clean %>% tm_map(stemDocument, language = "english") tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem) print("dirty") bow <- as.matrix(tdm) bow <- sort(bow[, 1], decreasing = TRUE) bow print("clean") bow_clean <- as.matrix(tdm_clean) bow_clean <- sort(bow_clean[, 1], decreasing = TRUE) bow_clean print("clean stem") bow_clean_stem <- as.matrix(tdm_clean_stem) bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE) bow_clean_stem wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1) wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1) tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf)) tdm_tfidf tfidf_matrix <- as.matrix(tdm_tfidf) tfidf_matrix corpus <- VCorpus(VectorSource(content(crude[[2]]))) create_wc <- function(tdm, limit = 20) { bow <- as.matrix(tdm) bow word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20) # word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE) word_freq <- word_freq[word_freq > 0] if (length(word_freq) > 0) { wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1) } else { message("No words with positive frequency to plot") } } # b create_wc(TermDocumentMatrix(corpus)) # c corpus_clean <- corpus %>% tm_map(content_transformer(tolower)) %>% tm_map(removePunctuation) %>% tm_map(removeNumbers) %>% tm_map(removeWords, stopwords) %>% tm_map(stripWhitespace) create_wc(TermDocumentMatrix(corpus_clean)) # d create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf))) # e create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf)))