#!/usr/bin/env Rscript
if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(magrittr)


if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(tm)

if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(wordcloud)

if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
library(SnowballC)

data("crude")

stop_raw <- readLines("stop.txt", encoding = "UTF-8")
stop_nocomm <- stop_raw[
  !grepl("^\\s*\\|", stop_raw) & # remove comment lines
    nzchar(trimws(stop_raw)) # remove empty lines
]
stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm)
stopwords

text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!"


corpus <- VCorpus(VectorSource(text))


corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords) %>%
  tm_map(stripWhitespace)

tdm <- TermDocumentMatrix(corpus)
tdm

tdm_clean <- TermDocumentMatrix(corpus_clean)
tdm_clean

corpus_clean_stem <- corpus_clean %>%
  tm_map(stemDocument, language = "english")

tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem)


print("dirty")

bow <- as.matrix(tdm)
bow <- sort(bow[, 1], decreasing = TRUE)
bow

print("clean")

bow_clean <- as.matrix(tdm_clean)
bow_clean <- sort(bow_clean[, 1], decreasing = TRUE)
bow_clean


print("clean stem")

bow_clean_stem <- as.matrix(tdm_clean_stem)
bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE)
bow_clean_stem

wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1)


wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1)


tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf))
tdm_tfidf

tfidf_matrix <- as.matrix(tdm_tfidf)
tfidf_matrix


corpus <- VCorpus(VectorSource(content(crude[[2]])))

create_wc <- function(tdm, limit = 20) {
  bow <- as.matrix(tdm)
  bow

  word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20)
  # word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  word_freq <- word_freq[word_freq > 0]
  if (length(word_freq) > 0) {
    wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1)
  } else {
    message("No words with positive frequency to plot")
  }
}

# b
create_wc(TermDocumentMatrix(corpus))

# c
corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords) %>%
  tm_map(stripWhitespace)

create_wc(TermDocumentMatrix(corpus_clean))

# d
create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf)))

# e
create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf)))