From 53f20d58628171934c097dff5602fe17765eae99 Mon Sep 17 00:00:00 2001 From: leshe4ka46 Date: Thu, 25 Dec 2025 21:28:30 +0300 Subject: finish --- R_NLP/main.r | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100755 R_NLP/main.r (limited to 'R_NLP/main.r') diff --git a/R_NLP/main.r b/R_NLP/main.r new file mode 100755 index 0000000..f3aa7f2 --- /dev/null +++ b/R_NLP/main.r @@ -0,0 +1,115 @@ +#!/usr/bin/env Rscript +if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(magrittr) + + +if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(tm) + +if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(wordcloud) + +if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint +library(SnowballC) + +data("crude") + +stop_raw <- readLines("stop.txt", encoding = "UTF-8") +stop_nocomm <- stop_raw[ + !grepl("^\\s*\\|", stop_raw) & # remove comment lines + nzchar(trimws(stop_raw)) # remove empty lines +] +stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm) +stopwords + +text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!" + + +corpus <- VCorpus(VectorSource(text)) + + +corpus_clean <- corpus %>% + tm_map(content_transformer(tolower)) %>% + tm_map(removePunctuation) %>% + tm_map(removeNumbers) %>% + tm_map(removeWords, stopwords) %>% + tm_map(stripWhitespace) + +tdm <- TermDocumentMatrix(corpus) +tdm + +tdm_clean <- TermDocumentMatrix(corpus_clean) +tdm_clean + +corpus_clean_stem <- corpus_clean %>% + tm_map(stemDocument, language = "english") + +tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem) + + +print("dirty") + +bow <- as.matrix(tdm) +bow <- sort(bow[, 1], decreasing = TRUE) +bow + +print("clean") + +bow_clean <- as.matrix(tdm_clean) +bow_clean <- sort(bow_clean[, 1], decreasing = TRUE) +bow_clean + + +print("clean stem") + +bow_clean_stem <- as.matrix(tdm_clean_stem) +bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE) +bow_clean_stem + +wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1) + + +wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1) + + +tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf)) +tdm_tfidf + +tfidf_matrix <- as.matrix(tdm_tfidf) +tfidf_matrix + + +corpus <- VCorpus(VectorSource(content(crude[[2]]))) + +create_wc <- function(tdm, limit = 20) { + bow <- as.matrix(tdm) + bow + + word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20) + # word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE) + word_freq <- word_freq[word_freq > 0] + if (length(word_freq) > 0) { + wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1) + } else { + message("No words with positive frequency to plot") + } +} + +# b +create_wc(TermDocumentMatrix(corpus)) + +# c +corpus_clean <- corpus %>% + tm_map(content_transformer(tolower)) %>% + tm_map(removePunctuation) %>% + tm_map(removeNumbers) %>% + tm_map(removeWords, stopwords) %>% + tm_map(stripWhitespace) + +create_wc(TermDocumentMatrix(corpus_clean)) + +# d +create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf))) + +# e +create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf))) -- cgit v1.2.3