aboutsummaryrefslogtreecommitdiff
path: root/R_NLP
diff options
context:
space:
mode:
authorleshe4ka46 <alex9102naid1@ya.ru>2025-12-25 21:28:30 +0300
committerleshe4ka46 <alex9102naid1@ya.ru>2025-12-25 21:28:30 +0300
commit53f20d58628171934c097dff5602fe17765eae99 (patch)
tree83f7344f76924ffd0aa81c2fdc4ee09fa3de9459 /R_NLP
parent175ac10904d0f31c3ffeeeed507c8914f13d0b15 (diff)
finishHEADmain
Diffstat (limited to 'R_NLP')
-rw-r--r--R_NLP/Rplots.pdfbin0 -> 6414 bytes
-rwxr-xr-xR_NLP/main.r115
-rw-r--r--R_NLP/stop.txt312
3 files changed, 427 insertions, 0 deletions
diff --git a/R_NLP/Rplots.pdf b/R_NLP/Rplots.pdf
new file mode 100644
index 0000000..518e93b
--- /dev/null
+++ b/R_NLP/Rplots.pdf
Binary files differ
diff --git a/R_NLP/main.r b/R_NLP/main.r
new file mode 100755
index 0000000..f3aa7f2
--- /dev/null
+++ b/R_NLP/main.r
@@ -0,0 +1,115 @@
+#!/usr/bin/env Rscript
+if (!require(magrittr)) install.packages("magrittr", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
+library(magrittr)
+
+
+if (!require(tm)) install.packages("tm", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
+library(tm)
+
+if (!require(wordcloud)) install.packages("wordcloud", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
+library(wordcloud)
+
+if (!require(SnowballC)) install.packages("SnowballC", repos = "https://cran.r-project.org/", Ncpus = 16) # nolint
+library(SnowballC)
+
+data("crude")
+
+stop_raw <- readLines("stop.txt", encoding = "UTF-8")
+stop_nocomm <- stop_raw[
+ !grepl("^\\s*\\|", stop_raw) & # remove comment lines
+ nzchar(trimws(stop_raw)) # remove empty lines
+]
+stopwords <- sub("^([a-z']*)(.|\\s)*", "\\1", stop_nocomm)
+stopwords
+
+text <- "Sneaky Fees! Do you carefully read every word on your statement and every notice your bank every sent you? If you've missed one, Yoyodyne Bank is NOT the bank for you! Close all your accounts especially if you're going overseas!!"
+
+
+corpus <- VCorpus(VectorSource(text))
+
+
+corpus_clean <- corpus %>%
+ tm_map(content_transformer(tolower)) %>%
+ tm_map(removePunctuation) %>%
+ tm_map(removeNumbers) %>%
+ tm_map(removeWords, stopwords) %>%
+ tm_map(stripWhitespace)
+
+tdm <- TermDocumentMatrix(corpus)
+tdm
+
+tdm_clean <- TermDocumentMatrix(corpus_clean)
+tdm_clean
+
+corpus_clean_stem <- corpus_clean %>%
+ tm_map(stemDocument, language = "english")
+
+tdm_clean_stem <- TermDocumentMatrix(corpus_clean_stem)
+
+
+print("dirty")
+
+bow <- as.matrix(tdm)
+bow <- sort(bow[, 1], decreasing = TRUE)
+bow
+
+print("clean")
+
+bow_clean <- as.matrix(tdm_clean)
+bow_clean <- sort(bow_clean[, 1], decreasing = TRUE)
+bow_clean
+
+
+print("clean stem")
+
+bow_clean_stem <- as.matrix(tdm_clean_stem)
+bow_clean_stem <- sort(bow_clean_stem[, 1], decreasing = TRUE)
+bow_clean_stem
+
+wordcloud(words = names(bow_clean), freq = bow_clean, min.freq = 1)
+
+
+wordcloud(words = names(bow_clean_stem), freq = bow_clean_stem, min.freq = 1)
+
+
+tdm_tfidf <- TermDocumentMatrix(corpus_clean_stem, control = list(weighting = weightTfIdf))
+tdm_tfidf
+
+tfidf_matrix <- as.matrix(tdm_tfidf)
+tfidf_matrix
+
+
+corpus <- VCorpus(VectorSource(content(crude[[2]])))
+
+create_wc <- function(tdm, limit = 20) {
+ bow <- as.matrix(tdm)
+ bow
+
+ word_freq <- head(sort(rowSums(as.matrix(tdm)), decreasing = TRUE), 20)
+ # word_freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
+ word_freq <- word_freq[word_freq > 0]
+ if (length(word_freq) > 0) {
+ wordcloud(names(word_freq), freq = word_freq, max.words = limit, min.freq = 1)
+ } else {
+ message("No words with positive frequency to plot")
+ }
+}
+
+# b
+create_wc(TermDocumentMatrix(corpus))
+
+# c
+corpus_clean <- corpus %>%
+ tm_map(content_transformer(tolower)) %>%
+ tm_map(removePunctuation) %>%
+ tm_map(removeNumbers) %>%
+ tm_map(removeWords, stopwords) %>%
+ tm_map(stripWhitespace)
+
+create_wc(TermDocumentMatrix(corpus_clean))
+
+# d
+create_wc(TermDocumentMatrix(corpus, control = list(weighting = weightTfIdf)))
+
+# e
+create_wc(TermDocumentMatrix(corpus_clean, control = list(weighting = weightTfIdf)))
diff --git a/R_NLP/stop.txt b/R_NLP/stop.txt
new file mode 100644
index 0000000..aee35c5
--- /dev/null
+++ b/R_NLP/stop.txt
@@ -0,0 +1,312 @@
+
+ | An English stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | Many of the forms below are quite rare (e.g. "yourselves") but included for
+ | completeness.
+
+ | PRONOUNS FORMS
+ | 1st person sing
+
+i | subject, always in upper case of course
+
+me | object
+my | possessive adjective
+ | the possessive pronoun `mine' is best suppressed, because of the
+ | sense of coal-mine etc.
+myself | reflexive
+ | 1st person plural
+we | subject
+
+| us | object
+ | care is required here because US = United States. It is usually
+ | safe to remove it if it is in lower case.
+our | possessive adjective
+ours | possessive pronoun
+ourselves | reflexive
+ | second person (archaic `thou' forms not included)
+you | subject and object
+your | possessive adjective
+yours | possessive pronoun
+yourself | reflexive (singular)
+yourselves | reflexive (plural)
+ | third person singular
+he | subject
+him | object
+his | possessive adjective and pronoun
+himself | reflexive
+
+she | subject
+her | object and possessive adjective
+hers | possessive pronoun
+herself | reflexive
+
+it | subject and object
+its | possessive adjective
+itself | reflexive
+ | third person plural
+they | subject
+them | object
+their | possessive adjective
+theirs | possessive pronoun
+themselves | reflexive
+ | other forms (demonstratives, interrogatives)
+what
+which
+who
+whom
+this
+that
+these
+those
+
+ | VERB FORMS (using F.R. Palmer's nomenclature)
+ | BE
+am | 1st person, present
+is | -s form (3rd person, present)
+are | present
+was | 1st person, past
+were | past
+be | infinitive
+been | past participle
+being | -ing form
+ | HAVE
+have | simple
+has | -s form
+had | past
+having | -ing form
+ | DO
+do | simple
+does | -s form
+did | past
+doing | -ing form
+
+ | The forms below are, I believe, best omitted, because of the significant
+ | homonym forms:
+
+ | He made a WILL
+ | old tin CAN
+ | merry month of MAY
+ | a smell of MUST
+ | fight the good fight with all thy MIGHT
+
+ | would, could, should, ought might however be included
+
+ | | AUXILIARIES
+ | | WILL
+ |will
+
+would
+
+ | | SHALL
+ |shall
+
+should
+
+ | | CAN
+ |can
+
+could
+
+ | | MAY
+ |may
+ |might
+ | | MUST
+ |must
+ | | OUGHT
+
+ought
+
+ | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
+ | pronoun + verb
+
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+
+ | verb + negation
+
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+
+ | auxiliary + negation
+
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+
+ | miscellaneous forms
+
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+
+ | rarer forms
+
+ | daren't needn't
+
+ | doubtful forms
+
+ | oughtn't mightn't
+
+ | ARTICLES
+a
+an
+the
+
+ | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
+ | high, that classification is pointless.)
+and
+but
+if
+or
+because
+as
+until
+while
+
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+
+again
+further
+then
+once
+
+here
+there
+when
+where
+why
+how
+
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+
+ | Just for the record, the following words are among the commonest in English
+
+ | one
+ | every
+ | least
+ | less
+ | many
+ | now
+ | ever
+ | never
+ | say
+ | says
+ | said
+ | also
+ | get
+ | go
+ | goes
+ | just
+ | made
+ | make
+ | put
+ | see
+ | seen
+ | whether
+ | like
+ | well
+ | back
+ | even
+ | still
+ | way
+ | take
+ | since
+ | another
+ | however
+ | two
+ | three
+ | four
+ | five
+ | first
+ | second
+ | new
+ | old
+ | high
+ | long
+