aboutsummaryrefslogtreecommitdiff
path: root/RCluster
diff options
context:
space:
mode:
Diffstat (limited to 'RCluster')
-rw-r--r--RCluster/Rplots.pdfbin0 -> 5625 bytes
-rwxr-xr-xRCluster/main.r93
-rwxr-xr-xRCluster/main2.r21
3 files changed, 114 insertions, 0 deletions
diff --git a/RCluster/Rplots.pdf b/RCluster/Rplots.pdf
new file mode 100644
index 0000000..ef25128
--- /dev/null
+++ b/RCluster/Rplots.pdf
Binary files differ
diff --git a/RCluster/main.r b/RCluster/main.r
new file mode 100755
index 0000000..541078f
--- /dev/null
+++ b/RCluster/main.r
@@ -0,0 +1,93 @@
+#!/usr/bin/env Rscript
+
+load("income_elec_state.rdata")
+
+head(income_elec_state)
+
+income_elec_state <- log10(income_elec_state)
+income_elec_state <- income_elec_state[income_elec_state$elec > 2.83, ]
+
+k <- 4
+km <- kmeans(income_elec_state, k, nstart = 1000)
+
+km_centers <- data.frame(km$centers)
+head(km_centers)
+
+if (!require(ggplot2)) install.packages("ggplot2", repos = "https://cran.r-project.org/")
+library(ggplot2)
+
+ggplot(
+ data = income_elec_state,
+ mapping = aes(x = income, y = elec, color = factor(km$cluster)),
+) +
+ labs(x = "income", y = "electricity usage") +
+ geom_point(shape = 1) +
+ geom_point(
+ data = km_centers,
+ mapping = aes(
+ x = income,
+ y = elec,
+ color = factor(rownames(km_centers)),
+ label = NULL
+ ),
+ shape = 13,
+ size = 4
+ )
+
+
+wss <- NULL
+range <- 1:10
+for (i in range) {
+ res <- kmeans(income_elec_state, i, nstart = 10)
+ wss <- c(wss, res$tot.withinss)
+}
+wss_df <- data.frame(wss)
+ggplot(wss_df, aes(x = range, y = wss)) +
+ geom_path() +
+ geom_point() +
+ scale_x_continuous(breaks = range)
+
+Q1 <- quantile(income_elec_state$elec, 0.25)
+Q3 <- quantile(income_elec_state$elec, 0.75)
+IQR_value <- Q3 - Q1
+
+lower_bound <- Q1 - 1.5 * IQR_value
+upper_bound <- Q3 + 1.5 * IQR_value
+
+outliers <- income_elec_state[income_elec_state$elec < lower_bound | income_elec_state$elec > upper_bound, ]
+
+cat("Lower bound:", lower_bound, "\n")
+cat("Upper bound:", upper_bound, "\n")
+cat("Number of outliers:", nrow(outliers), "\n")
+head(outliers)
+
+head(outliers <- income_elec_state[income_elec_state$elec < 2.84, ])
+head(outliers <- income_elec_state[income_elec_state$income > 4.85, ])
+
+
+if (!require(maps)) install.packages("maps", repos = "https://cran.r-project.org/")
+library(maps)
+
+head(km$cluster)
+
+map_color <- km$cluster[order(names(km$cluster))]
+
+head(map_color)
+
+map("state", fill = TRUE, col = map_color)
+
+
+if (!require(ggdendro)) install.packages("ggdendro", repos = "https://cran.r-project.org/")
+library(ggdendro)
+
+distance <- dist(income_elec_state, method = "euclidean")
+
+# single - closest pair of points
+# complete - farthest pair of points
+# average - average distance between all pairs
+h_clust <- hclust(distance, method = "complete")
+h_clust
+
+plot(ggdendrogram(h_clust))
+
+cutree(h_clust, k = 2)
diff --git a/RCluster/main2.r b/RCluster/main2.r
new file mode 100755
index 0000000..82bcf7e
--- /dev/null
+++ b/RCluster/main2.r
@@ -0,0 +1,21 @@
+#!/usr/bin/env Rscript
+
+
+income_elec_state = data.frame(a = c(1, 3:6, 8, 10, 12, 13), b = c(1:5, seq(6, 12, by = 2)))
+
+plot(income_elec_state)
+
+if (!require(ggdendro)) install.packages("ggdendro", repos = "https://cran.r-project.org/")
+library(ggdendro)
+
+distance <- dist(income_elec_state, method = "euclidean")
+
+# single - closest pair of points
+# complete - farthest pair of points
+# average - average distance between all pairs
+h_clust <- hclust(distance, method = "singlet")
+h_clust
+
+plot(ggdendrogram(h_clust))
+
+cutree(h_clust, k = 2) \ No newline at end of file