#!/usr/bin/env Rscript load("income_elec_state.rdata") head(income_elec_state) income_elec_state <- log10(income_elec_state) income_elec_state <- income_elec_state[income_elec_state$elec > 2.83, ] k <- 4 km <- kmeans(income_elec_state, k, nstart = 1000) km_centers <- data.frame(km$centers) head(km_centers) if (!require(ggplot2)) install.packages("ggplot2", repos = "https://cran.r-project.org/") library(ggplot2) ggplot( data = income_elec_state, mapping = aes(x = income, y = elec, color = factor(km$cluster)), ) + labs(x = "income", y = "electricity usage") + geom_point(shape = 1) + geom_point( data = km_centers, mapping = aes( x = income, y = elec, color = factor(rownames(km_centers)), label = NULL ), shape = 13, size = 4 ) wss <- NULL range <- 1:10 for (i in range) { res <- kmeans(income_elec_state, i, nstart = 10) wss <- c(wss, res$tot.withinss) } wss_df <- data.frame(wss) ggplot(wss_df, aes(x = range, y = wss)) + geom_path() + geom_point() + scale_x_continuous(breaks = range) Q1 <- quantile(income_elec_state$elec, 0.25) Q3 <- quantile(income_elec_state$elec, 0.75) IQR_value <- Q3 - Q1 lower_bound <- Q1 - 1.5 * IQR_value upper_bound <- Q3 + 1.5 * IQR_value outliers <- income_elec_state[income_elec_state$elec < lower_bound | income_elec_state$elec > upper_bound, ] cat("Lower bound:", lower_bound, "\n") cat("Upper bound:", upper_bound, "\n") cat("Number of outliers:", nrow(outliers), "\n") head(outliers) head(outliers <- income_elec_state[income_elec_state$elec < 2.84, ]) head(outliers <- income_elec_state[income_elec_state$income > 4.85, ]) if (!require(maps)) install.packages("maps", repos = "https://cran.r-project.org/") library(maps) head(km$cluster) map_color <- km$cluster[order(names(km$cluster))] head(map_color) map("state", fill = TRUE, col = map_color) if (!require(ggdendro)) install.packages("ggdendro", repos = "https://cran.r-project.org/") library(ggdendro) distance <- dist(income_elec_state, method = "euclidean") # single - closest pair of points # complete - farthest pair of points # average - average distance between all pairs h_clust <- hclust(distance, method = "complete") h_clust plot(ggdendrogram(h_clust)) cutree(h_clust, k = 2)