1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#!/usr/bin/env Rscript
load("income_elec_state.rdata")
head(income_elec_state)
income_elec_state <- log10(income_elec_state)
income_elec_state <- income_elec_state[income_elec_state$elec > 2.83, ]
k <- 4
km <- kmeans(income_elec_state, k, nstart = 1000)
km_centers <- data.frame(km$centers)
head(km_centers)
if (!require(ggplot2)) install.packages("ggplot2", repos = "https://cran.r-project.org/")
library(ggplot2)
ggplot(
data = income_elec_state,
mapping = aes(x = income, y = elec, color = factor(km$cluster)),
) +
labs(x = "income", y = "electricity usage") +
geom_point(shape = 1) +
geom_point(
data = km_centers,
mapping = aes(
x = income,
y = elec,
color = factor(rownames(km_centers)),
label = NULL
),
shape = 13,
size = 4
)
wss <- NULL
range <- 1:10
for (i in range) {
res <- kmeans(income_elec_state, i, nstart = 10)
wss <- c(wss, res$tot.withinss)
}
wss_df <- data.frame(wss)
ggplot(wss_df, aes(x = range, y = wss)) +
geom_path() +
geom_point() +
scale_x_continuous(breaks = range)
Q1 <- quantile(income_elec_state$elec, 0.25)
Q3 <- quantile(income_elec_state$elec, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- income_elec_state[income_elec_state$elec < lower_bound | income_elec_state$elec > upper_bound, ]
cat("Lower bound:", lower_bound, "\n")
cat("Upper bound:", upper_bound, "\n")
cat("Number of outliers:", nrow(outliers), "\n")
head(outliers)
head(outliers <- income_elec_state[income_elec_state$elec < 2.84, ])
head(outliers <- income_elec_state[income_elec_state$income > 4.85, ])
if (!require(maps)) install.packages("maps", repos = "https://cran.r-project.org/")
library(maps)
head(km$cluster)
map_color <- km$cluster[order(names(km$cluster))]
head(map_color)
map("state", fill = TRUE, col = map_color)
if (!require(ggdendro)) install.packages("ggdendro", repos = "https://cran.r-project.org/")
library(ggdendro)
distance <- dist(income_elec_state, method = "euclidean")
# single - closest pair of points
# complete - farthest pair of points
# average - average distance between all pairs
h_clust <- hclust(distance, method = "complete")
h_clust
plot(ggdendrogram(h_clust))
cutree(h_clust, k = 2)
|