1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/env Rscript
zip_income <- read.table("zipIncome.txt", header = TRUE, sep = "|")
head(zip_income)
names(zip_income) <- c("zipCode", "income")
head(zip_income)
overall_mean <- mean(zip_income$income)
overall_median <- median(zip_income$income)
summary(zip_income)
cat("Mean income:", overall_mean, "\n")
cat("Median income:", overall_median, "\n")
Q1 <- quantile(zip_income$income, 0.25)
Q3 <- quantile(zip_income$income, 0.75)
IQR_value <- IQR(zip_income$income)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- zip_income[zip_income$income < lower_bound | zip_income$income > upper_bound, ]
cat("Lower bound:", lower_bound, "\n")
cat("Upper bound:", upper_bound, "\n")
cat("Number of outliers:", nrow(outliers), "\n")
head(outliers)
mean_by_zip <- aggregate(income ~ zipCode, data = zip_income, FUN = mean)
print(mean_by_zip)
median_by_zip <- aggregate(income ~ zipCode, data = zip_income, FUN = median)
print(median_by_zip)
plot(zip_income$zipCode, zip_income[, "income"],
main = "income by zip code",
xlab = "zip code",
ylab = "income",
)
zip_income_filtered <- zip_income[zip_income$income > 7000 & zip_income$income < 200000, ]
summary(zip_income_filtered)
new_mean <- mean(zip_income_filtered$income)
new_median <- median(zip_income_filtered$income)
cat("Filtered mean income:", new_mean, "\n")
cat("Filtered median income:", new_median, "\n")
boxplot(income ~ zipCode,
data = zip_income_filtered,
main = "income by zip code",
xlab = "Zip Codes",
ylab = "Income"
)
boxplot(log10(income) ~ zipCode,
data = zip_income_filtered,
main = "income by zip code",
xlab = "Zip Codes",
ylab = "log10(Income)"
)
if (!require(ggplot2)) install.packages("ggplot2", repos = "https://cran.r-project.org/")
library(ggplot2)
ggplot(zip_income_filtered, aes(x = factor(zipCode), y = income)) +
geom_point(position = "jitter", alpha = 0.2) +
scale_y_log10() +
xlab("Zip Code") +
ylab("Income (log10)") +
ggtitle("Scatter plot of income by zip code")
ggplot(zip_income_filtered, aes(x = factor(zipCode), y = income)) +
geom_point(aes(colour = factor(zipCode)), position = "jitter", alpha = 0.2) +
geom_boxplot(alpha = 0.1, outlier.shape = NA, outlier.size = -Inf) +
scale_y_log10() +
xlab("Zip Code") +
ylab("Income (log10)") +
ggtitle("Scatter plot of income by zip code")
|