aboutsummaryrefslogtreecommitdiff
path: root/inlab23/main.r
blob: 9a10ea3363633c31960fe8aa51021437bd7745f7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env Rscript

zip_income <- read.table("zipIncome.txt", header = TRUE, sep = "|")

head(zip_income)

names(zip_income) <- c("zipCode", "income")

head(zip_income)

overall_mean <- mean(zip_income$income)
overall_median <- median(zip_income$income)

summary(zip_income)

cat("Mean income:", overall_mean, "\n")
cat("Median income:", overall_median, "\n")

Q1 <- quantile(zip_income$income, 0.25)
Q3 <- quantile(zip_income$income, 0.75)
IQR_value <- IQR(zip_income$income)

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

outliers <- zip_income[zip_income$income < lower_bound | zip_income$income > upper_bound, ]

cat("Lower bound:", lower_bound, "\n")
cat("Upper bound:", upper_bound, "\n")
cat("Number of outliers:", nrow(outliers), "\n")

head(outliers)

mean_by_zip <- aggregate(income ~ zipCode, data = zip_income, FUN = mean)
print(mean_by_zip)

median_by_zip <- aggregate(income ~ zipCode, data = zip_income, FUN = median)
print(median_by_zip)

plot(zip_income$zipCode, zip_income[, "income"],
     main = "income by zip code",
     xlab = "zip code",
     ylab = "income",
)

zip_income_filtered <- zip_income[zip_income$income > 7000 & zip_income$income < 200000, ]

summary(zip_income_filtered)

new_mean <- mean(zip_income_filtered$income)
new_median <- median(zip_income_filtered$income)

cat("Filtered mean income:", new_mean, "\n")
cat("Filtered median income:", new_median, "\n")

boxplot(income ~ zipCode,
     data = zip_income_filtered,
     main = "income by zip code",
     xlab = "Zip Codes",
     ylab = "Income"
)

boxplot(log10(income) ~ zipCode,
     data = zip_income_filtered,
     main = "income by zip code",
     xlab = "Zip Codes",
     ylab = "log10(Income)"
)

if (!require(ggplot2)) install.packages("ggplot2", repos = "https://cran.r-project.org/")
library(ggplot2)

ggplot(zip_income_filtered, aes(x = factor(zipCode), y = income)) +
     geom_point(position = "jitter", alpha = 0.2) +
     scale_y_log10() +
     xlab("Zip Code") +
     ylab("Income (log10)") +
     ggtitle("Scatter plot of income by zip code")

ggplot(zip_income_filtered, aes(x = factor(zipCode), y = income)) +
     geom_point(aes(colour = factor(zipCode)), position = "jitter", alpha = 0.2) +
     geom_boxplot(alpha = 0.1, outlier.shape = NA, outlier.size = -Inf) +
     scale_y_log10() +
     xlab("Zip Code") +
     ylab("Income (log10)") +
     ggtitle("Scatter plot of income by zip code")