#!/usr/bin/env Rscript zip_income <- read.table("zipIncome.txt", header = TRUE, sep = "|") head(zip_income) names(zip_income) <- c("zipCode", "income") head(zip_income) overall_mean <- mean(zip_income$income) overall_median <- median(zip_income$income) summary(zip_income) cat("Mean income:", overall_mean, "\n") cat("Median income:", overall_median, "\n") Q1 <- quantile(zip_income$income, 0.25) Q3 <- quantile(zip_income$income, 0.75) IQR_value <- IQR(zip_income$income) lower_bound <- Q1 - 1.5 * IQR_value upper_bound <- Q3 + 1.5 * IQR_value outliers <- zip_income[zip_income$income < lower_bound | zip_income$income > upper_bound, ] cat("Lower bound:", lower_bound, "\n") cat("Upper bound:", upper_bound, "\n") cat("Number of outliers:", nrow(outliers), "\n") head(outliers) mean_by_zip <- aggregate(income ~ zipCode, data = zip_income, FUN = mean) print(mean_by_zip) median_by_zip <- aggregate(income ~ zipCode, data = zip_income, FUN = median) print(median_by_zip) plot(zip_income$zipCode, zip_income[, "income"], main = "income by zip code", xlab = "zip code", ylab = "income", ) zip_income_filtered <- zip_income[zip_income$income > 7000 & zip_income$income < 200000, ] summary(zip_income_filtered) new_mean <- mean(zip_income_filtered$income) new_median <- median(zip_income_filtered$income) cat("Filtered mean income:", new_mean, "\n") cat("Filtered median income:", new_median, "\n") boxplot(income ~ zipCode, data = zip_income_filtered, main = "income by zip code", xlab = "Zip Codes", ylab = "Income" ) boxplot(log10(income) ~ zipCode, data = zip_income_filtered, main = "income by zip code", xlab = "Zip Codes", ylab = "log10(Income)" ) if (!require(ggplot2)) install.packages("ggplot2", repos = "https://cran.r-project.org/") library(ggplot2) ggplot(zip_income_filtered, aes(x = factor(zipCode), y = income)) + geom_point(position = "jitter", alpha = 0.2) + scale_y_log10() + xlab("Zip Code") + ylab("Income (log10)") + ggtitle("Scatter plot of income by zip code") ggplot(zip_income_filtered, aes(x = factor(zipCode), y = income)) + geom_point(aes(colour = factor(zipCode)), position = "jitter", alpha = 0.2) + geom_boxplot(alpha = 0.1, outlier.shape = NA, outlier.size = -Inf) + scale_y_log10() + xlab("Zip Code") + ylab("Income (log10)") + ggtitle("Scatter plot of income by zip code")