#!/usr/bin/env Rscript

df <- read.csv("zeta.csv")
df <- subset(df, sex == "F")
df <- subset(df, select = -c(zcta, sex))
df <- subset(df, 8 < meaneducation & meaneducation < 18)
df <- subset(df, 10000 < meanhouseholdincome & meanhouseholdincome < 200000)
df <- subset(df, 0 < meanemployment & meanemployment < 3)
df <- subset(df, 20 < meanage & meanage < 60)

df$log_income <- log10(df$meanhouseholdincome)


names(df) <- c("X", "age", "education", "employment", "income", "log_income")

library(ggplot2)

# b

model <- lm(log_income ~ age, df)

# a

ggplot(df, aes(x = age, y = log_income)) +
  geom_point(alpha = 0.2) +
  geom_abline(
    intercept = coef(model)[1],
    slope = coef(model)[2],
    color = "red",
    size = 1
  ) +
  labs(x = "age", y = "income", title = "log_income(age)")

# bcde
print(model)
summary(model)

model_2 <- lm(log_income ~ education, df)
print(model_2)
summary(model_2)

ggplot(df, aes(x = education, y = log_income)) +
  geom_point(alpha = 0.2) +
  geom_abline(
    intercept = coef(model_2)[1],
    slope = coef(model_2)[2],
    color = "red",
    size = 1
  ) +
  labs(x = "education", y = "income", title = "log_income(education)")


model_3 <- lm(log_income ~ education + age + employment, df)
print(model_3)
summary(model_3)

ggplot(df) +
  geom_point(aes(x = log_income, y = fitted(model_3)), alpha = 0.2) +
  geom_line(aes(x = log_income, y = log_income), col = "red") +
  labs(x = "actual", y = "predicted")