#!/usr/bin/env Rscript df <- read.csv("zeta.csv") df <- subset(df, sex == "F") df <- subset(df, select = -c(zcta, sex)) df <- subset(df, 8 < meaneducation & meaneducation < 18) df <- subset(df, 10000 < meanhouseholdincome & meanhouseholdincome < 200000) df <- subset(df, 0 < meanemployment & meanemployment < 3) df <- subset(df, 20 < meanage & meanage < 60) df$log_income <- log10(df$meanhouseholdincome) names(df) <- c("X", "age", "education", "employment", "income", "log_income") library(ggplot2) # b model <- lm(log_income ~ age, df) # a ggplot(df, aes(x = age, y = log_income)) + geom_point(alpha = 0.2) + geom_abline( intercept = coef(model)[1], slope = coef(model)[2], color = "red", size = 1 ) + labs(x = "age", y = "income", title = "log_income(age)") # bcde print(model) summary(model) model_2 <- lm(log_income ~ education, df) print(model_2) summary(model_2) ggplot(df, aes(x = education, y = log_income)) + geom_point(alpha = 0.2) + geom_abline( intercept = coef(model_2)[1], slope = coef(model_2)[2], color = "red", size = 1 ) + labs(x = "education", y = "income", title = "log_income(education)") model_3 <- lm(log_income ~ education + age + employment, df) print(model_3) summary(model_3) ggplot(df) + geom_point(aes(x = log_income, y = fitted(model_3)), alpha = 0.2) + geom_line(aes(x = log_income, y = log_income), col = "red") + labs(x = "actual", y = "predicted")