# cps_1.R # looking at CPS 2013 data # uses file from dataferret download of CPS March 2013 supplement, downloaded June 12 2014 # accompanying lecture notes for KFoster class ECO B2000 at CCNY require(AER) rm(list = ls(all = TRUE)) setwd("C:\\Users\\Kevin\\Documents\\CCNY\\data for classes\\CPS_Mar2013") load("cps_mar2013.RData") attach(dat_CPSMar2013) # use prime-age,fulltime, yearround workers use_varb <- (Age >= 25) & (Age <= 55) & work_fullt & work_50wks dat_use <- subset(dat_CPSMar2013,use_varb) # 47,550 out of 202,634 obs detach(dat_CPSMar2013) attach(dat_use) # just prime-age,fulltime, yearround workers # always a good idea to get basic stats of all of the variables in your regression to see if they make sense summary(WSAL_VAL) summary(Age) summary(female) summary(AfAm) summary(Asian) summary(Amindian) summary(race_oth) summary(Hispanic) summary(educ_hs) summary(educ_smcoll) summary(educ_as) summary(educ_bach) summary(educ_adv) summary(married) summary(divwidsep) summary(union_m) summary(veteran) summary(immigrant) summary(immig2gen) model1 <- lm(WSAL_VAL ~ Age + female + AfAm + Asian + Amindian + race_oth + Hispanic + educ_hs + educ_smcoll + educ_as + educ_bach + educ_adv + married + divwidsep + union_m + veteran + immigrant + immig2gen) summary(model1) coeftest(model1) #sometimes log form is preferred dat_noZeroWage <- subset(dat_use,(WSAL_VAL > 0)) model1a <- lm(log(WSAL_VAL) ~ Age + female + AfAm + Asian + Amindian + race_oth + Hispanic + educ_hs + educ_smcoll + educ_as + educ_bach + educ_adv + married + divwidsep + union_m + veteran + immigrant + immig2gen, data = dat_noZeroWage) detach(dat_use) attach(dat_noZeroWage) log(mean(WSAL_VAL)) mean(log(WSAL_VAL)) detach(dat_noZeroWage) attach(dat_use) # ^^ yes there are more elegant ways to do that, avoiding attach/detach - find them! # for heteroskedasticity consistent errors require(sandwich) require(lmtest) coeftest(model1,vcovHC) # jam nonlinear into linear regression model2 <- lm(WSAL_VAL ~ Age + I(Age^2) + I(Age^3)+I(Age^4) + female + AfAm + Asian + Amindian + race_oth + Hispanic + educ_hs + educ_smcoll + educ_as + educ_bach + educ_adv + married + divwidsep + union_m + veteran + immigrant + immig2gen) model3 <- lm(WSAL_VAL ~ Age + I(Age^2) + female + I(female*Age) + I(female*(Age^2)) + AfAm + Asian + Amindian + race_oth + Hispanic + educ_hs + educ_smcoll + educ_as + educ_bach + educ_adv + married + divwidsep + union_m + veteran + immigrant + immig2gen) # could do this with "update" function instead summary(model2) summary(model3) # the ANOVA function is flexible - can compare nested models anova(model1,model2,model3) # Applied Econometrics in R suggests also spline and kernel estimators, we might get to that later # subset in order to plot... NNobs <- length(WSAL_VAL) set.seed(12345) # just so you can replicate and get same "random" choices graph_obs <- (runif(NNobs) < 0.1) # so something like 4000 obs dat_graph <-subset(dat_use,graph_obs) plot(WSAL_VAL ~ jitter(Age, factor = 2), pch = 16, col = rgb(0.5, 0.5, 0.5, alpha = 0.02), data = dat_graph) # ^^ that looks like crap since Wages are soooooooo skew! So try to find ylim = c(0, ??) plot(WSAL_VAL ~ jitter(Age, factor = 2), pch = 16, col = rgb(0.5, 0.5, 0.5, alpha = 0.02), ylim = c(0,150000), data = dat_graph) # to plot the predicted values might want to do something like, lines(fitted.values(model2) ~ Age) # but that will plot ALLLLL the values, which is 4500 too many and looks awful # so back to this, to_be_predicted2 <- data.frame(Age = 25:55, female = 1, AfAm = 0, Asian = 0, Amindian = 1, race_oth = 1, Hispanic = 1, educ_hs = 0, educ_smcoll = 0, educ_as = 0, educ_bach = 1, educ_adv = 0, married = 0, divwidsep =0, union_m = 0, veteran = 0, immigrant = 0, immig2gen = 1) to_be_predicted2$yhat <- predict(model2, newdata = to_be_predicted2) lines(yhat ~ Age, data = to_be_predicted2) # now compare model3 to_be_predicted3m <- data.frame(Age = 25:55, female = 0, AfAm = 0, Asian = 0, Amindian = 1, race_oth = 1, Hispanic = 1, educ_hs = 0, educ_smcoll = 0, educ_as = 0, educ_bach = 1, educ_adv = 0, married = 0, divwidsep =0, union_m = 0, veteran = 0, immigrant = 0, immig2gen = 1) to_be_predicted3m$yhat <- predict(model3, newdata = to_be_predicted3m) to_be_predicted3f <- data.frame(Age = 25:55, female = 1, AfAm = 0, Asian = 0, Amindian = 1, race_oth = 1, Hispanic = 1, educ_hs = 0, educ_smcoll = 0, educ_as = 0, educ_bach = 1, educ_adv = 0, married = 0, divwidsep =0, union_m = 0, veteran = 0, immigrant = 0, immig2gen = 1) to_be_predicted3f$yhat <- predict(model3, newdata = to_be_predicted3f) plot(WSAL_VAL ~ jitter(Age, factor = 2), pch = 16, col = rgb(0.5, 0.5, 0.5, alpha = 0.02), ylim = c(0,150000), xlab = "Age", data = dat_graph) lines(yhat ~ Age, data = to_be_predicted3f) lines(yhat ~ Age, data = to_be_predicted3m, lty = 2) legend("topleft", c("male", "female"), lty = c(2,1), bty = "n") # and always remember this part... detach(dat_use)