# linear_regression_with_CEXdata.R # uses CEX data, Q1 of 2012 # accompanying lecture notes for KFoster class ECO B2000 at CCNY # download the CEX data from class web page then unzip the file # that file has .Rdata and then the code for how I created that data file rm(list = ls(all = TRUE)) # this line will be different for you! setwd("C:\\Users\\Kevin\\Documents\\CCNY\\data for classes\\CEX") # install this package require(AER) load("cex_2012.RData") head(data_cex) attach(data_cex) # you should explore this data on your own # I created some basic dummy variables analogous to PUMS data # NEWID: unique per CU (Consumer Unit) # AS_COMP1 - 5: (number of members at diff ages) # AGE_REF: Age of reference person # AGE2: Age of spouse # SEX_REF: 1 male 2 female # SEX2: spouse # EDUC_REF: Educ of ref: 10 grade 1-8; 11 gr 9-12; 12 hs grad; 13 some coll; 14 assoc degree; 15 bach degree; 16 master; 17 professional/PhD # EDUCA2: educ of spouse (same code) # MARITAL1: marital status of ref; 1 marr, 2 wid, 3 div, 4 sep, 5 never # REF_RACE: race of ref; 1 wh, 2 bl, 3 native, 4 asian, 5 PacIs, 6 multi # RACE2: race of spouse # HORREF1: Hisp ref; 1 Mex, 2 Mex-Am, 3 Chicano, 4 PR, 5 Cuban, 6 other # HORREF2: Hisp spouse # REGION: 1 NE, 2 MidW, 3 S, 4 W # STATE # PSU: list of "A-sized" PSU metro areas; 1109 is NY,NY; 1110 is NY+CT; 1111 is NJ # POPSIZE: size of PSU, 1 pop>4m, ... 5 <125k # BLS_URBN: 1 Urban, 2 Rural # CUTENURE: 1 own w mortgage, 2 own no mort, 3 own ? mort, 4 rent, 5 occupy no rent, 6 student housing # EARNCOMP: compositition of earners, 1 ref only, 2 ref & spouse, ... # FAM_SIZE: # in CU # FAM_TYPE: w/wo kids, # parents # INC_RANK: % rank of CU, on pretax inc # NO_EARNR: # of earners # VEHQ: # of owned vehicles # VEHQL: # leased vehicles # SMSASTAT: is CU in MSA? 1 Y, 2 N # POV_CY: in poverty current year 1 Y, 2 N # POV_PY: in poverty past year 1 Y, 2 N # INC_HRS1: hours usually worked by ref # INC_HRS2: hours usually worked by spouse # INCWEEK1: # weeks worked by ref # INCWEEK2: # weeks worked by spouse # INCNONW1: reason reference person not work; 1-6 code # INCNONW2: reason spouse not work # OCCUCOD1: occ of ref # OCCUCOD2: occ of sp # FINCATAX: income after taxes # FINCBTAX: income before taxes # FSALARYX: wage and salary of all CU # TOTEXPPQ: total expenditure past quarter # TOTEXPCQ: tot exp current Q # FOODPQ: food past Q # FOODCQ: food curr Q # FDHOMEPQ: food at home past Q # FDHOMECQ: food at home current Q # FDAWAYPQ: food away # ALCBEVPQ: alcohol bev # HOUSPQ: housing # MRINTPQ: mortgage interest # RENDWEPQ: rent # UTILPQ: utility, fuels, public services # NATLGASPQ: natural gas # ELCTRCPQ: elec # ALLFULPQ: fuel oil and other fuels # TELEPHPQ: telephone # WATEPSPQ: water # BBYDAYPQ: babysitting child care # HOUSEQPQ: house furnishings & equip # APPARPQ: apparel & svc # MENBOYPQ: clothing for men and boys # WOMGRLPQ: clothing for women and girls # FOOTWRPQ: footwear # TRANSPQ: transport # GASMOPQ: gas and motor oil # HEALTHPQ: healthcare # HLTHINPQ: health ins # ENTERTPQ: entertainment # PETTOYPQ: pets, toys # PERSCAPQ: personal care # EDUCAPQ: education # TOBACCPQ: tobacco and smoking # BUILDING: 1 detached, 2 row inner, 3 end unit, 4 duplex, 5 bigger, 6 garden, 7 hi-rise, 8 apt # UNISTRQ: units in structure, coded # BUILT: calendar year or range when built # HEATFUEL: heating fuel 1 gas, 2 elec, 3 oil, 4 other, 5 none # WATERHT: fuel for water 1 gas, 2 elec, 3 oil, 4 other, 5 none # simple example of linear regression fraction_housing <- HOUSPQ/FINCATAX fraction_housing1 <- is.finite(fraction_housing) # summary(fraction_housing1) summary(fraction_housing[fraction_housing1]) model1 <- lm(fraction_housing[fraction_housing1] ~ AGE_REF[fraction_housing1]) summary(model1) # alt way to deal with Inf values ---------------- fraction_housing[is.infinite(fraction_housing)] <- NA model2 <- lm(fraction_housing ~ AGE_REF) summary(model2) # next look at fraction spent on food fraction_food <- FOODPQ/TOTEXPPQ fraction_food[is.infinite(fraction_food)] <- NA fraction_food[fraction_food<0] <- NA # 1 reported negative total expenditure?! summary(fraction_food) summary(fraction_food[as.logical(AfAm)]) summary(fraction_food[as.logical(Asian)]) summary(fraction_food[as.logical(Hispanic)]) summary(fraction_food[as.logical(Amindian)]) summary(fraction_food[as.logical(race_oth)]) summary(fraction_food[as.logical(white)]) summary(fraction_food[as.logical(white & !Hispanic)]) summary(fraction_food[(AGE_REF >= 20) & (AGE_REF < 30)]) summary(fraction_food[(AGE_REF >= 30) & (AGE_REF < 40)]) summary(fraction_food[(AGE_REF >= 40) & (AGE_REF < 50)]) summary(fraction_food[(AGE_REF >= 50) & (AGE_REF < 60)]) summary(fraction_food[(AGE_REF >= 60)]) summary(fraction_food[as.logical(educ_nohs)]) summary(fraction_food[as.logical(educ_hs)]) summary(fraction_food[as.logical(educ_smcoll)]) summary(fraction_food[as.logical(educ_as)]) summary(fraction_food[as.logical(educ_bach)]) summary(fraction_food[as.logical(educ_adv)]) model3 <- lm(fraction_food ~ AGE_REF + FAM_SIZE + female + AfAm + Asian + race_oth + Amindian + Hispanic + educ_hs + educ_smcoll + educ_as + educ_bach + educ_adv ) summary(model3) plot(model3) # nicer plots plot(fraction_food ~ jitter(AGE_REF, factor = 2), pch = 16, col = rgb(0.5, 0.5, 0.5, alpha = 0.02)) # where jitter deals with problem of one marker on top of another # nicer plot clearly shows outlier ages, remember histogram of ages for PUMS? anova(model3) coefficients(model3) vcov(model3) summary(fitted.values(model3)) to_be_predicted <- data.frame(AGE_REF = 30, FAM_SIZE = 1, female = 1, AfAm = 1, Asian = 0, race_oth = 0, Amindian =0, Hispanic = 0, educ_hs = 0, educ_smcoll = 0, educ_as = 0, educ_bach = 1, educ_adv = 0) predict(model3, newdata = to_be_predicted, interval = "confidence") # alternatly if the newdata were outside the original span of values, would use: predict(model3, newdata = to_be_predicted, interval = "prediction") require(car) restr1 <- c(0,0,0,0,0,0,0,0,0,1,0,0,0,0) # educ_hs restr2 <- c(0,0,0,0,0,0,0,0,0,0,1,0,0,0) # educ_smcoll restr3 <- c(0,0,0,0,0,0,0,0,0,0,0,1,0,0) # educ_as restr4 <- c(0,0,0,0,0,0,0,0,0,0,0,0,1,0) # educ_bach restr5 <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,1) # educ_adv restr_m <- rbind(restr1,restr2,restr3,restr4,restr5) rest_eq <- c(0,0,0,0,0) linearHypothesis(model3,restr_m,rest_eq) # later add white.adjust = TRUE # alt to look at fraction of food bought away from home ---- frac_foodout <- FDAWAYPQ/FOODPQ summary(frac_foodout) summary(frac_foodout[as.logical(AfAm)]) summary(frac_foodout[as.logical(Asian)]) summary(frac_foodout[as.logical(Hispanic)]) summary(frac_foodout[as.logical(race_oth)]) summary(frac_foodout[as.logical(white)]) # ------------ summary(HEALTHPQ) summary(HLTHINPQ) summary(HEALTHPQ - HLTHINPQ) Health_expense <- HEALTHPQ - HLTHINPQ sum(Health_expense < 0) Health_expense[Health_expense<0] <- NA summary(Health_expense) ins_cost <- HLTHINPQ/TOTEXPPQ sum(ins_cost < 0) ins_cost[ins_cost < 0] <- NA summary(ins_cost) educ_indx <- educ_nohs + 2*educ_hs + 3*educ_smcoll + 4*educ_as + 5*educ_bach + 6*educ_adv table(cut(ins_cost,breaks=c(-0.01,0.1,0.2,0.3,1)),educ_indx)