# linear_regression_with_CEXdata.R
# uses CEX data, Q1 of 2012
# accompanying lecture notes for KFoster class ECO B2000 at CCNY
# download the CEX data from class web page then unzip the file
# that file has .Rdata and then the code for how I created that data file

rm(list = ls(all = TRUE))

# this line will be different for you!
setwd("C:\\Users\\Kevin\\Documents\\CCNY\\data for classes\\CEX")  

# install this package
require(AER)

load("cex_2012.RData")
head(data_cex)
attach(data_cex)

# you should explore this data on your own
# I created some basic dummy variables analogous to PUMS data 
# NEWID: unique per CU (Consumer Unit)
# AS_COMP1 - 5: (number of members at diff ages)
# AGE_REF: Age of reference person
# AGE2: Age of spouse
# SEX_REF: 1 male 2 female
# SEX2: spouse
# EDUC_REF: Educ of ref: 10 grade 1-8; 11 gr 9-12; 12 hs grad; 13 some coll; 14 assoc degree; 15 bach degree; 16 master; 17 professional/PhD
# EDUCA2: educ of spouse (same code)
# MARITAL1: marital status of ref; 1 marr, 2 wid, 3 div, 4 sep, 5 never
# REF_RACE: race of ref; 1 wh, 2 bl, 3 native, 4 asian, 5 PacIs, 6 multi
# RACE2: race of spouse
# HORREF1: Hisp ref; 1 Mex, 2 Mex-Am, 3 Chicano, 4 PR, 5 Cuban, 6 other
# HORREF2: Hisp spouse
# REGION: 1 NE, 2 MidW, 3 S, 4 W
# STATE
# PSU: list of "A-sized" PSU metro areas; 1109 is NY,NY; 1110 is NY+CT; 1111 is NJ
# POPSIZE: size of PSU, 1 pop>4m, ... 5 <125k
# BLS_URBN: 1 Urban, 2 Rural
# CUTENURE: 1 own w mortgage, 2 own no mort, 3 own ? mort, 4 rent, 5 occupy no rent, 6 student housing
# EARNCOMP: compositition of earners, 1 ref only, 2 ref & spouse, ...
# FAM_SIZE: # in CU
# FAM_TYPE: w/wo kids, # parents
# INC_RANK: % rank of CU, on pretax inc
# NO_EARNR: # of earners
# VEHQ: # of owned vehicles
# VEHQL: # leased vehicles
# SMSASTAT: is CU in MSA? 1 Y, 2 N
# POV_CY: in poverty current year 1 Y, 2 N
# POV_PY: in poverty past year 1 Y, 2 N
# INC_HRS1: hours usually worked by ref
# INC_HRS2: hours usually worked by spouse
# INCWEEK1: # weeks worked by ref
# INCWEEK2: # weeks worked by spouse
# INCNONW1: reason reference person not work; 1-6 code
# INCNONW2: reason spouse not work
# OCCUCOD1: occ of ref
# OCCUCOD2: occ of sp
# FINCATAX: income after taxes
# FINCBTAX: income before taxes
# FSALARYX: wage and salary of all CU
# TOTEXPPQ: total expenditure past quarter
# TOTEXPCQ: tot exp current Q
# FOODPQ: food past Q
# FOODCQ: food curr Q
# FDHOMEPQ: food at home past Q
# FDHOMECQ: food at home current Q
# FDAWAYPQ: food away 
# ALCBEVPQ: alcohol bev 
# HOUSPQ: housing 
# MRINTPQ: mortgage interest
# RENDWEPQ: rent
# UTILPQ: utility, fuels, public services
# NATLGASPQ: natural gas
# ELCTRCPQ: elec
# ALLFULPQ: fuel oil and other fuels
# TELEPHPQ: telephone 
# WATEPSPQ: water
# BBYDAYPQ: babysitting child care
# HOUSEQPQ: house furnishings & equip
# APPARPQ: apparel & svc
# MENBOYPQ: clothing for men and boys
# WOMGRLPQ: clothing for women and girls
# FOOTWRPQ: footwear
# TRANSPQ: transport
# GASMOPQ: gas and motor oil
# HEALTHPQ: healthcare
# HLTHINPQ: health ins
# ENTERTPQ: entertainment
# PETTOYPQ: pets, toys
# PERSCAPQ: personal care
# EDUCAPQ: education
# TOBACCPQ: tobacco and smoking
# BUILDING: 1 detached, 2 row inner, 3 end unit, 4 duplex, 5 bigger, 6 garden, 7 hi-rise, 8 apt
# UNISTRQ: units in structure, coded
# BUILT: calendar year or range when built
# HEATFUEL: heating fuel 1 gas, 2 elec, 3 oil, 4 other, 5 none
# WATERHT: fuel for water 1 gas, 2 elec, 3 oil, 4 other, 5 none



# simple example of linear regression
fraction_housing <- HOUSPQ/FINCATAX

fraction_housing1 <- is.finite(fraction_housing)
# summary(fraction_housing1)
summary(fraction_housing[fraction_housing1])

model1 <- lm(fraction_housing[fraction_housing1] ~ AGE_REF[fraction_housing1])
summary(model1)

# alt way to deal with Inf values ----------------
fraction_housing[is.infinite(fraction_housing)] <- NA

model2 <- lm(fraction_housing ~ AGE_REF)
summary(model2)

# next look at fraction spent on food
fraction_food <- FOODPQ/TOTEXPPQ
fraction_food[is.infinite(fraction_food)] <- NA
fraction_food[fraction_food<0] <- NA # 1 reported negative total expenditure?!
summary(fraction_food)


summary(fraction_food[as.logical(AfAm)])
summary(fraction_food[as.logical(Asian)])
summary(fraction_food[as.logical(Hispanic)])
summary(fraction_food[as.logical(Amindian)])
summary(fraction_food[as.logical(race_oth)])
summary(fraction_food[as.logical(white)])
summary(fraction_food[as.logical(white & !Hispanic)])

summary(fraction_food[(AGE_REF >= 20) & (AGE_REF < 30)])
summary(fraction_food[(AGE_REF >= 30) & (AGE_REF < 40)])
summary(fraction_food[(AGE_REF >= 40) & (AGE_REF < 50)])
summary(fraction_food[(AGE_REF >= 50) & (AGE_REF < 60)])
summary(fraction_food[(AGE_REF >= 60)])

summary(fraction_food[as.logical(educ_nohs)])
summary(fraction_food[as.logical(educ_hs)])
summary(fraction_food[as.logical(educ_smcoll)])
summary(fraction_food[as.logical(educ_as)])
summary(fraction_food[as.logical(educ_bach)])
summary(fraction_food[as.logical(educ_adv)])


model3 <- lm(fraction_food ~ AGE_REF + FAM_SIZE + female + AfAm + Asian + race_oth + Amindian + Hispanic 
             + educ_hs + educ_smcoll + educ_as + educ_bach + educ_adv )
summary(model3)
plot(model3)

# nicer plots
plot(fraction_food ~ jitter(AGE_REF, factor = 2), pch = 16, col = rgb(0.5, 0.5, 0.5, alpha = 0.02))
# where jitter deals with problem of one marker on top of another
# nicer plot clearly shows outlier ages, remember histogram of ages for PUMS?

anova(model3)
coefficients(model3)
vcov(model3)
summary(fitted.values(model3))

to_be_predicted <- data.frame(AGE_REF = 30, FAM_SIZE = 1, female = 1, 
                              AfAm = 1, Asian = 0, race_oth = 0, Amindian =0, 
                              Hispanic = 0, educ_hs = 0, educ_smcoll = 0, 
                              educ_as = 0, educ_bach = 1, educ_adv = 0)
predict(model3, newdata = to_be_predicted, interval = "confidence")
# alternatly if the newdata were outside the original span of values, would use:
predict(model3, newdata = to_be_predicted, interval = "prediction")

require(car)
restr1 <- c(0,0,0,0,0,0,0,0,0,1,0,0,0,0) # educ_hs
restr2 <- c(0,0,0,0,0,0,0,0,0,0,1,0,0,0) # educ_smcoll
restr3 <- c(0,0,0,0,0,0,0,0,0,0,0,1,0,0) # educ_as
restr4 <- c(0,0,0,0,0,0,0,0,0,0,0,0,1,0) # educ_bach
restr5 <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,1) # educ_adv
restr_m <- rbind(restr1,restr2,restr3,restr4,restr5)
rest_eq <- c(0,0,0,0,0)
linearHypothesis(model3,restr_m,rest_eq)
# later add white.adjust = TRUE



# alt to look at fraction of food bought away from home ----
frac_foodout <- FDAWAYPQ/FOODPQ
summary(frac_foodout)
summary(frac_foodout[as.logical(AfAm)])
summary(frac_foodout[as.logical(Asian)])
summary(frac_foodout[as.logical(Hispanic)])
summary(frac_foodout[as.logical(race_oth)])
summary(frac_foodout[as.logical(white)])

# ------------
summary(HEALTHPQ)
summary(HLTHINPQ)
summary(HEALTHPQ - HLTHINPQ)

Health_expense <- HEALTHPQ - HLTHINPQ
sum(Health_expense < 0)
Health_expense[Health_expense<0] <- NA
summary(Health_expense)

ins_cost <- HLTHINPQ/TOTEXPPQ
sum(ins_cost < 0)
ins_cost[ins_cost < 0] <- NA
summary(ins_cost)

educ_indx <- educ_nohs + 2*educ_hs + 3*educ_smcoll + 4*educ_as + 5*educ_bach + 6*educ_adv

table(cut(ins_cost,breaks=c(-0.01,0.1,0.2,0.3,1)),educ_indx)