# lecturenotes9.r



rm(list = ls(all = TRUE))

setwd("C:\\Users\\Kevin\\Documents\\CCNY\\data for classes\\NHIS2009")

# read SPSS data
library(foreign)
data1 <-read.spss("NHIS_2009_v1.sav", use.value.labels=T, to.data.frame=T)
# obviously, change your directory as appropriate - just note double back-slash

str(data1[1:47])
#head(data1)
#summary(data1)

# fugly but does the job
dat_names1 <- names(data1)
for(indx in 1:47){ 
print(dat_names1[indx]) 
print(mean(data1[[indx]], na.rm = TRUE))
}

regn_logit1 <- glm(HI_private ~ AGE_P + I(AGE_P^2) + female + afam + Amindian + asian_in + asian_ch + asian_ph + asian_oth + race_oth + Hispanic + mexican + boricua + cuban + dominican + ed_hs + ed_collnd + ed_ASvoc + ed_ASacad + ed_coll + ed_adv + immig + marrd + divwidsp + veteran + region2 + region3 + region4, family = binomial, data = data1)
summary(regn_logit1)


regn_probit1 <- glm(HI_private ~ AGE_P + I(AGE_P^2) + female + afam + Amindian + asian_in + asian_ch + asian_ph + asian_oth + race_oth + Hispanic + mexican + boricua + cuban + dominican + ed_hs + ed_collnd + ed_ASvoc + ed_ASacad + ed_coll + ed_adv + immig + marrd + divwidsp + veteran + region2 + region3 + region4, family = binomial(link = 'probit'), data = data1)
summary(regn_probit1)


# ----------------------
# Quantile Regressions
# ----------------------

# kernel density (nonparametric estimators) ?

rm(list = ls(all = TRUE))

setwd("C:\\Users\\Kevin\\Documents\\CCNY\\data for classes\\CPS2010")

data1 <-read.spss("cps_m2010_work3.sav", use.value.labels=T, to.data.frame=T)
str(data1)

data1$WEXP <- as.numeric(data1$WEXP)

# restrict to prime-age 25-55 and IncomeWage > 0 and FT worker
restrict1 <- ((data1$A_AGE >= 25) & (data1$A_AGE <= 55) & (data1$WSAL_VAL > 0) & ((data1$WEXP == 1) | (data1$WEXP  == 2)  | (data1$WEXP  == 3) ))


data2 <- subset(data1, restrict1)
str(data2)

#Note WEXP labels:
 # 0 'Not In Universe' 
 # 1 'FT/50-52 weeks' 
 # 2 'FT/48-49 weeks' 
 # 3 'FT/40-47 weeks' 
 # 4 'FT/27-39 weeks' 
 # 5 'FT/14-26 weeks' 
 # 6 'FT/13 weeks or less' 
 # 7 'PT/50-52 weeks' 
 # 8 'PT/48-49 weeks' 
 # 9 'PT/40-47 weeks' 
 # 10 'PT/27-39 weeks' 
 # 11 'PT/14-26 weeks' 
 # 12 'PT/13 weeks or less' 
 # 13 'Nonworker' 

restrict2 <- as.logical(data2$ed_coll)
data3 <- subset(data2, restrict2)

Age25 <- data3$WSAL_VAL[data3$A_AGE == 25]
Age30 <- data3$WSAL_VAL[data3$A_AGE == 30]
Age35 <- data3$WSAL_VAL[data3$A_AGE == 35]
Age40 <- data3$WSAL_VAL[data3$A_AGE == 40]
Age45 <- data3$WSAL_VAL[data3$A_AGE == 45]
Age50 <- data3$WSAL_VAL[data3$A_AGE == 50]

p_tiles <- c(0.1, 0.25, 0.5, 0.75, 0.9)
q_25 <- quantile(Age25, p_tiles)
q_30 <- quantile(Age30, p_tiles)
q_35 <- quantile(Age35, p_tiles)
q_40 <- quantile(Age40, p_tiles)
q_45 <- quantile(Age45, p_tiles)
q_50 <- quantile(Age50, p_tiles)

q_plot <- cbind(q_25,q_30,q_35,q_40,q_45,q_50)
x_val <- as.matrix(c(25, 30, 35, 40, 45, 50))
matplot(x_val,t(q_plot), type = "l", xlab= "Age", ylab = "Wage of college grads at Percentiles 10 25 50 75 90")

# so it would be sensible to look at quantiles in regression context
regression1 <- lm(WSAL_VAL ~ A_AGE + I(A_AGE^2) + female + afam + asian+ Amindian + Hispanic + immig + imm2gen + ed_hs + ed_collnd + ed_ASvoc + ed_ASacad + ed_coll + ed_adv + union + veteran, data=data2)
summary(regression1)


regression2 <- lm(WSAL_VAL ~ A_AGE + I(A_AGE^2) + female + afam + asian+ Amindian + Hispanic + immig + imm2gen + ed_hs + ed_collnd + ed_ASvoc + ed_ASacad + ed_coll + ed_adv + union + veteran + as.factor(WEIND) + as.factor(POCCU2), data=data2)
summary(regression2)

library(quantreg)

quantreg1 <- rq(WSAL_VAL ~ A_AGE + I(A_AGE^2) + female + afam + asian+ Amindian + Hispanic + immig + imm2gen + ed_hs + ed_collnd + ed_ASvoc + ed_ASacad + ed_coll + ed_adv + union + veteran, tau=p_tiles, data=data2)
summary(quantreg1)
plot(quantreg1)