# source("load_IPUMSdata.R", echo=TRUE)
# you need to change the directory of R to appropriate location 
# loads the data and creates

load_data_new <- 1
if(load_data_new){
	rm(list = ls(all = TRUE))

	dat1 = read.csv("ACS_2008_2011_NYC_med.csv")
	# first few lines:
	# AGE,female,educ_nohs,educ_hs,educ_somecoll,educ_collassoc,educ_coll,educ_adv,educ_indx,boro_bx,boro_m,boro_si,boro_bk,boro_qns,boroughs,commute_car,commute_bus,commute_subway,commute_other,commute_type,africanamerican,nativeamerican,asianamerican,Hispanic,raceother,kids_under5,has_kids,non_citizen,veteran,own_dwelling,rent_dwelling,anc_euro,anc_centamerica,anc_southamerica,anc_spanishcarib,anc_carib,anc_MENA,anc_africa,anc_asiaindia,anc_seasia,anc_asiachinese,anc_asiajapankorea,anc_asiapacific,anc_nativeindian,YEAR,DATANUM,SERIAL,NUMPREC,HHWT,HHTYPE,REGION,PUMA,PUMARES2MIG,PUMASUPR,GQ,OWNERSHP,OWNERSHPD,MORTGAGE,MORTGAG2,MORTAMT1,MORTAMT2,OWNCOST,RENT,RENTGRS,HHINCOME,FOODSTMP,ROOMS,BUILTYR2,UNITSSTR,BEDROOMS,VEHICLES,PERNUM,PERWT,SPLOC,NCHILD,NCHLT5,SEX,MARST,RACE,RACED,BPL,BPLD,ANCESTR1,ANCESTR2,CITIZEN,YRIMMIG,LANGUAGE,HISPAN,HISPAND,EDUC,EDUCD,EMPSTAT,EMPSTATD,OCC,IND,WKSWORK2,INCTOT,FTOTINC,INCWAGE,POVERTY,MIGRATE1,MIGRATE1D,MIGMET1,MIGPUMA1,VETSTAT,VETSTATD,VETOTHER,TRANWORK,TRANTIME,DEPARTS,ARRIVES
	# 94,1,1,0,0,0,0,0,1,0,0,0,0,1,5,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,2011,5,3801258,2,14,3,12,4106,41,36112,1,2,22,0,0,0,0,99999,559,559,19556,1,4,5,9,3,9,1,15,0,1,0,2,6,7,700,260,26010,275,999,2,1972,12,4,498,0,2,3,30,0,0,0,4242,19556,0,153,1,10,0,0,1,11,0,0,0,0,0
	# 54,1,1,0,0,0,0,0,1,0,0,0,0,1,5,0,1,0,0,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,2011,5,3801258,2,14,3,12,4106,41,36112,1,2,22,0,0,0,0,99999,559,559,19556,1,4,5,9,3,9,2,27,0,0,0,2,3,7,700,260,26010,275,999,2,1972,12,4,498,0,2,1,10,9640,1770,6,15314,19556,15314,153,1,10,0,0,1,13,0,31,120,617,819
	# 41,1,0,0,0,0,1,0,5,0,0,0,0,1,5,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2011,5,3801260,4,36,1,12,4105,41,36113,1,1,13,3,1,2587,0,2932,0,0,391531,1,6,3,3,4,1,1,36,2,2,1,2,1,2,200,260,26057,335,999,2,1985,1,0,0,10,101,2,20,560,9590,0,517,391531,0,501,1,10,0,0,1,11,0,0,0,0,0
}

#verify that it worked
summary(dat1$AGE)

BPLD <- as.numeric(dat1$BPLD)

MENA_BPL <- (BPLD == 52000) |(BPLD == 52140) |(BPLD == 52200) | (BPLD == 532000) |(BPLD == 53200) |(BPLD == 53210) |(BPLD == 53300) |(BPLD == 53410)  |(BPLD == 53430) |(BPLD == 53500) |(BPLD == 53600) |(BPLD == 53700) |(BPLD == 53800) |(BPLD == 53900) |(BPLD == 54000) |(BPLD == 54100) |(BPLD == 54200)  |(BPLD == 54210) |(BPLD == 54220) |(BPLD == 54300) |(BPLD == 54400) |(BPLD == 54500) |(BPLD == 54600) |(BPLD == 54700) |(BPLD == 54800) |(BPLD == 54900) |(BPLD == 60010) |(BPLD == 60011) |(BPLD == 60012) |(BPLD == 60013) |(BPLD == 60014) |(BPLD == 60015) |(BPLD == 60016) |(BPLD == 60017) |(BPLD == 60019)
# note that this is a logical type
MENA_BPL[1:10]
# so change to zero one
MENA_BPL <- as.numeric(MENA_BPL)
sum(MENA_BPL)

ANCESTR1 <- as.numeric(dat1$ANCESTR1)
ANCESTR2 <- as.numeric(dat1$ANCESTR2)

MENA_ANC <- (ANCESTR1 == 400) | (ANCESTR1 == 402) | (ANCESTR1 == 404) | (ANCESTR1 == 406) | (ANCESTR1 == 407) | (ANCESTR1 == 408) |  (ANCESTR1 == 411) | (ANCESTR1 == 412) | (ANCESTR1 == 413) | (ANCESTR1 == 414) | (ANCESTR1 == 415) | (ANCESTR1 == 416) | (ANCESTR1 == 417) | (ANCESTR1 == 421) |  (ANCESTR1 == 423) | (ANCESTR1 == 425) | (ANCESTR1 == 427) | (ANCESTR1 == 429) | (ANCESTR1 == 431) | (ANCESTR1 == 434) | (ANCESTR1 == 435) | (ANCESTR1 == 436) |  (ANCESTR1 == 437) | (ANCESTR1 == 438) | (ANCESTR1 == 439) | (ANCESTR1 == 442) | (ANCESTR1 == 444) | (ANCESTR1 == 465) | (ANCESTR1 == 466) | (ANCESTR1 == 467) |  (ANCESTR1 == 470) | (ANCESTR1 == 471) | (ANCESTR1 == 480) | (ANCESTR1 == 482) | (ANCESTR1 == 490) | (ANCESTR1 == 495) | (ANCESTR1 == 496) | (ANCESTR1 == 600) |  (ANCESTR1 == 601) | (ANCESTR1 == 602) | (ANCESTR2 == 400) | (ANCESTR2 == 402) | (ANCESTR2 == 404) | (ANCESTR2 == 406) | (ANCESTR2 == 407) | (ANCESTR2 == 408) | (ANCESTR2 == 411) | (ANCESTR2 == 412) | (ANCESTR2 == 413) | (ANCESTR2 == 414) | (ANCESTR2 == 415) | (ANCESTR2 == 416) | (ANCESTR2 == 417) | (ANCESTR2 == 421) |  (ANCESTR2 == 423) | (ANCESTR2 == 425) | (ANCESTR2 == 427) | (ANCESTR2 == 429) | (ANCESTR2 == 431) | (ANCESTR2 == 434) | (ANCESTR2 == 435) | (ANCESTR2 == 436) |  (ANCESTR2 == 437) | (ANCESTR2 == 438) | (ANCESTR2 == 439) | (ANCESTR2 == 442) | (ANCESTR2 == 444) | (ANCESTR2 == 465) | (ANCESTR2 == 466) | (ANCESTR2 == 467) |  (ANCESTR2 == 470) | (ANCESTR2 == 471) | (ANCESTR2 == 480) | (ANCESTR2 == 482) | (ANCESTR2 == 490) | (ANCESTR2 == 495) | (ANCESTR2 == 496) | (ANCESTR2 == 600) |  (ANCESTR2 == 601) | (ANCESTR2 == 602)
MENA_ANC <- as.numeric(MENA_ANC)
sum(MENA_ANC)







# note weird results after
summary(dat1$INCWAGE)

# this helps us see the problem
quantile(dat1$INCWAGE, probs = seq(0,1,0.1))

# solve this way
is.na(dat1$INCWAGE) <- (dat1$INCWAGE == 999999)
# check that it does solve
quantile(dat1$INCWAGE, probs = seq(0,1,0.1),na.rm = TRUE)

# that still leaves lots of zeros!
inc <- dat1$INCWAGE
is.na(inc) <- (inc == 0)
quantile(inc, probs = seq(0,1,0.1),na.rm = TRUE)

# I spaced this during class ...
dat1$MENA_ANC <- MENA_ANC
dat1$MENA_BPL <- MENA_BPL

# restrict to prime-age 25-55 and IncomeWage > 0
restrict1 <- ((dat1$AGE >= 25) & (dat1$AGE <= 55) & (dat1$INCWAGE > 0))

dat2 <- subset(dat1, restrict1)

summary(lm(INCWAGE ~ AGE + female + educ_hs + educ_somecoll + educ_collassoc + educ_coll + educ_adv, data=dat2))


# some regression estimation
summary(lm(INCWAGE ~ AGE + female + educ_hs + educ_somecoll + educ_collassoc + educ_coll + educ_adv, data=dat2))

# too long?  Here's a shortcut 
summary(lm(INCWAGE ~ AGE + female + factor(educ_indx), data=dat2))

# add a squared term, some others... Remember the heteroskedasticity-consistent errors from last time, so 
regression1 <- lm(INCWAGE ~ AGE + I(AGE^2) + female + factor(educ_indx) + africanamerican + nativeamerican + asianamerican+ Hispanic + raceother + kids_under5 + has_kids + non_citizen + veteran, data=dat2)

summary(regression1)
coeftest(regression1, df = Inf, vcov = vcovHC(regression1))

regression2 <- lm(log(INCWAGE) ~ AGE + I(AGE^2) + I(AGE*female) + I(female*AGE^2) + female + factor(educ_indx) + africanamerican + nativeamerican + asianamerican+ Hispanic + raceother + kids_under5 + has_kids + non_citizen + veteran, data=dat2)

summary(regression2)
coeftest(regression2, df = Inf, vcov = vcovHC(regression1))


# what are we missing? What important variables aren't even in the data?