# source("load_IPUMSdata.R", echo=TRUE) # you need to change the directory of R to appropriate location # loads the data and creates load_data_new <- 1 if(load_data_new){ rm(list = ls(all = TRUE)) dat1 = read.csv("ACS_2008_2011_NYC_med.csv") # first few lines: # AGE,female,educ_nohs,educ_hs,educ_somecoll,educ_collassoc,educ_coll,educ_adv,educ_indx,boro_bx,boro_m,boro_si,boro_bk,boro_qns,boroughs,commute_car,commute_bus,commute_subway,commute_other,commute_type,africanamerican,nativeamerican,asianamerican,Hispanic,raceother,kids_under5,has_kids,non_citizen,veteran,own_dwelling,rent_dwelling,anc_euro,anc_centamerica,anc_southamerica,anc_spanishcarib,anc_carib,anc_MENA,anc_africa,anc_asiaindia,anc_seasia,anc_asiachinese,anc_asiajapankorea,anc_asiapacific,anc_nativeindian,YEAR,DATANUM,SERIAL,NUMPREC,HHWT,HHTYPE,REGION,PUMA,PUMARES2MIG,PUMASUPR,GQ,OWNERSHP,OWNERSHPD,MORTGAGE,MORTGAG2,MORTAMT1,MORTAMT2,OWNCOST,RENT,RENTGRS,HHINCOME,FOODSTMP,ROOMS,BUILTYR2,UNITSSTR,BEDROOMS,VEHICLES,PERNUM,PERWT,SPLOC,NCHILD,NCHLT5,SEX,MARST,RACE,RACED,BPL,BPLD,ANCESTR1,ANCESTR2,CITIZEN,YRIMMIG,LANGUAGE,HISPAN,HISPAND,EDUC,EDUCD,EMPSTAT,EMPSTATD,OCC,IND,WKSWORK2,INCTOT,FTOTINC,INCWAGE,POVERTY,MIGRATE1,MIGRATE1D,MIGMET1,MIGPUMA1,VETSTAT,VETSTATD,VETOTHER,TRANWORK,TRANTIME,DEPARTS,ARRIVES # 94,1,1,0,0,0,0,0,1,0,0,0,0,1,5,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,2011,5,3801258,2,14,3,12,4106,41,36112,1,2,22,0,0,0,0,99999,559,559,19556,1,4,5,9,3,9,1,15,0,1,0,2,6,7,700,260,26010,275,999,2,1972,12,4,498,0,2,3,30,0,0,0,4242,19556,0,153,1,10,0,0,1,11,0,0,0,0,0 # 54,1,1,0,0,0,0,0,1,0,0,0,0,1,5,0,1,0,0,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,2011,5,3801258,2,14,3,12,4106,41,36112,1,2,22,0,0,0,0,99999,559,559,19556,1,4,5,9,3,9,2,27,0,0,0,2,3,7,700,260,26010,275,999,2,1972,12,4,498,0,2,1,10,9640,1770,6,15314,19556,15314,153,1,10,0,0,1,13,0,31,120,617,819 # 41,1,0,0,0,0,1,0,5,0,0,0,0,1,5,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2011,5,3801260,4,36,1,12,4105,41,36113,1,1,13,3,1,2587,0,2932,0,0,391531,1,6,3,3,4,1,1,36,2,2,1,2,1,2,200,260,26057,335,999,2,1985,1,0,0,10,101,2,20,560,9590,0,517,391531,0,501,1,10,0,0,1,11,0,0,0,0,0 } #verify that it worked summary(dat1$AGE) BPLD <- as.numeric(dat1$BPLD) MENA_BPL <- (BPLD == 52000) |(BPLD == 52140) |(BPLD == 52200) | (BPLD == 532000) |(BPLD == 53200) |(BPLD == 53210) |(BPLD == 53300) |(BPLD == 53410) |(BPLD == 53430) |(BPLD == 53500) |(BPLD == 53600) |(BPLD == 53700) |(BPLD == 53800) |(BPLD == 53900) |(BPLD == 54000) |(BPLD == 54100) |(BPLD == 54200) |(BPLD == 54210) |(BPLD == 54220) |(BPLD == 54300) |(BPLD == 54400) |(BPLD == 54500) |(BPLD == 54600) |(BPLD == 54700) |(BPLD == 54800) |(BPLD == 54900) |(BPLD == 60010) |(BPLD == 60011) |(BPLD == 60012) |(BPLD == 60013) |(BPLD == 60014) |(BPLD == 60015) |(BPLD == 60016) |(BPLD == 60017) |(BPLD == 60019) # note that this is a logical type MENA_BPL[1:10] # so change to zero one MENA_BPL <- as.numeric(MENA_BPL) sum(MENA_BPL) ANCESTR1 <- as.numeric(dat1$ANCESTR1) ANCESTR2 <- as.numeric(dat1$ANCESTR2) MENA_ANC <- (ANCESTR1 == 400) | (ANCESTR1 == 402) | (ANCESTR1 == 404) | (ANCESTR1 == 406) | (ANCESTR1 == 407) | (ANCESTR1 == 408) | (ANCESTR1 == 411) | (ANCESTR1 == 412) | (ANCESTR1 == 413) | (ANCESTR1 == 414) | (ANCESTR1 == 415) | (ANCESTR1 == 416) | (ANCESTR1 == 417) | (ANCESTR1 == 421) | (ANCESTR1 == 423) | (ANCESTR1 == 425) | (ANCESTR1 == 427) | (ANCESTR1 == 429) | (ANCESTR1 == 431) | (ANCESTR1 == 434) | (ANCESTR1 == 435) | (ANCESTR1 == 436) | (ANCESTR1 == 437) | (ANCESTR1 == 438) | (ANCESTR1 == 439) | (ANCESTR1 == 442) | (ANCESTR1 == 444) | (ANCESTR1 == 465) | (ANCESTR1 == 466) | (ANCESTR1 == 467) | (ANCESTR1 == 470) | (ANCESTR1 == 471) | (ANCESTR1 == 480) | (ANCESTR1 == 482) | (ANCESTR1 == 490) | (ANCESTR1 == 495) | (ANCESTR1 == 496) | (ANCESTR1 == 600) | (ANCESTR1 == 601) | (ANCESTR1 == 602) | (ANCESTR2 == 400) | (ANCESTR2 == 402) | (ANCESTR2 == 404) | (ANCESTR2 == 406) | (ANCESTR2 == 407) | (ANCESTR2 == 408) | (ANCESTR2 == 411) | (ANCESTR2 == 412) | (ANCESTR2 == 413) | (ANCESTR2 == 414) | (ANCESTR2 == 415) | (ANCESTR2 == 416) | (ANCESTR2 == 417) | (ANCESTR2 == 421) | (ANCESTR2 == 423) | (ANCESTR2 == 425) | (ANCESTR2 == 427) | (ANCESTR2 == 429) | (ANCESTR2 == 431) | (ANCESTR2 == 434) | (ANCESTR2 == 435) | (ANCESTR2 == 436) | (ANCESTR2 == 437) | (ANCESTR2 == 438) | (ANCESTR2 == 439) | (ANCESTR2 == 442) | (ANCESTR2 == 444) | (ANCESTR2 == 465) | (ANCESTR2 == 466) | (ANCESTR2 == 467) | (ANCESTR2 == 470) | (ANCESTR2 == 471) | (ANCESTR2 == 480) | (ANCESTR2 == 482) | (ANCESTR2 == 490) | (ANCESTR2 == 495) | (ANCESTR2 == 496) | (ANCESTR2 == 600) | (ANCESTR2 == 601) | (ANCESTR2 == 602) MENA_ANC <- as.numeric(MENA_ANC) sum(MENA_ANC) # note weird results after summary(dat1$INCWAGE) # this helps us see the problem quantile(dat1$INCWAGE, probs = seq(0,1,0.1)) # solve this way is.na(dat1$INCWAGE) <- (dat1$INCWAGE == 999999) # check that it does solve quantile(dat1$INCWAGE, probs = seq(0,1,0.1),na.rm = TRUE) # that still leaves lots of zeros! inc <- dat1$INCWAGE is.na(inc) <- (inc == 0) quantile(inc, probs = seq(0,1,0.1),na.rm = TRUE) # I spaced this during class ... dat1$MENA_ANC <- MENA_ANC dat1$MENA_BPL <- MENA_BPL # restrict to prime-age 25-55 and IncomeWage > 0 restrict1 <- ((dat1$AGE >= 25) & (dat1$AGE <= 55) & (dat1$INCWAGE > 0)) dat2 <- subset(dat1, restrict1) summary(lm(INCWAGE ~ AGE + female + educ_hs + educ_somecoll + educ_collassoc + educ_coll + educ_adv, data=dat2)) # some regression estimation summary(lm(INCWAGE ~ AGE + female + educ_hs + educ_somecoll + educ_collassoc + educ_coll + educ_adv, data=dat2)) # too long? Here's a shortcut summary(lm(INCWAGE ~ AGE + female + factor(educ_indx), data=dat2)) # add a squared term, some others... Remember the heteroskedasticity-consistent errors from last time, so regression1 <- lm(INCWAGE ~ AGE + I(AGE^2) + female + factor(educ_indx) + africanamerican + nativeamerican + asianamerican+ Hispanic + raceother + kids_under5 + has_kids + non_citizen + veteran, data=dat2) summary(regression1) coeftest(regression1, df = Inf, vcov = vcovHC(regression1)) regression2 <- lm(log(INCWAGE) ~ AGE + I(AGE^2) + I(AGE*female) + I(female*AGE^2) + female + factor(educ_indx) + africanamerican + nativeamerican + asianamerican+ Hispanic + raceother + kids_under5 + has_kids + non_citizen + veteran, data=dat2) summary(regression2) coeftest(regression2, df = Inf, vcov = vcovHC(regression1)) # what are we missing? What important variables aren't even in the data?