Skip to content

Commit

Permalink
Check if file match exits before executing the match
Browse files Browse the repository at this point in the history
  • Loading branch information
crisjf committed May 11, 2020
1 parent b06ba3a commit a23d343
Showing 1 changed file with 46 additions and 42 deletions.
88 changes: 46 additions & 42 deletions download_shapeData.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,52 +49,56 @@ def match_zip_bg(shapesPath,year_version):
'''
Matches block groups to zip codes.
'''
zipShapes = gpd.read_file(os.path.join(shapesPath,year_version+'_zcta5','tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
matched = []
fnames = [os.path.join(shapesPath,year_version+'_bg',f,f+'.shp') for f in os.listdir(os.path.join(shapesPath,year_version+'_bg'))]
for f in fnames:
state = f.split('/')[-1].split('_')[2]
print(state,f)
bgs = gpd.read_file(f)

small = bgs[['GEOID','geometry']]
large = zipShapes[['ZCTA5CE10','geometry']]

small = small[['GEOID','geometry']]
large = large[['ZCTA5CE10','geometry']]
match_path = os.path.join(shapesPath,'ZIP_BG_matched_{}.csv'.format(year_version))
if not os.path.isfile(match_path):
zipShapes = gpd.read_file(os.path.join(shapesPath,year_version+'_zcta5','tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
matched = []
fnames = [os.path.join(shapesPath,year_version+'_bg',f,f+'.shp') for f in os.listdir(os.path.join(shapesPath,year_version+'_bg'))]
for f in fnames:
state = f.split('/')[-1].split('_')[2]
print(state,f)
bgs = gpd.read_file(f)

small = bgs[['GEOID','geometry']]
large = zipShapes[['ZCTA5CE10','geometry']]

small = small[['GEOID','geometry']]
large = large[['ZCTA5CE10','geometry']]
matchRaw = gpd.overlay(small,large,how='intersection')
match = matchRaw[matchRaw.columns]
match['INT_AREA'] = match.geometry.area
match = pd.merge(match,match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
match['weight'] = match['INT_AREA']/match['TOT_AREA']

match = match[match['weight']>0.1]
match = pd.merge(match.drop('TOT_AREA',1),match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
match['weight'] = match['INT_AREA']/match['TOT_AREA']

match = match[['GEOID','ZCTA5CE10','weight']].drop_duplicates()
match['STATE_FIPS'] = state
matched.append(match)
match = pd.concat(matched)
match.to_csv(match_path,index=False)

def match_zip_msa(shapesPath,year_version='2019'):
match_path = os.path.join(shapesPath,'ZIP_MSA_matched_{}.csv'.format(year_version))
if not os.path.isfile(match_path):
msas = gpd.read_file(os.path.join(SHAPES_PATH,'{}_cbsa'.format(year_version),'tl_{}_us_cbsa'.format(year_version),'tl_{}_us_cbsa.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
msas = msas[msas['LSAD']=='M1']
zipShapes = gpd.read_file(os.path.join(SHAPES_PATH,'{}_zcta5'.format(year_version),'tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})

small = zipShapes[['ZCTA5CE10','geometry']]
large = msas[['CBSAFP','geometry']]

small = small[['ZCTA5CE10','geometry']]
large = large[['CBSAFP','geometry']]
matchRaw = gpd.overlay(small,large,how='intersection')
match = matchRaw[matchRaw.columns]
match['INT_AREA'] = match.geometry.area
match = pd.merge(match,match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
match = pd.merge(match,match.groupby('ZCTA5CE10').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
match['weight'] = match['INT_AREA']/match['TOT_AREA']

match = match[match['weight']>0.1]
match = pd.merge(match.drop('TOT_AREA',1),match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
match['weight'] = match['INT_AREA']/match['TOT_AREA']

match = match[['GEOID','ZCTA5CE10','weight']].drop_duplicates()
match['STATE_FIPS'] = state
matched.append(match)
match = pd.concat(matched)
match.to_csv(os.path.join(shapesPath,'ZIP_BG_matched_{}.csv'.format(year_version)),index=False)

def match_zip_msa(shapesPath,year_version='2019'):
msas = gpd.read_file(os.path.join(SHAPES_PATH,'{}_cbsa'.format(year_version),'tl_{}_us_cbsa'.format(year_version),'tl_{}_us_cbsa.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
msas = msas[msas['LSAD']=='M1']
zipShapes = gpd.read_file(os.path.join(SHAPES_PATH,'{}_zcta5'.format(year_version),'tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})

small = zipShapes[['ZCTA5CE10','geometry']]
large = msas[['CBSAFP','geometry']]

small = small[['ZCTA5CE10','geometry']]
large = large[['CBSAFP','geometry']]
matchRaw = gpd.overlay(small,large,how='intersection')
match = matchRaw[matchRaw.columns]
match['INT_AREA'] = match.geometry.area
match = pd.merge(match,match.groupby('ZCTA5CE10').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
match['weight'] = match['INT_AREA']/match['TOT_AREA']
match = match.sort_values(by='weight',ascending=False).groupby(['ZCTA5CE10']).first().reset_index()[['ZCTA5CE10','CBSAFP']]
match.to_csv(os.path.join(shapesPath,'ZIP_MSA_matched_{}.csv'.format(year_version)),index=False)
match = match.sort_values(by='weight',ascending=False).groupby(['ZCTA5CE10']).first().reset_index()[['ZCTA5CE10','CBSAFP']]
match.to_csv(match_path,index=False)


def main(shapesPath):
Expand Down

0 comments on commit a23d343

Please sign in to comment.