Check if file match exits before executing the match

CityScope · May 11, 2020 · a23d343 · a23d343
1 parent b06ba3a
commit a23d343
Showing 1 changed file with 46 additions and 42 deletions.
diff --git a/download_shapeData.py b/download_shapeData.py
@@ -49,52 +49,56 @@ def match_zip_bg(shapesPath,year_version):
     '''
     Matches block groups to zip codes.
     '''
-    zipShapes = gpd.read_file(os.path.join(shapesPath,year_version+'_zcta5','tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
-    matched = []
-    fnames = [os.path.join(shapesPath,year_version+'_bg',f,f+'.shp') for f in os.listdir(os.path.join(shapesPath,year_version+'_bg'))]
-    for f in fnames:
-        state = f.split('/')[-1].split('_')[2]
-        print(state,f)
-        bgs = gpd.read_file(f)
-
-        small = bgs[['GEOID','geometry']]
-        large = zipShapes[['ZCTA5CE10','geometry']]
-
-        small = small[['GEOID','geometry']]
-        large = large[['ZCTA5CE10','geometry']]
+    match_path = os.path.join(shapesPath,'ZIP_BG_matched_{}.csv'.format(year_version))
+    if not os.path.isfile(match_path):
+        zipShapes = gpd.read_file(os.path.join(shapesPath,year_version+'_zcta5','tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
+        matched = []
+        fnames = [os.path.join(shapesPath,year_version+'_bg',f,f+'.shp') for f in os.listdir(os.path.join(shapesPath,year_version+'_bg'))]
+        for f in fnames:
+            state = f.split('/')[-1].split('_')[2]
+            print(state,f)
+            bgs = gpd.read_file(f)
+
+            small = bgs[['GEOID','geometry']]
+            large = zipShapes[['ZCTA5CE10','geometry']]
+
+            small = small[['GEOID','geometry']]
+            large = large[['ZCTA5CE10','geometry']]
+            matchRaw = gpd.overlay(small,large,how='intersection')
+            match = matchRaw[matchRaw.columns]
+            match['INT_AREA'] = match.geometry.area
+            match = pd.merge(match,match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
+            match['weight'] = match['INT_AREA']/match['TOT_AREA']
+
+            match = match[match['weight']>0.1]
+            match = pd.merge(match.drop('TOT_AREA',1),match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
+            match['weight'] = match['INT_AREA']/match['TOT_AREA']
+
+            match = match[['GEOID','ZCTA5CE10','weight']].drop_duplicates()
+            match['STATE_FIPS'] = state
+            matched.append(match)
+        match = pd.concat(matched)
+        match.to_csv(match_path,index=False)
+
+def match_zip_msa(shapesPath,year_version='2019'):
+    match_path = os.path.join(shapesPath,'ZIP_MSA_matched_{}.csv'.format(year_version))
+    if not os.path.isfile(match_path):
+        msas = gpd.read_file(os.path.join(SHAPES_PATH,'{}_cbsa'.format(year_version),'tl_{}_us_cbsa'.format(year_version),'tl_{}_us_cbsa.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
+        msas = msas[msas['LSAD']=='M1']
+        zipShapes = gpd.read_file(os.path.join(SHAPES_PATH,'{}_zcta5'.format(year_version),'tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
+
+        small = zipShapes[['ZCTA5CE10','geometry']]
+        large = msas[['CBSAFP','geometry']]
+
+        small = small[['ZCTA5CE10','geometry']]
+        large = large[['CBSAFP','geometry']]
         matchRaw = gpd.overlay(small,large,how='intersection')
         match = matchRaw[matchRaw.columns]
         match['INT_AREA'] = match.geometry.area
-        match = pd.merge(match,match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
+        match = pd.merge(match,match.groupby('ZCTA5CE10').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
         match['weight'] = match['INT_AREA']/match['TOT_AREA']
-
-        match = match[match['weight']>0.1]
-        match = pd.merge(match.drop('TOT_AREA',1),match.groupby('GEOID').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
-        match['weight'] = match['INT_AREA']/match['TOT_AREA']
-
-        match = match[['GEOID','ZCTA5CE10','weight']].drop_duplicates()
-        match['STATE_FIPS'] = state
-        matched.append(match)
-    match = pd.concat(matched)
-    match.to_csv(os.path.join(shapesPath,'ZIP_BG_matched_{}.csv'.format(year_version)),index=False)
-
-def match_zip_msa(shapesPath,year_version='2019'):
-    msas = gpd.read_file(os.path.join(SHAPES_PATH,'{}_cbsa'.format(year_version),'tl_{}_us_cbsa'.format(year_version),'tl_{}_us_cbsa.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
-    msas = msas[msas['LSAD']=='M1']
-    zipShapes = gpd.read_file(os.path.join(SHAPES_PATH,'{}_zcta5'.format(year_version),'tl_{}_us_zcta510'.format(year_version),'tl_{}_us_zcta510.shp'.format(year_version)),dtype={'ZCTA5CE10':str})
-
-    small = zipShapes[['ZCTA5CE10','geometry']]
-    large = msas[['CBSAFP','geometry']]
-
-    small = small[['ZCTA5CE10','geometry']]
-    large = large[['CBSAFP','geometry']]
-    matchRaw = gpd.overlay(small,large,how='intersection')
-    match = matchRaw[matchRaw.columns]
-    match['INT_AREA'] = match.geometry.area
-    match = pd.merge(match,match.groupby('ZCTA5CE10').sum()[['INT_AREA']].rename(columns={'INT_AREA':'TOT_AREA'}).reset_index())
-    match['weight'] = match['INT_AREA']/match['TOT_AREA']
-    match = match.sort_values(by='weight',ascending=False).groupby(['ZCTA5CE10']).first().reset_index()[['ZCTA5CE10','CBSAFP']]
-    match.to_csv(os.path.join(shapesPath,'ZIP_MSA_matched_{}.csv'.format(year_version)),index=False)
+        match = match.sort_values(by='weight',ascending=False).groupby(['ZCTA5CE10']).first().reset_index()[['ZCTA5CE10','CBSAFP']]
+        match.to_csv(match_path,index=False)
 
 
 def main(shapesPath):