11import os
2+
23import pandas as pd
34from flask import current_app
4- from pipeline import calssify_new_data , clean_and_load_data , archive_rows , match_data
5+ from api import admin_api
6+ from pipeline import calssify_new_data , clean_and_load_data , archive_rows , match_data , log_db
57from config import CURRENT_SOURCE_FILES_PATH
68from config import engine
79from models import Base
810
911
1012def start_flow ():
11- file_path_list = os .listdir (CURRENT_SOURCE_FILES_PATH )
12-
13- if file_path_list :
14- with engine .connect () as connection :
15- Base .metadata .create_all (connection )
16-
17- # Get previous version of pdp_contacts table, which is used later to classify new records
18- pdp_contacts_df = pd .read_sql_table ('pdp_contacts' , connection )
19- pdp_contacts_df = pdp_contacts_df [pdp_contacts_df ["archived_date" ].isnull ()]
20- pdp_contacts_df = pdp_contacts_df .drop (columns = ['archived_date' , 'created_date' , '_id' , 'matching_id' ])
21-
22- current_app .logger .info ('Loaded {} records from pdp_contacts table' .format (pdp_contacts_df .shape [0 ]))
23-
24- # Clean the input data and normalize/rename columns
25- # Populate new records in secondary tables (donations, volunteer shifts)
26- # input - existing files in path
27- # output - normalized object of all entries, as well as the input json rows for primary sources
28- normalized_data , source_json , manual_matches_df = clean_and_load_data .start (connection , pdp_contacts_df , file_path_list )
29-
30- # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
31- # (If additional inconsistencies are encountered, may need to enforce the schema of
32- # the contacts loader by initializing it from pdp_contacts.)
33- normalized_data .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
34- normalized_data = pd .read_sql_table ('_temp_pdp_contacts_loader' , connection )
35-
36- # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
37- rows_classified = calssify_new_data .start (pdp_contacts_df , normalized_data )
38-
39- # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
40- archive_rows .archive (connection , rows_classified ["updated" ])
41-
42- # Match new+updated records against previous version of pdp_contacts database, and
43- # write these rows to the database.
44- match_data .start (connection , rows_classified , manual_matches_df )
45-
46- # Copy raw input rows to json fields in pdp_contacts,
47- # using a temporary table to simplify the update code.
48- current_app .logger .info ('Saving json of original rows to pdp_contacts' )
49- source_json .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
50- # https://www.postgresql.org/docs/8.4/sql-update.html
51- connection .execute ('''
52- UPDATE pdp_contacts pdp
53- SET json = to_json(temp.json)
54- FROM _temp_pdp_contacts_loader temp
55- WHERE
56- pdp.source_type = temp.source_type AND
57- pdp.source_id = temp.source_id AND
58- pdp.archived_date IS NULL
59- ''' )
13+
14+ job_id = admin_api .start_job ()
15+
16+ if (not job_id ):
17+ current_app .logger .info ('Failed to get job_id' )
18+ job_outcome = 'busy'
19+
20+ else :
21+ log_db .log_exec_status (job_id , 'start_flow' , 'executing' , '' )
22+
23+ file_path_list = os .listdir (CURRENT_SOURCE_FILES_PATH )
24+
25+
26+
27+ if file_path_list :
28+ with engine .connect () as connection :
29+ Base .metadata .create_all (connection )
30+
31+ # Get previous version of pdp_contacts table, which is used later to classify new records
32+ pdp_contacts_df = pd .read_sql_table ('pdp_contacts' , connection )
33+ pdp_contacts_df = pdp_contacts_df [pdp_contacts_df ["archived_date" ].isnull ()]
34+ pdp_contacts_df = pdp_contacts_df .drop (columns = ['archived_date' , 'created_date' , '_id' , 'matching_id' ])
35+
36+ current_app .logger .info ('Loaded {} records from pdp_contacts table' .format (pdp_contacts_df .shape [0 ]))
37+
38+ # Clean the input data and normalize/rename columns
39+ # Populate new records in secondary tables (donations, volunteer shifts)
40+ # input - existing files in path
41+ # output - normalized object of all entries, as well as the input json rows for primary sources
42+ log_db .log_exec_status (job_id , 'clean_and_load' , 'executing' , '' )
43+ normalized_data , source_json , manual_matches_df = clean_and_load_data .start (connection , pdp_contacts_df , file_path_list )
44+
45+ # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
46+ # (If additional inconsistencies are encountered, may need to enforce the schema of
47+ # the contacts loader by initializing it from pdp_contacts.)
48+ normalized_data .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
49+ normalized_data = pd .read_sql_table ('_temp_pdp_contacts_loader' , connection )
50+
51+ # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
52+ log_db .log_exec_status (job_id , 'classify' , 'executing' , '' )
53+ rows_classified = calssify_new_data .start (pdp_contacts_df , normalized_data )
54+
55+ # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
56+ archive_rows .archive (connection , rows_classified ["updated" ])
57+
58+ # Match new+updated records against previous version of pdp_contacts database, and
59+ # write these rows to the database.
60+ match_data .start (connection , rows_classified , manual_matches_df , job_id )
61+
62+ # Copy raw input rows to json fields in pdp_contacts,
63+ # using a temporary table to simplify the update code.
64+ current_app .logger .info ('Saving json of original rows to pdp_contacts' )
65+ source_json .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
66+ # https://www.postgresql.org/docs/8.4/sql-update.html
67+ connection .execute ('''
68+ UPDATE pdp_contacts pdp
69+ SET json = to_json(temp.json)
70+ FROM _temp_pdp_contacts_loader temp
71+ WHERE
72+ pdp.source_type = temp.source_type AND
73+ pdp.source_id = temp.source_id AND
74+ pdp.archived_date IS NULL
75+ ''' )
6076
6177 current_app .logger .info ('Finished flow script run' )
78+ job_outcome = 'completed'
79+
80+ else : # No files in list
81+ current_app .logger .info ('No files to process' )
82+ job_outcome = 'nothing to do'
83+
84+ log_db .log_exec_status (job_id , 'flow' , 'complete' , '' )
6285
86+ return job_outcome
0 commit comments