diff --git a/.gitignore b/.gitignore index 7c21aed..73442c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,18 @@ -*.DS_Store +.DS_Store *.csv +*.xlsx +settings.py +config.py + +__pycache__ +*.pyc +*.rdb +*.sqlite + +.pytest_cache/ +.coverage +htmlcov/ + +dist/ +build/ +*.egg-info/ \ No newline at end of file diff --git a/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc b/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc deleted file mode 100644 index 153d16e..0000000 Binary files a/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc and /dev/null differ diff --git a/Projects/NLP/SMS_Annotation/Code/aggregate_text_messages.py b/Projects/NLP/SMS_Annotation/Code/aggregate_text_messages.py index 8eaf051..7dcd94d 100644 --- a/Projects/NLP/SMS_Annotation/Code/aggregate_text_messages.py +++ b/Projects/NLP/SMS_Annotation/Code/aggregate_text_messages.py @@ -380,7 +380,7 @@ def main(args): ) PARSER.add_argument( "-p", "--phoneCol", - default="EndpointPhoneNumber", + default="ContactPhone", help="name of the column in input data containing the phone number. Any unique identifier for the recipient will suffice" ) PARSER.add_argument( diff --git a/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store b/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store and /dev/null differ diff --git a/Projects/NLP/SMS_Annotation/Input_Data/readme.txt b/Projects/NLP/SMS_Annotation/Input_Data/readme.txt new file mode 100644 index 0000000..770703e --- /dev/null +++ b/Projects/NLP/SMS_Annotation/Input_Data/readme.txt @@ -0,0 +1 @@ +Uploaded data will show up in this dir! \ No newline at end of file diff --git a/README_local.md b/README_local.md index 0f831f5..c0a6d50 100644 --- a/README_local.md +++ b/README_local.md @@ -11,6 +11,10 @@ This document describes how to use 5 versions of name extraction scripts for vot - Levenshtein `pip install python-Levenshtein` - NLTK `pip install nltk` +- Alternatively, if you are in an environment where you can't / don't want to install Anaconda, install Python 3.6.9+. Create and activate your Python 3 virtual environment (see [pipenv and virtualenv](https://docs.python-guide.org/dev/virtualenvs/)) and run `pip install -r requirements.txt` + +- You'll also need to run `spacy download en` once. + ## Getting Started Find your use case below and add your input data to the appropriate place, then run the specified python script. All of these scripts should be run out of the directory `Projects/NLP/SMS_Annotation` @@ -21,7 +25,7 @@ All output data (after running a script) will be found in `Projects/NLP/SMS_Anno **Use Case:** I need to aggregate SMS messages by conversation. This step is necessary before performing any extraction on SMS data. **Inputs:** -Add a csv added to the Input_Data folder. This csv should be raw individual SMS messages, not grouped by conversation. +Add a csv added to the Input_Data folder. This csv should be raw individual SMS messages, not grouped by conversation. **Instructions:** Open the script aggregate_text_messages.R in RStudioo and follow the instructions to aggregate messages into a single row per conversation @@ -45,16 +49,16 @@ A file (filename specified by you in the R script) with a single row representin ## SMS Conversation Categorization and Name Extraction -**Use Case:** I have SMS conversations and I need to figure out which text recipiants volunteered to triple, which chose to opt out, what names they provided, and whether they moved. - +**Use Case:** I have SMS conversations and I need to figure out which text recipiants volunteered to triple, which chose to opt out, what names they provided, and whether they moved. + **Inputs:** -Add a csv to the Input_Data folder. This csv file must be of the same format as the output of the aggregation in step 1. +Add a CSV to the Input_Data folder. This csv file must be of the same format as the output of the aggregation in step 1. **Instructions:** -In this directory, run `python3 Code/annotate_conversations.py -d [input_filename]`. +In this directory, run `python3 Code/annotate_conversations.py -i [input_filename]`. **Outputs:** -This script will output two files: +This script will output three files: 1. A file of triplers called `sms_triplers.csv`. For each tripler, we provide the following fields (each row represents one text message conversation): - *ConversationId* a unique identifier for the conversation - *contact_phone* the phone number of the target @@ -74,46 +78,45 @@ This script will output two files: - *wrong_number* guess for did we have the wrong number for this person (to be reviewed) - *names_extract* guess for what names (if any) were provided by this person as tripling targets (to be reviewed) - +3. A file of opt-outs + ## Text Banker Log Cleaning -**Use Case:** I have text banker logs for names provided by vote triplers. I need these logs cleaned up and standardized. - +**Use Case:** I have text banker logs for names provided by vote triplers. I need these logs cleaned up and standardized. + **Inputs:** -Add a csv to the Input_Data folder. This csv file must contain column 'names' containing the names logged by a text banker +Add a csv to the Input_Data folder. This csv file must contain column 'names' containing the names logged by a text banker **Instructions:** -In this directory, run `python3 Code/name_cleaning.py -d [input_filename]` +In this directory, run `python3 Code/name_cleaning.py -i [input_filename]` **Outputs:** -A File named `labeled_names_cleaned_no_response.csv` with the cleaned names in a column titles "clean_names", along with any other columns in the initial file - - +A file in `Output_Data` named `labeled_names_cleaned_no_response.csv` with the cleaned names in a column titles "clean_names", along with any other columns in the initial file + ## Text Banker Log Cleaning (utilizing text message conversation) -**Use Case:** I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized. We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages. - +**Use Case:** I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized. We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages. + **Inputs:** Add a csv to the Input_Data folder. This csv file must be of the same format as the output of the aggregation in step 1. This csv file must also contain column 'names' containing the names logged by a text banker. **Instructions:** -In this directory, run `python3 Code/name_cleaning_with_responses.py -d [input_filename]` +In this directory, run `python3 Code/name_cleaning_with_responses.py -i [input_filename]` **Outputs:** A File named `labeled_names_cleaned_with_response.csv` with the cleaned names in a column titles "clean_names", along with any other columns in the initial file - - + ## VAN Export Cleaning **Use Case:** I have a VAN Export and I need to extract any tripling target names from the note text. **Inputs:** Add a csv to the Input_Data folder. This csv file must contain the following columns: -- *VANID* a unique ID for this row +- *voter_file_vanid* a unique ID for this row - *ContactName* the name of the tripler - *NoteText* free text possibly including names of tripling targets **Instructions:** -In this directory, run `python3 Code/van_export_cleaning.py -d [input_filename]` +In this directory, run `python3 Code/van_export_cleaning.py -d [input_filename]` **Outputs:** This script will output two files: @@ -126,3 +129,44 @@ This script will output two files: - *ContactName* a unique identifier for the conversation - *NoteText* free text possibly including names of tripling targets - *names_extract* a guess for the extracted names (to be reviewed) + +# Running the app frontend +app.py is a Python 3.x, Flask-based frontend that provides a dedicated UI for uploading data sets and requesting that the above scripts be run with them. + +Make sure you've created and activated a virtual environment (see Requirements) and installed everything in requirements.txt. + +You'll need to [install Redis](https://redis.io/topics/quickstart). On OSX, install homebrew and then `brew install redis`. You may also need to run `pip install "celery[redis]"` + +Copy config.py from `instance/config.py.example` file and fill it in. + +To run an instance of the frontend locally, from the project root directory run: +``` +export FLASK_APP=parser +export FLASK_ENV=development +flask run +``` +and access the running application at [http://localhost:5000/](http://localhost:5000/) + +## Configuring email + +Email config variables in the example config file assume you are using Gmail for testing. Two important notes: +* Gmail probably isn't adequate for production scale; you can only send about 100 emails a day. +* Gmail doesn't consider any apps that send mail using SMTP protocol secure. When you try and run the app with a Gmail account you'll get security warnings on that account unless you have enabled what Google calls ["Less Secure Apps"](https://support.google.com/accounts/answer/6010255?hl=en). + +## Running scripts async in the background vs. waiting for results + +If config.PROCESS_ASYNC is set to True, the app uses Celery workers, a Redis queue and Flask-mail to manage script jobs and email results in the background. If config.PROCESS_ASYNC is set to False, it runs script jobs synchronously and the app waits to deliver the results as linked files. + +Synchronous mode is not recommended for production if you expect lots of large files that take a while ( > 30 seconds) to process. + +If you set config.PROCESS_ASYNC to true, you'll need to run celery and redis (which celery uses to manage its queue) +* `celery -A celery_worker.celery worker --loglevel=info` will spin up a celery worker for you in a local dev environment. [More on celery workers](https://docs.celeryproject.org/en/stable/userguide/workers.html) +* Run redis in a different terminal window with `redis-server`. + +## Testing the app frontend + +`pytest` should run all the tests in the `tests` folder. + +## TODO + + A Docker container would ease of deployment. \ No newline at end of file diff --git a/instance/config.py.example b/instance/config.py.example new file mode 100644 index 0000000..8b558d2 --- /dev/null +++ b/instance/config.py.example @@ -0,0 +1,34 @@ +import os + +SECRET_KEY = 'dev' # change this for prod! +BASE_URL = 'http://localhost:5000/' # local dev +MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16 MB + +APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +UPLOAD_FOLDER = os.path.join(os.path.dirname(APP_ROOT), 'Projects/NLP/SMS_Annotation/Input_Data'), +RESULTS_FOLDER = os.path.join(os.path.dirname(APP_ROOT), 'Projects/NLP/SMS_Annotation/Output_Data'), +SCRIPTS_FOLDER = os.path.join(os.path.dirname(APP_ROOT), 'Projects/NLP/SMS_Annotation/Code'), + +# If PROCESS_ASYNC is set to True, we run scripts in the background and email +# results with celery, redis and flask-mail. +# If PROCESS_ASYNC is set to False, we run scripts synchronously and await a +# link to the results. If a script take too long to run (as they do with larger +# files), the web app may time out before the script finishes. +PROCESS_ASYNC = True + +# Show full script error output in error message to user +SHOW_SCRIPT_ERRORS = False + +CELERY_BROKER_URL = 'redis://localhost:6379/0' +CELERY_RESULT_BACKEND = 'redis://localhost:6379/1' + +MAIL_SERVER = 'smtp.gmail.com' +MAIL_PORT = 587 +MAIL_USE_TLS = True +MAIL_USERNAME = 'notarealemail@gmail.com' +MAIL_PASSWORD = 'password1234' +EMAIL_SENDER = 'Votetripling SMS Transcript Processing, notarealemail@gmail.com' +EMAIL_SUBJECT = 'SMS transcript processing' +TEST_TARGET_EMAIL = 'your.email@example.com' + +FILE_LIFE = 72 # no. of hrs we let uploaded and result files hang out on the server diff --git a/parser/__init__.py b/parser/__init__.py new file mode 100644 index 0000000..b9902c6 --- /dev/null +++ b/parser/__init__.py @@ -0,0 +1,43 @@ +# because this code gets run from several different places, update PATH +# so we can find modules from wherever we run things +import os +import sys +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'instance')) + +from celery import Celery +from flask import Flask +import config + +celery = Celery(__name__, broker=config.CELERY_BROKER_URL, result_backend=config.CELERY_RESULT_BACKEND) + +def create_app(test_config=None): + # create and configure the app + app = Flask(__name__, instance_relative_config=True) + app.instance_path = (os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'instance')) # maybe setting this manually fixes the config problem in blueprint celery tasks + app.config.from_mapping( + SECRET_KEY='dev', + DATABASE=os.path.join(app.instance_path, 'parser.sqlite') + ) + test_config = None + if test_config is None: + # load the instance config, if it exists, when not testing + app.config.from_pyfile('config.py', silent=True) + else: + # load the test config if passed in + app.config.from_mapping(test_config) + + import main + app.register_blueprint(main.bp) + + # ensure the instance folder exists + try: + os.makedirs(app.instance_path) + except OSError: + pass + + # redis_client.init_app(app) + celery.conf.update(app.config) + + return app diff --git a/parser/celery_worker.py b/parser/celery_worker.py new file mode 100644 index 0000000..18e35bc --- /dev/null +++ b/parser/celery_worker.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +import os +from __init__ import celery, create_app + +app = create_app(os.getenv('FLASK_CONFIG') or 'default') +app.app_context().push() \ No newline at end of file diff --git a/parser/main.py b/parser/main.py new file mode 100644 index 0000000..9b9fa15 --- /dev/null +++ b/parser/main.py @@ -0,0 +1,351 @@ +import csv +import datetime +import functools +import os +import random +import string +import subprocess + +import config + +from __init__ import celery +from flask import (Blueprint, current_app, flash, Flask, g, Markup, + redirect, render_template, request, send_from_directory, url_for) +from flask_mail import Mail, Message + +from werkzeug.utils import secure_filename + +bp = Blueprint('main', __name__, url_prefix='') + +ALLOWED_EXTENSIONS = {'csv'} +UPLOAD_TYPES = { + 'tblc_file': { + 'name': 'Text Banker Log Cleaning', + 'required_headers': ['names'] + }, + 'tblctmc_file': { + 'name': 'Text Banker Log Cleaning (utilizing text message conversation)', + 'required_headers': ['names', 'triplemessage', 'voterresponse', + 'voterfinal', 'voterpost' + ] + }, + 'sccne_file': { + 'name': 'SMS Conversation Categorization and Name Extraction', + 'required_headers': ['noresponse', 'negresponse', 'posresponse', + 'affirmresponse', 'finalaffirmresponse', 'triplemessage', + 'voterresponse','voterfinal', 'voterpost', 'conversationid', + 'contact_phone' + ] + }, + 'smsagg_file': { + 'name': 'SMS Aggregation', + 'required_headers': ['MessageId', 'ConversationId', 'MessageDirection', 'MessageBody', 'ContactPhone'] + }, + 'vec_file': { + 'name': 'VAN Export Cleaning', + 'required_headers': ['voter_file_vanid','ContactName','NoteText'] + } +} + + +def allowed_file(file): + return ('.' in file.filename + and file.filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + and file.content_type == 'text/csv') + + +def unique_filename(): + return secure_filename( + '{}-{}.csv'.format( + ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(17)), + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + ) + ) + + +def check_headers(file_path, upload_type): + # open file as csv and read the first line + with open(file_path, newline='') as f: + reader = csv.reader(f) + headers = next(reader) + print("headers", headers) # for debugging + for header in UPLOAD_TYPES[upload_type]['required_headers']: + if header not in headers: + f.close() + os.remove(file_path) + return False + return True + + +@celery.task(time_limit=1800) +def process_job(data): + # It seems silly to run these via the command line + # but we can easily change that later. + scripts_folder = config.SCRIPTS_FOLDER + scripts_home_dir = os.path.dirname(scripts_folder) + input_file = data['input_file'] + job_type = data['upload_type'] + output_file = unique_filename() + second_output_file = None + third_output_file = None + if job_type in ['vec_file','sccne_file']: + second_output_file = unique_filename() + if job_type == 'sccne_file': + third_output_file = unique_filename() + + cmd = None + if job_type == 'tblc_file': + cmd = 'python {}/name_cleaning.py -i {} -f {} -o {}'.format( + scripts_folder, input_file, scripts_home_dir, output_file) + elif job_type == 'vec_file': + cmd = 'python {}/van_export_cleaning.py -i {} -f {} -o {} -m {}'.format( + scripts_folder, input_file, scripts_home_dir, output_file, + second_output_file) + elif job_type == 'tblctmc_file': + cmd = 'python {}/name_cleaning_with_responses.py -i {} -f {} -o {}'.format( + scripts_folder, input_file, scripts_home_dir, output_file) + elif job_type == 'sccne_file': + cmd = 'python {}/annotate_conversations.py -i {} -f {} -o {} -n {} -m {}'.format( + scripts_folder, input_file, scripts_home_dir, output_file, second_output_file, + third_output_file) + elif job_type == 'smsagg_file': + cmd = 'python {}/aggregate_text_messages.py -d {} -o {}/{} -a "{}" -af "{}" -t "{}" -in "{}" -out "{}"'.format( + scripts_folder, input_file, config.RESULTS_FOLDER, + output_file, data['aff_regex'], data['aff_regex_final'], + data['init_triple_phrase'], data['inbound_text'], data['outbound_text']) + else: + err_log = "Unknown job type {}".format(job_type) + return False, err_log, None, None + + status = 'success' + print("cmd", cmd) + job_run = subprocess.run(cmd, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, shell=True) + if job_run.returncode: # exit code of 0 is success, 1 is generic error + status = 'error' + err_log = job_run.stdout.decode() + print("ERROR: Could not process file {} as type {}".format( + input_file, job_type)) + print(err_log) + else: + print("SUCCESS") + + if status == 'success': + if config.PROCESS_ASYNC: + result_files = list(filter( + None, [output_file, second_output_file, third_output_file])) + result_file = '|'.join([file for file in result_files]) + email_data = { + 'success': True, + 'result_file': result_file, + 'email': data['email'] or config.TEST_TARGET_EMAIL, + 'job_type': job_type, + 'original_file': data['original_file'], + } + res = email_results.apply_async(args=[email_data], countdown=0) + cleanup = cleanup_files.apply_async(args=[data], countdown=10) + # TODO: delete input file. If processing wasn't successful, leave the input file + # for troubleshooting + return True, output_file, second_output_file, third_output_file + else: # failed to process + if config.PROCESS_ASYNC: + email_data = { + 'success': False, + 'email': data['email'] or config.TEST_TARGET_EMAIL, + 'job_type': job_type, + 'original_file': data['original_file'], + 'error_log': err_log + } + res = email_results.apply_async(args=[email_data], countdown=0) + cleanup = cleanup_files.apply_async(args=[data], countdown=10) + return False, err_log, None, None + + +@celery.task +def email_results(data): + with current_app.app_context(): + # for some reason celery spins up a new instance without access to + # original config file, so we re-config here + current_app.config.from_object(config) + mail = Mail(current_app) + msg = Message(config.EMAIL_SUBJECT, + sender=config.EMAIL_SENDER, + recipients=[data['email']]) + # accommodating more than one output file + if data['success']: + output = data['result_file'].split('|') + results = ''.join(['

{}results/{}

'.format(config.BASE_URL, output_file) for output_file in output]) + msg.html = ( + "

Thank you for using the votetripling.org SMS transcript processing tool.

" + "

The {} script successfully processed your data file {}.

" + "

Link(s) to download the results:

" + "{}" + "

Your result files will be available for download for {} hours.

" + ).format(UPLOAD_TYPES[data['job_type']]['name'], data['original_file'], + results, config.FILE_LIFE) + else: # send along the error log + msg.html = ( + "

Thank you for using the votetripling.org SMS transcript processing tool.

" + "

The {} script failed to process your data file {}.

" + "

It returned the following error:

" + "
{}
" + "

Please correct the issues and try again.

" + ).format(UPLOAD_TYPES[data['job_type']]['name'], data['original_file'], + data['error_log']) + mail.send(msg) + return True + +@celery.task +def cleanup_files(data=None): + # delete input file + print('FILE CLEANUP') + if data: + input_file = data['input_file'] + if os.path.exists(input_file): + print('deleting input file {}'.format(input_file)) + os.remove(input_file) + else: + print('{} does not exist'.format(input_file)) + # delete output files that are more than config.FILE_LIFE hours old + print('deleting results files') + output_folder = config.RESULTS_FOLDER + files = os.listdir(config.RESULTS_FOLDER) + for file in files: + try: + date_time_obj = datetime.datetime.strptime( + file.split('-')[1].replace('.csv', ''), '%Y%m%d%H%M%S') + except: + continue + if date_time_obj + datetime.timedelta(hours=config.FILE_LIFE) < datetime.datetime.now(): + print(file) + os.remove('{}/{}'.format(config.RESULTS_FOLDER, file)) + return True + +# most of the business is here +@bp.route('/', methods=['GET', 'POST']) +def index(): + if request.method == 'POST': + if not (UPLOAD_TYPES.keys() & request.files.keys()): + flash('No file', 'error') + return redirect(request.url) + for upload_type in request.files: + if upload_type not in UPLOAD_TYPES: + return redirect(request.url) + file = request.files[upload_type] + if file.filename == '': + flash('No selected file', 'error') + return redirect(request.url) + if not (file and allowed_file(file)): + flash('Invalid file type. Select a CSV to upload', 'error') + return redirect(request.url) + + filename = unique_filename() + file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) + file.save(file_path) + if not check_headers(file_path, upload_type): + msg = 'Invalid CSV headers. Check CSV for required columns {}.'.format( + UPLOAD_TYPES[upload_type]['required_headers']) + flash(msg, 'error') + return redirect(request.url) + + email_name = '{}_email'.format(upload_type.split('_')[0]) + email = request.form[email_name] + aff_regex = None + aff_regex_final = None + init_triple_phrase = None + inbound_text = None + outbound_text = None + if upload_type == 'smsagg_file': + aff_regex = request.form['aff_regex'] + aff_regex_final = request.form['aff_regex_final'] + init_triple_phrase = request.form['init_triple_phrase'] + inbound_text = request.form['inbound_text'] + outbound_text = request.form['outbound_text'] + + outcome_msg = ('Queued file {} for processing as {}. Check your email ' + 'in a few minutes for results.').format( + file.filename, UPLOAD_TYPES[upload_type]['name']) + data = { + 'original_file': file.filename, + 'input_file': file_path, + 'upload_type': upload_type, + 'email': email, + 'aff_regex': aff_regex, + 'aff_regex_final': aff_regex_final, + 'init_triple_phrase': init_triple_phrase, + 'inbound_text': inbound_text, + 'outbound_text': outbound_text + } + + if current_app.config['PROCESS_ASYNC']: + res = process_job.apply_async(args=[data], countdown=0) + else: + # make the user wait for processing + print("Processing file {} type {}".format(file_path, upload_type)) + success, output, second_output, third_output = process_job(data) + if success: + output_files = list(filter(None, [output, second_output, third_output])) + result_links = ['File {}'.format( + x, output_files.index(x) + 1) for x in output_files] + outcome_msg = Markup( + 'Download results: {}'.format(' '.join(result_links))) + else: + outcome_msg = 'Error processing file {}'.format( + output) if current_app.config['SHOW_SCRIPT_ERRORS'] else 'Error processing file' + + flash(outcome_msg, 'info') + return redirect(request.url) + # GET + return render_template('upload_form.html') + + +@bp.route('/results/') +def results_file(filename): + return send_from_directory(current_app.config['RESULTS_FOLDER'], + filename) + + +# test emailing +@bp.route('/email-results/', methods=['GET']) +def email(): + if request.method == 'GET': + data = { + 'job_id': 1234, + 'upload_type': 'vec_file', + 'result_file': '123.csv|456.csv', + 'email': current_app.config['TEST_TARGET_EMAIL'], + 'original_file': 'EMAIL TEST' + } + res = email_results.apply_async(args=[data], countdown=0) + if res.failed(): + print('emailing results failed') + msg = ('Email queued for send! Check your inbox (and spam) for an email ' + 'from "{}" with the subject "{}".').format( + current_app.config['EMAIL_SENDER'], current_app.config['EMAIL_SUBJECT']) + return msg + return 'not allowed here' + + +# manually kick off processing +@bp.route('/process//', methods=['GET']) +def process(): + if request.method == 'GET': + data = { + 'input_file': '{}/{}'.format(current_app.config['UPLOAD_FOLDER'], file_name), + 'upload_type': upload_type, + 'email': None + } + res = process_job_async.apply_async(args=[data], countdown=0) + msg = ('File {} queued for processing as {}. Results will be emailed to {}.').format( + file_name, upload_type, current_app.config['TEST_TARGET_EMAIL']) + return msg + return 'not allowed here' + + +# manually kick off file cleanup +@bp.route('/cleanup/', methods=['GET']) +def cleanup(): + if request.method == 'GET': + res = cleanup_files.apply_async(countdown=0) + return ('deleting result files older than {} hours'.format(config.FILE_LIFE)) + return 'not allowed here' \ No newline at end of file diff --git a/parser/templates/upload_form.html b/parser/templates/upload_form.html new file mode 100644 index 0000000..3c36757 --- /dev/null +++ b/parser/templates/upload_form.html @@ -0,0 +1,312 @@ + + + + + Vote tripling SMS transcript parsing + + + + + + + + + +
+
+
+ Processing ... +
+ +
+ + {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} +
    + {% for category, message in messages %} +
  • {{ message }}
  • + {% endfor %} +
+ {% endif %} + {% endwith %} + +
+

Vote tripling SMS transcript parsing

+ + +
+ +
+
+ +

SMS Aggregation

+
I need to aggregate SMS messages by conversation. This step is necessary before performing any extraction on SMS data.
+ +

Upload a csv file of raw individual SMS messages. The csv should contain the following columns:

+
    +
  • MessageId: Message timestamp OR an ascending unique ID for the message, used to determine message order
  • +
  • ConversationId: unique ID for each conversation
  • +
  • MessageDirection: direction (default: "Inbound" or "Outbound") of each message
  • +
  • MessageBody: the body/content of the message
  • +
  • ContactPhone: phone number or any other unique identifier for the recipient
  • +
+ +
+
+ + +
+
+ + +
+
+ +
+
+
+ +
+
+
+ +
+
+
+ +
+
+
+ +
+
+
+ +
+
+ +

If successful, the output will be a CSV with a single row representing each text message conversation, including the following fields: +

    +
  • ConversationId: a unique identifier for the conversation
  • +
  • contact_phone: the phone number of the target
  • +
  • totalMessages: the total number of messages exchanged
  • +
  • tripleMessage: initial message sent from the text banker to the target
  • +
  • voterResponse: initial response(s) by the target (generally where the target makes known if they opt out or want to triple)
  • +
  • tripleResponse: follow up message sent from the text banker to the target
  • +
  • voterFinal: the final follow up message sent by the target (generally where they provide names)
  • +
  • tripleFinal: final follow up sent by text banker
  • +
  • voterPost: post script from the target (generally a thank you or good luck)
  • +
  • noResponse: boolean for whether there was no response
  • +
  • negResponse: boolean for generally negative or discouraging terms (sorry, no, etc.)
  • +
  • posResponse: boolean for generally positive or encouraging terms
  • +
  • affirmResponse: boolean for presence of a scripted affirmation by text banker
  • +
  • finalAffirmResponse: boolean for presence of a scripted follow up affirmation by text banker
  • +
+ +
+ +
+
+

SMS Conversation Categorization and Name Extraction

+
I have SMS conversations and I need to figure out which text recipients volunteered to triple, which chose to opt out, what names they provided, and whether they moved.
+ +

Upload a csv file of the same format as the output of SMS Aggregation with a column of unique IDs labeled conversationid.

+ +
+
+ + +
+
+ + + +
+
+ +

If successful, the output will include three files: +

    +
  • A file of triplers. For each tripler, we provide the following fields (each row represents one text message conversation): +
      +
    • ConversationId: a unique identifier for the conversation
    • +
    • contact_phone: the phone number of the target
    • +
    • is_tripler: did this person agree to be a tripler ('yes' for everyone in this file)
    • +
    • opted_out: did this person opt out of future messages
    • +
    • wrong_number: did we have the wrong number for this person
    • +
    • names_extract: what names (if any) were provided by this person as tripling targets
    • +
    + +
  • A file of conversations for manual review, with the following fields: +
      +
    • ConversationId: a unique identifier for the conversation
    • +
    • contact_phone: the phone number of the target
    • +
    • voterResponse: initial response(s) by the target (generally where the target makes known if they opt out or want to triple)
    • +
    • voterFinal: the final follow up message sent by the target (generally where they provide names)
    • +
    • voterPost: post script from the target (generally a thank you or good luck)
    • +
    • is_tripler: guess for did this person agree to be a tripler (to be reviewed)
    • +
    • opted_out: guess for did this person opt out of future messages (to be reviewed)
    • +
    • wrong_number: guess for did we have the wrong number for this person (to be reviewed)
    • +
    • names_extract: guess for what names (if any) were provided by this person as tripling targets (to be reviewed)
    • +
    + +
  • A file of opt-outs, with the following fields: +
      +
    • conversationid: a unique identifier for the conversation
    • +
    • contact_phone: the phone number of the target
    • +
    • opted_out: yes/no
    • +
    • wrong_number: yes/no
    • +
    +
+ + +
+
+ +

Text Banker Log Cleaning

+ +
I have text banker logs for names provided by vote triplers. I need these logs cleaned up and standardized.
+ +

First, enter your email address to receive a link to the output file.

+

Then, upload a csv containing a column names containing the names logged by a text banker.

+ +
+
+ + +
+
+ + + +
+
+ +

If successful, output will be a CSV with the cleaned names in a column titled clean_names, along with any other columns in the initial file.

+ +
+ +
+
+ +

Text Banker Log Cleaning (utilizing text message conversation)

+ +
I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized.
+ +

We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages.

+ +

Upload a CSV file of the same format as the output of SMS Aggregation.This CSV file must also contain a column names containing the names logged by a text banker. You can join the text banker logs to the aggregation output using the conversation ID.

+ +
+
+ + +
+
+ + + +
+ +
+ +

If successful, output will be a CSV with the cleaned names in a column titled clean_names, along with any other columns in the initial file.

+ +
+ +
+
+ +

VAN Export Cleaning

+ +
I have a VAN Export and I need to extract any tripling target names from the note text.
+ +

Upload a csv file containing the following columns:

+
    +
  • voter_file_vanid: a unique ID for this row
  • +
  • ContactName: the name of the tripler
  • +
  • NoteText: free text possibly including names of tripling targets
  • +
+ +
+
+ + +
+
+ + + +
+
+ +

This script will output two files: +

    +
  • A CSV file of triplers. For each tripler, we provide the following fields (each row represents one text message conversation): +
      +
    • VANID: a unique identifier for the conversation
    • +
    • names_extract: the extracted names
    • +
    +
  • +
  • A CSV file of conversations for manual review, with the following fields: +
      +
    • VANID: a unique identifier for the conversation
    • +
    • ContactName: a unique identifier for the conversation
    • +
    • NoteText: free text possibly including names of tripling targets
    • +
    • names_extract: a guess for the extracted names (to be reviewed)
    • +
    +
  • +
+ +
+ +
+ + + + + + \ No newline at end of file diff --git a/parser/tests/test_parser.py b/parser/tests/test_parser.py new file mode 100644 index 0000000..d9d8b0b --- /dev/null +++ b/parser/tests/test_parser.py @@ -0,0 +1,26 @@ +import os +import tempfile + +import pytest + +import parser + +@pytest.fixture +def client(): + db_fd, parser.app.config['DATABASE'] = tempfile.mkstemp() + parser.app.config['TESTING'] = True + + with parser.app.test_client() as client: + with parser.app.app_context(): + parser.init_db() + yield client + + os.close(db_fd) + os.unlink(parser.app.config['DATABASE']) + + +# TODO + +# test posting wrong file extension to / +# test posting files with bad headers to / +# test posting right file with right headers to / diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1ddc4a7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +celery==5.0.5 +coverage==5.4 +Flask==1.1.2 +Flask-Mail==0.9.1 +matplotlib==3.3.3 +nltk==3.5 +numpy==1.19.4 +pandas==1.1.5 +pathlib==1.0.1 +pytest==6.2.2 +python-Levenshtein==0.12.0 +redis==3.5.3 +scikit-learn==0.23.2 +scipy==1.5.4 +spacy==2.3.4