MoveOnOrg · sandramchung · Dec 7, 2020 · Dec 7, 2020 · Dec 18, 2020 · Dec 18, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,18 @@
-*.DS_Store
+.DS_Store
 *.csv
+*.xlsx
+settings.py
+config.py
+
+__pycache__
+*.pyc
+*.rdb
+*.sqlite
+
+.pytest_cache/
+.coverage
+htmlcov/
+
+dist/
+build/
+*.egg-info/
diff --git a/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc b/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc
diff --git a/Projects/NLP/SMS_Annotation/Code/aggregate_text_messages.py b/Projects/NLP/SMS_Annotation/Code/aggregate_text_messages.py
@@ -380,7 +380,7 @@ def main(args):
     )
     PARSER.add_argument(
         "-p", "--phoneCol",
-        default="EndpointPhoneNumber",
+        default="ContactPhone",
         help="name of the column in input data containing the phone number. Any unique identifier for the recipient will suffice"
     )
     PARSER.add_argument(

diff --git a/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store b/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store
diff --git a/Projects/NLP/SMS_Annotation/Input_Data/readme.txt b/Projects/NLP/SMS_Annotation/Input_Data/readme.txt
@@ -0,0 +1 @@
+Uploaded data will show up in this dir!
diff --git a/README_local.md b/README_local.md
@@ -11,6 +11,10 @@ This document describes how to use 5 versions of name extraction scripts for vot
   - Levenshtein `pip install python-Levenshtein`
   - NLTK `pip install nltk`
 
+- Alternatively, if you are in an environment where you can't / don't want to install Anaconda, install Python 3.6.9+. Create and activate your Python 3 virtual environment (see [pipenv and virtualenv](https://docs.python-guide.org/dev/virtualenvs/)) and run `pip install -r requirements.txt`
+
+- You'll also need to run `spacy download en` once.
+
 ## Getting Started
 Find your use case below and add your input data to the appropriate place, then run the specified python script.
 All of these scripts should be run out of the directory `Projects/NLP/SMS_Annotation`  
@@ -21,7 +25,7 @@ All output data (after running a script) will be found in `Projects/NLP/SMS_Anno
 **Use Case:** I need to aggregate SMS messages by conversation. This step is necessary before performing any extraction on SMS data.  
 
 **Inputs:**
-Add a csv added to the Input_Data folder. This csv should be raw individual SMS messages, not grouped by conversation.  
+Add a csv added to the Input_Data folder. This csv should be raw individual SMS messages, not grouped by conversation.
 
 **Instructions:**
 Open the script aggregate_text_messages.R in RStudioo and follow the instructions to aggregate messages into a single row per conversation
@@ -45,16 +49,16 @@ A file (filename specified by you in the R script) with a single row representin
 
 
 ## SMS Conversation Categorization and Name Extraction
-**Use Case:** I have SMS conversations and I need to figure out which text recipiants volunteered to triple, which chose to opt out, what names they provided, and whether they moved.  
-  
+**Use Case:** I have SMS conversations and I need to figure out which text recipiants volunteered to triple, which chose to opt out, what names they provided, and whether they moved.
+
 **Inputs:**
-Add a csv to the Input_Data folder. This csv file must be of the same format as the output of the aggregation in step 1.   
+Add a CSV to the Input_Data folder. This csv file must be of the same format as the output of the aggregation in step 1.
 
 **Instructions:**
-In this directory, run `python3 Code/annotate_conversations.py -d [input_filename]`. 
+In this directory, run `python3 Code/annotate_conversations.py -i [input_filename]`.
 
 **Outputs:**
-This script will output two files:  
+This script will output three files:
 1. A file of triplers called `sms_triplers.csv`. For each tripler, we provide the following fields (each row represents one text message conversation):
 - *ConversationId* a unique identifier for the conversation
 - *contact_phone* the phone number of the target 
@@ -74,46 +78,45 @@ This script will output two files:
 - *wrong_number* guess for did we have the wrong number for this person (to be reviewed)
 - *names_extract* guess for what names (if any) were provided by this person as tripling targets (to be reviewed)
 
-
+3. A file of opt-outs
+
 ## Text Banker Log Cleaning
-**Use Case:** I have text banker logs for names provided by vote triplers. I need these logs cleaned up and standardized.  
-  
+**Use Case:** I have text banker logs for names provided by vote triplers. I need these logs cleaned up and standardized.
+
 **Inputs:**
-Add a csv to the Input_Data folder. This csv file must contain column 'names' containing the names logged by a text banker  
+Add a csv to the Input_Data folder. This csv file must contain column 'names' containing the names logged by a text banker
 
 **Instructions:**
-In this directory, run `python3 Code/name_cleaning.py -d [input_filename]`  
+In this directory, run `python3 Code/name_cleaning.py -i [input_filename]`
 
 **Outputs:**
-A File named `labeled_names_cleaned_no_response.csv` with the cleaned names in a column titles "clean_names", along with any other columns in the initial file 
-
-
+A file in `Output_Data` named `labeled_names_cleaned_no_response.csv` with the cleaned names in a column titles "clean_names", along with any other columns in the initial file 
+
 ## Text Banker Log Cleaning (utilizing text message conversation)
-**Use Case:** I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized. We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages.  
-  
+**Use Case:** I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized. We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages.
+
 **Inputs:**
 Add a csv to the Input_Data folder. 
 This csv file must be of the same format as the output of the aggregation in step 1.
 This csv file must also contain column 'names' containing the names logged by a text banker.
 
 **Instructions:**
-In this directory, run `python3 Code/name_cleaning_with_responses.py -d [input_filename]`
+In this directory, run `python3 Code/name_cleaning_with_responses.py -i [input_filename]`
 
 **Outputs:**
 A File named `labeled_names_cleaned_with_response.csv` with the cleaned names in a column titles "clean_names", along with any other columns in the initial file
-
-
+
 ## VAN Export Cleaning
 **Use Case:** I have a VAN Export and I need to extract any tripling target names from the note text.
 
 **Inputs:**
 Add a csv to the Input_Data folder. This csv file must contain the following columns:
-- *VANID* a unique ID for this row
+- *voter_file_vanid* a unique ID for this row
 - *ContactName* the name of the tripler
 - *NoteText* free text possibly including names of tripling targets
 
 **Instructions:**
-In this directory, run `python3 Code/van_export_cleaning.py -d [input_filename]`  
+In this directory, run `python3 Code/van_export_cleaning.py -d [input_filename]`
 
 **Outputs:**
 This script will output two files:  
@@ -126,3 +129,44 @@ This script will output two files:
 - *ContactName* a unique identifier for the conversation
 - *NoteText* free text possibly including names of tripling targets
 - *names_extract* a guess for the extracted names (to be reviewed)
+
+# Running the app frontend
+app.py is a Python 3.x, Flask-based frontend that provides a dedicated UI for uploading data sets and requesting that the above scripts be run with them. 
+
+Make sure you've created and activated a virtual environment (see Requirements) and installed everything in requirements.txt.
+
+You'll need to [install Redis](https://redis.io/topics/quickstart). On OSX, install homebrew and then `brew install redis`. You may also need to run `pip install "celery[redis]"`
+
+Copy config.py from `instance/config.py.example` file and fill it in.
+
+To run an instance of the frontend locally, from the project root directory run:
+```
+export FLASK_APP=parser
+export FLASK_ENV=development
+flask run
+```
+and access the running application at [http://localhost:5000/](http://localhost:5000/)
+
+## Configuring email
+
+Email config variables in the example config file assume you are using Gmail for testing. Two important notes:
+* Gmail probably isn't adequate for production scale; you can only send about 100 emails a day.
+* Gmail doesn't consider any apps that send mail using SMTP protocol secure. When you try and run the app with a Gmail account you'll get security warnings on that account unless you have enabled what Google calls ["Less Secure Apps"](https://support.google.com/accounts/answer/6010255?hl=en).
+
+## Running scripts async in the background vs. waiting for results
+
+If config.PROCESS_ASYNC is set to True, the app uses Celery workers, a Redis queue and Flask-mail to manage script jobs and email results in the background. If config.PROCESS_ASYNC is set to False, it runs script jobs synchronously and the app waits to deliver the results as linked files. 
+
+Synchronous mode is not recommended for production if you expect lots of large files that take a while ( > 30 seconds) to process.
+
+If you set config.PROCESS_ASYNC to true, you'll need to run celery and redis (which celery uses to manage its queue)
+* `celery -A celery_worker.celery worker --loglevel=info` will spin up a celery worker for you in a local dev environment. [More on celery workers](https://docs.celeryproject.org/en/stable/userguide/workers.html)
+* Run redis in a different terminal window with `redis-server`.
+
+## Testing the app frontend
+
+`pytest` should run all the tests in the `tests` folder.
+
+## TODO
+
+ A Docker container would ease of deployment.
diff --git a/instance/config.py.example b/instance/config.py.example
@@ -0,0 +1,34 @@
+import os
+
+SECRET_KEY = 'dev' # change this for prod!
+BASE_URL = 'http://localhost:5000/' # local dev
+MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16 MB
+
+APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+UPLOAD_FOLDER = os.path.join(os.path.dirname(APP_ROOT), 'Projects/NLP/SMS_Annotation/Input_Data'),
+RESULTS_FOLDER = os.path.join(os.path.dirname(APP_ROOT), 'Projects/NLP/SMS_Annotation/Output_Data'),
+SCRIPTS_FOLDER = os.path.join(os.path.dirname(APP_ROOT), 'Projects/NLP/SMS_Annotation/Code'),
+
+# If PROCESS_ASYNC is set to True, we run scripts in the background and email
+# results with celery, redis and flask-mail.
+# If PROCESS_ASYNC is set to False, we run scripts synchronously and await a
+# link to the results. If a script take too long to run (as they do with larger
+# files), the web app may time out before the script finishes.
+PROCESS_ASYNC = True
+
+# Show full script error output in error message to user
+SHOW_SCRIPT_ERRORS = False
+
+CELERY_BROKER_URL = 'redis://localhost:6379/0'
+CELERY_RESULT_BACKEND = 'redis://localhost:6379/1'
+
+MAIL_SERVER = 'smtp.gmail.com'
+MAIL_PORT = 587
+MAIL_USE_TLS = True
+MAIL_USERNAME = 'notarealemail@gmail.com'
+MAIL_PASSWORD = 'password1234'
+EMAIL_SENDER = 'Votetripling SMS Transcript Processing, notarealemail@gmail.com'
+EMAIL_SUBJECT = 'SMS transcript processing'
+TEST_TARGET_EMAIL = 'your.email@example.com'
+
+FILE_LIFE = 72 # no. of hrs we let uploaded and result files hang out on the server
diff --git a/parser/__init__.py b/parser/__init__.py
@@ -0,0 +1,43 @@
+# because this code gets run from several different places, update PATH
+# so we can find modules from wherever we run things
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'instance'))
+
+from celery import Celery
+from flask import Flask
+import config
+
+celery = Celery(__name__, broker=config.CELERY_BROKER_URL, result_backend=config.CELERY_RESULT_BACKEND)
+
+def create_app(test_config=None):
+    # create and configure the app
+    app = Flask(__name__, instance_relative_config=True)
+    app.instance_path = (os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'instance')) # maybe setting this manually fixes the config problem in blueprint celery tasks
+    app.config.from_mapping(
+            SECRET_KEY='dev',
+            DATABASE=os.path.join(app.instance_path, 'parser.sqlite')
+        )
+    test_config = None
+    if test_config is None:
+        # load the instance config, if it exists, when not testing
+        app.config.from_pyfile('config.py', silent=True)
+    else:
+        # load the test config if passed in
+        app.config.from_mapping(test_config)
+
+    import main
+    app.register_blueprint(main.bp)
+
+    # ensure the instance folder exists
+    try:
+        os.makedirs(app.instance_path)
+    except OSError:
+        pass
+
+    # redis_client.init_app(app)
+    celery.conf.update(app.config)
+
+    return app
diff --git a/parser/celery_worker.py b/parser/celery_worker.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+import os
+from __init__ import celery, create_app
+
+app = create_app(os.getenv('FLASK_CONFIG') or 'default')
+app.app_context().push()