Correct paths and add SHAP explanations

michaelmior · michaelmior · commit 1f06dcba56f1 · 2023-06-14T14:59:47.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,7 @@
 pages/
 patterns.json
 *.npy
+*.json
+*.h5
+*.png
+regex101/
diff --git a/Pipfile b/Pipfile
@@ -13,6 +13,8 @@ tqdm = "*"
 pandas = "*"
 scikit-learn = "*"
 tensorflow = "*"
+shap = "*"
+matplotlib = "*"
 
 [dev-packages]
 
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,21 +1,57 @@
 # Learning from Uncurated Regular Expressions
 
 Dependencies of all Python code are managed with [Pipenv](https://pipenv.pypa.io/en/latest/) and can be installed with `pipenv install`.
-Note that the dataset from the [Sherlock](https://github.com/mitmedialab/sherlock-project) project should be available in a copy of the repository in the same directory as this project.
+Note that the dataset from the [Sherlock](https://github.com/mitmedialab/sherlock-project) project should be available in a copy of the repository in alongside the directory for this project.
 [`jq`](https://jqlang.github.io/jq/) is also required for some JSON processing.
 
+## Model training
+
 1. Download all regular expressions from regex101
 
 `./download_patterns.sh`
 
+This will create a directory `regex101` which has the individual regular expressions and `patterns.json` which contains only the expressions strings.
+
 2. Compile a database of all the downloaded regular expressions
 
-`pipenv run python compile_db.py < patterns.json`
+`pipenv run python compile_db.py < patterns.json > patterns_final.json`
+
+`patterns_final.json` is a subset of the expressions in `patterns.json` which are supported by Hyperscan.
+This step will also create `hs.db` which are the compiled regular expressions that can be used during preprocessing.
 
 3. Preprocess the data to generate feature vectors
 
 `pipenv run python preprocess.py train`
 
+This will generate `preprocessed_train.txt` which contains all the feature vectors extracted using the regular expression extracted using the regular expressions.
+
 4. Train the model on the extracted features
 
 `pipenv run python train.py`
+
+The model architecture will be stored in `nn_model_sherlock.json` with the weights in `nn_model_weights_sherlock.h5`.
+
+## Evaluation
+
+First, the test data must be preprocessed.
+
+`pipenv run python preprocess.py test`
+
+Then, the model can be evaluated.
+
+`pipenv run python test.py`
+
+## Model explanation
+
+Explains for predictions for an individual class can be generated using [SHAP](https://shap.readthedocs.io/en/latest/).
+First, follow the steps for training the model above.
+The file `patterns_final.json` will be used to match the patterns back to the original regular expressions.
+
+`pipenv run python find_patterns.py > pattern_ids.txt`
+
+This file of pattern IDs will then be used to label the SHAP plot with the ID of the regular expression.
+To generate the SHAP plot in `shap.png`, run the command below where `<class_name>` is one of the semantic types defined by Sherlock.
+
+`pipenv run python explain.py <class_name>`
+
+The IDs displayed in the SHAP plot can be used to reference the regular expressions by ID in the `regex101/patterns` directory or viewing it directly on regex101 at the URL `https://regex101.com/library/<ID>`.
diff --git a/compile_db.py b/compile_db.py
@@ -20,18 +20,19 @@
     except hyperscan.error as e:
         pass
 
+# Build input for the final Hyperscan database
 db = hyperscan.Database()
-num_patterns = 0
 patterns = []
 ids = []
 flags = []
-for regex in regexes:
+for (i, regex) in enumerate(regexes):
+    print(json.dumps(regex))
     patterns.append(regex.encode("utf8"))
-    ids.append(num_patterns)
+    ids.append(i)
     flags.append(hyperscan.HS_FLAG_SINGLEMATCH | hyperscan.HS_FLAG_UTF8)
-    num_patterns += 1
 
+# Compile the final database and save to file
 sys.stderr.write("Compiling %d patterns...\n" % len(patterns))
 db.compile(expressions=patterns, ids=ids, flags=flags)
 with open("hs.db", "wb") as f:
-    pickle.dump([num_patterns, hyperscan.dumpb(db)], f)
+    pickle.dump([len(patterns), hyperscan.dumpb(db)], f)
diff --git a/download_patterns.sh b/download_patterns.sh
@@ -1,22 +1,22 @@
 #!/bin/bash
 
 # Download each page of search results
-mkdir -p pages/
-wget "https://regex101.com/api/library/1/?orderBy=MOST_POINTS&search=" -O pages/1.json
-PAGES=$(jq -r .pages pages/1.json)
+mkdir -p regex101/pages/
+wget "https://regex101.com/api/library/1/?orderBy=MOST_POINTS&search=" -O regex101/pages/1.json
+PAGES=$(jq -r .pages regex101/pages/1.json)
 for i in $(seq 2 $PAGES); do
     # Fetch this page of regular expressions
-    wget "https://regex101.com/api/library/$i/?orderBy=MOST_POINTS&search=" -O "pages/$i.json"
+    wget "https://regex101.com/api/library/$i/?orderBy=MOST_POINTS&search=" -O "regex101/pages/$i.json"
     sleep 1
 done
 
 # Extract all fragments from each page to get individual regexes
-mkdir -p regexes/
-jq -cr '.data[] | (.permalinkFragment + " https://regex101.com/api/regex/" + .permalinkFragment + "/" + (.version | tostring))' pages/*.json | \
+mkdir -p regex101/regexes/
+jq -cr '.data[] | (.permalinkFragment + " https://regex101.com/api/regex/" + .permalinkFragment + "/" + (.version | tostring))' regex101/pages/*.json | \
     while read -r frag url; do
         # If the regex has not already been fetched, fetch it
-        [ -f "regexes/$frag.json" ] || (wget -O "regexes/$frag.json" -nc "$url"; sleep 1)
+        [ -f "regex101/regexes/$frag.json" ] || (wget -O "regex101/regexes/$frag.json" -nc "$url"; sleep 1)
     done
 
 # Extract all PCRE regexes without newlines into a file
-jq -c 'select((.flavor == "pcre") and (.regex | contains( "\n") | not)) .regex' ../regex101/regexes/* > patterns.json
+jq -c 'select((.flavor == "pcre") and (.regex | contains( "\n") | not)) .regex' regex101/regexes/* > patterns.json
diff --git a/explain.py b/explain.py
@@ -0,0 +1,73 @@
+import argparse
+import io
+import json
+import random
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pyarrow.parquet import ParquetFile
+import shap
+import tensorflow as tf
+
+BACKGROUND_SIZE = 1000
+SAMPLE_SIZE = 500
+
+
+# Implements reservoir sampling
+def update_sample(samples, N, sample):
+    if sample is None:
+        return
+
+    if len(samples) < BACKGROUND_SIZE:
+        samples.append(str(sample))
+    else:
+        s = int(random.random() * N)
+        if s < BACKGROUND_SIZE:
+            samples[s] = str(sample)
+
+
+class_names = np.load("classes.npy", allow_pickle=True)
+parser = argparse.ArgumentParser()
+parser.add_argument("class_name", choices=class_names)
+args = parser.parse_args()
+
+# Get indexes of samples matching the given
+# class in the first SAMPLE_SIZE values
+pq_labels = ParquetFile("../sherlock-project/data/data/raw/train_labels.parquet")
+class_idx = list(
+    np.where(
+        pq_labels.read(columns=["type"]).columns[0].to_numpy()[:SAMPLE_SIZE]
+        == args.class_name
+    )[0]
+)
+
+# See https://github.com/slundberg/shap/issues/1406
+shap.explainers._deep.deep_tf.op_handlers[
+    "AddV2"
+] = shap.explainers._deep.deep_tf.passthrough
+
+# Load the trained model
+model = tf.keras.models.model_from_json(open("nn_model_sherlock.json").read())
+model.load_weights("nn_model_weights_sherlock.h5")
+
+# Produce a randomly sample of background from the training data
+background = []
+for (i, line) in enumerate(open("preprocessed_train.txt")):
+    update_sample(background, i, line)
+
+matrix = np.loadtxt(io.StringIO("".join(background)))
+del background
+
+# Load sample values matching the given class
+sample = np.loadtxt(open("preprocessed_train.txt", "r"), max_rows=SAMPLE_SIZE)[
+    class_idx, :
+]
+
+# Use SHAP to create a summary plot
+e = shap.DeepExplainer(model, matrix)
+shap_values = e.shap_values(sample)
+feature_names = [l.strip() for l in open("pattern_ids.txt")]
+shap.summary_plot(
+    shap_values, sample, class_names=class_names, feature_names=feature_names
+)
+plt.savefig("shap.png")
diff --git a/find_patterns.py b/find_patterns.py
@@ -0,0 +1,16 @@
+import glob
+import json
+
+# Create a dictionary of possible patterns
+pat_dict = {}
+for file in glob.glob("regex101/regexes/*.json"):
+    try:
+        obj = json.load(open(file))
+        pat_dict[obj["regex"]] = file.split("/")[-1].split(".")[0]
+    except json.decoder.JSONDecodeError:
+        pass
+
+# Output the index and ID of each pattern
+for line in open("patterns_final.json"):
+    pat = json.loads(line)
+    print(pat_dict[pat])