Skip to content

Analyze

Analyze #10

Workflow file for this run

name: Analyze
env:
TEST_PATTERN: '*justice.gov/*'
on:
pull_request: {}
schedule:
# Tuesday at 4pm UTC/9am PT
- cron: '40 16 * * 2'
workflow_dispatch:
inputs:
threshold:
description: 'Threshold'
required: false
type: string
pattern:
description: 'Pattern'
required: false
type: string
default: ''
from:
description: 'From Time'
required: false
type: string
default: ''
to:
description: 'To Time'
required: false
type: string
default: ''
readability:
description: 'Use Readability'
required: false
type: boolean
default: true
upload_folder:
description: 'Save to GDrive with this folder name'
required: false
type: string
default: ''
jobs:
analyze:
env:
DEFAULT_PATTERN: ${{ github.event_name == 'schedule' && '*' || '*justice.gov/*' }}
DEFAULT_THRESHOLD: '0.25'
DEFAULT_FROM: ${{ github.event_name == 'schedule' && '268' || '240' }}
DEFAULT_TO: '0'
USE_READABILITY: ${{ inputs.readability == false && 'false' || 'true' }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: pip
- name: Install System Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
gcc g++ pkg-config libxml2-dev libxslt-dev libz-dev
- name: Install Python Dependencies
run: pip install -r requirements.txt
- name: Download NLTK Corpora
run: |
python -m nltk.downloader stopwords
- uses: actions/setup-node@v4
if: env.USE_READABILITY == 'true'
with:
node-version: '22'
cache: 'npm'
cache-dependency-path: readability-server/package-lock.json
- name: Install readability-server dependencies
if: env.USE_READABILITY == 'true'
run: |
cd readability-server
npm ci
- name: Run readability-server
if: env.USE_READABILITY == 'true'
run: |
cd readability-server
npm start &
- name: Analyze!
env:
WEB_MONITORING_DB_URL: '${{ secrets.WEB_MONITORING_DB_URL }}'
READABILITY_OPTIONS: ${{ env.USE_READABILITY == 'false' && '--skip-readability' || '' }}
run: |
echo "Readability options: '${READABILITY_OPTIONS}'"
python generate_task_sheets.py \
--output out \
--after '${{ inputs.from || env.DEFAULT_FROM }}' \
--before '${{ inputs.to || env.DEFAULT_TO }}' \
--threshold '${{ inputs.threshold || env.DEFAULT_THRESHOLD }}' \
--pattern '${{ inputs.pattern || env.DEFAULT_PATTERN }}' \
$READABILITY_OPTIONS
- name: Upload Results
uses: actions/upload-artifact@v4
with:
name: output
path: out
if-no-files-found: error
retention-days: 7
upload:
if: inputs.upload_folder || github.event_name == 'schedule'
needs:
- analyze
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v4
with:
name: output
- name: Install Rclone
run: sudo apt-install rclone
- name: Configure Rclone
run: |
echo '${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT }}' | base64 --decode > gdrive-service-account.json
(
echo '[gdrive-task-sheets]'
echo 'type = drive'
echo 'scope = drive.file'
echo 'service_account_file = ./gdrive-service-account'
echo 'root_folder_id = ${{ secrets.GOOGLE_DRIVE_FOLDER_ID }}'
) > rclone.conf
- name: Upload
run: |
FOLDER_NAME='${{ inputs.upload_folder }}'
if [ -n "${FOLDER_NAME}" ]; then
FOLDER_NAME="Scanner-sheets-$(date +'%Y-%m-%d')"
fi
rclone copy out "gdrive-task-sheets:${FOLDER_NAME}"