Analyze #10
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Analyze | |
env: | |
TEST_PATTERN: '*justice.gov/*' | |
on: | |
pull_request: {} | |
schedule: | |
# Tuesday at 4pm UTC/9am PT | |
- cron: '40 16 * * 2' | |
workflow_dispatch: | |
inputs: | |
threshold: | |
description: 'Threshold' | |
required: false | |
type: string | |
pattern: | |
description: 'Pattern' | |
required: false | |
type: string | |
default: '' | |
from: | |
description: 'From Time' | |
required: false | |
type: string | |
default: '' | |
to: | |
description: 'To Time' | |
required: false | |
type: string | |
default: '' | |
readability: | |
description: 'Use Readability' | |
required: false | |
type: boolean | |
default: true | |
upload_folder: | |
description: 'Save to GDrive with this folder name' | |
required: false | |
type: string | |
default: '' | |
jobs: | |
analyze: | |
env: | |
DEFAULT_PATTERN: ${{ github.event_name == 'schedule' && '*' || '*justice.gov/*' }} | |
DEFAULT_THRESHOLD: '0.25' | |
DEFAULT_FROM: ${{ github.event_name == 'schedule' && '268' || '240' }} | |
DEFAULT_TO: '0' | |
USE_READABILITY: ${{ inputs.readability == false && 'false' || 'true' }} | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: '3.10' | |
cache: pip | |
- name: Install System Dependencies | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y --no-install-recommends \ | |
gcc g++ pkg-config libxml2-dev libxslt-dev libz-dev | |
- name: Install Python Dependencies | |
run: pip install -r requirements.txt | |
- name: Download NLTK Corpora | |
run: | | |
python -m nltk.downloader stopwords | |
- uses: actions/setup-node@v4 | |
if: env.USE_READABILITY == 'true' | |
with: | |
node-version: '22' | |
cache: 'npm' | |
cache-dependency-path: readability-server/package-lock.json | |
- name: Install readability-server dependencies | |
if: env.USE_READABILITY == 'true' | |
run: | | |
cd readability-server | |
npm ci | |
- name: Run readability-server | |
if: env.USE_READABILITY == 'true' | |
run: | | |
cd readability-server | |
npm start & | |
- name: Analyze! | |
env: | |
WEB_MONITORING_DB_URL: '${{ secrets.WEB_MONITORING_DB_URL }}' | |
READABILITY_OPTIONS: ${{ env.USE_READABILITY == 'false' && '--skip-readability' || '' }} | |
run: | | |
echo "Readability options: '${READABILITY_OPTIONS}'" | |
python generate_task_sheets.py \ | |
--output out \ | |
--after '${{ inputs.from || env.DEFAULT_FROM }}' \ | |
--before '${{ inputs.to || env.DEFAULT_TO }}' \ | |
--threshold '${{ inputs.threshold || env.DEFAULT_THRESHOLD }}' \ | |
--pattern '${{ inputs.pattern || env.DEFAULT_PATTERN }}' \ | |
$READABILITY_OPTIONS | |
- name: Upload Results | |
uses: actions/upload-artifact@v4 | |
with: | |
name: output | |
path: out | |
if-no-files-found: error | |
retention-days: 7 | |
upload: | |
if: inputs.upload_folder || github.event_name == 'schedule' | |
needs: | |
- analyze | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/download-artifact@v4 | |
with: | |
name: output | |
- name: Install Rclone | |
run: sudo apt-install rclone | |
- name: Configure Rclone | |
run: | | |
echo '${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT }}' | base64 --decode > gdrive-service-account.json | |
( | |
echo '[gdrive-task-sheets]' | |
echo 'type = drive' | |
echo 'scope = drive.file' | |
echo 'service_account_file = ./gdrive-service-account' | |
echo 'root_folder_id = ${{ secrets.GOOGLE_DRIVE_FOLDER_ID }}' | |
) > rclone.conf | |
- name: Upload | |
run: | | |
FOLDER_NAME='${{ inputs.upload_folder }}' | |
if [ -n "${FOLDER_NAME}" ]; then | |
FOLDER_NAME="Scanner-sheets-$(date +'%Y-%m-%d')" | |
fi | |
rclone copy out "gdrive-task-sheets:${FOLDER_NAME}" |