add CI action; refactor; error handling to make CI reliable

wumpus · web-flow · commit 6ca98e6bb07d · 2024-12-26T04:50:24.000Z
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length=120
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,63 @@
+name: CI
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+    branches:
+    - main
+
+jobs:
+  unit-tests:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      max-parallel: 1  # avoids ever triggering a rate limit
+      matrix:
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
+        os: [ubuntu-latest]
+        EXTRA: [false]  # used to force includes to get included
+        include:
+          - python-version: '3.7'
+            os: ubuntu-20.04  # oldest version on github actions
+            EXTRA: true
+
+    steps:
+      - name: checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install setuptools on python 3.12+
+        if: ${{ matrix.python-version >= '3.12' }}
+        run: |
+          pip install setuptools
+
+      - name: Install packages
+        run: make install
+
+      # crash tests from here on out
+      - name: make iterate
+        run: make iterate
+
+      - name: make cdxj
+        run: make cdxj
+
+      - name: make extract
+        run: make extract
+
+      - name: make cdx_toolkit
+        run: make cdx_toolkit
+
+      - name: make download_collinfo
+        run: make download_collinfo
+
+      - name: make duck_cloudfront
+        run: make duck_cloudfront
+
+      - name: make wreck_the_warc
+        run: make wreck_the_warc
diff --git a/CC-MAIN-2024-22.warc.paths.gz b/CC-MAIN-2024-22.warc.paths.gz
diff --git a/Makefile b/Makefile
@@ -37,11 +37,12 @@ extract:
 
 cdx_toolkit:
 	@echo look up this capture in the comoncrawl cdx index
-	cdxt --cc --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
+	#cdxt --cc --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
+	cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
 	@echo
 	@echo extract the content from the commoncrawl s3 bucket
 	rm -f TEST-000000.extracted.warc.gz
-	cdxt --cc --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
+	cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
 	@echo
 	@echo index this new warc
 	cdxj-indexer TEST-000000.extracted.warc.gz  > TEST-000000.extracted.warc.cdxj
diff --git a/duck.py b/duck.py
@@ -1,3 +1,4 @@
+import time
 import glob
 import json
 import os.path
@@ -8,13 +9,41 @@
 import duckdb
 
 
-def advice(prefix, crawl):
+def index_download_advice(prefix, crawl):
     print('Do you need to download this index?')
     print(f' mkdir -p {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/')
     print(f' cd {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/')
     print(f' aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/ .')
 
 
+def print_row_as_cdxj(row):
+    df = row.fetchdf()
+    for ro in df.itertuples(index=False):
+        d = ro._asdict()
+        cdxjd = {
+            'url': d['url'],
+            'mime': d['content_mime_type'],
+            'status': str(d['fetch_status']),
+            'digest': 'sha1:' + d['content_digest'],
+            'length': str(d['warc_record_length']),
+            'offset': str(d['warc_record_offset']),
+            'filename': d['warc_filename'],
+        }
+
+        timestamp = d['fetch_time'].isoformat(sep='T')
+        timestamp = timestamp.translate(str.maketrans('', '', '-T :Z')).replace('+0000', '')
+
+        print(d['url_surtkey'], timestamp, json.dumps(cdxjd))
+
+
+def print_row_as_kv_list(row):
+    df = row.fetchdf()
+    for ro in df.itertuples(index=False):
+        d = ro._asdict()
+        for k, v in d.items():
+            print(' ', k, v)
+
+
 all_algos = ('s3_glob', 'local_files', 'ccf_local_files', 'cloudfront_glob', 'cloudfront')
 
 
@@ -28,12 +57,12 @@ def get_files(algo, crawl):
         files = glob.glob(files)
         # did we already download? we expect 300 files of about a gigabyte
         if len(files) < 250:
-            advice('~', crawl)
+            index_download_advice('~', crawl)
             exit(1)
     elif algo == 'ccf_local_files':
         files = glob.glob(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet')
         if len(files) < 250:
-            advice('/home/cc-pds', crawl)
+            index_download_advice('/home/cc-pds', crawl)
             exit(1)
     elif algo == 'cloudfront_glob':
         # duckdb can't glob this, same reason as s3_glob above
@@ -54,12 +83,44 @@ def get_files(algo, crawl):
 
 def main(algo, crawl):
     files = get_files(algo, crawl)
-    ccindex = duckdb.read_parquet(files, hive_partitioning=True)
+    retries_left = 100
+
+    while True:
+        try:
+            ccindex = duckdb.read_parquet(files, hive_partitioning=True)
+            break
+        except (duckdb.HTTPException, duckdb.InvalidInputException) as e:
+            # read_parquet exception seen: HTTPException("HTTP Error: HTTP GET error on 'https://...' (HTTP 403)")
+            # duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...'
+            print('read_parquet exception seen:', repr(e), file=sys.stderr)
+            if retries_left:
+                print('sleeping for 60s', file=sys.stderr)
+                time.sleep(60)
+                retries_left -= 1
+            else:
+                raise
+
+    duckdb.sql('SET enable_progress_bar = true;')
+    duckdb.sql('SET http_retries = 100;')
+    #duckdb.sql("SET enable_http_logging = true;SET http_logging_output = 'duck.http.log'")
 
     print('total records for crawl:', crawl)
-    print(duckdb.sql('SELECT COUNT(*) FROM ccindex;'))
-
-    sq2 = '''
+    retries_left = 100
+    while True:
+        try:
+            print(duckdb.sql('SELECT COUNT(*) FROM ccindex;'))
+            break
+        except duckdb.InvalidInputException as e:
+            # duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...'
+            print('duckdb exception seen:', repr(e), file=sys.stderr)
+            if retries_left:
+                print('sleeping for 10s', file=sys.stderr)
+                time.sleep(10)
+                retries_left -= 1
+            else:
+                raise
+
+    sq2 = f'''
     select
       *
     from ccindex
@@ -73,7 +134,19 @@ def main(algo, crawl):
 
     row2 = duckdb.sql(sq2)
     print('our one row')
-    row2.show()
+    while True:
+        try:
+            row2.show()
+            break
+        except duckdb.InvalidInputException as e:
+            # duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...'
+            print('duckdb exception seen:', repr(e), file=sys.stderr)
+            if retries_left:
+                print('sleeping for 10s', file=sys.stderr)
+                time.sleep(10)
+                retries_left -= 1
+            else:
+                raise
 
     print('writing our one row to a local parquet file, whirlwind.parquet')
     row2.write_parquet('whirlwind.parquet')
@@ -89,29 +162,11 @@ def main(algo, crawl):
     row3.show()
 
     print('complete row:')
-    df = row3.fetchdf()
-    for row in df.itertuples(index=False):
-        d = row._asdict()
-        for k, v in d.items():
-            print(' ', k, v)
+    print_row_as_kv_list(row3)
     print('')
 
     print('equivalent to cdxj:')
-
-    cdxjd = {
-        'url': d['url'],
-        'mime': d['content_mime_type'],
-        'status': str(d['fetch_status']),
-        'digest': 'sha1:' + d['content_digest'],
-        'length': str(d['warc_record_length']),
-        'offset': str(d['warc_record_offset']),
-        'filename': d['warc_filename'],
-    }
-
-    timestamp = d['fetch_time'].isoformat(sep='T')
-    timestamp = timestamp.translate(str.maketrans('', '', '-T :Z')).replace('+0000', '')
-
-    print(d['url_surtkey'], timestamp, json.dumps(cdxjd))
+    print_row_as_cdxj(row3)
 
 
 if __name__ == '__main__':
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,8 @@
 warcio
 cdx_toolkit
-cdxj-indexer
 duckdb
 pyarrow
 pandas
 polars
 cdxj-indexer
 setuptools
-gzip