Skip to content

Commit 2ea7ce1

Browse files
committed
fix: graceful handling of Scholar rate-limit + skip timestamp-only commits
1 parent 0a9c046 commit 2ea7ce1

3 files changed

Lines changed: 74 additions & 22 deletions

File tree

.github/workflows/update-scholar.yml

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,44 @@ jobs:
1818
runs-on: ubuntu-latest
1919
steps:
2020
- name: Checkout
21-
uses: actions/checkout@v4
21+
uses: actions/checkout@v5
2222

2323
- name: Set up Python
24-
uses: actions/setup-python@v5
24+
uses: actions/setup-python@v6
2525
with:
26-
python-version: '3.11'
26+
python-version: '3.12'
2727

2828
- name: Fetch scholar stats
29+
id: fetch
30+
continue-on-error: true
2931
run: python update-scholar.py
3032

31-
- name: Commit & push if changed
33+
- name: Exit gracefully if fetch failed
34+
if: steps.fetch.outcome == 'failure'
35+
run: |
36+
echo "::warning::Scholar fetch failed (likely rate-limited by Google). Skipping commit."
37+
exit 0
38+
39+
- name: Commit & push if stats changed
40+
if: steps.fetch.outcome == 'success'
3241
run: |
3342
git config user.name "github-actions[bot]"
3443
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
35-
if git diff --quiet scholar.json; then
36-
echo "No changes to scholar.json"
44+
45+
# Only compare the numerical fields; ignore timestamp-only changes
46+
# so we don't spam commits when nothing actually changed.
47+
if ! git diff --quiet scholar.json; then
48+
# Check whether meaningful fields (excluding updated_at*) changed
49+
changed=$(git diff scholar.json | grep -E '^\+[^+]' | grep -v 'updated_at' | grep -v '"source"' || true)
50+
if [ -n "$changed" ]; then
51+
git add scholar.json
52+
git commit -m "chore: update scholar stats [skip ci]"
53+
git push
54+
echo "✅ Scholar stats updated."
55+
else
56+
echo "ℹ️ Only timestamp changed — skipping commit."
57+
git checkout scholar.json
58+
fi
3759
else
38-
git add scholar.json
39-
git commit -m "chore: update scholar stats [skip ci]"
40-
git push
60+
echo "ℹ️ No changes to scholar.json."
4161
fi

scholar.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
"user": "-XWPfk4AAAAJ",
33
"name": "Linzongying 林宗莹",
44
"paper_count": 7,
5-
"citations_all": 191,
6-
"citations_recent": 191,
5+
"citations_all": 192,
6+
"citations_recent": 192,
77
"h_index_all": 5,
88
"h_index_recent": 5,
99
"i10_index_all": 3,
1010
"i10_index_recent": 3,
11-
"updated_at": "2026-04-24T06:59:07Z",
12-
"updated_at_local": "2026-04-24 14:59 UTC",
11+
"updated_at": "2026-04-25T04:27:13Z",
12+
"updated_at_local": "2026-04-25 12:27 UTC",
1313
"source": "https://scholar.google.com/citations?user=-XWPfk4AAAAJ&hl=en"
1414
}

update-scholar.py

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,22 @@
22
"""
33
Fetch latest Google Scholar stats and save to scholar.json.
44
Runs manually or via GitHub Actions (see .github/workflows/update-scholar.yml).
5+
6+
Exit codes:
7+
0 — success (or Google temporarily unreachable; kept existing JSON)
8+
1 — fatal error (malformed local scholar.json, disk error, etc.)
9+
10+
This script is *tolerant* to transient failures: if Google returns a
11+
CAPTCHA / rate-limit page, we keep the previous scholar.json untouched
12+
and exit 0 so the GitHub Action doesn't falsely report failure.
513
"""
6-
import urllib.request
7-
import re
814
import json
15+
import os
16+
import re
917
import ssl
10-
import time
1118
import sys
19+
import time
20+
import urllib.request
1221

1322
USER = '-XWPfk4AAAAJ'
1423
URL = f'https://scholar.google.com/citations?user={USER}&hl=en'
@@ -21,20 +30,31 @@ def fetch_scholar():
2130
ctx.verify_mode = ssl.CERT_NONE
2231

2332
req = urllib.request.Request(URL, headers={
24-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
25-
'AppleWebKit/537.36 (KHTML, like Gecko) '
26-
'Chrome/120.0 Safari/537.36',
33+
'User-Agent': (
34+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
35+
'AppleWebKit/537.36 (KHTML, like Gecko) '
36+
'Chrome/120.0 Safari/537.36'
37+
),
2738
'Accept-Language': 'en-US,en;q=0.9',
39+
'Accept': 'text/html,application/xhtml+xml',
2840
})
2941

3042
with urllib.request.urlopen(req, context=ctx, timeout=30) as r:
3143
html = r.read().decode('utf-8', errors='ignore')
3244

45+
# Detect CAPTCHA / unusual traffic block pages
46+
low = html.lower()
47+
if 'unusual traffic' in low or 'captcha' in low or 'sorry' in low[:2000]:
48+
raise RuntimeError('Google Scholar is rate-limiting this request (CAPTCHA page).')
49+
3350
# Six number cells in the stats table:
3451
# citations(all), citations(since X), h(all), h(since X), i10(all), i10(since X)
3552
nums = re.findall(r'gsc_rsb_std">(\d+)', html)
3653
if len(nums) < 6:
37-
raise RuntimeError(f'Could not parse stats. Found: {nums}')
54+
raise RuntimeError(
55+
f'Could not parse stats table. Found {len(nums)} numbers '
56+
f'(expected 6). HTML length: {len(html)}'
57+
)
3858

3959
# Count publication rows
4060
paper_count = len(re.findall(r'class="gsc_a_tr"', html))
@@ -62,12 +82,24 @@ def fetch_scholar():
6282
def main():
6383
try:
6484
data = fetch_scholar()
85+
except Exception as e:
86+
print(f'[WARN] Scholar fetch failed: {e}', file=sys.stderr)
87+
# Graceful exit — keep existing scholar.json intact.
88+
# GitHub Action will skip the commit step.
89+
if os.path.exists(OUT):
90+
print(f'[INFO] Keeping existing {OUT} unchanged.')
91+
return 0
92+
# No existing file — that's a real problem.
93+
print(f'[ERROR] No existing {OUT} to fall back to.', file=sys.stderr)
94+
return 1
95+
96+
try:
6597
with open(OUT, 'w', encoding='utf-8') as f:
6698
json.dump(data, f, indent=2, ensure_ascii=False)
6799
print(json.dumps(data, indent=2, ensure_ascii=False))
68100
return 0
69-
except Exception as e:
70-
print(f'[ERROR] {e}', file=sys.stderr)
101+
except OSError as e:
102+
print(f'[ERROR] Could not write {OUT}: {e}', file=sys.stderr)
71103
return 1
72104

73105

0 commit comments

Comments
 (0)