22"""
33Fetch latest Google Scholar stats and save to scholar.json.
44Runs manually or via GitHub Actions (see .github/workflows/update-scholar.yml).
5+
6+ Exit codes:
7+ 0 — success (or Google temporarily unreachable; kept existing JSON)
8+ 1 — fatal error (malformed local scholar.json, disk error, etc.)
9+
10+ This script is *tolerant* to transient failures: if Google returns a
11+ CAPTCHA / rate-limit page, we keep the previous scholar.json untouched
12+ and exit 0 so the GitHub Action doesn't falsely report failure.
513"""
6- import urllib .request
7- import re
814import json
15+ import os
16+ import re
917import ssl
10- import time
1118import sys
19+ import time
20+ import urllib .request
1221
1322USER = '-XWPfk4AAAAJ'
1423URL = f'https://scholar.google.com/citations?user={ USER } &hl=en'
@@ -21,20 +30,31 @@ def fetch_scholar():
2130 ctx .verify_mode = ssl .CERT_NONE
2231
2332 req = urllib .request .Request (URL , headers = {
24- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
25- 'AppleWebKit/537.36 (KHTML, like Gecko) '
26- 'Chrome/120.0 Safari/537.36' ,
33+ 'User-Agent' : (
34+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
35+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
36+ 'Chrome/120.0 Safari/537.36'
37+ ),
2738 'Accept-Language' : 'en-US,en;q=0.9' ,
39+ 'Accept' : 'text/html,application/xhtml+xml' ,
2840 })
2941
3042 with urllib .request .urlopen (req , context = ctx , timeout = 30 ) as r :
3143 html = r .read ().decode ('utf-8' , errors = 'ignore' )
3244
45+ # Detect CAPTCHA / unusual traffic block pages
46+ low = html .lower ()
47+ if 'unusual traffic' in low or 'captcha' in low or 'sorry' in low [:2000 ]:
48+ raise RuntimeError ('Google Scholar is rate-limiting this request (CAPTCHA page).' )
49+
3350 # Six number cells in the stats table:
3451 # citations(all), citations(since X), h(all), h(since X), i10(all), i10(since X)
3552 nums = re .findall (r'gsc_rsb_std">(\d+)' , html )
3653 if len (nums ) < 6 :
37- raise RuntimeError (f'Could not parse stats. Found: { nums } ' )
54+ raise RuntimeError (
55+ f'Could not parse stats table. Found { len (nums )} numbers '
56+ f'(expected 6). HTML length: { len (html )} '
57+ )
3858
3959 # Count publication rows
4060 paper_count = len (re .findall (r'class="gsc_a_tr"' , html ))
@@ -62,12 +82,24 @@ def fetch_scholar():
6282def main ():
6383 try :
6484 data = fetch_scholar ()
85+ except Exception as e :
86+ print (f'[WARN] Scholar fetch failed: { e } ' , file = sys .stderr )
87+ # Graceful exit — keep existing scholar.json intact.
88+ # GitHub Action will skip the commit step.
89+ if os .path .exists (OUT ):
90+ print (f'[INFO] Keeping existing { OUT } unchanged.' )
91+ return 0
92+ # No existing file — that's a real problem.
93+ print (f'[ERROR] No existing { OUT } to fall back to.' , file = sys .stderr )
94+ return 1
95+
96+ try :
6597 with open (OUT , 'w' , encoding = 'utf-8' ) as f :
6698 json .dump (data , f , indent = 2 , ensure_ascii = False )
6799 print (json .dumps (data , indent = 2 , ensure_ascii = False ))
68100 return 0
69- except Exception as e :
70- print (f'[ERROR] { e } ' , file = sys .stderr )
101+ except OSError as e :
102+ print (f'[ERROR] Could not write { OUT } : { e } ' , file = sys .stderr )
71103 return 1
72104
73105
0 commit comments