Skip to content

Commit 46601e5

Browse files
authored
Don't raise on *archived* rate limit errors (#159)
A memento can be a archive of an old rate limit error (status code 429) and in our feverish run to handle rate limit errors better at the end of 2023, we caused `WaybackSession.send()` to raise exceptions for both real rate limits *and* archived ones. However, the archived ones might be an actual memento that you were looking for, and should have been exempted from raising. This solves the issue by simply checking whether a response is a memento and returning it immediately without doing any other checks, since the *effective* status code for a memento is always 200. (Checking various attributes of a memento is complicated, so it’s better to just return them right away rather than remembering to make complex exceptions in all the places where various response attributes have to be treated differently for mementos.) Fixes #158.
1 parent 477c444 commit 46601e5

File tree

3 files changed

+128
-5
lines changed

3 files changed

+128
-5
lines changed

src/wayback/_client.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,11 @@ def send(self, request: requests.PreparedRequest, **kwargs):
455455
response = super().send(request, **kwargs)
456456
retry_delay = self.get_retry_delay(retries, response)
457457

458-
if retries >= maximum or not self.should_retry(response):
458+
if is_memento_response(response):
459+
# Mementos are necessarily successful responses, so just
460+
# return them without any other checks.
461+
return response
462+
elif retries >= maximum or not self.should_retry(response):
459463
if response.status_code == 429:
460464
read_and_close(response)
461465
raise RateLimitError(response, retry_delay)
@@ -498,10 +502,6 @@ def request(self, method, url, **kwargs):
498502
return super().request(method, url, **kwargs)
499503

500504
def should_retry(self, response):
501-
# A memento may actually be a capture of an error, so don't retry it :P
502-
if is_memento_response(response):
503-
return False
504-
505505
return response.status_code in self.retryable_statuses
506506

507507
def should_retry_error(self, error):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers:
5+
Accept-Encoding:
6+
- gzip, deflate
7+
User-Agent:
8+
- wayback/0.4.5.dev10+gb7a16cd.d20231218 (+https://github.com/edgi-govdata-archiving/wayback)
9+
method: GET
10+
uri: https://web.archive.org/web/20150129034904id_/http://www.reddit.com/r/PokemonGiveaway
11+
response:
12+
body:
13+
string: "\n<!doctype html>\n<html>\n <head>\n <title>Too Many Requests</title>\n
14+
\ <style>\n body {\n font: small verdana, arial, helvetica,
15+
sans-serif;\n width: 600px;\n margin: 0 auto;\n }\n\n
16+
\ h1 {\n height: 40px;\n background: transparent url(//www.redditstatic.com/reddit.com.header.png)
17+
no-repeat scroll top right;\n }\n </style>\n </head>\n <body>\n
18+
\ <h1>whoa there, pardner!</h1>\n \n\n\n<p>we're sorry, but you appear
19+
to be a bot and we've seen too many requests\nfrom you lately. we enforce
20+
a hard speed limit on requests that appear to come\nfrom bots to prevent abuse.</p>\n\n<p>if
21+
you are not a bot but are spoofing one via your browser's user agent\nstring:
22+
please change your user agent string to avoid seeing this message\nagain.</p>\n\n<p>please
23+
wait 6 second(s) and try again.</p>\n\n <p>as a reminder to developers,
24+
we recommend that clients make no\n more than <a href=\"http://github.com/reddit/reddit/wiki/API\">one\n
25+
\ request every two seconds</a> to avoid seeing this message.</p>\n </body>\n</html>\n"
26+
headers:
27+
Connection:
28+
- keep-alive
29+
Content-Type:
30+
- text/html; charset=UTF-8
31+
Date:
32+
- Thu, 01 Feb 2024 18:20:31 GMT
33+
Permissions-Policy:
34+
- interest-cohort=()
35+
Referrer-Policy:
36+
- no-referrer-when-downgrade
37+
Server:
38+
- nginx
39+
Transfer-Encoding:
40+
- chunked
41+
X-NA:
42+
- '0'
43+
X-NID:
44+
- '-'
45+
X-Page-Cache:
46+
- MISS
47+
X-RL:
48+
- '1'
49+
X-location:
50+
- All
51+
cache-control:
52+
- max-age=1800
53+
content-security-policy:
54+
- 'default-src ''self'' ''unsafe-eval'' ''unsafe-inline'' data: blob: archive.org
55+
web.archive.org web-static.archive.org wayback-api.archive.org analytics.archive.org
56+
pragma.archivelab.org'
57+
link:
58+
- <http://www.reddit.com/r/PokemonGiveaway>; rel="original", <https://web.archive.org/web/timemap/link/http://www.reddit.com/r/PokemonGiveaway>;
59+
rel="timemap"; type="application/link-format", <https://web.archive.org/web/http://www.reddit.com/r/PokemonGiveaway>;
60+
rel="timegate", <https://web.archive.org/web/20120626000027/http://www.reddit.com:80/r/Pokemongiveaway>;
61+
rel="first memento"; datetime="Tue, 26 Jun 2012 00:00:27 GMT", <https://web.archive.org/web/20141209120144/http://www.reddit.com:80/r/Pokemongiveaway/>;
62+
rel="prev memento"; datetime="Tue, 09 Dec 2014 12:01:44 GMT", <https://web.archive.org/web/20150129034904/http://www.reddit.com/r/PokemonGiveaway>;
63+
rel="memento"; datetime="Thu, 29 Jan 2015 03:49:04 GMT", <https://web.archive.org/web/20150208032710/http://www.reddit.com:80/r/Pokemongiveaway>;
64+
rel="next memento"; datetime="Sun, 08 Feb 2015 03:27:10 GMT", <https://web.archive.org/web/20231020104350/https://www.reddit.com/r/Pokemongiveaway/>;
65+
rel="last memento"; datetime="Fri, 20 Oct 2023 10:43:50 GMT"
66+
memento-datetime:
67+
- Thu, 29 Jan 2015 03:49:04 GMT
68+
server-timing:
69+
- exclusion.robots;dur=1.346979, exclusion.robots.policy;dur=1.258865, cdx.remote;dur=0.566878,
70+
esindex;dur=0.070942, LoadShardBlock;dur=668.835646, PetaboxLoader3.datanode;dur=362.949615,
71+
PetaboxLoader3.resolve;dur=109.386489, load_resource;dur=78.884440
72+
x-app-server:
73+
- wwwb-app220
74+
x-archive-orig-cache-control:
75+
- no-cache
76+
x-archive-orig-cf-cache-status:
77+
- EXPIRED
78+
x-archive-orig-cf-ray:
79+
- 1b02752d98b0012c-SJC
80+
x-archive-orig-connection:
81+
- close
82+
x-archive-orig-content-length:
83+
- '-1'
84+
x-archive-orig-date:
85+
- Thu, 29 Jan 2015 03:49:04 GMT
86+
x-archive-orig-edge-control:
87+
- bypass-cache
88+
x-archive-orig-retry-after:
89+
- '6'
90+
x-archive-orig-server:
91+
- cloudflare-nginx
92+
x-archive-orig-vary:
93+
- accept-encoding
94+
x-archive-orig-x-content-type-options:
95+
- nosniff
96+
x-archive-orig-x-frame-options:
97+
- SAMEORIGIN
98+
x-archive-orig-x-moose:
99+
- majestic
100+
x-archive-orig-x-ua-compatible:
101+
- IE=edge
102+
x-archive-orig-x-xss-protection:
103+
- 1; mode=block
104+
x-archive-src:
105+
- liveweb-20150129011011/live-20150129000440-wwwb-app16.us.archive.org.warc.gz
106+
x-tr:
107+
- '1820'
108+
x-ts:
109+
- '429'
110+
status:
111+
code: 429
112+
message: Too Many Requests
113+
version: 1

src/wayback/tests/test_client.py

+10
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,16 @@ def test_get_memento_raises_no_memento_error():
609609
'20170929002712')
610610

611611

612+
@ia_vcr.use_cassette()
613+
def test_get_memento_works_on_archived_rate_limit_responses():
614+
with WaybackClient() as client:
615+
memento = client.get_memento('http://www.reddit.com/r/PokemonGiveaway',
616+
timestamp=datetime(2015, 1, 29, 3, 49, 4),
617+
exact=True)
618+
assert 'http://www.reddit.com/r/PokemonGiveaway' == memento.url
619+
assert 429 == memento.status_code
620+
621+
612622
@ia_vcr.use_cassette()
613623
def test_get_memento_follows_historical_redirects():
614624
with WaybackClient() as client:

0 commit comments

Comments
 (0)