Skip to content

Commit e8c603c

Browse files
authored
Handle binary/octet-stream content type (#190)
Treat `binary/octet-stream` as a generic media type, just like `application/octet-stream`, when trying to determine if content is not HTML. Even though `binary/octet-stream` is not a registered IANA media type, it turns out some AWS SDKs use it when uploading files to S3, so it’s not uncommon.
1 parent 46aecd6 commit e8c603c

File tree

3 files changed

+11
-1
lines changed

3 files changed

+11
-1
lines changed

docs/source/release-history.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Release History
55
In Development
66
--------------
77

8-
n/a
8+
- Treat the `binary/octet-stream` as a generic media type, just like `application/octet-stream`, when trying to determine if content is not HTML. Even though `binary/octet-stream` is not a registered IANA media type it turns out some AWS SDKs use it when uploading files to S3, so it’s somewhat common.
99

1010

1111
Version 0.1.4 (2024-01-01)

web_monitoring_diff/content_type.py

+2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
# Matches Content Types that *could* be acceptable for diffing as HTML
3939
UNKNOWN_CONTENT_TYPE_PATTERN = re.compile(r'^(%s)$' % '|'.join((
4040
r'application/octet-stream',
41+
r'binary/octet-stream',
4142
r'application/x-download',
4243
r'text/.+'
4344
)))
@@ -70,6 +71,7 @@ def is_not_html(text, headers=None, check_options='normal'):
7071
- `nosniff` uses the `Content-Type` header but does not sniff.
7172
- `ignore` doesn’t do any checking at all.
7273
"""
74+
print(f'#is_not_html: check_options="{check_options}", headers={headers}, text={text[:500]}')
7375
if headers and (check_options == 'normal' or check_options == 'nosniff'):
7476
content_type = headers.get('Content-Type', '').split(';', 1)[0].strip()
7577
if content_type and VALID_CONTENT_TYPE_PATTERN.match(content_type):

web_monitoring_diff/tests/test_html_diff_validity.py

+8
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,14 @@ def test_html_diff_render_should_not_check_content_type_header_if_header_is_malf
198198
b_headers={'Content-Type': 'text/html'})
199199

200200

201+
def test_html_diff_render_should_not_check_content_type_header_if_header_is_generic():
202+
html_diff_render(
203+
'<p>Just a little HTML</p>',
204+
'<p>Just some HTML</p>',
205+
a_headers={'Content-Type': 'binary/octet-stream'},
206+
b_headers={'Content-Type': 'application/x-download'})
207+
208+
201209
def test_html_diff_render_should_not_check_content_type_header_if_content_type_options_is_nocheck():
202210
html_diff_render(
203211
'<p>Just a little HTML</p>',

0 commit comments

Comments
 (0)