Skip to content

Commit 2310bd8

Browse files
gsneddersjgraham
authored andcommitted
Make scanning for meta encoding much quicker
Previously, this code tried to match everything with strings beginning with "<"; now we jump forward to each "<" and compare there. This also alters the jumpTo implementation to avoid computing a (perhaps long) slice, making repeated calls O(n^2).
1 parent 4b8cabf commit 2310bd8

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

html5lib/_inputstream.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -668,15 +668,11 @@ def matchBytes(self, bytes):
668668
def jumpTo(self, bytes):
669669
"""Look for the next sequence of bytes matching a given sequence. If
670670
a match is found advance the position to the last byte of the match"""
671-
newPosition = self[self.position:].find(bytes)
672-
if newPosition > -1:
673-
# XXX: This is ugly, but I can't see a nicer way to fix this.
674-
if self._position == -1:
675-
self._position = 0
676-
self._position += (newPosition + len(bytes) - 1)
677-
return True
678-
else:
671+
try:
672+
self._position = self.index(bytes, self.position) + len(bytes) - 1
673+
except ValueError:
679674
raise StopIteration
675+
return True
680676

681677

682678
class EncodingParser(object):
@@ -697,6 +693,10 @@ def getEncoding(self):
697693
(b"<", self.handlePossibleStartTag))
698694
for _ in self.data:
699695
keepParsing = True
696+
try:
697+
self.data.jumpTo(b"<")
698+
except StopIteration:
699+
break
700700
for key, method in methodDispatch:
701701
if self.data.matchBytes(key):
702702
try:

0 commit comments

Comments
 (0)