Make scanning for meta encoding much quicker

gsnedders · jgraham · commit 2310bd8b610b · 2020-04-14T16:15:26.000+01:00
Previously, this code tried to match everything with strings
beginning with "&lt;"; now we jump forward to each "&lt;" and compare
there. This also alters the jumpTo implementation to avoid
computing a (perhaps long) slice, making repeated calls O(n^2).
diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
@@ -668,15 +668,11 @@ def matchBytes(self, bytes):
     def jumpTo(self, bytes):
         """Look for the next sequence of bytes matching a given sequence. If
         a match is found advance the position to the last byte of the match"""
-        newPosition = self[self.position:].find(bytes)
-        if newPosition > -1:
-            # XXX: This is ugly, but I can't see a nicer way to fix this.
-            if self._position == -1:
-                self._position = 0
-            self._position += (newPosition + len(bytes) - 1)
-            return True
-        else:
+        try:
+            self._position = self.index(bytes, self.position) + len(bytes) - 1
+        except ValueError:
             raise StopIteration
+        return True
 
 
 class EncodingParser(object):
@@ -697,6 +693,10 @@ def getEncoding(self):
             (b"<", self.handlePossibleStartTag))
         for _ in self.data:
             keepParsing = True
+            try:
+                self.data.jumpTo(b"<")
+            except StopIteration:
+                break
             for key, method in methodDispatch:
                 if self.data.matchBytes(key):
                     try: