bczsalba · leonard-IMBERT · Aug 23, 2023 · bczsalba · Sep 6, 2023 · bczsalba
diff --git a/pytermgui/markup/parsing.py b/pytermgui/markup/parsing.py
@@ -3,13 +3,12 @@
 from __future__ import annotations
 
 import json
-from typing import Callable, Iterator, Protocol, TypedDict
+from typing import Callable, Iterator, Protocol, TypedDict, List
 from warnings import filterwarnings, warn
 
 from ..colors import Color
 from ..exceptions import ColorSyntaxError, MarkupSyntaxError
-from ..regex import RE_ANSI_NEW as RE_ANSI
-from ..regex import RE_MACRO, RE_MARKUP, RE_POSITION
+from ..regex import RE_MACRO, RE_MARKUP
 from .style_maps import CLEARERS, REVERSE_CLEARERS, REVERSE_STYLES, STYLES
 from .tokens import (
     AliasToken,
@@ -189,120 +188,191 @@ def tokenize_markup(text: str) -> Iterator[Token]:
         yield PlainToken(text[cursor:length])
 
 
-def tokenize_ansi(  # pylint: disable=too-many-locals, too-many-branches, too-many-statements
-    text: str,
+def _tokenize_ansi_color(
+    params: List[str],
 ) -> Iterator[Token]:
-    """Converts some ANSI-coded text into a stream of tokens.
+    """Convert ANSI color code into a stream of tokens
 
     Args:
-        text: Any valid ANSI-coded text.
+        params: List of parameters given to an SGR
 
     Yields:
-        The generated tokens, in the order they occur within the text.
+        The generated tokens
     """
+    state = None
+    color_code = ""
+    for part in params:
+        if state is None:
+            if part in REVERSE_STYLES:
+                yield StyleToken(REVERSE_STYLES[part])
+                continue
 
-    cursor = 0
+            if part in REVERSE_CLEARERS:
+                yield ClearToken(REVERSE_CLEARERS[part])
+                continue
 
-    for matchobj in RE_ANSI.finditer(text):
-        start, end = matchobj.span()
+            if part in ("38", "48"):
+                state = "COLOR"
+                color_code += part + ";"
+                continue
 
-        csi = matchobj.groups()[0:2]
-        link_osc = matchobj.groups()[2:4]
+            # standard colors
+            try:
+                yield ColorToken(part, Color.parse(part, localize=False))
+                continue
 
-        if cursor < start:
-            yield PlainToken(text[cursor:start])
+            except ColorSyntaxError as exc:
+                raise ValueError(f"Could not parse color tag {part!r}.") from exc
 
-        if link_osc != (None, None):
-            cursor = end
-            uri, label = link_osc
+        if state != "COLOR":
+            continue
 
-            yield HLinkToken(uri)
-            yield PlainToken(label)
-            yield ClearToken("/~")
+        color_code += part + ";"
 
+        # Ignore incomplete RGB colors
+        if (
+            color_code.startswith(("38;2;", "48;2;"))
+            and len(color_code.split(";")) != 6
+        ):
             continue
 
-        full, content = csi
+        try:
+            code = color_code
 
-        cursor = end
+            if code.startswith(("38;2;", "48;2;", "38;5;", "48;5;")):
+                stripped = code[5:-1]
 
-        code = ""
+                if code.startswith("4"):
+                    stripped = "@" + stripped
 
-        # Position
-        posmatch = RE_POSITION.match(full)
+                code = stripped
 
-        if posmatch is not None:
-            ypos, xpos = posmatch.groups()
-            if not ypos and not xpos:
-                raise ValueError(
-                    f"Cannot parse cursor when no position is supplied. Match: {posmatch!r}"
-                )
+            yield ColorToken(code, Color.parse(code, localize=False))
 
-            yield CursorToken(content, int(ypos) or None, int(xpos) or None)
+        except ColorSyntaxError:
             continue
 
-        parts = content.split(";")
-
         state = None
         color_code = ""
-        for part in parts:
-            if state is None:
-                if part in REVERSE_STYLES:
-                    yield StyleToken(REVERSE_STYLES[part])
-                    continue
 
-                if part in REVERSE_CLEARERS:
-                    yield ClearToken(REVERSE_CLEARERS[part])
-                    continue
 
-                if part in ("38", "48"):
-                    state = "COLOR"
-                    color_code += part + ";"
-                    continue
+ESC="\x1b"
 
-                # standard colors
-                try:
-                    yield ColorToken(part, Color.parse(part, localize=False))
-                    continue
+CSI="["
+SGR="m"
+CURSOR="H"
 
-                except ColorSyntaxError as exc:
-                    raise ValueError(f"Could not parse color tag {part!r}.") from exc
+OSC="]"
+HYPERLINK="8"
 
-            if state != "COLOR":
-                continue
+ST="\\"
 
-            color_code += part + ";"
+SEP=";"
 
-            # Ignore incomplete RGB colors
-            if (
-                color_code.startswith(("38;2;", "48;2;"))
-                and len(color_code.split(";")) != 6
-            ):
+def tokenize_ansi(  # pylint: disable=too-many-locals, too-many-branches, too-many-statements
+    text: str,
+) -> Iterator[Token]:
+    """Converts some ANSI-coded text into a stream of tokens.
+
+    Args:
+        text: Any valid ANSI-coded text.
+
+    Yields:
+        The generated tokens, in the order they occur within the text.
+    """
+
+    ## State machine status
+
+
+    cstate=None
+    params=[]
+
+    accumulator = ""
+
+
+    escaping = False
+
+    for char in text:
+        if char == ESC:
+            if cstate is None and len(accumulator) > 0:
+                yield PlainToken(accumulator)
+                accumulator = ""
+            escaping = True
+            continue
+
+
+        if escaping:
+            if char == CSI:
+                cstate = CSI
+                escaping = False
                 continue
 
-            try:
-                code = color_code
+            if char == OSC:
+                cstate = OSC
+                escaping = False
+                continue
 
-                if code.startswith(("38;2;", "48;2;", "38;5;", "48;5;")):
-                    stripped = code[5:-1]
+            if char == ST:
+                if cstate == HYPERLINK:
+                    params.append(accumulator)
 
-                    if code.startswith("4"):
-                        stripped = "@" + stripped
+                    if sum(len(param) for param in params)> 0:
+                        yield HLinkToken(params[2])
+                    else:
+                        yield ClearToken("/~")
+
+                    params = []
+                    accumulator = ""
+
+                    cstate = None
+                    escaping = False
+                    continue
 
-                    code = stripped
+            else:
+                raise ValueError(f"Unknown escape character, got {repr(char)}")
 
-                yield ColorToken(code, Color.parse(code, localize=False))
 
-            except ColorSyntaxError:
+        if cstate == OSC:
+            if char == HYPERLINK:
+                cstate = HYPERLINK
                 continue
 
-            state = None
-            color_code = ""
+        if char == SEP and cstate in (
+            HYPERLINK, CSI
+        ):
+            params.append(accumulator)
+            accumulator = ""
+            continue
+
+        if cstate == CSI:
+            if char == SGR:
+                params.append(accumulator)
+                accumulator = ""
+
+                for token in _tokenize_ansi_color(params):
+                    yield token
+
+                params = []
+                cstate = None
+                continue
+            if char == CURSOR:
+                params.append(accumulator)
+                accumulator = ""
+
+                if len(params) != 2:
+                    raise ValueError("Invalid number of params for cursor token."
+                                     f"Expected 2, got {repr(params)}")
+
+                content = "".join((pos + ";" for pos in params))[:-1]
+                yield CursorToken(content, int(params[0]) or None, int(params[1]) or None)
+                params = []
+                cstate = None
+                continue
 
-    remaining = text[cursor:]
-    if len(remaining) > 0:
-        yield PlainToken(remaining)
+        accumulator += char
 
+    if len(accumulator) > 0:
+        yield PlainToken(accumulator)
 
 def eval_alias(text: str, context: ContextDict) -> str:
     """Evaluates a space-delimited string of alias tags into their underlying value.

diff --git a/pytermgui/regex.py b/pytermgui/regex.py
@@ -5,11 +5,9 @@
 from typing import Match
 
 RE_LINK = re.compile(r"(?:\x1b\]8;;([^\\]*)\x1b\\([^\\]*?)\x1b\]8;;\x1b\\)")
-RE_ANSI_NEW = re.compile(rf"(\x1b\[(.*?)[mH])|{RE_LINK.pattern}|(\x1b_G(.*?)\x1b\\)")
 RE_ANSI = re.compile(r"(?:\x1b\[(.*?)[mH])|(?:\x1b\](.*?)\x1b\\)|(?:\x1b_G(.*?)\x1b\\)")
 RE_MACRO = re.compile(r"(![a-z0-9_\-]+)(?:\(([\w\/\.?\-=:]+)\))?")
 RE_MARKUP = re.compile(r"((\\*)\[([^\[\]]*)\])")
-RE_POSITION = re.compile(r"\x1b\[(\d*?)(?:;(\d*))?H")
 RE_PIXEL_SIZE = re.compile(r"\x1b\[4;([\d]+);([\d]+)t")
 
 RE_256 = re.compile(r"^([\d]{1,3})$")

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -126,6 +126,12 @@ def test_parse(self):
             == "\x1b[38;5;141m\x1b[48;5;61m\x1b[1mHELLO\x1b[0m"
         ), repr(tim.parse("[141 @61 bold !upper]Hello"))
 
+    def test_mutiple_hypertext_closing_sequence(self):
+        for plain in tim.group_styles(
+            "\x1b]8;;path.py\x1b\\inner\x1b]8;;\x1b\\\x1b]8;;\x1b\\outer"
+        ):
+            assert "\x1b]8;;\x1b\\" not in plain.plain, repr(plain)
+
 
 class TestFunctionality:
     def test_alias(self):