diff --git a/BetterMD/__init__.py b/BetterMD/__init__.py index 9e04a76..c82c49f 100644 --- a/BetterMD/__init__.py +++ b/BetterMD/__init__.py @@ -1,10 +1,25 @@ -import logging -from .elements import A, H1, H2, H3, H4, H5, H6, Head, OL, UL, LI, Text, Div, P, Span, Img, B, I, Br, Blockquote, Hr, Table, Tr, Td, Th, THead, TBody, Input, Code -from .html import CustomHTML -from .markdown import CustomMarkdown -from .rst import CustomRst +from .elements import * +from .parse import Collection, HTMLParser, MDParser, RSTParser +def from_html(html:'str'): + """ + Converts an HTML string into a Symbol. + + Given a string with HTML content, returns the corresponding Symbol object. + """ + return Symbol.from_html(html) -def enable_debug_mode(): - logging.basicConfig(level=logging.DEBUG) - logger = logging.getLogger("BetterMD") +def from_md(md:'str'): + """ + Convert a Markdown formatted string into a Symbol object. + + Parses the provided Markdown text and returns the corresponding Symbol + using the Symbol.from_md conversion method. + + Args: + md (str): A string containing Markdown formatted text. + + Returns: + Symbol: The Symbol object generated from the Markdown input. + """ + return Symbol.from_md(md) \ No newline at end of file diff --git a/BetterMD/elements/a.py b/BetterMD/elements/a.py index d7ea329..c7a747f 100644 --- a/BetterMD/elements/a.py +++ b/BetterMD/elements/a.py @@ -1,23 +1,124 @@ -from BetterMD.rst.custom_rst import CustomRst from .symbol import Symbol +from ..rst import CustomRst from ..markdown import CustomMarkdown -from ..html import CustomHTML +import re import typing as t -class MD(CustomMarkdown['A']): - def to_md(self, inner, symbol, parent, **kwargs): - return f"[{" ".join([e.to_md(**kwargs) for e in inner])}]({symbol.get_prop("href")})" +if t.TYPE_CHECKING: + from ..parse import Collection + +class MD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert a list of inner elements to a Markdown link. + + Joins the Markdown representations of the inner elements with spaces and uses the symbol's + href property to format the link. + + Args: + inner: A list of elements, each having a to_md() method. + symbol: An object that provides the link destination via the 'href' property. + parent: The parent element context, currently unused. + + Returns: + A Markdown formatted link as a string. + """ + return f"[{" ".join([e.to_md() for e in inner])}]({symbol.get_prop("href")})" + + def verify(self, text:'str'): + """ + Checks if the text contains any valid Markdown link formats. + + This method tests the input string for three Markdown link styles: + inline links (e.g., [label](url)), automatic links (e.g., ), + and reference links (e.g., [label][ref] with a corresponding reference definition). + It returns True if any valid link pattern is detected, otherwise False. + + Args: + text: The text to search for Markdown link patterns. + + Returns: + bool: True if a Markdown link is found; otherwise, False. + """ + if re.findall("\[([^\]]+)\]\((https?:\/\/[^\s)]+)\)", text): + # Case 1: Inline link + return True + + elif re.findall("<(https?:\/\/[^\s>]+)>", text): + # Case 2: Automatic Links + return True + + elif re.findall("\[([^\]]+)\]\[([^\]]+)\]\s*\n?\[([^\]]+)\]:\s*(https?:\/\/[^\s]+)", text): + # Case 3: Reference Links + return True + + return False -class HTML(CustomHTML['A']): - def to_html(self, inner, symbol, parent, **kwargs): - return f"{" ".join([e.to_html(**kwargs) for e in inner])}" class RST(CustomRst['A']): - def to_rst(self, inner, symbol, parent, **kwargs): - return f"`{' '.join([e.to_rst(**kwargs) for e in inner])} <{symbol.get_prop('href')}>`_" + def to_rst(self, inner, symbol, parent): + """ + Converts a list of elements to a reStructuredText hyperlink. + + The inner elements are converted into their RST representations, joined with a space, + and combined with the URL obtained from the symbol's 'href' property. The resulting + string follows the standard RST hyperlink syntax. + + Args: + inner: A list of elements having a to_rst method. + symbol: An object that provides the hyperlink URL via its get_prop('href') method. + parent: The parent element (currently unused). + + Returns: + A string formatted as an RST hyperlink. + """ + return f"`{' '.join([e.to_rst() for e in inner])} <{symbol.get_prop('href')}>`_" class A(Symbol): prop_list = ["href"] + + refs = {} md = MD() - html = HTML() - rst = RST() \ No newline at end of file + html = "a" + rst = RST() + + @classmethod + def md_refs(cls, references: 'list[str]' = None): + """ + Process Markdown references. + + This placeholder class method is intended for handling an optional list of Markdown + reference strings for future processing. Currently, it does not perform any action. + + Args: + references: Optional list of Markdown reference strings. + """ + pass + + @classmethod + def rst_refs(cls, references: 'list[str]' = None): + """ + Processes reStructuredText (RST) references for the symbol. + + This class method serves as a placeholder for handling RST reference links. + If a list of reference identifiers is provided, it may be used in future + enhancements to register or process those references. + + Args: + references: Optional list of reference identifiers. + """ + pass + + @classmethod + def html_refs(cls, references: 'list[str]' = None): + """ + Processes HTML references. + + This is a placeholder method for future processing of HTML reference strings. + If provided, the list of references may be used to update the symbol's HTML links. + Currently, no processing is performed. + + Args: + references: Optional list of HTML reference strings. Defaults to None. + """ + pass \ No newline at end of file diff --git a/BetterMD/elements/code.py b/BetterMD/elements/code.py index 26ba46c..dc5fe37 100644 --- a/BetterMD/elements/code.py +++ b/BetterMD/elements/code.py @@ -2,34 +2,115 @@ from .text import Text from ..markdown import CustomMarkdown from ..html import CustomHTML +from ..rst import CustomRst -class MD(CustomMarkdown['Code']): - def to_md(self, inner, symbol, parent, **kwargs): - language = symbol.get_prop("language", "") +class MD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert the given content into Markdown code formatting. + + This method transforms the provided content based on the symbol's properties. If the symbol specifies a programming language or the content contains newlines, the content is formatted as a fenced code block using triple backticks. Otherwise, the content is enclosed in single backticks as inline code. If the content is a Text instance, it is first converted to Markdown. - content = " ".join([e.to_md(**kwargs) for e in inner]) + Args: + inner: The content to format as Markdown, which may be a string or a Text instance. + symbol: An object containing properties (e.g., language) that influence the formatting. + parent: The parent context element (unused) for interface consistency. + + Returns: + A Markdown-formatted string representing the content as a code block or inline code. + """ + language = symbol.get_prop("language", "") + if isinstance(inner, Text): + inner = inner.to_md() # If it's a code block (has language or multiline) if language or "\n" in inner: - return f"```{language}\n{content}\n```\n" + return f"```{language}\n{inner}\n```\n" # Inline code - return f"`{content}`" + return f"`{inner}`" class HTML(CustomHTML): - def to_html(self, inner, symbol, parent, **kwargs): - language = symbol.get_prop("language", "") + def to_html(self, inner, symbol, parent): + """ + Converts a collection of content elements into an HTML code block. - content = " ".join([e.to_html(**kwargs) for e in inner]) + Joins the HTML representation of each item and wraps the result in a element. + If the symbol specifies a programming language, the tag includes a + language-specific CSS class for syntax highlighting. + """ + language = symbol.get_prop("language", "") + inner = "\n".join([i.to_html() for i in inner]) if language: - return f'
{content}
' + return f'{inner}' + + return f"{inner}" + + def verify(self, text: str) -> bool: + """ + Determine if the provided text equals "code", case-insensitively. + + Args: + text: The text to check. + + Returns: + bool: True if the text matches "code" irrespective of case, otherwise False. + """ + return text.lower() == "code" + +class RST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Converts content to reStructuredText formatted code. + + This method processes the given content—either a single element or a list—and converts it into a + reStructuredText representation. It extracts a language property from the provided symbol to determine + if the output should be formatted as a code block. When a language is specified or the content spans + multiple lines, the content is indented and returned as a code block (using the ".. code-block::" + directive if a language is provided, or a literal block otherwise). Inline code is wrapped in backticks, + with special handling if backticks already exist in the content. + + Args: + inner: The content to convert, which may be a single element or a list of elements. + symbol: An object that supplies properties (such as the programming language) affecting formatting. + parent: Unused parameter reserved for interface compatibility. + + Returns: + A string containing the reStructuredText formatted code, either as a code block or inline code. + """ + language = symbol.get_prop("language", "") - return f"{content}" + # Handle inner content + if isinstance(inner, list): + content = "".join([ + i.to_rst() if isinstance(i, Symbol) else str(i) + for i in inner + ]) + else: + content = inner.to_rst() if isinstance(inner, Symbol) else str(inner) + + # If it's a code block (has language or multiline) + if language or "\n" in content: + # Use code-block directive for language-specific blocks + if language: + # Indent the content by 3 spaces (RST requirement) + indented_content = "\n".join(f" {line}" for line in content.strip().split("\n")) + return f".. code-block:: {language}\n\n{indented_content}\n\n" + + # Use simple literal block for language-less blocks + # Indent the content by 3 spaces (RST requirement) + indented_content = "\n".join(f" {line}" for line in content.strip().split("\n")) + return f"::\n\n{indented_content}\n\n" + + # Inline code + # Escape backticks if they exist in content + if "`" in content: + return f"``{content}``" + return f"`{content}`" class Code(Symbol): - prop_list = ["language"] html = HTML() md = MD() - rst = "``" + rst = RST() nl = True \ No newline at end of file diff --git a/BetterMD/elements/input.py b/BetterMD/elements/input.py index 4cf9e4b..a3fe863 100644 --- a/BetterMD/elements/input.py +++ b/BetterMD/elements/input.py @@ -3,32 +3,31 @@ from ..markdown import CustomMarkdown from ..rst import CustomRst -class HTML(CustomHTML): - def to_html(self, inner, symbol, parent, **kwargs): - # Collect all input attributes - attrs = [] - for prop in Input.props: - value = symbol.get_prop(prop) - if value: - # Handle boolean attributes like 'required', 'disabled', etc. - if isinstance(value, bool) and value: - attrs.append(prop) - else: - attrs.append(f'{prop}="{value}"') - - attrs_str = " ".join(attrs) - return f"" - class MD(CustomMarkdown): - def to_md(self, inner, symbol, parent, **kwargs): + def to_md(self, inner, symbol, parent): + """ + Converts an input symbol into its Markdown representation. + + If the symbol's "type" property is "checkbox", returns a Markdown formatted checkbox + (with an "x" if checked or a space if unchecked) followed by the inner content's Markdown. + Otherwise, returns the symbol's HTML representation. + """ if symbol.get_prop("type") == "checkbox": - return f"- [{'x' if symbol.get_prop('checked', '') else ''}] {inner.to_md()}" + return f"- [{'x' if symbol.get_prop('checked', '') else ' '}] {inner.to_md()}" return symbol.to_html() class RST(CustomRst): - def to_rst(self, inner, symbol, parent, **kwargs): + def to_rst(self, inner, symbol, parent): + """ + Generate an RST formatted string for a checkbox input element. + + If the symbol's "type" property is "checkbox", returns a string displaying a checkbox + indicator ("x" if the "checked" property is truthy, otherwise a blank space), optionally + followed by the inner element’s RST representation. For other input types, returns an + empty string. + """ if symbol.get_prop("type") == "checkbox": - return f"[ ] {inner.to_rst() if inner else ''}" + return f"[{'x' if symbol.get_prop('checked', '') else ' '}] {inner.to_rst() if inner else ''}" return "" # Most input types don't have RST equivalents class Input(Symbol): @@ -50,6 +49,6 @@ class Input(Symbol): "multiple", "step" ] - html = HTML() + html = "input" md = MD() rst = RST() \ No newline at end of file diff --git a/BetterMD/elements/symbol.py b/BetterMD/elements/symbol.py index a01e96e..8136020 100644 --- a/BetterMD/elements/symbol.py +++ b/BetterMD/elements/symbol.py @@ -1,72 +1,86 @@ import typing as t -import logging from ..markdown import CustomMarkdown from ..html import CustomHTML from ..rst import CustomRst - -T = t.TypeVar("T", default=t.Any) -T2 = t.TypeVar("T2", default=t.Any) -logger = logging.getLogger("BetterMD") - -class List(list, t.Generic[T]): - def on_set(self, key, value): ... - - def on_ammend(self, object: 'T'): ... - - - def append(self, object: 'T') -> 'None': - self.on_ammend(object) - return super().append(object) - - def get(self, index, default:'T2'=None) -> 't.Union[T, T2]': - try: - return self[index] - except IndexError: - return default - - def __setitem__(self, key, value): - self.on_set(key, value) - return super().__setitem__(key, value) - - def __getitem__(self, item) -> 'T': - return super().__getitem__(item) - - def __iter__(self) -> 't.Iterator[T]': - return super().__iter__() +from ..parse import HTMLParser, MDParser, RSTParser, ELEMENT, TEXT, Collection class Symbol: styles: 'dict[str, str]' = {} classes: 'list[str]' = [] - html: 't.Union[str, CustomHTML, CustomHTML[Symbol]]' = "" - props: 'dict[str, t.Union[str, list[str], dict[str, str]]]' = {} + html: 't.Union[str, CustomHTML]' = "" + props: 'dict[str, str]' = {} prop_list: 'list[str]' = [] vars:'dict[str,str]' = {} - children:'List[Symbol]' = List() - md: 't.Union[str, CustomMarkdown, CustomMarkdown[Symbol], None]' = None - rst: 't.Union[str, CustomRst, CustomRst[Symbol], None]' = None + children:'list[Symbol]' = [] + md: 't.Union[str, CustomMarkdown]' = "" + rst: 't.Union[str, CustomRst]' = "" parent:'Symbol' = None prepared:'bool' = False nl:'bool' = False html_written_props = "" - def __init__(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], dom:'bool'=True, inner:'list[Symbol]'=[], **props): - logger.debug(f"Creating new Symbol with {styles=} {classes=} {dom=} {inner=} {props=}") + collection = Collection() + html_parser = HTMLParser() + md_parser = MDParser() + + def __init_subclass__(cls, **kwargs) -> None: + """ + Automatically registers new subclasses in the symbol collection. + + This method adds the new subclass to the class-wide symbol collection and then + delegates additional initialization to the superclass. + """ + cls.collection.add_symbols(cls) + super().__init_subclass__(**kwargs) + + def __init__(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], inner:'list[Symbol]'=[], **props): + """ + Initialize a new Symbol instance with optional styles, classes, inner symbols, and additional properties. + + Args: + styles (dict[str, str]): CSS style definitions for the symbol. + classes (list[str]): CSS class names for the symbol. + inner (list[Symbol]): Initial child symbols to be assigned. + **props: Additional properties to associate with the symbol. + """ self.styles = styles self.classes = classes - self.children = List(inner) or List() + self.children = list(inner) or [] self.props = props - self.dom = dom - + + def copy(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], inner:'list[Symbol]'=None): + """ + Return a copy of the symbol with merged styles and specified children. + + Merges the current symbol's styles into the provided styles dictionary so that the + symbol's own styles override any overlapping keys. The resulting dictionary, along with + the given list of class names and inner symbols (defaulting to an empty list if not provided), + is used to create a new Symbol instance. + + Args: + styles (dict[str, str]): Optional dictionary to merge with the symbol's styles. + classes (list[str]): Optional list of class names for the new symbol. + inner (list[Symbol], optional): Optional list of inner symbols; defaults to an empty list. + + Returns: + Symbol: A new Symbol instance with the merged styles, specified classes, and inner symbols. + """ if inner == None: - inner = [Symbol()] + inner = [] styles.update(self.styles) return Symbol(styles, classes, inner = inner) - - + + def set_parent(self, parent:'Symbol'): + """ + Set the parent symbol for this instance and add it as a child of the parent. + + Args: + parent: The Symbol instance to be assigned as the parent. + """ self.parent = parent self.parent.add_child(self) @@ -81,83 +95,232 @@ def remove_child(self, symbol:'Symbol'): self.children.remove(symbol) def has_child(self, child:'type[Symbol]'): + """ + Checks if a child symbol of a specified type exists. + + Iterates over the symbol's children and returns the first child that is an instance of the specified type. + If no matching child is found, returns False. + + Args: + child: The Symbol subclass to search for among the children. + + Returns: + The first child instance matching the specified type, or False if none is found. + """ for e in self.children: if isinstance(e, child): return e - + return False - def prepare(self, parent:'t.Union[Symbol, None]'=None, *args, **kwargs): - self.prepared = True - self.parent = parent + def prepare(self, parent:'Symbol'): + """ + Prepares the symbol by setting its parent and recursively preparing its children. + + Marks the symbol as prepared, assigns the given parent, and applies the same preparation process to every child symbol. - [symbol.prepare(self, *args, **kwargs) for symbol in self.children] + Args: + parent: The parent Symbol instance to be assigned to the current symbol. + Returns: + The prepared Symbol instance. + """ + self.prepared = True + self.parent = parent + for symbol in self.children: + symbol.prepare(self) + return self def replace_child(self, old:'Symbol', new:'Symbol'): + """ + Replaces an existing child symbol with a new one. + + Finds the index of the specified old child within the children list, removes it, + and inserts the new symbol at the position immediately preceding the removed child's + former index. + + Args: + old: The child symbol to replace. + new: The new child symbol to insert. + + Raises: + ValueError: If the old child is not found among the children. + """ i = self.children.index(old) self.children.remove(old) self.children[i-1] = new - - def to_html(self) -> 'str': - if not self.prepared: - self.prepare() + + def to_html(self, indent=1) -> 'str': + """ + Converts the symbol and its children into an HTML representation. + + If the symbol's HTML attribute is an instance of CustomHTML, its own + to_html method is called. Otherwise, an HTML tag is constructed with + any associated CSS classes, inline styles, and additional properties. + Child elements are formatted and indented based on the specified indent + level. A self-closing tag is returned when no child content exists. + + Args: + indent: The current indentation level for formatting nested child elements. + Returns: + The complete HTML string representation of the symbol. + """ if isinstance(self.html, CustomHTML): return self.html.to_html(self.children, self, self.parent) + + inner_HTML = f"\n{" "*indent}".join([e.to_html(indent+1) if not (len(self.children) == 1 and self.children[0].html == "text") else e.to_html(0) for e in self.children]) + return f"<{self.html}{" " if self.styles or self.classes or self.props else ""}{f"class={'"'}{' '.join(self.classes) or ''}{'"'}" if self.classes else ""}{" " if (self.styles or self.classes) and self.props else ""}{f"style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ""}{'"'}" if self.styles else ""}{" " if (self.styles or self.classes) and self.props else ""}{' '.join([f'{k}={'"'}{v}{'"'}' if v != "" else f'{k}' for k,v in self.props.items()])}{f">{"\n" if len(self.children) > 1 else ""}{inner_HTML}{"\n" if len(self.children) > 1 else ""}" if inner_HTML else f" />"}" + + def to_md(self) -> 'str': + """ + Generates a Markdown representation of the symbol and its children. - props = [] - for prop, value in self.props.items(): - if isinstance(value, list): - props.append(f"{prop}={'"'}{' '.join(value)}{'"'}") - elif isinstance(value, dict): - props.append(f"{prop}={'"'}{' '.join([f'{k}:{v}' for k,v in value.items()])}{'"'}") - else: - props.append(f"{prop}={value}") - - inner_HTML = "\n".join([e.to_html() for e in self.children]) - logger.debug(f"{inner_HTML=} {self.html=} {self.classes=} {self.styles=} {props=}") - return f"<{self.html} class={'"'}{' '.join(self.classes) or ''}{'"'} style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ''}{'"'} {' '.join(props)}>{inner_HTML}" - - def to_md(self, **kwargs) -> 'str': - if not self.prepared: - self.prepare(**kwargs) - + If the symbol's md attribute is a CustomMarkdown instance, its to_md() method is invoked. + Otherwise, the Markdown output is constructed by concatenating the symbol's md attribute + with the Markdown representations of its child symbols. A newline character is appended + if the nl attribute is set. + """ if isinstance(self.md, CustomMarkdown): - return self.md.to_md(self.children, self, self.parent, **kwargs) - - if self.md == None: - return self.to_html(**kwargs) - - inner_md = " ".join([e.to_md() for e in self.children]) - return f"{self.md} {inner_md}" + ("\n" if self.nl else "") - - def to_rst(self, **kwargs) -> 'str': - if not self.prepared: - self.prepare(**kwargs) + return self.md.to_md(self.children, self, self.parent) + + inner_md = "".join([e.to_md() for e in self.children]) + return f"{self.md}{inner_md}" + ("\n" if self.nl else "") + def to_rst(self) -> 'str': + """ + Convert the symbol and its children to an RST string representation. + + If the symbol's "rst" attribute is an instance of CustomRst, the conversion is delegated to + its "to_rst" method with the children, the symbol itself, and its parent. Otherwise, the + method concatenates the symbol's "rst" attribute with the RST representations of its children, + appending the "rst" attribute again and a newline. + """ if isinstance(self.rst, CustomRst): return self.rst.to_rst(self.children, self, self.parent) - - if self.rst == None: - return f".. raw:: html\n\n{" ".join(self.to_html().splitlines())}\n" - + inner_rst = " ".join([e.to_rst() for e in self.children]) return f"{self.rst}{inner_rst}{self.rst}\n" - - def get_prop(self, prop, default="") -> 't.Union[str, list[str], dict[str, str]]': + + @classmethod + def from_html(cls, text:'str') -> 'list[Symbol]': + """ + Parses an HTML string into a list of Symbol instances. + + This class method uses the HTML parser to convert the input string into parsed elements, + and then retrieves the corresponding Symbol for each element from the symbol collection. + Each element is processed by invoking the symbol's parse method. An error is raised if a + matching symbol cannot be found for any parsed element. + + Returns: + list[Symbol]: A list of Symbol instances derived from the parsed HTML elements. + """ + parsed = cls.html_parser.parse(text) + return [cls.collection.find_symbol(elm['name'] , raise_errors=True).parse(elm) for elm in parsed] + + @classmethod + def parse(cls, text:'ELEMENT') -> 'Symbol': + """ + Parses a structured element into a Symbol instance. + + This class method transforms a dictionary representation of an element—either a text + node (with type 'text') or an element node (identified by its 'name')—into a Symbol. + It extracts CSS styles and classes from the element's attributes, recursively parses + child elements, and passes any remaining attributes as properties. + + Args: + text: A dictionary representing the element. Expected to include keys such as + 'attributes' (with optional 'style' and 'class' entries), 'children' (a list + of nested element dictionaries), and 'type'. For text nodes, a 'content' key + is expected; for element nodes, a 'name' key is used. + + Returns: + A Symbol instance corresponding to the parsed element. + """ + def handle_element(element:'ELEMENT|TEXT') -> 'Symbol': + if element['type'] == 'text': + text = cls.collection.find_symbol("text", raise_errors=True) + assert text is not None, "`collection.find_symbol` is broken" + + return text(element['content']) + + symbol_cls = cls.collection.find_symbol(element['name'], raise_errors=True) + assert symbol_cls is not None, "`collection.find_symbol` is broken" + + return symbol_cls.parse(element) + + styles = {s.split(":")[0]: s.split(":")[1] for s in text["attributes"].pop("style", "").split(";") if ":" in s} + classes = list(filter(lambda c: bool(c), text["attributes"].pop("class", "").split(" "))) + + return cls(styles, classes, inner=[handle_element(elm) for elm in text["children"]], **text["attributes"]) + + @classmethod + def from_md(cls, text: str) -> 'Symbol': + """ + Parses a Markdown string and returns the corresponding Symbol instance. + + This method uses the class-level Markdown parser to convert the input text into a + structured representation. It then locates the appropriate symbol from the collection + (using the parsed name, with errors raised if not found) and further processes the parsed + data to create a new Symbol instance. + + Args: + text: A Markdown formatted string representing the symbol's data. + + Returns: + A Symbol instance constructed based on the parsed Markdown content. + + Raises: + Exception: If a symbol with the parsed name is not found in the collection. + """ + parsed = cls.md_parser.parse(text) + return cls.collection.find_symbol(parsed['name'], raise_errors=True).parse(parsed) + + + + def get_prop(self, prop, default="") -> 'str': + """ + Retrieves the value of a property from the symbol's properties. + + If the property is not found, returns the specified default value. + + Args: + prop: The key of the property to retrieve. + default: The value to return if the property key is absent (defaults to an empty string). + + Returns: + The property value as a string, or the default value if the key is not found. + """ return self.props.get(prop, default) - def set_prop(self, prop:'str', value:'t.Union[str, list[str], dict[str, str]]'): + def set_prop(self, prop, value): self.props[prop] = value def __contains__(self, item): + """ + Determines if a specified child symbol or type exists among the children. + + If the given item is callable (typically a type), the method returns True if any + child is an instance of that type. Otherwise, it checks for direct membership in + the list of children. + + Returns: + bool: True if a matching child is found, False otherwise. + """ if callable(item): return any(isinstance(e, item) for e in self.children) return item in self.children def __str__(self): - return f"<{self.html} class={'"'}{' '.join(self.classes) or ''}{'"'} style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ''}{'"'} {' '.join(self.props)}/>" + """ + Return a string representation of the symbol as an HTML element. + + The string includes the element's tag name along with any defined CSS classes, + inline styles, and additional properties. It also appends the count of child symbols. + """ + return f"<{self.html}{" " if self.styles or self.classes or self.props else ""}{f"class={'"'}{' '.join(self.classes) or ''}{'"'}" if self.classes else ""}{" " if (self.styles or self.classes) and self.props else ""}{f"style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ""}{'"'}" if self.styles else ""}{" " if (self.styles or self.classes) and self.props else ""}{' '.join([f'{k}={'"'}{v}{'"'}' if v != "" else f'{k}' for k,v in self.props.items()])}{f">{"\n" if len(self.children) > 1 else ""}{"\n" if len(self.children) > 1 else ""}{len(self.children)}"}" + + __repr__ = __str__ \ No newline at end of file diff --git a/BetterMD/elements/table.py b/BetterMD/elements/table.py index 6576662..7d2f29a 100644 --- a/BetterMD/elements/table.py +++ b/BetterMD/elements/table.py @@ -1,393 +1,287 @@ -from .symbol import Symbol, List +from .symbol import Symbol from ..markdown import CustomMarkdown from ..rst import CustomRst from .h import H1, H2, H3, H4, H5, H6 from .text import Text -import logging -import typing as t +import itertools as it -if t.TYPE_CHECKING: - # Wont be imported at runtime - import pandas as pd # If not installed, will not affedt anything at runtime - -logger = logging.getLogger("BetterMD") - -class TrMD(CustomMarkdown['Tr']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - logger.debug("Converting Tr element to Markdown") - contents = "\n".join([e.to_md() for e in inner]) - split_content = contents.splitlines() - logger.debug(f"Split content: {split_content}") - ret = f"| {" | ".join(split_content)} |" - return ret - - -class THeadMD(CustomMarkdown['THead']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - md = [] - for child in symbol.head.children: - e = child.to_md() - - md.append({"len":len(e), "style":child.styles.get("text-align", "justify")}) - - def parse_md(data: 'dict') -> 'str': - start = " :" if data["style"] in ["left", "center"] else " " - middle = "-"*(data["len"]-2) if data["style"] == "center" else "-"*(data["len"]-1) if data["style"] in ["left", "right"] else "-"*(data["len"]) - end = ": " if data["style"] in ["right", "center"] else " " - - return f"{start}{middle}{end}" - - return f"{inner[0].to_md()}\n|{"|".join([parse_md(item) for item in md])}|" +class TableMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Generate Markdown for a table element. -class TBodyMD(CustomMarkdown['TBody']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - content = [e.to_md() for e in inner if isinstance(e, Tr)] - logger.debug(f"TBody conent: {content}") - return "\n".join(content) - -class TdMD(CustomMarkdown['Td']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - if not pretty: - return " ".join([e.to_md() for e in inner]) - - length = len(max(symbol.table.cols[symbol.header], key=len).data) - logger.debug(f"Td length: {len(symbol)}") - logger.debug(f"Column length: {length}") - return " ".join([e.to_md() for e in inner]).center(length) - -class ThMD(CustomMarkdown['Th']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - if not pretty: - return " ".join([e.to_md() for e in inner]) + Iterates over the provided inner elements to construct the table's Markdown output. + Processes a THead element to generate the header and collects non-empty Markdown + from each TBody element as table rows. The header and rows are concatenated with + newline separators to form the final Markdown string. - width = len(max(symbol.table.cols[symbol.header], key=len).data) - + Returns: + str: The complete Markdown representation of the table. + """ + result = [] + thead_content = "" + tbody_rows = [] - if symbol.data == "": - return "".center(width) + # Process inner elements + for section in inner: + if isinstance(section, THead): + thead_content = section.to_md() + elif isinstance(section, TBody): + tbody_content = section.to_md() + if tbody_content: + tbody_rows.append(tbody_content) - return f"**{" ".join([e.to_md() for e in inner]).center(width)}**" - -class TableMD(CustomMarkdown['Table']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - logger.debug("Converting Table element to Markdown") - head = symbol.head.to_md() if symbol.head else None - body = symbol.body.to_md() - - logger.debug(f"Table conversion complete. Has header: {head is not None}") - return f"{f"{head}\n" if head else ""}{body}" - - -class TableRST(CustomRst['Table']): - def to_rst(self, inner, symbol, parent, **kwargs): - logger.debug("Converting Table element to RST") - head = symbol.head.to_rst() if symbol.head else None - body = symbol.body.to_rst() - - return f"{f"{head}\n" if head else ""}{body}" - -class THeadRST(CustomRst['THead']): - def to_rst(self, inner, symbol, parent, **kwargs): - logger.debug("Converting THead element to RST") - logger.debug(f"THead has {len(inner)} children: {[e.to_rst() for e in inner]}") - top = [len(max(symbol.table.cols[child.header], key=len).data) for child in symbol.head.children] - content = "\n".join([e.to_rst() for e in inner]) - return f"+-{"-+-".join([t*"-" for t in top])}-+\n{content}\n+={"=+=".join([t*"=" for t in top])}=+" - -class TBodyRST(CustomRst['TBody']): - def to_rst(self, inner, symbol, parent, **kwargs): - bottom = [len(max(symbol.table.cols[child.header], key=len).data) for child in symbol.table.head.head.children] - return f'{f"\n+-{"-+-".join(["-"*b for b in bottom])}-+\n".join([e.to_rst() for e in inner if isinstance(e, Tr)])}\n+-{"-+-".join(["-"*b for b in bottom])}-+' - -class TrRST(CustomRst['Tr']): - def to_rst(self, inner, symbol, parent, **kwargs): - return f'| {" |\n| ".join(" | ".join([e.to_rst() for e in inner]).split("\n"))} |' - - -class TdRST(CustomRst['Td']): - def to_rst(self, inner, symbol, parent, **kwargs): - content = " ".join([e.to_rst() for e in inner]) - width = len(max(symbol.table.cols[symbol.header], key=len).data) - return content.center(width) + # Combine all parts + if thead_content: + result.append(thead_content) + + if tbody_rows: + result.append("\n".join(tbody_rows)) + + return "\n".join(result) + +class TableRST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Converts table sections into a reStructuredText formatted table. + + This method processes a list of table sections (typically header and body) to generate an RST table. It first + iterates over the sections to compute the maximum width for each column by examining each cell’s content. In a + second pass, it builds a table string with consistent column widths, using a distinct separator (with '=' characters) + after header rows and '-' characters for body rows. If no valid rows are found, an empty string is returned. + + Args: + inner: A list of table section symbols (e.g., THead and TBody) that contain the table rows. + + Returns: + str: The formatted reStructuredText table. + """ + if not inner: + return "" + + # First pass: collect all cell widths from both thead and tbody + col_widths = [] + all_rows = [] + + for section in inner: + if isinstance(section, THead) or isinstance(section, TBody): + for row in section.children: + cells = [cell.to_rst() for cell in row.children] + all_rows.append((cells, isinstance(section, THead))) + + # Update column widths + if not col_widths: + col_widths = [len(cell) for cell in cells] + else: + col_widths = [max(old, len(new)) for old, new in zip(col_widths, cells + [''] * (len(col_widths) - len(cells)))] + + if not all_rows: + return "" + + # Second pass: generate RST with consistent widths + result = [] + + # Top border + top_border = "+" + "+".join(["-" * (width + 2) for width in col_widths]) + "+" + result.append(top_border) + + for i, (cells, is_header) in enumerate(all_rows): + # Create row with proper spacing using consistent column widths + row = "| " + " | ".join(cell.ljust(width) for cell, width in zip(cells, col_widths)) + " |" + result.append(row) + + # Add separator after each row + if is_header: + separator = "+" + "+".join(["=" * (width + 2) for width in col_widths]) + "+" + else: + separator = "+" + "+".join(["-" * (width + 2) for width in col_widths]) + "+" + result.append(separator) + + return "\n".join(result) -class ThRST(CustomRst['Th']): - def to_rst(self, inner, symbol, parent, **kwargs): - content = " ".join([e.to_rst() for e in inner]) - width = len(max(symbol.table.cols[symbol.header], key=len).data) - if content == "": - return "".center(width) - return f"**{content}**".center(width) +class THeadMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert table rows to a Markdown formatted table. + + Iterates over each row in the input, converting its cells to Markdown and + computing the maximum width for each column. The function then constructs + a pipe-delimited table string and appends a separator row composed of dashes. + Returns an empty string if no rows are provided. + + Args: + inner: Iterable of row elements, each having a 'children' attribute + with cell elements that implement a to_md() method. + symbol: Ignored; provided for interface compatibility. + parent: Ignored; provided for interface compatibility. + + Returns: + A Markdown formatted string representing the table. + """ + if not inner: + return "" + + rows = [] + widths = [] + + # First pass: collect all rows and calculate column widths + for row in inner: + row_cells = [cell.to_md() for cell in row.children] + if not widths: + widths = [len(cell) for cell in row_cells] + else: + widths = [max(old, len(new)) for old, new in zip(widths, row_cells)] + rows.append(row_cells) + + if not rows: + return "" + + # Second pass: generate properly formatted markdown + result = [] + for row_cells in rows: + row = "|" + "|".join(row_cells) + "|" + result.append(row) + + # Add separator row + separator = "|" + "|".join(["-" * width for width in widths]) + "|" + result.append(separator) + + return "\n".join(result) +class THeadRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Return an empty string as RST conversion is handled by TableRST. + + This placeholder method satisfies the interface requirement without performing any + direct header conversion, as all RST formatting for headers is managed by the TableRST class. + """ + return "" + +class TBodyMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Generate a Markdown string by converting each inner row. + + This method iterates over the provided inner elements and converts each row to its + Markdown representation using the row's to_md() method. The resulting Markdown + string is generated by joining the individual row outputs with newline characters. + If no inner elements are provided, an empty string is returned. + + Args: + inner: A collection of row elements, each supporting a to_md() conversion. + symbol: An element symbol (unused) for interface consistency. + parent: The parent element (unused) for context. + """ + if not inner: + return "" + + rows = [] + for row in inner: + rows.append(row.to_md()) + + return "\n".join(rows) +class TrMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts inner cells to a Markdown table row. + + Iterates over each element in the provided list, converting it to Markdown and + joining the results with pipe characters. The returned string is enclosed + with leading and trailing pipes. + """ + cells = [cell.to_md() for cell in inner] + return f"|{'|'.join(cells)}|" + +class TrRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Placeholder for table header RST conversion. + + This method returns an empty string as RST conversion for table headers is handled by TableRST. + """ + return "" + +class TdMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts inner elements to a Markdown string. + + Processes each element in the provided iterable by calling its to_md() method and + joins the resulting strings using a space separator. + + Returns: + str: The concatenated Markdown representation. + """ + return " ".join([e.to_md() for e in inner]) + +class TdRST(CustomRst): + def to_rst(self, inner: list[Symbol], symbol: Symbol, parent: Symbol) -> str: + """Convert inner content to its reStructuredText representation. + + Returns an empty string if no inner content is provided. If there is exactly one + symbol and it is of a text or heading type (Text, H1, H2, H3, H4, H5, H6), its + reStructuredText output is returned directly. Otherwise, the method concatenates + the reStructuredText outputs of all inner symbols with a space. + """ + if not inner: + return "" + + if len(inner) > 1 or not isinstance(inner[0], (Text, H1, H2, H3, H4, H5, H6)): + return " ".join([e.to_rst() for e in inner]) # Fallback to join instead of raising error + return inner[0].to_rst() + +class ThRST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Generates a reStructuredText string from inner elements. + + Iterates over each element in the provided list, calling its `to_rst()` method + and joining the results with spaces. The `symbol` and `parent` parameters are + present for interface consistency and are not used. + """ + return " ".join([e.to_rst() for e in inner]) + +class TBodyRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Placeholder method for generating reStructuredText for a table header. + + This method returns an empty string because header processing is handled by TableRST. + """ + return "" class Table(Symbol): html = "table" md = TableMD() rst = TableRST() - head:'THead' = None - body:'TBody' = None - - cols: 'dict[Th, list[Td]]' = {} - headers: 'list[Th]' = [] - - def to_pandas(self): - if not self.prepared: - self.prepare() - - logger.debug("Converting Table to pandas DataFrame") - try: - import pandas as pd - df = pd.DataFrame([e.to_pandas() for e in self.body.children], columns=self.head.to_pandas()) - logger.debug(f"Successfully converted table to DataFrame with shape {df.shape}") - return df - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `to_pandas`") - except Exception as e: - logger.error(f"Error converting table to pandas: {str(e)}") - raise - - @classmethod - def from_pandas(cls, df:'pd.DataFrame'): - logger.debug(f"Creating Table from pandas DataFrame with shape {df.shape}") - try: - import pandas as pd - self = cls() - head = THead.from_pandas(list(df.columns)) - body = TBody.from_pandas(df) - - self.head = head - self.body = body - - self.add_child(head) - self.add_child(body) - - logger.debug("Successfully created Table from DataFrame") - logger.debug(f"Table has {len(self.head.children)} columns and {len(self.body.children)} rows with shape {df.shape}") - logger.debug(f"Table head: {self.head.to_pandas()}") - logger.debug(f"Table body: {[e.to_list() for e in self.body.children]}") - return self - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `from_pandas`") - except Exception as e: - logger.error(f"Error creating table from pandas: {str(e)}") - raise - - def prepare(self, parent = None, *args, **kwargs): - return super().prepare(parent, table=self, *args, **kwargs) - -class THead(Symbol): - html = "thead" - rst = THeadRST() - md = THeadMD() - - table:'Table' = None - children:'List[Tr]' = List() - - head:'Tr' = None - - - def to_pandas(self) -> 'list[str]': - return self.to_list() - - def to_list(self) -> 'list[str]': - if not self.prepared: - self.prepare() - - return self.children[0].to_list() - - @classmethod - def from_pandas(cls, data:'list[str]'): - return cls.from_list(data) - - @classmethod - def from_list(cls, data:'list[str]'): - self = cls() - tr = Tr.from_list(data) - self.add_child(tr) - - return self - - def prepare(self, parent = None, table=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.table.head = self - return super().prepare(parent, table=table, head=True, *args, **kwargs) - -class TBody(Symbol): - html = "tbody" - rst = TBodyRST() - md = TBodyMD() - - table:'Table' = None - children:'List[Tr]' = List() - - def to_pandas(self): - if not self.prepared: - self.prepare() - - logger.debug("Converting TBody to pandas format") - data = [e.to_pandas() for e in self.children] - logger.debug(f"Converted {len(data)} rows from TBody") - return data - - @classmethod - def from_pandas(cls, df:'pd.DataFrame'): - logger.debug(f"Creating TBody from DataFrame with {len(df)} rows") - try: - import pandas as pd - self = cls() - - for i, row in df.iterrows(): - tr = Tr.from_pandas(row) - self.children.append(tr) - logger.debug(f"Added row {i} to TBody") - - return self - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `from_pandas`") - - def prepare(self, parent = None, table=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.table.body = self - return super().prepare(parent, table=table, head=False, *args, **kwargs) + nl = True class Tr(Symbol): html = "tr" md = TrMD() rst = TrRST() - table:'Table' = None - - children:'List[t.Union[Td, Th]]' = List() - - def __init__(self, styles = {}, classes = [], dom = True, inner = [], **props): - super().__init__(styles, classes, dom, inner, **props) - - self.is_header = False - if isinstance(self.parent, THead): - self.is_header = True - logger.debug("Tr element identified as header row") - - def to_pandas(self): - if not self.prepared: - self.prepare() - - def get(o, f): - return [getattr(v, f) for v in o] - - try: - import pandas as pd - if self.is_header: - raise ValueError("This `Tr` is a header row and cannot be converted to a pandas `Series`") - return pd.Series({h.data: v.data for h, v in zip(self.table.head.head.children, self.children)}, index=self.table.head.to_pandas()) - - except ImportError: - raise ImportError("`tables` extra is required to use `to_pandas`") - - def to_list(self): - if not self.prepared: - self.prepare() - - return [e.data for e in self.children] - - @classmethod - def from_pandas(cls, series:'pd.Series'): - try: - import pandas as pd - self = cls() - self.children.clear() - for v in series: - td = Td(inner=[Text(v)]) - self.children.append(td) - - return self - except ImportError: - raise ImportError("`tables` extra is required to use `from_pandas`") - - @classmethod - def from_list(cls, data:'list[str]'): - self = cls() - for value in data: - td = Td(inner=[Text(value)]) - self.children.append(td) - - return self - - def prepare(self, parent = None, table=None, head=False, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - if head: self.table.head.head = self - return super().prepare(parent, table=table, row=self, *args, **kwargs) - class Td(Symbol): html = "td" md = TdMD() rst = TdRST() - children:'List[Text]' = List() - row:'Tr' = None - - @property - def data(self): - return self.children.get(0, Text("")).text - - @property - def width(self): - return len(self.data) - - def prepare(self, parent = None, table=None, row=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.row = row - - self.header = self.table.headers[self.row.children.index(self)] - self.table.cols[self.header].append(self) - return super().prepare(parent, table=table, *args, **kwargs) - - def __len__(self): - return len(self.data) - class Th(Symbol): html = "th" - md = ThMD() + md = TdMD() rst = ThRST() - children:'List[Text]' = List() - row:'Tr' = None - - def __init__(self, styles: dict[str, str] = {}, classes: list[str] = [], dom: bool = True, inner: list[Symbol] = [], **props): - super().__init__(styles, classes, dom, inner, **props) - - @property - def data(self): - contents = self.children.get(0, Text("")).text - logger.debug(f"Th data: {contents}") - if contents == "": - logger.debug("Th data is empty") - return "" - logger.debug("Th data is not empty") - return f"**{contents}**" - - @property - def width(self): - """Width of the data""" - if self.data == "": - return 0 - return len(self.data)-4 +class THead(Symbol): + html = "thead" + md = THeadMD() + rst = THeadRST() - def prepare(self, parent = None, table=None, row=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.row = row - self.header = self - self.table.headers.append(self) - self.table.cols[self] = [self] - return super().prepare(parent, table=table, *args, **kwargs) - - def __len__(self): - """Width of the element (data + bolding)""" - return len(self.data) \ No newline at end of file +class TBody(Symbol): + html = "tbody" + md = TBodyMD() + rst = TBodyRST() \ No newline at end of file diff --git a/BetterMD/elements/text.py b/BetterMD/elements/text.py index f09900c..82e1e7e 100644 --- a/BetterMD/elements/text.py +++ b/BetterMD/elements/text.py @@ -2,33 +2,55 @@ from ..markdown import CustomMarkdown from ..html import CustomHTML -import typing as t - -class Str(t.Protocol): - def __str__(self) -> str: ... - # This is not equivelant to the html span or p tags but instead just raw text class Text(Symbol): - md = "{t}" - html = "{t}" - rst = "{t}" - - def __init__(self, text:'Str', dom = True, **props): - self.text = str(text) - return super().__init__(dom=dom, **props) - - def to_html(self) -> 'str': + md = "text" + html = "text" + rst = "text" + + def __init__(self, text:str, **props): + """ + Initializes a Text instance with the provided text and additional properties. + + Args: + text (str): The content of the text element. + **props: Additional keyword arguments passed to the parent Symbol class. + """ + self.text = text + return super().__init__(**props) + + def to_html(self, indent=0, parent=None): + """ + Return the text content as an HTML-formatted string with indentation. + + Prefixes the text with repeated four-space blocks corresponding to the provided + indentation level. The parent parameter is ignored. + + Args: + indent (int, optional): The number of indentation levels to apply. Defaults to 0. + parent: Optional parameter maintained for interface consistency; not used. + + Returns: + str: The indented HTML-formatted text. + """ + return f"{' '*indent}{self.text}" + + def to_md(self): + """ + Return the text as a Markdown string. + + Returns: + str: The original text. + """ return self.text - def to_md(self) -> 'str': + def to_rst(self): + """ + Return the text content for reStructuredText. + + Returns: + str: The text content as a reStructuredText string. + """ return self.text - - def to_rst(self) -> 'str': - return self.text - - def __str__(self): - return f"{self.text}" - - __repr__ = __str__ \ No newline at end of file diff --git a/BetterMD/elements/title.py b/BetterMD/elements/title.py new file mode 100644 index 0000000..cb7535d --- /dev/null +++ b/BetterMD/elements/title.py @@ -0,0 +1,47 @@ +from typing import Text +from .symbol import Symbol +from ..markdown import CustomMarkdown +from ..rst import CustomRst +from .text import Text + +class MD(CustomMarkdown): + def to_md(self, inner: list[Symbol], symbol: Symbol, parent: Symbol, **kwargs) -> str: + """ + Converts a single Text element to a Markdown title. + + This method verifies that the provided list contains exactly one element and that + this element is a Text instance. If the check passes, it returns a Markdown-formatted + title using the element's own conversion method; otherwise, a ValueError is raised. + + Args: + inner: A list of Symbol objects that must contain exactly one Text instance. + + Raises: + ValueError: If the inner list does not contain exactly one Text element. + """ + if not isinstance(inner[0], Text) or len(inner) != 1: + raise ValueError("Title element must contain a single Text element") + + return f'title: "{inner[0].to_md()}"' + +class RST(CustomRst): + def to_rst(self, inner: list[Symbol], symbol: Symbol, parent: Symbol, **kwargs) -> str: + """ + Generates a reStructuredText title string from a single Text element. + + Validates that the inner list contains exactly one Text instance and returns a formatted + string prefixed with ":title:" followed by the element’s RST representation. Raises a + ValueError if the validation fails. + """ + if not isinstance(inner[0], Text) or len(inner) != 1: + raise ValueError("Title element must contain a single Text element") + + return f":title: {inner[0].to_rst()}" + + +class Title(Symbol): + html = "title" + md = MD() + rst = RST() + + diff --git a/BetterMD/html/custom_html.py b/BetterMD/html/custom_html.py index 2ba3b19..00d4625 100644 --- a/BetterMD/html/custom_html.py +++ b/BetterMD/html/custom_html.py @@ -1,13 +1,38 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomHTML(t.Generic[T]): - def to_html(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol') -> str: ... +class CustomHTML(t.Generic[T], ABC): + @abstractmethod + def to_html(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol') -> str: """ +Generate an HTML representation of a symbol structure. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +Subclasses must override this method to convert the provided symbol and its inner content into an +HTML string. The 'inner' list contains any child symbols, 'symbol' is the primary symbol to convert, +and 'parent' provides the context of the symbol's parent. + +Returns: + An HTML string representing the symbol structure. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'):""" +Prepares the components for HTML conversion. + +This method serves as a hook for any preparatory processing on inner symbols, +the current symbol, and its parent before HTML rendering. Subclasses should +override this method to implement any domain-specific logic needed to adjust or +validate these elements prior to conversion. + +Parameters: + inner: List of symbols representing nested or inner elements. + symbol: The symbol instance for the current element. + parent: The parent symbol providing contextual hierarchy. +""" +... def verify(self, text) -> bool: ... \ No newline at end of file diff --git a/BetterMD/markdown/custom_markdown.py b/BetterMD/markdown/custom_markdown.py index db4e535..afeb5b4 100644 --- a/BetterMD/markdown/custom_markdown.py +++ b/BetterMD/markdown/custom_markdown.py @@ -1,16 +1,45 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomMarkdown(t.Generic[T]): +class CustomMarkdown(t.Generic[T], ABC): prop = "" md: 'dict[str, str]' = {} - def to_md(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol', **kwargs) -> 'str': ... + @abstractmethod + def to_md(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol') -> str: """ +Converts the provided symbols to a markdown string. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +Subclasses must implement this method to generate a markdown representation based on a list of inner symbols, a specific symbol, and its parent symbol. - def verify(self, text) -> 'bool': ... \ No newline at end of file +Args: + inner: A list of symbols to be included in the markdown content. + symbol: The symbol to be processed. + parent: The parent symbol providing contextual hierarchy. + +Returns: + A markdown-formatted string. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'): """Prepare markdown content from symbols. + +Hook method for subclasses to perform any required pre-processing before generating the +final markdown output. It receives a list of inner symbols, the current symbol being processed, +and its parent symbol to allow contextual adjustments. By default, this method does nothing. +""" +... + + def verify(self, text) -> bool: """Verify that the provided text meets the expected criteria. + +Assesses whether the specified text conforms to custom markdown validation rules. +Returns True if the text is valid according to these rules, otherwise False. + +Args: + text: The text to verify. +""" +... diff --git a/BetterMD/parse/collection.py b/BetterMD/parse/collection.py new file mode 100644 index 0000000..4629d08 --- /dev/null +++ b/BetterMD/parse/collection.py @@ -0,0 +1,64 @@ +import typing as t +import logging +from ..html import CustomHTML + +if t.TYPE_CHECKING: + from ..elements import Symbol + +class Collection: + def __init__(self, *symbols:'type[Symbol]'): + """ + Initializes a Collection instance with the provided symbols. + + Args: + *symbols: Variable number of Symbol instances to populate the collection. + + Also configures a logger named "BetterMD". + """ + self.symbols = list(symbols) + self.logger = logging.getLogger("BetterMD") + + def add_symbols(self, symbol:'type[Symbol]'): + """ + Adds a Symbol to the collection. + + Appends the provided Symbol instance to the collection's internal list of symbols. + """ + self.symbols.append(symbol) + + def remove_symbol(self, symbol:'type[Symbol]'): + """ + Removes the specified symbol from the collection. + + If the symbol is not present in the collection, a ValueError is raised. + """ + self.symbols.remove(symbol) + + def find_symbol(self, name:'str', raise_errors:'bool'=False) -> 't.Union[None, type[Symbol]]': + """ + Searches for a symbol matching the given name in the collection. + + This method iterates over the collection's symbols. It returns the first symbol whose `html` + attribute is either a string equal to the provided name or an instance of CustomHTML that verifies + a match via its `verify` method. If no matching symbol is found, a ValueError is raised when + raise_errors is True; otherwise, the method returns None. + + Parameters: + name: The name to match against a symbol's html attribute. + raise_errors: If True, raises a ValueError when no matching symbol is found. + + Returns: + The matching symbol if found; otherwise, None. + + Raises: + ValueError: If no symbol is found and raise_errors is True. + """ + for symbol in self.symbols: + if isinstance(symbol.html, str) and symbol.html == name: + return symbol + elif isinstance(symbol.html, CustomHTML) and symbol.html.verify(name): + return symbol + + if raise_errors: + raise ValueError(f"Symbol `{name}` not found in collection, if using default symbols it may not be supported.") + return None \ No newline at end of file diff --git a/BetterMD/parse/html.py b/BetterMD/parse/html.py new file mode 100644 index 0000000..13f4a67 --- /dev/null +++ b/BetterMD/parse/html.py @@ -0,0 +1,276 @@ +from .typing import ELEMENT +import typing as t + +class HTMLParser: + def __init__(self): + """ + Initializes the HTMLParser instance. + + Invokes the reset method to set the parser's internal state to its default values. + """ + self.reset() + + def reset(self): + """ + Reset the parser state and clear all internal buffers. + + Reinitializes the parser by setting the current tag to None, clearing the DOM list, resetting + the parsing state to 'TEXT', and emptying the buffer, attribute name, and tag stack. + """ + self.current_tag:'t.Optional[ELEMENT]' = None + self.dom = [] + self.state = 'TEXT' + self.buffer = '' + self.attr_name = '' + self.tag_stack = [] + + def parse(self, html:'str') -> 'list[ELEMENT]': + """ + Parse an HTML string into a DOM representation. + + Resets the parser state and processes each character of the input HTML, transitioning + through states to identify text content, tag names, attributes, and self-closing or + closing tags. Delegates the creation of nodes to helper methods and adds any remaining + text as a text node. + + Args: + html (str): The HTML content to parse. + + Returns: + list[ELEMENT]: The constructed Document Object Model. + """ + self.reset() + + i = 0 + while i < len(html): + char = html[i] + + if self.state == 'TEXT': + if char == '<': + if self.buffer.strip(): + self.handle_text(self.buffer) + self.buffer = '' + self.state = 'TAG_START' + else: + self.buffer += char + + elif self.state == 'TAG_START': + if char == '/': + self.state = 'CLOSING_TAG' + elif char == '!': + self.state = 'COMMENT_OR_DOCTYPE' + self.buffer = '!' + else: + self.state = 'TAG_NAME' + self.buffer = char + + elif self.state == 'TAG_NAME': + if char.isspace(): + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.buffer = '' + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '>': + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.state = 'SELF_CLOSING_TAG' + else: + self.buffer += char + + elif self.state == 'BEFORE_ATTRIBUTE_NAME': + if char.isspace(): + pass + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + else: + self.attr_name = char + self.state = 'ATTRIBUTE_NAME' + + elif self.state == 'ATTRIBUTE_NAME': + if char.isspace(): + self.current_tag['attributes'][self.attr_name] = '' + self.state = 'AFTER_ATTRIBUTE_NAME' + elif char == '=': + self.state = 'BEFORE_ATTRIBUTE_VALUE' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = '' + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag['attributes'][self.attr_name] = '' + self.state = 'SELF_CLOSING_TAG' + else: + self.attr_name += char + + elif self.state == 'AFTER_ATTRIBUTE_NAME': + if char.isspace(): + pass + elif char == '=': + self.state = 'BEFORE_ATTRIBUTE_VALUE' + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + else: + self.current_tag['attributes'][self.attr_name] = '' + self.attr_name = char + self.state = 'ATTRIBUTE_NAME' + + elif self.state == 'BEFORE_ATTRIBUTE_VALUE': + if char.isspace(): + pass + elif char == '"': + self.buffer = '' + self.state = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED' + elif char == "'": + self.buffer = '' + self.state = 'ATTRIBUTE_VALUE_SINGLE_QUOTED' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = '' + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + self.buffer = char + self.state = 'ATTRIBUTE_VALUE_UNQUOTED' + + elif self.state == 'ATTRIBUTE_VALUE_DOUBLE_QUOTED': + if char == '"': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'AFTER_ATTRIBUTE_VALUE_QUOTED' + else: + self.buffer += char + + elif self.state == 'ATTRIBUTE_VALUE_SINGLE_QUOTED': + if char == "'": + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'AFTER_ATTRIBUTE_VALUE_QUOTED' + else: + self.buffer += char + + elif self.state == 'ATTRIBUTE_VALUE_UNQUOTED': + if char.isspace(): + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'SELF_CLOSING_TAG' + else: + self.buffer += char + + elif self.state == 'AFTER_ATTRIBUTE_VALUE_QUOTED': + if char.isspace(): + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + self.state = 'BEFORE_ATTRIBUTE_NAME' + i -= 1 # Reconsider this character + + elif self.state == 'SELF_CLOSING_TAG': + if char == '>': + self.handle_tag_self_closing(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + # Error handling + pass + + elif self.state == 'CLOSING_TAG': + if char == '>': + self.handle_tag_close(self.buffer) + self.buffer = '' + self.state = 'TEXT' + else: + self.buffer += char + + # Additional states would be implemented here + + i += 1 + + # Handle any remaining text + if self.state == 'TEXT' and self.buffer.strip(): + self.handle_text(self.buffer) + + return self.dom + + def handle_tag_open(self, tag): + """ + Handles an opening tag by appending it to the current DOM hierarchy. + + If there are open tags, the provided tag is added as a child of the last tag in the stack. + Otherwise, it is appended as a top-level element in the DOM. The tag is then pushed onto the stack. + """ + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(tag) + else: + self.dom.append(tag) + + self.tag_stack.append(tag) + + def handle_tag_self_closing(self, tag): + """ + Processes a self-closing tag by attaching it to the DOM. + + If a parent tag is currently active (i.e. the tag stack is not empty), the self-closing tag is added as a child of that tag. Otherwise, it is appended directly to the DOM. + """ + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(tag) + else: + self.dom.append(tag) + + def handle_tag_close(self, tag_name): + """ + Closes a tag by removing it from the tag stack if the tag's name matches. + + If the last tag in the stack has a 'name' equal to the provided tag_name, it is removed. + """ + if len(self.tag_stack) > 0 and self.tag_stack[-1]['name'] == tag_name: + self.tag_stack.pop() + + def handle_text(self, text): + """ + Appends a text node to the current DOM. + + Creates a text node from the given content and adds it to the DOM structure. If there is + an active tag (indicated by a non-empty tag stack), the text node is appended to that + tag's children; otherwise, it is added directly to the DOM. + """ + text_node = {'type': 'text', 'content': text, 'name': 'text'} + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(text_node) + else: + self.dom.append(text_node) + + def get_dom(self): + """ + Return the constructed DOM. + + Retrieves the document object model (DOM) built during HTML parsing. The DOM is + represented as a list of elements reflecting the hierarchical structure of the + input HTML. + """ + return self.dom \ No newline at end of file diff --git a/BetterMD/parse/markdown.py b/BetterMD/parse/markdown.py new file mode 100644 index 0000000..7896b67 --- /dev/null +++ b/BetterMD/parse/markdown.py @@ -0,0 +1,495 @@ +import re +import typing as t +from .typing import ELEMENT, TEXT +import threading as th + +class MDParser: + + top_level_tags = { + "blockquote": r"^> (.+)$", # Blockquote + "br": r"\n\n", # Br + "code": r"^```([A-Za-z]*)[^.](?:([^`]*)[^.])?```$", # Code block + + "h": r"^(#{1,6})(?: (.*))?$", + + "hr": r"^---+$", # Hr + + "ul" : r"^([ | ]*)(?:-|\+|\*)(?: (.*))?$", # Ul Li + "ol" : r"^([ | ]*)(\d)\.(?: (.*))?$", # Ol Li + + "tr": r"^\|(?:[^|\n]+\|)+$", # tr - must start and end with | and have at least one | + "thead": r"^\|(?::?-+:?\|)+$", # thead / tbody + + "title": r"^title: .+$", # Title + } + + def __init__(self): + """ + Initialize the Markdown parser. + + Calls reset() to initialize the parser's internal state for a new parsing operation. + """ + self.reset() + + def reset(self): + """Reset the parser's internal state. + + Clears the document structure, text buffer, and stacks used for lists and DOM + elements to prepare the parser for a new operation. + """ + self.dom = [] + self.buffer = '' + self.list_stack = [] + self.dom_stack = [] + + def create_element(self, name:'str', attrs:'dict[str, str]'=None, children:'list[ELEMENT|TEXT]'=None) -> 'ELEMENT': + """ + Creates a structured element dictionary for a DOM node. + + Constructs a dictionary representing an element with a specified tag name, + attributes, and children. If no attributes or children are provided, they + default to an empty dictionary or list, respectively. + """ + if children is None: + children = [] + + if attrs is None: + attrs = {} + + return { + "type": "element", + "name": name, + "attributes": attrs, + "children": children + } + + def create_text(self, content:'str') -> 'TEXT': + """ + Creates a text element. + + Returns a dictionary representing a text node with the specified content. + The returned element always includes the keys "type", set to "text", "content", + carrying the provided text value, and "name", also set to "text". + """ + return { + "type": "text", + "content": content, + "name": "text" + } + + def end_block(self): + # Create paragraph from buffered text + """ + Finalizes the buffered text into a paragraph element. + + If the internal buffer contains text, the method strips it and, if non-empty, creates a + paragraph element with the text and appends it to the DOM. After processing, the buffer is cleared. + """ + if self.buffer: + text = self.buffer.strip() + if text: + para = self.create_element("p", children=[self.create_text(text)]) + self.dom.append(para) + self.buffer = '' + + def start_block(self): + """ + Placeholder method for starting a new block. + + This method is currently a no-op and may be extended in the future. + """ + pass + + def handle_blockquote(self, text: 'list[str]', i): + """ + Process a blockquote section in Markdown text. + + Scans the list of Markdown lines starting from the given index to extract the full + blockquote content. The function removes blockquote markers, handles paragraph + breaks on empty lines, and accepts continuation lines without explicit markers. + It recursively parses the accumulated text into a blockquote element and appends + it to the document model. + + Args: + text: A list of Markdown lines. + i: The starting index from which to process the blockquote. + + Returns: + An integer indicating the number of lines consumed as part of the blockquote. + """ + elm = self.create_element("blockquote") + new_text = [] + current_line = [] + + for line in text[i:]: + if re.match(self.top_level_tags["blockquote"], line): + # Remove blockquote marker and add to current line + content = line.removeprefix("> ").removeprefix(">").strip() + if content: + current_line.append(content) + elif line.strip() == "": + # Empty line marks paragraph break + if current_line: + new_text.append(" ".join(current_line)) + new_text.append("") + current_line = [] + elif not any(re.match(pattern, line) for pattern in self.top_level_tags.values()): + # Continuation of blockquote without marker + current_line.append(line.strip()) + else: + break + + if current_line: + new_text.append(" ".join(current_line)) + + # Parse blockquote content recursively + elm["children"] = MDParser().parse("\n".join(new_text)) + self.dom.append(elm) + + return len(new_text) - 1 + + def handle_code(self, text: 'list[str]'): + """ + Extracts and processes a code block from Markdown text lines. + + Finalizes any pending text block, then uses a regular expression to match and extract + the language identifier and code content from the joined text lines. It wraps the code + within a element (annotated with the language) nested inside a
 element and
+        appends this structure to the DOM. Returns the character offset from the opening to the
+        closing code fence within the joined text.
+            
+        Args:
+            text (list[str]): Markdown lines representing the code block.
+            
+        Returns:
+            int: The offset from the opening code fence to the closing code fence.
+        """
+        self.end_block()
+        match = re.match(self.top_level_tags["code"], "\n".join(text))
+        assert match is not None, "Code block not found"
+
+        lang = match.group(1)
+        content = match.group(2)
+
+        elm = self.create_element("pre", children=[self.create_element("code", {"language": lang}, [self.create_text(content)])])
+        self.dom.append(elm)
+
+        return "\n".join(text)["\n".join(text).index("```"):].index("```")
+
+
+    def handle_br(self, text: 'list[str]'):
+        """
+        Handles Markdown line breaks by appending a 
element if two empty lines are encountered. + + Finalizes the current text block and checks the first two lines of the provided markdown + input. If both lines are empty, a
element is appended to the document model and + the method returns 1, indicating that one line was processed. Otherwise, it returns 0. + + Args: + text: A list of markdown lines; the first two lines are inspected for empty strings. + """ + self.end_block() + if text[0] == "" and text[1] == "": + self.dom.append(self.create_element("br", {})) + return 1 + return 0 + + def handle_h(self, line: 'str'): + """ + Process a Markdown header line and append it to the DOM. + + Ends any active text block, parses the line to determine the header level based on the + number of '#' characters, and extracts the header text. A header element (e.g.,

,

) + is then created with the extracted content and added to the DOM. An assertion error is raised + if the line does not match the expected header format. + + Args: + line: A Markdown string representing a header, beginning with '#' characters. + """ + self.end_block() + match = re.match(self.top_level_tags["h"], line) + assert match is not None, "Header not found" + + level = len(match.group(1)) + content = match.group(2) + + self.dom.append(self.create_element(f"h{level}", children=[self.create_text(content)])) + + def handle_hr(self, line: 'str'): + """Appends a horizontal rule to the DOM. + + Ends the current text block and inserts an
element into the document model. + """ + self.end_block() + self.dom.append(self.create_element("hr", {})) + + def handle_text(self, line: 'str'): + # Don't create text nodes for empty lines + """ + Buffers a line of text for paragraph formation. + + If the line is empty or contains only whitespace, it delegates to the line break handler. + Otherwise, it appends the line to an internal buffer to accumulate text for later + conversion into a paragraph element. + """ + if not line.strip(): + self.handle_br(line) + return + + # Buffer text content for paragraph handling + if self.buffer: + self.buffer += '\n' + line + else: + self.buffer = line + + def handle_list(self, text: 'list[str]', i: int, indent_level: int = 0) -> int: + """ + Parses a Markdown list from the provided text and appends it to the DOM. + + This function examines the text starting at the specified index to determine if it + represents an unordered or ordered list based on preset patterns. It aggregates list + items, supports nested lists by comparing indentation levels, and creates corresponding + list elements with their items. The constructed list is then added to the DOM, and the + function returns the number of lines processed. + """ + if re.match(self.top_level_tags["ul"], text[i]): + list_elm = self.create_element("ul") + list_pattern = self.top_level_tags["ul"] + elif re.match(self.top_level_tags["ol"], text[i]): + list_elm = self.create_element("ol") + list_pattern = self.top_level_tags["ol"] + else: + return 0 + + current_item = [] + lines_processed = 0 + + while i + lines_processed < len(text): + line = text[i + lines_processed] + + if not line.strip(): + if current_item: + # Empty line in list item - treat as paragraph break + current_item.append("") + lines_processed += 1 + continue + + list_match = re.match(list_pattern, line) + if list_match: + indent = len(list_match.group(1)) + + if indent < indent_level: + # End of current list level + break + elif indent > indent_level: + # Nested list + nested_lines = lines_processed + self.handle_list(text[i + lines_processed:], 0, indent) + lines_processed += nested_lines + continue + + # Add previous item if exists + if current_item: + content = " ".join(current_item).strip() + if content: + list_elm["children"].append( + self.create_element("li", children=[self.create_text(content)]) + ) + + # Start new item + current_item = [list_match.group(2).strip()] + + elif not any(re.match(pattern, line) for pattern in self.top_level_tags.values()): + # Continuation of list item + current_item.append(line.strip()) + else: + break + + lines_processed += 1 + + # Add final item + if current_item: + content = " ".join(current_item).strip() + if content: + list_elm["children"].append( + self.create_element("li", children=[self.create_text(content)]) + ) + + self.dom.append(list_elm) + return lines_processed + + def handle_table(self, text: 'list[str]', i: int) -> int: + # First check if this is actually a table + # A proper table needs at least two rows (header and separator) + """ + Parse a Markdown table starting at a given index. + + This method inspects the provided list of Markdown lines beginning at index i to + determine if they form a valid table structure. A valid table requires at least a + header row and a subsequent separator line. When detected, the method constructs + a table element with separate header () and body () sections, where + cells in the header are rendered as and those in the body as . If the + structure does not match a table, the line is processed as regular text. + + Args: + text: A list of Markdown text lines. + i: The starting index where table parsing should commence. + + Returns: + The number of lines processed as part of the table. + """ + if i + 1 >= len(text) or not re.match(self.top_level_tags["thead"], text[i + 1]): + # Not a table, treat as regular text + self.handle_text(text[i]) + return 1 + + lines_processed = 0 + table = self.create_element("table") + thead = self.create_element("thead") + tbody = self.create_element("tbody") + current_section = thead + + while i + lines_processed < len(text): + line = text[i + lines_processed] + + if not line.strip(): + break + + if re.match(self.top_level_tags["thead"], line): + # Alignment row - skip it but switch to tbody + current_section = tbody + lines_processed += 1 + continue + + if re.match(self.top_level_tags["tr"], line): + # Process table row + row = self.create_element("tr") + cells = [cell.strip() for cell in line.strip('|').split('|')] + + for cell in cells: + if current_section == thead: + cell_type = "th" + else: + cell_type = "td" + + row["children"].append( + self.create_element(cell_type, children=[self.create_text(cell.strip())]) + ) + + current_section["children"].append(row) + lines_processed += 1 + else: + break + + if thead["children"]: + table["children"].append(thead) + if tbody["children"]: + table["children"].append(tbody) + + self.dom.append(table) + return lines_processed + + def handle_title(self, line: 'str'): + """ + Processes a Markdown title line and updates the document head. + + Finalizes any ongoing text block, extracts the title from the provided line using a + predefined pattern, and creates a head element containing a title element with the + extracted text. An assertion error is raised if the line does not match the expected title format. + """ + self.end_block() + match = re.match(self.top_level_tags["title"], line) + assert match is not None, "Title not found" + + title = match.group(1) + self.head = self.create_element("head", children=[self.create_element("title", children=[self.create_text(title)])]) + + def parse(self, markdown: 'str') -> 'ELEMENT': + """ + Parses Markdown text into a structured HTML element. + + Splits the input Markdown text into lines and processes each one to identify + block-level elements such as headers, blockquotes, code blocks, horizontal rules, + lists, tables, and titles. It buffers regular text for paragraph creation and + finalizes any pending content before assembling a DOM with head and body elements, + which is then returned as an HTML element. + """ + self.reset() + lines = markdown.splitlines() + i = 0 + + while i < len(lines): + line = lines[i].strip() # Strip whitespace from each line + + # Empty line ends current block + if not line: + self.end_block() + i += 1 + continue + + # Check for block-level elements + if re.search(self.top_level_tags["h"], line): + self.end_block() + self.handle_h(line) + i += 1 + continue + + elif re.search(self.top_level_tags["blockquote"], line): + self.end_block() + lines_processed = self.handle_blockquote(lines, i) + i += lines_processed + 1 + continue + + elif re.search(self.top_level_tags["code"], "\n".join(lines[i:])): + self.end_block() + lines_processed = self.handle_code(lines[i:]) + i += lines_processed + 1 + continue + + elif re.search(self.top_level_tags["h"], line): + self.end_block() + self.handle_h(line) + i += 1 + continue + + elif re.search(self.top_level_tags["hr"], line): + self.end_block() + self.handle_hr(line) + i += 1 + continue + + elif re.search(self.top_level_tags["ul"], line) or re.search(self.top_level_tags["ol"], line): + self.end_block() + lines_processed = self.handle_list(lines, i) + i += lines_processed + continue + + elif re.search(self.top_level_tags["tr"], line): + self.end_block() + lines_processed = self.handle_table(lines, i) + i += lines_processed + continue + + elif re.search(self.top_level_tags["title"], line): + self.end_block() + self.handle_title(line) + i += 1 + continue + + elif re.search(self.top_level_tags["br"], line): + self.end_block() + lines_processed = self.handle_br(lines[i:]) + i += lines_processed + continue + + else: + # Regular text gets buffered for paragraph handling + self.handle_text(line) + i += 1 + + # End any remaining block + self.end_block() + + head = self.create_element("head") or self.head + body = self.create_element("body", children=self.dom) + + return self.create_element("html", children=[head, body]) \ No newline at end of file diff --git a/BetterMD/parse/typing.py b/BetterMD/parse/typing.py new file mode 100644 index 0000000..429121b --- /dev/null +++ b/BetterMD/parse/typing.py @@ -0,0 +1,28 @@ +import typing as t + +class TEXT(t.TypedDict): + type: t.Literal["text"] + content: str + name: t.Literal["text"] + +class ELEMENT(t.TypedDict): + type: 't.Literal["element"]' + name: 'str' + attributes: 'dict[str, str]' + children: 'list[t.Union[ELEMENT, TEXT]]' + +@t.runtime_checkable +class Parser(t.Protocol): + def parse(self, html:'str') -> 'list[ELEMENT]': """Parse an HTML string into a list of element structures. + +Parses the provided HTML markup and returns a list of ELEMENT dictionaries. +Each dictionary represents an HTML element with its tag name, attributes, and child +nodes, conforming to the ELEMENT TypedDict schema. + +Args: + html: A string containing the HTML markup to parse. + +Returns: + A list of ELEMENT dictionaries representing the parsed HTML structure. +""" +... \ No newline at end of file diff --git a/BetterMD/rst/custom_rst.py b/BetterMD/rst/custom_rst.py index c3fa565..8dab732 100644 --- a/BetterMD/rst/custom_rst.py +++ b/BetterMD/rst/custom_rst.py @@ -1,16 +1,57 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomRst(t.Generic[T]): +class CustomRst(t.Generic[T], ABC): prop = "" rst: 'dict[str, str]' = {} - def to_rst(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol', **kwargs) -> str: ... + @abstractmethod + def to_rst(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol') -> 'str': """ +Convert a symbol and its children into reStructuredText format. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +This abstract method converts the provided symbol into its corresponding reStructuredText +representation. The conversion may incorporate nested symbols from the inner list to form a +complete and structured output. - def verify(self, text) -> bool: ... \ No newline at end of file +Args: + inner: A list of child symbols to include in the conversion. + symbol: The symbol to be converted. + parent: The parent symbol providing contextual information. + +Returns: + A string representing the reStructuredText formatted output. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'): """ +Prepares contextual data for reStructuredText conversion. + +This method provides a hook for performing any necessary preparatory tasks prior to +conversion. Subclasses may override this method to initialize internal state or +modify the provided symbols based on their context. + +Parameters: + inner: A list of symbols representing inner content. + symbol: The symbol to be prepared. + parent: The parent symbol associated with the current symbol. +""" +... + + def verify(self, text) -> 'bool': """ +Verifies if the given text meets the required validation criteria. + +This method checks whether the provided text conforms to the expected rules or format. +It returns True when the text is valid, and False otherwise. + +Args: + text: The text string to validate. + +Returns: + bool: True if the text passes validation, False otherwise. +""" +... \ No newline at end of file