diff --git a/BetterMD/__init__.py b/BetterMD/__init__.py index 9e04a76..a5de000 100644 --- a/BetterMD/__init__.py +++ b/BetterMD/__init__.py @@ -1,10 +1,28 @@ -import logging -from .elements import A, H1, H2, H3, H4, H5, H6, Head, OL, UL, LI, Text, Div, P, Span, Img, B, I, Br, Blockquote, Hr, Table, Tr, Td, Th, THead, TBody, Input, Code -from .html import CustomHTML -from .markdown import CustomMarkdown -from .rst import CustomRst +from .elements import * +from .parse import Collection, HTMLParser, MDParser, RSTParser +def from_html(html:'str'): + """ + Converts an HTML string to a Symbol object. + + This function processes the provided HTML content by calling the Symbol.from_html method and returns the resulting Symbol. + + Args: + html: The HTML content to convert. + + Returns: + The Symbol object corresponding to the input HTML. + """ + return Symbol.from_html(html) -def enable_debug_mode(): - logging.basicConfig(level=logging.DEBUG) - logger = logging.getLogger("BetterMD") +def from_md(md:'str'): + """ + Converts a Markdown string into a Symbol instance. + + This function processes a Markdown-formatted string by invoking the + Symbol.from_md method to generate a corresponding Symbol object. + + Args: + md (str): The Markdown content to be converted. + """ + return Symbol.from_md(md) \ No newline at end of file diff --git a/BetterMD/elements/a.py b/BetterMD/elements/a.py index d7ea329..8811ddc 100644 --- a/BetterMD/elements/a.py +++ b/BetterMD/elements/a.py @@ -1,23 +1,109 @@ -from BetterMD.rst.custom_rst import CustomRst from .symbol import Symbol +from ..rst import CustomRst from ..markdown import CustomMarkdown -from ..html import CustomHTML +import re import typing as t -class MD(CustomMarkdown['A']): - def to_md(self, inner, symbol, parent, **kwargs): - return f"[{" ".join([e.to_md(**kwargs) for e in inner])}]({symbol.get_prop("href")})" +if t.TYPE_CHECKING: + from ..parse import Collection + +class MD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts inner elements and a symbol's href property into a Markdown link. + + This method constructs a Markdown link by joining the Markdown representations + of each element in the inner list as the link text and using the 'href' property + (from symbol.get_prop("href")) as the link destination. + + Args: + inner: A list of elements that implement a to_md() method. + symbol: An object providing a 'href' value via its get_prop() method. + parent: Unused parameter that exists for interface compatibility. + + Returns: + A string representing a Markdown link. + """ + return f"[{" ".join([e.to_md() for e in inner])}]({symbol.get_prop("href")})" + + def verify(self, text:'str'): + """ + Check if the given text contains Markdown link patterns. + + This method searches for inline links ([text](url)), automatic links (), + and reference links ([text][ref] with corresponding definitions) in the input text. + It returns True if any of these patterns is detected; otherwise, it returns False. + + Args: + text: The text to analyze for Markdown link formats. + + Returns: + True if a valid link format is found, False otherwise. + """ + if re.findall("\[([^\]]+)\]\((https?:\/\/[^\s)]+)\)", text): + # Case 1: Inline link + return True + + elif re.findall("<(https?:\/\/[^\s>]+)>", text): + # Case 2: Automatic Links + return True + + elif re.findall("\[([^\]]+)\]\[([^\]]+)\]\s*\n?\[([^\]]+)\]:\s*(https?:\/\/[^\s]+)", text): + # Case 3: Reference Links + return True + + return False -class HTML(CustomHTML['A']): - def to_html(self, inner, symbol, parent, **kwargs): - return f"{" ".join([e.to_html(**kwargs) for e in inner])}" class RST(CustomRst['A']): - def to_rst(self, inner, symbol, parent, **kwargs): - return f"`{' '.join([e.to_rst(**kwargs) for e in inner])} <{symbol.get_prop('href')}>`_" + def to_rst(self, inner, symbol, parent): + """ + Converts inner elements to an RST-formatted hyperlink. + + This method concatenates the RST representations of the provided inner elements, + retrieves the 'href' property from the symbol, and returns a formatted RST link in the + form: `inner_text `_. + + Note: The parent parameter is included for interface compatibility but is not used. + """ + return f"`{' '.join([e.to_rst() for e in inner])} <{symbol.get_prop('href')}>`_" class A(Symbol): prop_list = ["href"] + + refs = {} md = MD() - html = HTML() - rst = RST() \ No newline at end of file + html = "a" + rst = RST() + + @classmethod + def md_refs(cls, references: 'list[str]' = None): + """ + Registers Markdown references for the symbol. + + This class method is a placeholder for future integration of Markdown references. + If provided, the optional 'references' parameter should be a list of strings that + represent reference identifiers. No operation is performed in the current implementation. + """ + pass + + @classmethod + def rst_refs(cls, references: 'list[str]' = None): + """ + Processes reStructuredText references for the symbol. + + This class method serves as a placeholder for future functionality to handle + reStructuredText-specific link references. An optional list of reference strings + may be provided for processing. + """ + pass + + @classmethod + def html_refs(cls, references: 'list[str]' = None): + """ + Handles HTML references for the symbol. + + If a list of reference strings is provided, they may be processed or registered. + Currently, this method is a placeholder with no implemented functionality. + """ + pass \ No newline at end of file diff --git a/BetterMD/elements/code.py b/BetterMD/elements/code.py index 26ba46c..b2b3874 100644 --- a/BetterMD/elements/code.py +++ b/BetterMD/elements/code.py @@ -2,34 +2,115 @@ from .text import Text from ..markdown import CustomMarkdown from ..html import CustomHTML +from ..rst import CustomRst -class MD(CustomMarkdown['Code']): - def to_md(self, inner, symbol, parent, **kwargs): - language = symbol.get_prop("language", "") +class MD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert content to Markdown code. - content = " ".join([e.to_md(**kwargs) for e in inner]) + Transforms the provided content into its Markdown representation. If a language is specified + via the symbol or the content contains newline characters, the content is wrapped in a fenced + code block with the optional language identifier; otherwise, it is formatted as inline code. + If the input is a Text instance, it is first converted to Markdown. + + Returns: + str: The Markdown-formatted code. + """ + language = symbol.get_prop("language", "") + if isinstance(inner, Text): + inner = inner.to_md() # If it's a code block (has language or multiline) if language or "\n" in inner: - return f"```{language}\n{content}\n```\n" + return f"```{language}\n{inner}\n```\n" # Inline code - return f"`{content}`" + return f"`{inner}`" class HTML(CustomHTML): - def to_html(self, inner, symbol, parent, **kwargs): - language = symbol.get_prop("language", "") + def to_html(self, inner, symbol, parent): + """ + Converts inner elements to an HTML code element with optional syntax highlighting. - content = " ".join([e.to_html(**kwargs) for e in inner]) + Joins the HTML representations of each inner element with newline characters and + wraps the result in a tag. If a programming language is specified in the symbol's + properties, the code element is assigned a language-specific class. + """ + language = symbol.get_prop("language", "") + inner = "\n".join([i.to_html() for i in inner]) if language: - return f'
{content}
' + return f'{inner}' + + return f"{inner}" + + def verify(self, text: str) -> bool: + """ + Verifies that the input text equals "code" in a case-insensitive manner. + + Args: + text: The string to validate against the keyword "code". + + Returns: + True if the lowercase version of text is "code", otherwise False. + """ + return text.lower() == "code" + +class RST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Convert content to a reStructuredText format. - return f"{content}" + This method transforms the provided content into reStructuredText syntax suited for code + representation. It extracts an optional programming language from the symbol and processes the + inner content accordingly. When a language is specified or the content spans multiple lines, it + formats the content as an indented code block—using a language-specific code-block directive if + available or a literal block if not. Otherwise, the content is formatted as inline code with backticks + properly escaped if necessary. + + Parameters: + inner: The content to convert, which may be a list of elements or a single element. Each item is + either an object with its own RST conversion method or is convertible to a string. + symbol: An object that provides properties (including a "language" attribute) for determining + code block formatting. + parent: A contextual parameter that is part of the interface but is not used in this conversion. + + Returns: + A string containing the content formatted in reStructuredText. + """ + language = symbol.get_prop("language", "") + + # Handle inner content + if isinstance(inner, list): + content = "".join([ + i.to_rst() if isinstance(i, Symbol) else str(i) + for i in inner + ]) + else: + content = inner.to_rst() if isinstance(inner, Symbol) else str(inner) + + # If it's a code block (has language or multiline) + if language or "\n" in content: + # Use code-block directive for language-specific blocks + if language: + # Indent the content by 3 spaces (RST requirement) + indented_content = "\n".join(f" {line}" for line in content.strip().split("\n")) + return f".. code-block:: {language}\n\n{indented_content}\n\n" + + # Use simple literal block for language-less blocks + # Indent the content by 3 spaces (RST requirement) + indented_content = "\n".join(f" {line}" for line in content.strip().split("\n")) + return f"::\n\n{indented_content}\n\n" + + # Inline code + # Escape backticks if they exist in content + if "`" in content: + return f"``{content}``" + return f"`{content}`" class Code(Symbol): - prop_list = ["language"] html = HTML() md = MD() - rst = "``" + rst = RST() nl = True \ No newline at end of file diff --git a/BetterMD/elements/input.py b/BetterMD/elements/input.py index 4cf9e4b..61b8354 100644 --- a/BetterMD/elements/input.py +++ b/BetterMD/elements/input.py @@ -3,32 +3,48 @@ from ..markdown import CustomMarkdown from ..rst import CustomRst -class HTML(CustomHTML): - def to_html(self, inner, symbol, parent, **kwargs): - # Collect all input attributes - attrs = [] - for prop in Input.props: - value = symbol.get_prop(prop) - if value: - # Handle boolean attributes like 'required', 'disabled', etc. - if isinstance(value, bool) and value: - attrs.append(prop) - else: - attrs.append(f'{prop}="{value}"') - - attrs_str = " ".join(attrs) - return f"" - class MD(CustomMarkdown): - def to_md(self, inner, symbol, parent, **kwargs): + def to_md(self, inner, symbol, parent): + """ + Convert an input element to its Markdown representation. + + If the symbol represents a checkbox, returns a Markdown-formatted checklist item + with an 'x' when checked (or a space when unchecked) followed by the rendered inner content. + For other input types, the element’s HTML representation is returned. + + Parameters: + inner: An object with a to_md() method that renders inner content. + symbol: An element descriptor whose 'type' property determines rendering; if its 'type' + is "checkbox", the 'checked' property is used to indicate its state. + parent: The parent element context (unused in this conversion). + + Returns: + A string containing either the Markdown or HTML representation of the input element. + """ if symbol.get_prop("type") == "checkbox": - return f"- [{'x' if symbol.get_prop('checked', '') else ''}] {inner.to_md()}" + return f"- [{'x' if symbol.get_prop('checked', '') else ' '}] {inner.to_md()}" return symbol.to_html() class RST(CustomRst): - def to_rst(self, inner, symbol, parent, **kwargs): + def to_rst(self, inner, symbol, parent): + """ + Return a reStructuredText representation of a checkbox input element. + + If the symbol's "type" property is "checkbox", formats a checkbox with an "x" + if checked (or a space if not) and appends any nested content rendered via its + to_rst method. For input types other than checkbox, returns an empty string. + + Args: + inner: An optional element to be rendered in RST, if provided. + symbol: The input element symbol whose properties determine formatting. + parent: The parent element; not used in this conversion. + + Returns: + A string with the RST representation of the checkbox input, or an empty + string. + """ if symbol.get_prop("type") == "checkbox": - return f"[ ] {inner.to_rst() if inner else ''}" + return f"[{'x' if symbol.get_prop('checked', '') else ' '}] {inner.to_rst() if inner else ''}" return "" # Most input types don't have RST equivalents class Input(Symbol): @@ -50,6 +66,6 @@ class Input(Symbol): "multiple", "step" ] - html = HTML() + html = "input" md = MD() rst = RST() \ No newline at end of file diff --git a/BetterMD/elements/symbol.py b/BetterMD/elements/symbol.py index a01e96e..97af5be 100644 --- a/BetterMD/elements/symbol.py +++ b/BetterMD/elements/symbol.py @@ -1,72 +1,90 @@ import typing as t -import logging from ..markdown import CustomMarkdown from ..html import CustomHTML from ..rst import CustomRst - -T = t.TypeVar("T", default=t.Any) -T2 = t.TypeVar("T2", default=t.Any) -logger = logging.getLogger("BetterMD") - -class List(list, t.Generic[T]): - def on_set(self, key, value): ... - - def on_ammend(self, object: 'T'): ... - - - def append(self, object: 'T') -> 'None': - self.on_ammend(object) - return super().append(object) - - def get(self, index, default:'T2'=None) -> 't.Union[T, T2]': - try: - return self[index] - except IndexError: - return default - - def __setitem__(self, key, value): - self.on_set(key, value) - return super().__setitem__(key, value) - - def __getitem__(self, item) -> 'T': - return super().__getitem__(item) - - def __iter__(self) -> 't.Iterator[T]': - return super().__iter__() +from ..parse import HTMLParser, MDParser, RSTParser, ELEMENT, TEXT, Collection class Symbol: styles: 'dict[str, str]' = {} classes: 'list[str]' = [] - html: 't.Union[str, CustomHTML, CustomHTML[Symbol]]' = "" - props: 'dict[str, t.Union[str, list[str], dict[str, str]]]' = {} + html: 't.Union[str, CustomHTML]' = "" + props: 'dict[str, str]' = {} prop_list: 'list[str]' = [] vars:'dict[str,str]' = {} - children:'List[Symbol]' = List() - md: 't.Union[str, CustomMarkdown, CustomMarkdown[Symbol], None]' = None - rst: 't.Union[str, CustomRst, CustomRst[Symbol], None]' = None + children:'list[Symbol]' = [] + md: 't.Union[str, CustomMarkdown]' = "" + rst: 't.Union[str, CustomRst]' = "" parent:'Symbol' = None prepared:'bool' = False nl:'bool' = False html_written_props = "" - def __init__(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], dom:'bool'=True, inner:'list[Symbol]'=[], **props): - logger.debug(f"Creating new Symbol with {styles=} {classes=} {dom=} {inner=} {props=}") + collection = Collection() + html_parser = HTMLParser() + md_parser = MDParser() + + def __init_subclass__(cls, **kwargs) -> None: + """ + Automatically registers a new symbol subclass in the global collection. + + This special method is called when a new subclass is defined. It adds the + subclass to the collection by invoking its add_symbols method and then delegates + further subclass initialization to the superclass. + """ + cls.collection.add_symbols(cls) + super().__init_subclass__(**kwargs) + + def __init__(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], inner:'list[Symbol]'=[], **props): + """ + Initialize a Symbol instance with optional styles, classes, children, and properties. + + This constructor sets up the symbol with provided CSS styles, class names, and a list + of child Symbol instances (stored in the children attribute). Any additional keyword + arguments are stored as properties. + + Args: + styles (dict[str, str]): Optional mapping of CSS property names to values. + classes (list[str]): Optional list of CSS class names. + inner (list[Symbol]): Optional list of child Symbol instances. + **props: Additional properties to be associated with the symbol. + """ self.styles = styles self.classes = classes - self.children = List(inner) or List() + self.children = list(inner) or [] self.props = props - self.dom = dom - + + def copy(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], inner:'list[Symbol]'=None): + """ + Creates a copy of the current symbol with merged styles, custom classes, and inner symbols. + + The provided styles dictionary is updated with the symbol's own styles so that any key in the + original symbol takes precedence. The classes and inner parameters are used to set the new symbol's + CSS classes and child symbols, with inner defaulting to an empty list if not provided. + + Args: + styles: A dictionary of CSS styles to merge with the symbol's existing styles. + classes: A list of CSS class names for the new symbol. + inner: An optional list of child Symbol instances to include in the copy. Defaults to an empty list. + + Returns: + A new Symbol instance with updated styles, classes, and inner symbols. + """ if inner == None: - inner = [Symbol()] + inner = [] styles.update(self.styles) return Symbol(styles, classes, inner = inner) - - + + def set_parent(self, parent:'Symbol'): + """ + Sets the parent of this symbol and registers it as a child. + + Assigns the provided Symbol instance as the parent of the current symbol and + adds this symbol to the parent's list of children. + """ self.parent = parent self.parent.add_child(self) @@ -81,83 +99,235 @@ def remove_child(self, symbol:'Symbol'): self.children.remove(symbol) def has_child(self, child:'type[Symbol]'): + """ + Checks if a child symbol of the specified type exists. + + Iterates over the symbol's children and returns the first instance that is of the given type. + If no such child is found, returns False. + + Parameters: + child: A subclass of Symbol to look for among the children. + + Returns: + The first child that is an instance of the specified type, or False if none exists. + """ for e in self.children: if isinstance(e, child): return e - + return False - def prepare(self, parent:'t.Union[Symbol, None]'=None, *args, **kwargs): - self.prepared = True - self.parent = parent + def prepare(self, parent:'Symbol'): + """ + Prepare the symbol and its children for processing. - [symbol.prepare(self, *args, **kwargs) for symbol in self.children] + Marks the symbol as prepared, assigns the provided symbol as its parent, and + recursively prepares each child by setting the current symbol as their parent. + Args: + parent: The new parent symbol for this symbol. + + Returns: + The prepared symbol instance. + """ + self.prepared = True + self.parent = parent + for symbol in self.children: + symbol.prepare(self) + return self def replace_child(self, old:'Symbol', new:'Symbol'): + """ + Replaces a child Symbol with a new Symbol. + + Finds the first occurrence of the specified old child in the parent's children list, + removes it, and assigns the new child to the position immediately preceding the removed + child's original index. Note that if the old child is the first element, the new child + will replace the last element. + + Args: + old: The child Symbol instance to be replaced. + new: The replacement Symbol instance. + """ i = self.children.index(old) self.children.remove(old) self.children[i-1] = new - - def to_html(self) -> 'str': - if not self.prepared: - self.prepare() + + def to_html(self, indent=1) -> 'str': + """ + Converts the symbol to an HTML string. + + Generates an HTML representation of the symbol, including its attributes and child + elements. If the symbol's HTML attribute is an instance of a custom HTML handler, that + handler’s conversion is used. Otherwise, an HTML tag is constructed with any associated + CSS classes, inline styles, and additional properties, and its child symbols are rendered + recursively. The indent parameter adjusts the indentation level for nested elements. + + Args: + indent: The current indentation level for formatting nested elements (default is 1). + Returns: + A string containing the HTML representation of the symbol. + """ if isinstance(self.html, CustomHTML): return self.html.to_html(self.children, self, self.parent) + + inner_HTML = f"\n{" "*indent}".join([e.to_html(indent+1) if not (len(self.children) == 1 and self.children[0].html == "text") else e.to_html(0) for e in self.children]) + return f"<{self.html}{" " if self.styles or self.classes or self.props else ""}{f"class={'"'}{' '.join(self.classes) or ''}{'"'}" if self.classes else ""}{" " if (self.styles or self.classes) and self.props else ""}{f"style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ""}{'"'}" if self.styles else ""}{" " if (self.styles or self.classes) and self.props else ""}{' '.join([f'{k}={'"'}{v}{'"'}' if v != "" else f'{k}' for k,v in self.props.items()])}{f">{"\n" if len(self.children) > 1 else ""}{inner_HTML}{"\n" if len(self.children) > 1 else ""}" if inner_HTML else f" />"}" + + def to_md(self) -> 'str': + """ + Convert the symbol and its children to a Markdown string. - props = [] - for prop, value in self.props.items(): - if isinstance(value, list): - props.append(f"{prop}={'"'}{' '.join(value)}{'"'}") - elif isinstance(value, dict): - props.append(f"{prop}={'"'}{' '.join([f'{k}:{v}' for k,v in value.items()])}{'"'}") - else: - props.append(f"{prop}={value}") - - inner_HTML = "\n".join([e.to_html() for e in self.children]) - logger.debug(f"{inner_HTML=} {self.html=} {self.classes=} {self.styles=} {props=}") - return f"<{self.html} class={'"'}{' '.join(self.classes) or ''}{'"'} style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ''}{'"'} {' '.join(props)}>{inner_HTML}" - - def to_md(self, **kwargs) -> 'str': - if not self.prepared: - self.prepare(**kwargs) + If the symbol's Markdown attribute is a CustomMarkdown instance, its custom + conversion method is used with the symbol's children, self, and parent. + Otherwise, the base Markdown content is concatenated with the Markdown + outputs of its children, and a newline is appended if the newline flag is set. + Returns: + A string containing the Markdown representation of the symbol. + """ if isinstance(self.md, CustomMarkdown): - return self.md.to_md(self.children, self, self.parent, **kwargs) - - if self.md == None: - return self.to_html(**kwargs) - - inner_md = " ".join([e.to_md() for e in self.children]) - return f"{self.md} {inner_md}" + ("\n" if self.nl else "") - - def to_rst(self, **kwargs) -> 'str': - if not self.prepared: - self.prepare(**kwargs) + return self.md.to_md(self.children, self, self.parent) + + inner_md = "".join([e.to_md() for e in self.children]) + return f"{self.md}{inner_md}" + ("\n" if self.nl else "") + def to_rst(self) -> 'str': + """ + Converts the symbol and its children to a reStructuredText (RST) string. + + If the symbol's `rst` attribute is a CustomRst instance, the conversion is delegated to its + `to_rst` method with the symbol’s children, itself, and its parent. Otherwise, the method + concatenates the RST representations of its children, wrapping them with the symbol's `rst` + string and appending a newline. + """ if isinstance(self.rst, CustomRst): return self.rst.to_rst(self.children, self, self.parent) - - if self.rst == None: - return f".. raw:: html\n\n{" ".join(self.to_html().splitlines())}\n" - + inner_rst = " ".join([e.to_rst() for e in self.children]) return f"{self.rst}{inner_rst}{self.rst}\n" - - def get_prop(self, prop, default="") -> 't.Union[str, list[str], dict[str, str]]': + + @classmethod + def from_html(cls, text:'str') -> 'list[Symbol]': + """ + Parses an HTML string and returns a list of Symbol instances. + + This method utilizes the class-level HTML parser to convert the provided HTML text into a list of element dictionaries. For each element, it finds the corresponding Symbol from the collection (raising errors if not found) and parses it into a Symbol instance. + + Args: + text: A string containing the HTML content to be parsed. + + Returns: + A list of Symbol instances corresponding to the parsed HTML elements. + """ + parsed = cls.html_parser.parse(text) + return [cls.collection.find_symbol(elm['name'] , raise_errors=True).parse(elm) for elm in parsed] + + @classmethod + def parse(cls, text:'ELEMENT') -> 'Symbol': + """ + Parses an ELEMENT dictionary into a Symbol instance. + + Processes a structured dictionary representing an element by extracting inline + styles from the "style" attribute and CSS class names from the "class" attribute. + Recursively parses child elements and text nodes via the symbol collection, + passing remaining attributes to the Symbol constructor. + + Args: + text: A dictionary representing an ELEMENT with keys such as "attributes", + "children", "type", "name", and "content". + + Returns: + A Symbol instance corresponding to the parsed element. + """ + def handle_element(element:'ELEMENT|TEXT') -> 'Symbol': + if element['type'] == 'text': + text = cls.collection.find_symbol("text", raise_errors=True) + assert text is not None, "`collection.find_symbol` is broken" + + return text(element['content']) + + symbol_cls = cls.collection.find_symbol(element['name'], raise_errors=True) + assert symbol_cls is not None, "`collection.find_symbol` is broken" + + return symbol_cls.parse(element) + + styles = {s.split(":")[0]: s.split(":")[1] for s in text["attributes"].pop("style", "").split(";") if ":" in s} + classes = list(filter(lambda c: bool(c), text["attributes"].pop("class", "").split(" "))) + + return cls(styles, classes, inner=[handle_element(elm) for elm in text["children"]], **text["attributes"]) + + @classmethod + def from_md(cls, text: str) -> 'Symbol': + """ + Creates a Symbol instance from a Markdown string. + + Parses the provided Markdown text using the class's Markdown parser to produce an + intermediate representation of the symbol. It then uses the 'name' field from the parsed + data to look up the corresponding symbol in the collection and constructs a Symbol instance + by parsing the intermediate representation. + + Args: + text: A Markdown formatted string representing a symbol element. + + Returns: + A Symbol instance constructed from the parsed Markdown data. + """ + parsed = cls.md_parser.parse(text) + return cls.collection.find_symbol(parsed['name'], raise_errors=True).parse(parsed) + + + + def get_prop(self, prop, default="") -> 'str': + """ + Retrieves a property value from the symbol's properties. + + Looks up the specified key in the symbol's properties dictionary and returns its + value. If the key is not found, the provided default value is returned. + + Args: + prop: The name of the property to retrieve. + default: The fallback value if the property key is absent (defaults to an empty string). + + Returns: + The value associated with the key as a string. + """ return self.props.get(prop, default) - def set_prop(self, prop:'str', value:'t.Union[str, list[str], dict[str, str]]'): + def set_prop(self, prop, value): self.props[prop] = value def __contains__(self, item): + """ + Determines whether a child symbol is present based on the specified condition. + + If the given item is callable (typically a type), the method returns True if + any child is an instance of that callable. Otherwise, it performs a direct + membership test against the children. + + Args: + item: A child symbol instance or a callable (usually a type) to check + against each child. + + Returns: + bool: True if the condition is met; otherwise, False. + """ if callable(item): return any(isinstance(e, item) for e in self.children) return item in self.children def __str__(self): - return f"<{self.html} class={'"'}{' '.join(self.classes) or ''}{'"'} style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ''}{'"'} {' '.join(self.props)}/>" + """ + Return a string representation of the symbol as an HTML element. + + Formats the symbol into an HTML-like string that includes its tag name and conditionally + adds CSS classes, inline styles, and additional properties. Also indicates the number of + child symbols, with extra line breaks when more than one child is present. + """ + return f"<{self.html}{" " if self.styles or self.classes or self.props else ""}{f"class={'"'}{' '.join(self.classes) or ''}{'"'}" if self.classes else ""}{" " if (self.styles or self.classes) and self.props else ""}{f"style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ""}{'"'}" if self.styles else ""}{" " if (self.styles or self.classes) and self.props else ""}{' '.join([f'{k}={'"'}{v}{'"'}' if v != "" else f'{k}' for k,v in self.props.items()])}{f">{"\n" if len(self.children) > 1 else ""}{"\n" if len(self.children) > 1 else ""}{len(self.children)}"}" + + __repr__ = __str__ \ No newline at end of file diff --git a/BetterMD/elements/table.py b/BetterMD/elements/table.py index 6576662..896762f 100644 --- a/BetterMD/elements/table.py +++ b/BetterMD/elements/table.py @@ -1,393 +1,295 @@ -from .symbol import Symbol, List +from .symbol import Symbol from ..markdown import CustomMarkdown from ..rst import CustomRst from .h import H1, H2, H3, H4, H5, H6 from .text import Text -import logging -import typing as t +import itertools as it -if t.TYPE_CHECKING: - # Wont be imported at runtime - import pandas as pd # If not installed, will not affedt anything at runtime - -logger = logging.getLogger("BetterMD") - -class TrMD(CustomMarkdown['Tr']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - logger.debug("Converting Tr element to Markdown") - contents = "\n".join([e.to_md() for e in inner]) - split_content = contents.splitlines() - logger.debug(f"Split content: {split_content}") - ret = f"| {" | ".join(split_content)} |" - return ret - - -class THeadMD(CustomMarkdown['THead']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - md = [] - for child in symbol.head.children: - e = child.to_md() - - md.append({"len":len(e), "style":child.styles.get("text-align", "justify")}) - - def parse_md(data: 'dict') -> 'str': - start = " :" if data["style"] in ["left", "center"] else " " - middle = "-"*(data["len"]-2) if data["style"] == "center" else "-"*(data["len"]-1) if data["style"] in ["left", "right"] else "-"*(data["len"]) - end = ": " if data["style"] in ["right", "center"] else " " - - return f"{start}{middle}{end}" - - return f"{inner[0].to_md()}\n|{"|".join([parse_md(item) for item in md])}|" +class TableMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Generate a Markdown representation of a table. -class TBodyMD(CustomMarkdown['TBody']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - content = [e.to_md() for e in inner if isinstance(e, Tr)] - logger.debug(f"TBody conent: {content}") - return "\n".join(content) - -class TdMD(CustomMarkdown['Td']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - if not pretty: - return " ".join([e.to_md() for e in inner]) - - length = len(max(symbol.table.cols[symbol.header], key=len).data) - logger.debug(f"Td length: {len(symbol)}") - logger.debug(f"Column length: {length}") - return " ".join([e.to_md() for e in inner]).center(length) - -class ThMD(CustomMarkdown['Th']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - if not pretty: - return " ".join([e.to_md() for e in inner]) + Processes provided elements by converting header (THead) and body (TBody) sections into + their Markdown representations and concatenates them with appropriate line breaks. + If a header is present, it is rendered first, followed by any body rows. - width = len(max(symbol.table.cols[symbol.header], key=len).data) - + Returns: + str: The combined Markdown formatted string representing the table. + """ + result = [] + thead_content = "" + tbody_rows = [] - if symbol.data == "": - return "".center(width) + # Process inner elements + for section in inner: + if isinstance(section, THead): + thead_content = section.to_md() + elif isinstance(section, TBody): + tbody_content = section.to_md() + if tbody_content: + tbody_rows.append(tbody_content) - return f"**{" ".join([e.to_md() for e in inner]).center(width)}**" - -class TableMD(CustomMarkdown['Table']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - logger.debug("Converting Table element to Markdown") - head = symbol.head.to_md() if symbol.head else None - body = symbol.body.to_md() - - logger.debug(f"Table conversion complete. Has header: {head is not None}") - return f"{f"{head}\n" if head else ""}{body}" - - -class TableRST(CustomRst['Table']): - def to_rst(self, inner, symbol, parent, **kwargs): - logger.debug("Converting Table element to RST") - head = symbol.head.to_rst() if symbol.head else None - body = symbol.body.to_rst() - - return f"{f"{head}\n" if head else ""}{body}" - -class THeadRST(CustomRst['THead']): - def to_rst(self, inner, symbol, parent, **kwargs): - logger.debug("Converting THead element to RST") - logger.debug(f"THead has {len(inner)} children: {[e.to_rst() for e in inner]}") - top = [len(max(symbol.table.cols[child.header], key=len).data) for child in symbol.head.children] - content = "\n".join([e.to_rst() for e in inner]) - return f"+-{"-+-".join([t*"-" for t in top])}-+\n{content}\n+={"=+=".join([t*"=" for t in top])}=+" - -class TBodyRST(CustomRst['TBody']): - def to_rst(self, inner, symbol, parent, **kwargs): - bottom = [len(max(symbol.table.cols[child.header], key=len).data) for child in symbol.table.head.head.children] - return f'{f"\n+-{"-+-".join(["-"*b for b in bottom])}-+\n".join([e.to_rst() for e in inner if isinstance(e, Tr)])}\n+-{"-+-".join(["-"*b for b in bottom])}-+' - -class TrRST(CustomRst['Tr']): - def to_rst(self, inner, symbol, parent, **kwargs): - return f'| {" |\n| ".join(" | ".join([e.to_rst() for e in inner]).split("\n"))} |' - - -class TdRST(CustomRst['Td']): - def to_rst(self, inner, symbol, parent, **kwargs): - content = " ".join([e.to_rst() for e in inner]) - width = len(max(symbol.table.cols[symbol.header], key=len).data) - return content.center(width) + # Combine all parts + if thead_content: + result.append(thead_content) + + if tbody_rows: + result.append("\n".join(tbody_rows)) + + return "\n".join(result) + +class TableRST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Generates a reStructuredText representation of a table. + + This method processes a sequence of table sections (headers and bodies) to compute + consistent column widths and generate a formatted table with borders and row separators. + It iterates over header and body sections to determine the spacing for each column and + then renders the final table. Returns an empty string if no valid table rows are found. + + Args: + inner: A list of table section symbols (THead or TBody) containing table rows. + symbol: The current table symbol (not used in the RST formatting). + parent: The parent symbol of the current table element (not used in the RST formatting). + + Returns: + A string representing the table in reStructuredText format. + """ + if not inner: + return "" + + # First pass: collect all cell widths from both thead and tbody + col_widths = [] + all_rows = [] + + for section in inner: + if isinstance(section, THead) or isinstance(section, TBody): + for row in section.children: + cells = [cell.to_rst() for cell in row.children] + all_rows.append((cells, isinstance(section, THead))) + + # Update column widths + if not col_widths: + col_widths = [len(cell) for cell in cells] + else: + col_widths = [max(old, len(new)) for old, new in zip(col_widths, cells + [''] * (len(col_widths) - len(cells)))] + + if not all_rows: + return "" + + # Second pass: generate RST with consistent widths + result = [] + + # Top border + top_border = "+" + "+".join(["-" * (width + 2) for width in col_widths]) + "+" + result.append(top_border) + + for i, (cells, is_header) in enumerate(all_rows): + # Create row with proper spacing using consistent column widths + row = "| " + " | ".join(cell.ljust(width) for cell, width in zip(cells, col_widths)) + " |" + result.append(row) + + # Add separator after each row + if is_header: + separator = "+" + "+".join(["=" * (width + 2) for width in col_widths]) + "+" + else: + separator = "+" + "+".join(["-" * (width + 2) for width in col_widths]) + "+" + result.append(separator) + + return "\n".join(result) -class ThRST(CustomRst['Th']): - def to_rst(self, inner, symbol, parent, **kwargs): - content = " ".join([e.to_rst() for e in inner]) - width = len(max(symbol.table.cols[symbol.header], key=len).data) - if content == "": - return "".center(width) - return f"**{content}**".center(width) +class THeadMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Generate markdown for table header rows and a separator. + + This method iterates over the provided row elements, converting each child cell + to its markdown representation while determining the maximum column widths for + consistent formatting. It then constructs each row with pipe delimiters and appends + a final separator row composed of dashes. Returns an empty string if no rows are provided. + + Args: + inner: A collection of row elements, each containing cells to be formatted. + symbol: Unused parameter representing the current symbol. + parent: Unused parameter representing the parent element. + + Returns: + A markdown formatted string representing the table header rows followed by a separator, + or an empty string if there are no rows. + """ + if not inner: + return "" + + rows = [] + widths = [] + + # First pass: collect all rows and calculate column widths + for row in inner: + row_cells = [cell.to_md() for cell in row.children] + if not widths: + widths = [len(cell) for cell in row_cells] + else: + widths = [max(old, len(new)) for old, new in zip(widths, row_cells)] + rows.append(row_cells) + + if not rows: + return "" + + # Second pass: generate properly formatted markdown + result = [] + for row_cells in rows: + row = "|" + "|".join(row_cells) + "|" + result.append(row) + + # Add separator row + separator = "|" + "|".join(["-" * width for width in widths]) + "|" + result.append(separator) + + return "\n".join(result) +class THeadRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Placeholder for RST conversion. + + This method does not perform any conversion and simply returns an empty string, + as reStructuredText rendering is delegated to the TableRST class. + """ + return "" + +class TBodyMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts a list of row elements to Markdown. + + Iterates over each element in the provided inner list, calling its to_md method, + and joins the resulting strings with newline characters. Returns an empty string + if no inner elements are provided. + """ + if not inner: + return "" + + rows = [] + for row in inner: + rows.append(row.to_md()) + + return "\n".join(rows) +class TrMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts a list of cell elements into a Markdown table row. + + Iterates over each cell in `inner`, calling its `to_md()` method, and joins the resulting + strings with pipe characters. A leading and trailing pipe are added to complete the row format. + + Args: + inner: A list of cell objects where each object provides a Markdown representation. + symbol: Represents the current row's symbol (provided for interface consistency; unused). + parent: The parent element in the document structure (provided for interface consistency; unused). + + Returns: + A string formatted as a Markdown table row. + """ + cells = [cell.to_md() for cell in inner] + return f"|{'|'.join(cells)}|" + +class TrRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Return an empty string because RST conversion is handled by TableRST. + + This placeholder method exists solely to fulfill the interface and does not perform any conversion. + """ + return "" + +class TdMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert inner elements to a Markdown string. + + Processes each element in the provided inner list by invoking its to_md method + and joining the resulting strings with a space. The symbol and parent parameters + are included to comply with the interface but are not used in the conversion. + + Returns: + str: The concatenated Markdown output of all inner elements. + """ + return " ".join([e.to_md() for e in inner]) + +class TdRST(CustomRst): + def to_rst(self, inner: list[Symbol], symbol: Symbol, parent: Symbol) -> str: + """ + Generate an RST representation for cell content. + + Converts a list of inner symbols into an RST-formatted string. Returns an empty + string if no symbols are provided. When more than one symbol is present or when the + single symbol is not a standard text or header type, their RST outputs are joined + with spaces as a fallback mechanism. Otherwise, returns the RST output of the single + symbol. + + Note: + The parameters 'symbol' and 'parent' are included for interface consistency. + """ + if not inner: + return "" + + if len(inner) > 1 or not isinstance(inner[0], (Text, H1, H2, H3, H4, H5, H6)): + return " ".join([e.to_rst() for e in inner]) # Fallback to join instead of raising error + return inner[0].to_rst() + +class ThRST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Renders a collection of elements as a reStructuredText string. + + Iterates over each element in the provided inner list, calling its to_rst() method, + and concatenates the results with a space as the separator. + """ + return " ".join([e.to_rst() for e in inner]) + +class TBodyRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Return an empty reStructuredText string. + + This placeholder method returns an empty string because the conversion of + these elements is delegated to the TableRST class. + """ + return "" class Table(Symbol): html = "table" md = TableMD() rst = TableRST() - head:'THead' = None - body:'TBody' = None - - cols: 'dict[Th, list[Td]]' = {} - headers: 'list[Th]' = [] - - def to_pandas(self): - if not self.prepared: - self.prepare() - - logger.debug("Converting Table to pandas DataFrame") - try: - import pandas as pd - df = pd.DataFrame([e.to_pandas() for e in self.body.children], columns=self.head.to_pandas()) - logger.debug(f"Successfully converted table to DataFrame with shape {df.shape}") - return df - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `to_pandas`") - except Exception as e: - logger.error(f"Error converting table to pandas: {str(e)}") - raise - - @classmethod - def from_pandas(cls, df:'pd.DataFrame'): - logger.debug(f"Creating Table from pandas DataFrame with shape {df.shape}") - try: - import pandas as pd - self = cls() - head = THead.from_pandas(list(df.columns)) - body = TBody.from_pandas(df) - - self.head = head - self.body = body - - self.add_child(head) - self.add_child(body) - - logger.debug("Successfully created Table from DataFrame") - logger.debug(f"Table has {len(self.head.children)} columns and {len(self.body.children)} rows with shape {df.shape}") - logger.debug(f"Table head: {self.head.to_pandas()}") - logger.debug(f"Table body: {[e.to_list() for e in self.body.children]}") - return self - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `from_pandas`") - except Exception as e: - logger.error(f"Error creating table from pandas: {str(e)}") - raise - - def prepare(self, parent = None, *args, **kwargs): - return super().prepare(parent, table=self, *args, **kwargs) - -class THead(Symbol): - html = "thead" - rst = THeadRST() - md = THeadMD() - - table:'Table' = None - children:'List[Tr]' = List() - - head:'Tr' = None - - - def to_pandas(self) -> 'list[str]': - return self.to_list() - - def to_list(self) -> 'list[str]': - if not self.prepared: - self.prepare() - - return self.children[0].to_list() - - @classmethod - def from_pandas(cls, data:'list[str]'): - return cls.from_list(data) - - @classmethod - def from_list(cls, data:'list[str]'): - self = cls() - tr = Tr.from_list(data) - self.add_child(tr) - - return self - - def prepare(self, parent = None, table=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.table.head = self - return super().prepare(parent, table=table, head=True, *args, **kwargs) - -class TBody(Symbol): - html = "tbody" - rst = TBodyRST() - md = TBodyMD() - - table:'Table' = None - children:'List[Tr]' = List() - - def to_pandas(self): - if not self.prepared: - self.prepare() - - logger.debug("Converting TBody to pandas format") - data = [e.to_pandas() for e in self.children] - logger.debug(f"Converted {len(data)} rows from TBody") - return data - - @classmethod - def from_pandas(cls, df:'pd.DataFrame'): - logger.debug(f"Creating TBody from DataFrame with {len(df)} rows") - try: - import pandas as pd - self = cls() - - for i, row in df.iterrows(): - tr = Tr.from_pandas(row) - self.children.append(tr) - logger.debug(f"Added row {i} to TBody") - - return self - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `from_pandas`") - - def prepare(self, parent = None, table=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.table.body = self - return super().prepare(parent, table=table, head=False, *args, **kwargs) + nl = True class Tr(Symbol): html = "tr" md = TrMD() rst = TrRST() - table:'Table' = None - - children:'List[t.Union[Td, Th]]' = List() - - def __init__(self, styles = {}, classes = [], dom = True, inner = [], **props): - super().__init__(styles, classes, dom, inner, **props) - - self.is_header = False - if isinstance(self.parent, THead): - self.is_header = True - logger.debug("Tr element identified as header row") - - def to_pandas(self): - if not self.prepared: - self.prepare() - - def get(o, f): - return [getattr(v, f) for v in o] - - try: - import pandas as pd - if self.is_header: - raise ValueError("This `Tr` is a header row and cannot be converted to a pandas `Series`") - return pd.Series({h.data: v.data for h, v in zip(self.table.head.head.children, self.children)}, index=self.table.head.to_pandas()) - - except ImportError: - raise ImportError("`tables` extra is required to use `to_pandas`") - - def to_list(self): - if not self.prepared: - self.prepare() - - return [e.data for e in self.children] - - @classmethod - def from_pandas(cls, series:'pd.Series'): - try: - import pandas as pd - self = cls() - self.children.clear() - for v in series: - td = Td(inner=[Text(v)]) - self.children.append(td) - - return self - except ImportError: - raise ImportError("`tables` extra is required to use `from_pandas`") - - @classmethod - def from_list(cls, data:'list[str]'): - self = cls() - for value in data: - td = Td(inner=[Text(value)]) - self.children.append(td) - - return self - - def prepare(self, parent = None, table=None, head=False, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - if head: self.table.head.head = self - return super().prepare(parent, table=table, row=self, *args, **kwargs) - class Td(Symbol): html = "td" md = TdMD() rst = TdRST() - children:'List[Text]' = List() - row:'Tr' = None - - @property - def data(self): - return self.children.get(0, Text("")).text - - @property - def width(self): - return len(self.data) - - def prepare(self, parent = None, table=None, row=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.row = row - - self.header = self.table.headers[self.row.children.index(self)] - self.table.cols[self.header].append(self) - return super().prepare(parent, table=table, *args, **kwargs) - - def __len__(self): - return len(self.data) - class Th(Symbol): html = "th" - md = ThMD() + md = TdMD() rst = ThRST() - children:'List[Text]' = List() - row:'Tr' = None - - def __init__(self, styles: dict[str, str] = {}, classes: list[str] = [], dom: bool = True, inner: list[Symbol] = [], **props): - super().__init__(styles, classes, dom, inner, **props) - - @property - def data(self): - contents = self.children.get(0, Text("")).text - logger.debug(f"Th data: {contents}") - if contents == "": - logger.debug("Th data is empty") - return "" - logger.debug("Th data is not empty") - return f"**{contents}**" - - @property - def width(self): - """Width of the data""" - if self.data == "": - return 0 - return len(self.data)-4 +class THead(Symbol): + html = "thead" + md = THeadMD() + rst = THeadRST() - def prepare(self, parent = None, table=None, row=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.row = row - self.header = self - self.table.headers.append(self) - self.table.cols[self] = [self] - return super().prepare(parent, table=table, *args, **kwargs) - - def __len__(self): - """Width of the element (data + bolding)""" - return len(self.data) \ No newline at end of file +class TBody(Symbol): + html = "tbody" + md = TBodyMD() + rst = TBodyRST() \ No newline at end of file diff --git a/BetterMD/elements/text.py b/BetterMD/elements/text.py index f09900c..07a3879 100644 --- a/BetterMD/elements/text.py +++ b/BetterMD/elements/text.py @@ -2,33 +2,46 @@ from ..markdown import CustomMarkdown from ..html import CustomHTML -import typing as t - -class Str(t.Protocol): - def __str__(self) -> str: ... - # This is not equivelant to the html span or p tags but instead just raw text class Text(Symbol): - md = "{t}" - html = "{t}" - rst = "{t}" - - def __init__(self, text:'Str', dom = True, **props): - self.text = str(text) - return super().__init__(dom=dom, **props) - - def to_html(self) -> 'str': + md = "text" + html = "text" + rst = "text" + + def __init__(self, text:str, **props): + """Initialize a Text instance with the specified content. + + Args: + text (str): The text content to be assigned to the instance. + **props: Additional keyword arguments forwarded to the parent initializer. + """ + self.text = text + return super().__init__(**props) + + def to_html(self, indent=0, parent=None): + """ + Return the text formatted with optional indentation for HTML output. + + The function prefixes the text attribute with four spaces per indent level. + The parent parameter is not used. + """ + return f"{' '*indent}{self.text}" + + def to_md(self): + """ + Return the text attribute as Markdown output. + + This method returns the stored text without modification, making it suitable for Markdown rendering. + """ return self.text - def to_md(self) -> 'str': + def to_rst(self): + """ + Return the text formatted as reStructuredText. + + Returns: + str: The text attribute, suitable for reStructuredText rendering. + """ return self.text - - def to_rst(self) -> 'str': - return self.text - - def __str__(self): - return f"{self.text}" - - __repr__ = __str__ \ No newline at end of file diff --git a/BetterMD/elements/title.py b/BetterMD/elements/title.py new file mode 100644 index 0000000..01c7c1a --- /dev/null +++ b/BetterMD/elements/title.py @@ -0,0 +1,54 @@ +from typing import Text +from .symbol import Symbol +from ..markdown import CustomMarkdown +from ..rst import CustomRst +from .text import Text + +class MD(CustomMarkdown): + def to_md(self, inner: list[Symbol], symbol: Symbol, parent: Symbol, **kwargs) -> str: + """ + Converts a title element into its Markdown representation. + + Expects the content list to contain exactly one Text element; otherwise, + raises a ValueError. The resulting string is formatted as: title: "". + + Args: + inner: A list of Symbol objects representing the title content. Must contain + exactly one Text element. + symbol: The Symbol instance corresponding to the title element. + parent: The parent Symbol of the title element. + **kwargs: Additional keyword arguments (unused). + + Returns: + A Markdown-formatted title string. + + Raises: + ValueError: If the inner list does not contain exactly one Text element. + """ + if not isinstance(inner[0], Text) or len(inner) != 1: + raise ValueError("Title element must contain a single Text element") + + return f'title: "{inner[0].to_md()}"' + +class RST(CustomRst): + def to_rst(self, inner: list[Symbol], symbol: Symbol, parent: Symbol, **kwargs) -> str: + """ + Converts a title element to reStructuredText format. + + This method expects the 'inner' list to contain exactly one Text element representing the title. + It returns the title as a reStructuredText string prefixed with ":title: ". + Raises: + ValueError: If 'inner' does not contain exactly one Text element. + """ + if not isinstance(inner[0], Text) or len(inner) != 1: + raise ValueError("Title element must contain a single Text element") + + return f":title: {inner[0].to_rst()}" + + +class Title(Symbol): + html = "title" + md = MD() + rst = RST() + + diff --git a/BetterMD/html/custom_html.py b/BetterMD/html/custom_html.py index 2ba3b19..74d9cd6 100644 --- a/BetterMD/html/custom_html.py +++ b/BetterMD/html/custom_html.py @@ -1,13 +1,42 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomHTML(t.Generic[T]): - def to_html(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol') -> str: ... +class CustomHTML(t.Generic[T], ABC): + @abstractmethod + def to_html(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol') -> str: """ +Converts a symbol and its context into an HTML string. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +This abstract method should be implemented by subclasses to generate an HTML +representation from the provided symbol, its inner content, and its parent. + +Args: + inner: A list of inner symbols to be included in the HTML. + symbol: The symbol to be rendered as HTML. + parent: The parent symbol providing contextual information. + +Returns: + A string containing the HTML representation. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'):""" +Prepare the HTML element for conversion. + +This method serves as a hook for performing any preparatory operations on the +element's content prior to HTML conversion. Subclasses can override this method +to implement custom preprocessing logic using the provided inner elements, +the current symbol, and its parent symbol. + +Args: + inner: A list of Symbol instances representing the inner content. + symbol: The symbol representing the current element. + parent: The symbol representing the parent element. +""" +... def verify(self, text) -> bool: ... \ No newline at end of file diff --git a/BetterMD/markdown/custom_markdown.py b/BetterMD/markdown/custom_markdown.py index db4e535..65c6903 100644 --- a/BetterMD/markdown/custom_markdown.py +++ b/BetterMD/markdown/custom_markdown.py @@ -1,16 +1,53 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomMarkdown(t.Generic[T]): +class CustomMarkdown(t.Generic[T], ABC): prop = "" md: 'dict[str, str]' = {} - def to_md(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol', **kwargs) -> 'str': ... + @abstractmethod + def to_md(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol') -> str: """ +Converts a symbol along with its inner elements to a Markdown string. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +Subclasses must implement this method to generate a Markdown +representation based on the provided current symbol, its nested child +symbols, and its parent context. - def verify(self, text) -> 'bool': ... \ No newline at end of file +Args: + inner: A list of child symbols to be processed. + symbol: The symbol to convert. + parent: The contextual parent symbol of the given symbol. + +Returns: + The resulting Markdown string. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'): """ +Performs any necessary preparation prior to markdown conversion. + +This hook allows subclasses to preprocess symbols before generating +their markdown representation. + +Args: + inner: A list of symbols representing nested elements. + symbol: The current symbol to be prepared. + parent: The parent symbol related to the current symbol. +""" +... + + def verify(self, text) -> bool: """ +Verifies whether the provided text meets the expected criteria. + +This method should determine if the given text is valid according to the custom rules +defined in a subclass. It returns True if the text passes validation, otherwise False. + +Args: + text: The text content to verify. +""" +... diff --git a/BetterMD/parse/collection.py b/BetterMD/parse/collection.py new file mode 100644 index 0000000..2e258a1 --- /dev/null +++ b/BetterMD/parse/collection.py @@ -0,0 +1,65 @@ +import typing as t +import logging +from ..html import CustomHTML + +if t.TYPE_CHECKING: + from ..elements import Symbol + +class Collection: + def __init__(self, *symbols:'type[Symbol]'): + """ + Initialize a Collection instance with optional symbols. + + Args: + *symbols: Initial Symbol instances to include in the collection. + + The logger is configured using the "BetterMD" namespace. + """ + self.symbols = list(symbols) + self.logger = logging.getLogger("BetterMD") + + def add_symbols(self, symbol:'type[Symbol]'): + """ + Appends a symbol to the collection. + + The provided symbol is added to the internal list of symbols. + """ + self.symbols.append(symbol) + + def remove_symbol(self, symbol:'type[Symbol]'): + """ + Removes the specified symbol from the collection. + + Args: + symbol: The Symbol instance to remove. + """ + self.symbols.remove(symbol) + + def find_symbol(self, name:'str', raise_errors:'bool'=False) -> 't.Union[None, type[Symbol]]': + """ + Finds a symbol in the collection by matching its HTML attribute. + + Iterates over the collection's symbols and returns the first symbol for which the `html` + attribute matches the provided name either directly (if it is a string) or via the + `verify` method (if it is an instance of CustomHTML). If no matching symbol is found, + the function returns None unless raise_errors is True, in which case it raises a ValueError. + + Args: + name: The name to search for in the symbol's HTML representation. + raise_errors: If True, raises a ValueError when no matching symbol is found. + + Returns: + The matching symbol if found; otherwise, None. + + Raises: + ValueError: If no symbol is found and raise_errors is True. + """ + for symbol in self.symbols: + if isinstance(symbol.html, str) and symbol.html == name: + return symbol + elif isinstance(symbol.html, CustomHTML) and symbol.html.verify(name): + return symbol + + if raise_errors: + raise ValueError(f"Symbol `{name}` not found in collection, if using default symbols it may not be supported.") + return None \ No newline at end of file diff --git a/BetterMD/parse/html.py b/BetterMD/parse/html.py new file mode 100644 index 0000000..4850ff6 --- /dev/null +++ b/BetterMD/parse/html.py @@ -0,0 +1,268 @@ +from .typing import ELEMENT +import typing as t + +class HTMLParser: + def __init__(self): + """ + Initialize an HTMLParser instance. + + Resets the parser's internal state to prepare for HTML parsing. + """ + self.reset() + + def reset(self): + """ + Resets the parser to its initial state. + + Clears all stored parsing data including the current tag, DOM, internal state, buffer, + attribute name, and tag stack to prepare for new HTML input. + """ + self.current_tag:'t.Optional[ELEMENT]' = None + self.dom = [] + self.state = 'TEXT' + self.buffer = '' + self.attr_name = '' + self.tag_stack = [] + + def parse(self, html:'str') -> 'list[ELEMENT]': + """ + Parses an HTML string into a Document Object Model (DOM). + + This method resets the parser's state and processes the input one character at a time using + a state machine, handling text nodes, opening tags, attributes, self-closing tags, and closing + tags. Any remaining text is appended as a text node, and the complete DOM is returned as a list + of elements. + """ + self.reset() + + i = 0 + while i < len(html): + char = html[i] + + if self.state == 'TEXT': + if char == '<': + if self.buffer.strip(): + self.handle_text(self.buffer) + self.buffer = '' + self.state = 'TAG_START' + else: + self.buffer += char + + elif self.state == 'TAG_START': + if char == '/': + self.state = 'CLOSING_TAG' + elif char == '!': + self.state = 'COMMENT_OR_DOCTYPE' + self.buffer = '!' + else: + self.state = 'TAG_NAME' + self.buffer = char + + elif self.state == 'TAG_NAME': + if char.isspace(): + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.buffer = '' + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '>': + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.state = 'SELF_CLOSING_TAG' + else: + self.buffer += char + + elif self.state == 'BEFORE_ATTRIBUTE_NAME': + if char.isspace(): + pass + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + else: + self.attr_name = char + self.state = 'ATTRIBUTE_NAME' + + elif self.state == 'ATTRIBUTE_NAME': + if char.isspace(): + self.current_tag['attributes'][self.attr_name] = '' + self.state = 'AFTER_ATTRIBUTE_NAME' + elif char == '=': + self.state = 'BEFORE_ATTRIBUTE_VALUE' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = '' + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag['attributes'][self.attr_name] = '' + self.state = 'SELF_CLOSING_TAG' + else: + self.attr_name += char + + elif self.state == 'AFTER_ATTRIBUTE_NAME': + if char.isspace(): + pass + elif char == '=': + self.state = 'BEFORE_ATTRIBUTE_VALUE' + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + else: + self.current_tag['attributes'][self.attr_name] = '' + self.attr_name = char + self.state = 'ATTRIBUTE_NAME' + + elif self.state == 'BEFORE_ATTRIBUTE_VALUE': + if char.isspace(): + pass + elif char == '"': + self.buffer = '' + self.state = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED' + elif char == "'": + self.buffer = '' + self.state = 'ATTRIBUTE_VALUE_SINGLE_QUOTED' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = '' + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + self.buffer = char + self.state = 'ATTRIBUTE_VALUE_UNQUOTED' + + elif self.state == 'ATTRIBUTE_VALUE_DOUBLE_QUOTED': + if char == '"': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'AFTER_ATTRIBUTE_VALUE_QUOTED' + else: + self.buffer += char + + elif self.state == 'ATTRIBUTE_VALUE_SINGLE_QUOTED': + if char == "'": + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'AFTER_ATTRIBUTE_VALUE_QUOTED' + else: + self.buffer += char + + elif self.state == 'ATTRIBUTE_VALUE_UNQUOTED': + if char.isspace(): + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'SELF_CLOSING_TAG' + else: + self.buffer += char + + elif self.state == 'AFTER_ATTRIBUTE_VALUE_QUOTED': + if char.isspace(): + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + self.state = 'BEFORE_ATTRIBUTE_NAME' + i -= 1 # Reconsider this character + + elif self.state == 'SELF_CLOSING_TAG': + if char == '>': + self.handle_tag_self_closing(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + # Error handling + pass + + elif self.state == 'CLOSING_TAG': + if char == '>': + self.handle_tag_close(self.buffer) + self.buffer = '' + self.state = 'TEXT' + else: + self.buffer += char + + # Additional states would be implemented here + + i += 1 + + # Handle any remaining text + if self.state == 'TEXT' and self.buffer.strip(): + self.handle_text(self.buffer) + + return self.dom + + def handle_tag_open(self, tag): + """ + Handles an opening tag by incorporating it into the DOM and updating the tag stack. + + If there is an active tag, the new tag is added as its child; otherwise, it is added + as a top-level element. The tag is then pushed onto the tag stack to track nested elements. + """ + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(tag) + else: + self.dom.append(tag) + + self.tag_stack.append(tag) + + def handle_tag_self_closing(self, tag): + """ + Handles a self-closing HTML tag by appending it to the appropriate parent in the DOM. + + If there is an open tag (i.e., the tag stack is not empty), the tag is added as a child of + the most recent tag on the stack. Otherwise, it is appended directly to the DOM. + """ + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(tag) + else: + self.dom.append(tag) + + def handle_tag_close(self, tag_name): + """ + Closes the last opened tag if its name matches the provided tag name. + + If the tag stack is not empty and the top tag's name equals tag_name, it is removed + from the stack to correctly update the open tags during parsing. + """ + if len(self.tag_stack) > 0 and self.tag_stack[-1]['name'] == tag_name: + self.tag_stack.pop() + + def handle_text(self, text): + """ + Adds a text node to the DOM. + + Constructs a text node as a dictionary with keys 'type', 'content', and 'name' set to "text" and the provided text. The node is appended to the children of the most recently opened tag if one exists; otherwise, it is added to the DOM root. + """ + text_node = {'type': 'text', 'content': text, 'name': 'text'} + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(text_node) + else: + self.dom.append(text_node) + + def get_dom(self): + """ + Returns the constructed Document Object Model. + + This method provides access to the DOM built from the parsed HTML content. + """ + return self.dom \ No newline at end of file diff --git a/BetterMD/parse/markdown.py b/BetterMD/parse/markdown.py new file mode 100644 index 0000000..898e309 --- /dev/null +++ b/BetterMD/parse/markdown.py @@ -0,0 +1,512 @@ +import re +import typing as t +from .typing import ELEMENT, TEXT +import threading as th + +class MDParser: + + top_level_tags = { + "blockquote": r"^> (.+)$", # Blockquote + "br": r"\n\n", # Br + "code": r"^```([A-Za-z]*)[^.](?:([^`]*)[^.])?```$", # Code block + + "h": r"^(#{1,6})(?: (.*))?$", + + "hr": r"^---+$", # Hr + + "ul" : r"^([ | ]*)(?:-|\+|\*)(?: (.*))?$", # Ul Li + "ol" : r"^([ | ]*)(\d)\.(?: (.*))?$", # Ol Li + + "tr": r"^\|(?:[^|\n]+\|)+$", # tr - must start and end with | and have at least one | + "thead": r"^\|(?::?-+:?\|)+$", # thead / tbody + + "title": r"^title: .+$", # Title + } + + def __init__(self): + """ + Initializes a new MDParser instance and resets its internal state. + + Calls reset() to clear any existing state and prepare the parser for a new Markdown input. + """ + self.reset() + + def reset(self): + """ + Resets the parser's internal state. + + Clears the document model, text buffer, and both list and DOM stacks to prepare for a new parsing operation. + """ + self.dom = [] + self.buffer = '' + self.list_stack = [] + self.dom_stack = [] + + def create_element(self, name:'str', attrs:'dict[str, str]'=None, children:'list[ELEMENT|TEXT]'=None) -> 'ELEMENT': + """ + Creates a DOM element dictionary. + + Constructs a dictionary representing a DOM element with a specified name, along with + optional attributes and child nodes. If attributes or children are omitted, they default + to an empty dictionary and an empty list, respectively. + + Args: + name: The tag name for the element. + attrs: An optional dictionary of attributes for the element. + children: An optional list of child elements or text nodes. + + Returns: + A dictionary with keys "type", "name", "attributes", and "children" representing the element. + """ + if children is None: + children = [] + + if attrs is None: + attrs = {} + + return { + "type": "element", + "name": name, + "attributes": attrs, + "children": children + } + + def create_text(self, content:'str') -> 'TEXT': + """ + Creates a text node for the Markdown DOM. + + Constructs and returns a dictionary representing a text node with a fixed type and name. + The node encapsulates the provided text content. + + Args: + content: The text content for the node. + + Returns: + A dictionary with keys "type", "content", and "name" representing the text node. + """ + return { + "type": "text", + "content": content, + "name": "text" + } + + def end_block(self): + # Create paragraph from buffered text + """ + Finalizes the current text block into a paragraph element. + + If any text is buffered, it is stripped of leading and trailing whitespace. If the resulting text is non-empty, a text node wrapped in a paragraph element is created and appended to the DOM. The text buffer is then cleared. + """ + if self.buffer: + text = self.buffer.strip() + if text: + para = self.create_element("p", children=[self.create_text(text)]) + self.dom.append(para) + self.buffer = '' + + def start_block(self): + """ + Marks the beginning of a new block. + + This placeholder method is reserved for future functionality to indicate the start + of a new block during Markdown parsing. Currently, it does not perform any actions. + """ + pass + + def handle_blockquote(self, text: 'list[str]', i): + """ + Parses a Markdown blockquote and appends it to the DOM. + + Starting at index i, this function extracts blockquote content by stripping the + blockquote marker (">") from lines and accumulating text. Empty lines trigger a + paragraph break, and lines without a marker that don't match other Markdown + elements are treated as continuations. The consolidated text is recursively + parsed into child elements of a new blockquote, which is then added to the DOM. + + Parameters: + text (list[str]): The sequence of Markdown lines. + i (int): The starting index for blockquote processing. + + Returns: + int: The line offset based on the number of consumed blockquote lines. + """ + elm = self.create_element("blockquote") + new_text = [] + current_line = [] + + for line in text[i:]: + if re.match(self.top_level_tags["blockquote"], line): + # Remove blockquote marker and add to current line + content = line.removeprefix("> ").removeprefix(">").strip() + if content: + current_line.append(content) + elif line.strip() == "": + # Empty line marks paragraph break + if current_line: + new_text.append(" ".join(current_line)) + new_text.append("") + current_line = [] + elif not any(re.match(pattern, line) for pattern in self.top_level_tags.values()): + # Continuation of blockquote without marker + current_line.append(line.strip()) + else: + break + + if current_line: + new_text.append(" ".join(current_line)) + + # Parse blockquote content recursively + elm["children"] = MDParser().parse("\n".join(new_text)) + self.dom.append(elm) + + return len(new_text) - 1 + + def handle_code(self, text: 'list[str]'): + """ + Parses a fenced code block and appends a corresponding code element to the DOM. + + Finalizes any pending text block and then matches the joined Markdown lines against + a code block pattern. It extracts the language specifier (if any) and the code content, + creates a nested
 element (with a language attribute), and adds it to the DOM.
+        
+        Args:
+            text: A list of strings representing the lines of a Markdown code block.
+        
+        Returns:
+            An integer offset computed from the positions of the code fence markers in the joined text.
+        """
+        self.end_block()
+        match = re.match(self.top_level_tags["code"], "\n".join(text))
+        assert match is not None, "Code block not found"
+
+        lang = match.group(1)
+        content = match.group(2)
+
+        elm = self.create_element("pre", children=[self.create_element("code", {"language": lang}, [self.create_text(content)])])
+        self.dom.append(elm)
+
+        return "\n".join(text)["\n".join(text).index("```"):].index("```")
+
+
+    def handle_br(self, text: 'list[str]'):
+        """
+        Processes a potential line break in Markdown input.
+        
+        Finalizes the current text block and checks if the first two lines in the provided list are empty.
+        If both lines are empty, a break element is appended to the document, and the method returns 1.
+        Otherwise, it returns 0.
+        
+        Args:
+            text: A list of Markdown lines. The first two elements are inspected to determine if a break should be inserted.
+        
+        Returns:
+            1 if a break element was appended to the DOM; 0 otherwise.
+        """
+        self.end_block()
+        if text[0] == "" and text[1] == "":
+            self.dom.append(self.create_element("br", {}))
+            return 1
+        return 0
+
+    def handle_h(self, line: 'str'):
+        """Process a Markdown header line to add a corresponding header element to the DOM.
+        
+        Finalizes any pending text block, extracts the header level and content from the input
+        line, and appends an HTML header element (e.g., 

,

) with the parsed content to the + document model. Raises an AssertionError if the line does not match the expected header pattern. + + Args: + line: A Markdown header line containing header markup and corresponding text. + """ + self.end_block() + match = re.match(self.top_level_tags["h"], line) + assert match is not None, "Header not found" + + level = len(match.group(1)) + content = match.group(2) + + self.dom.append(self.create_element(f"h{level}", children=[self.create_text(content)])) + + def handle_hr(self, line: 'str'): + """ + Processes a horizontal rule in Markdown. + + Ends the current text block and appends a horizontal rule element to the document. + """ + self.end_block() + self.dom.append(self.create_element("hr", {})) + + def handle_text(self, line: 'str'): + # Don't create text nodes for empty lines + """ + Processes a text line for paragraph buffering. + + If the line contains only whitespace, a line break is processed via the break handler. + Otherwise, the line is appended to the internal buffer to accumulate paragraph content. + """ + if not line.strip(): + self.handle_br(line) + return + + # Buffer text content for paragraph handling + if self.buffer: + self.buffer += '\n' + line + else: + self.buffer = line + + def handle_list(self, text: 'list[str]', i: int, indent_level: int = 0) -> int: + """ + Parses a Markdown list block, handling nested lists. + + This method examines the line at the given index to determine if it starts an unordered or ordered + list. It then iterates through subsequent lines, grouping them as list items and managing indentation + to detect nested lists via recursive calls. Each completed list item is added to the parser's DOM, + and the method returns the total number of lines processed for the list. + + Args: + text: A list of Markdown text lines. + i: The starting index in text where the list begins. + indent_level: The indentation level used to determine the current list’s scope (default is 0). + + Returns: + The total number of lines processed as part of the list. + """ + if re.match(self.top_level_tags["ul"], text[i]): + list_elm = self.create_element("ul") + list_pattern = self.top_level_tags["ul"] + elif re.match(self.top_level_tags["ol"], text[i]): + list_elm = self.create_element("ol") + list_pattern = self.top_level_tags["ol"] + else: + return 0 + + current_item = [] + lines_processed = 0 + + while i + lines_processed < len(text): + line = text[i + lines_processed] + + if not line.strip(): + if current_item: + # Empty line in list item - treat as paragraph break + current_item.append("") + lines_processed += 1 + continue + + list_match = re.match(list_pattern, line) + if list_match: + indent = len(list_match.group(1)) + + if indent < indent_level: + # End of current list level + break + elif indent > indent_level: + # Nested list + nested_lines = lines_processed + self.handle_list(text[i + lines_processed:], 0, indent) + lines_processed += nested_lines + continue + + # Add previous item if exists + if current_item: + content = " ".join(current_item).strip() + if content: + list_elm["children"].append( + self.create_element("li", children=[self.create_text(content)]) + ) + + # Start new item + current_item = [list_match.group(2).strip()] + + elif not any(re.match(pattern, line) for pattern in self.top_level_tags.values()): + # Continuation of list item + current_item.append(line.strip()) + else: + break + + lines_processed += 1 + + # Add final item + if current_item: + content = " ".join(current_item).strip() + if content: + list_elm["children"].append( + self.create_element("li", children=[self.create_text(content)]) + ) + + self.dom.append(list_elm) + return lines_processed + + def handle_table(self, text: 'list[str]', i: int) -> int: + # First check if this is actually a table + # A proper table needs at least two rows (header and separator) + """ + Parses a Markdown table starting at the specified line index. + + Checks for a valid table by verifying that a header separator exists immediately after the header row. + If found, processes subsequent lines as table rows—adding header cells to the table's thead and regular + cells to its tbody—and appends the constructed table element to the document. If the structure does not + conform to a table, the line is handled as regular text. Returns the number of lines processed. + """ + if i + 1 >= len(text) or not re.match(self.top_level_tags["thead"], text[i + 1]): + # Not a table, treat as regular text + self.handle_text(text[i]) + return 1 + + lines_processed = 0 + table = self.create_element("table") + thead = self.create_element("thead") + tbody = self.create_element("tbody") + current_section = thead + + while i + lines_processed < len(text): + line = text[i + lines_processed] + + if not line.strip(): + break + + if re.match(self.top_level_tags["thead"], line): + # Alignment row - skip it but switch to tbody + current_section = tbody + lines_processed += 1 + continue + + if re.match(self.top_level_tags["tr"], line): + # Process table row + row = self.create_element("tr") + cells = [cell.strip() for cell in line.strip('|').split('|')] + + for cell in cells: + if current_section == thead: + cell_type = "th" + else: + cell_type = "td" + + row["children"].append( + self.create_element(cell_type, children=[self.create_text(cell.strip())]) + ) + + current_section["children"].append(row) + lines_processed += 1 + else: + break + + if thead["children"]: + table["children"].append(thead) + if tbody["children"]: + table["children"].append(tbody) + + self.dom.append(table) + return lines_processed + + def handle_title(self, line: 'str'): + """ + Processes a Markdown title line and sets the document head element. + + Ends any active text block, extracts the title from the given line using a regex + pattern, and assigns a head element with a title child to the document. An + AssertionError is raised if the line does not match the expected title format. + """ + self.end_block() + match = re.match(self.top_level_tags["title"], line) + assert match is not None, "Title not found" + + title = match.group(1) + self.head = self.create_element("head", children=[self.create_element("title", children=[self.create_text(title)])]) + + def parse(self, markdown: 'str') -> 'ELEMENT': + """ + Parses Markdown text into a structured DOM element. + + Resets the parser state and processes each line of the input Markdown to build a Document + Object Model (DOM) that reflects various Markdown constructs such as headers, blockquotes, + code blocks, horizontal rules, lists, tables, titles, and line breaks. Regular text is buffered + and converted into paragraph elements when an empty line is encountered. The constructed DOM is + returned as an HTML element containing a head section (either existing or newly created) and a body + with the parsed content. + + Args: + markdown: The Markdown string to be parsed. + + Returns: + The root DOM element representing the parsed HTML structure. + """ + self.reset() + lines = markdown.splitlines() + i = 0 + + while i < len(lines): + line = lines[i].strip() # Strip whitespace from each line + + # Empty line ends current block + if not line: + self.end_block() + i += 1 + continue + + # Check for block-level elements + if re.search(self.top_level_tags["h"], line): + self.end_block() + self.handle_h(line) + i += 1 + continue + + elif re.search(self.top_level_tags["blockquote"], line): + self.end_block() + lines_processed = self.handle_blockquote(lines, i) + i += lines_processed + 1 + continue + + elif re.search(self.top_level_tags["code"], "\n".join(lines[i:])): + self.end_block() + lines_processed = self.handle_code(lines[i:]) + i += lines_processed + 1 + continue + + elif re.search(self.top_level_tags["h"], line): + self.end_block() + self.handle_h(line) + i += 1 + continue + + elif re.search(self.top_level_tags["hr"], line): + self.end_block() + self.handle_hr(line) + i += 1 + continue + + elif re.search(self.top_level_tags["ul"], line) or re.search(self.top_level_tags["ol"], line): + self.end_block() + lines_processed = self.handle_list(lines, i) + i += lines_processed + continue + + elif re.search(self.top_level_tags["tr"], line): + self.end_block() + lines_processed = self.handle_table(lines, i) + i += lines_processed + continue + + elif re.search(self.top_level_tags["title"], line): + self.end_block() + self.handle_title(line) + i += 1 + continue + + elif re.search(self.top_level_tags["br"], line): + self.end_block() + lines_processed = self.handle_br(lines[i:]) + i += lines_processed + continue + + else: + # Regular text gets buffered for paragraph handling + self.handle_text(line) + i += 1 + + # End any remaining block + self.end_block() + + head = self.create_element("head") or self.head + body = self.create_element("body", children=self.dom) + + return self.create_element("html", children=[head, body]) \ No newline at end of file diff --git a/BetterMD/parse/typing.py b/BetterMD/parse/typing.py new file mode 100644 index 0000000..b291c05 --- /dev/null +++ b/BetterMD/parse/typing.py @@ -0,0 +1,25 @@ +import typing as t + +class TEXT(t.TypedDict): + type: t.Literal["text"] + content: str + name: t.Literal["text"] + +class ELEMENT(t.TypedDict): + type: 't.Literal["element"]' + name: 'str' + attributes: 'dict[str, str]' + children: 'list[t.Union[ELEMENT, TEXT]]' + +@t.runtime_checkable +class Parser(t.Protocol): + def parse(self, html:'str') -> 'list[ELEMENT]': """ +Parses an HTML string into a list of ELEMENT objects. + +Args: + html: A string containing HTML-like data to be parsed. + +Returns: + A list of ELEMENT objects representing the hierarchical structure of the HTML. +""" +... \ No newline at end of file diff --git a/BetterMD/rst/custom_rst.py b/BetterMD/rst/custom_rst.py index c3fa565..7dbfce1 100644 --- a/BetterMD/rst/custom_rst.py +++ b/BetterMD/rst/custom_rst.py @@ -1,16 +1,57 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomRst(t.Generic[T]): +class CustomRst(t.Generic[T], ABC): prop = "" rst: 'dict[str, str]' = {} - def to_rst(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol', **kwargs) -> str: ... + @abstractmethod + def to_rst(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol') -> 'str': """ +Converts provided symbols into a reStructuredText formatted string. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +Subclasses must override this method to implement conversion logic that transforms the +current symbol, its nested inner symbols, and the contextual parent symbol into a +reStructuredText representation. - def verify(self, text) -> bool: ... \ No newline at end of file +Args: + inner: A list of symbols representing nested content. + symbol: The symbol instance to be converted. + parent: The parent symbol providing contextual information. + +Returns: + A string containing the reStructuredText representation. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'): """ +Prepare context for reStructuredText conversion. + +Subclasses may override this method to perform any preparatory actions before +rendering a symbol into reStructuredText format. The provided parameters supply +context for the conversion process. + +Args: + inner: A list of symbols representing nested or inner elements. + symbol: The symbol to be processed. + parent: The parent symbol of the current symbol. +""" +... + + def verify(self, text) -> 'bool': """ +Verify the validity of the provided text. + +This method should assess whether the given text meets the required criteria. +Subclasses must override this method to implement the specific verification logic. + +Args: + text: The text to be validated. + +Returns: + bool: True if the text is valid, False otherwise. +""" +... \ No newline at end of file