Coverage for tdom/parser.py: 100%
88 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-31 17:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-31 17:14 +0000
1import random
2import string
3import typing as t
4from html.parser import HTMLParser
6from markupsafe import Markup
8from .nodes import (
9 CONTENT_ELEMENTS,
10 VOID_ELEMENTS,
11 Comment,
12 DocumentType,
13 Element,
14 Fragment,
15 Node,
16 Text,
17)
19_FRAGMENT_TAG = f"t🐍f-{''.join(random.choices(string.ascii_lowercase, k=4))}-"
22class NodeParser(HTMLParser):
23 root: Fragment
24 stack: list[Element]
26 def __init__(self):
27 super().__init__()
28 self.root = Fragment(children=[])
29 self.stack = []
31 def handle_starttag(
32 self, tag: str, attrs: t.Sequence[tuple[str, str | None]]
33 ) -> None:
34 node = Element(tag, attrs=dict(attrs), children=[])
35 self.stack.append(node)
37 # Unfortunately, Python's built-in HTMLParser has inconsistent behavior
38 # with void elements. In particular, it calls handle_endtag() for them
39 # only if they explicitly self-close (e.g., <br />). But in the HTML
40 # spec itself, *there is no distinction* between <br> and <br />.
41 # So we need to handle this case ourselves.
42 #
43 # See https://github.com/python/cpython/issues/69445
44 if tag in VOID_ELEMENTS:
45 # Always call handle_endtag for void elements. If it happens
46 # to be self-closed in the input, handle_endtag() will effectively
47 # be called twice. We ignore the second call there.
48 self.handle_endtag(tag)
50 def handle_endtag(self, tag: str) -> None:
51 if tag in VOID_ELEMENTS:
52 # Special case: handle Python issue #69445 (see comment above).
53 most_recent_closed = self.get_most_recent_closed_element()
54 if most_recent_closed and most_recent_closed.tag == tag:
55 # Ignore this call; we've already closed it.
56 return
57 open_element = self.get_open_element()
58 if open_element and open_element.tag == tag:
59 _ = self.stack.pop()
60 self.append_element_child(open_element)
61 return
63 if not self.stack:
64 raise ValueError(f"Unexpected closing tag </{tag}> with no open element.")
66 element = self.stack.pop()
67 if element.tag != tag:
68 raise ValueError(f"Mismatched closing tag </{tag}> for <{element.tag}>.")
70 self.append_element_child(element)
72 def handle_data(self, data: str) -> None:
73 text = Text(Markup(data) if self.in_content_element() else data)
74 self.append_child(text)
76 def handle_comment(self, data: str) -> None:
77 comment = Comment(data)
78 self.append_child(comment)
80 def handle_decl(self, decl: str) -> None:
81 if decl.upper().startswith("DOCTYPE"):
82 doctype_content = decl[7:].strip()
83 doctype = DocumentType(doctype_content)
84 self.append_child(doctype)
85 # For simplicity, we ignore other declarations.
86 pass
88 def in_content_element(self) -> bool:
89 """Return True if the current context is within a content element."""
90 open_element = self.get_open_element()
91 return open_element is not None and open_element.tag in CONTENT_ELEMENTS
93 def get_parent(self) -> Fragment | Element:
94 """Return the current parent node to which new children should be added."""
95 return self.stack[-1] if self.stack else self.root
97 def get_open_element(self) -> Element | None:
98 """Return the currently open Element, if any."""
99 return self.stack[-1] if self.stack else None
101 def get_most_recent_closed_element(self) -> Element | None:
102 """Return the most recently closed Element, if any."""
103 parent = self.get_parent()
104 if parent.children and isinstance(parent.children[-1], Element):
105 return parent.children[-1]
106 return None
108 def append_element_child(self, child: Element) -> None:
109 parent = self.get_parent()
110 node: Element | Fragment = child
111 # Special case: if the element is a Fragment, convert it to a Fragment node.
112 if child.tag == _FRAGMENT_TAG:
113 assert not child.attrs, (
114 "Fragment elements should never be able to have attributes."
115 )
116 node = Fragment(children=child.children)
117 parent.children.append(node)
119 def append_child(self, child: Fragment | Text | Comment | DocumentType) -> None:
120 parent = self.get_parent()
121 parent.children.append(child)
123 def close(self) -> None:
124 if self.stack:
125 raise ValueError("Invalid HTML structure: unclosed tags remain.")
126 super().close()
128 def get_node(self) -> Node:
129 """Get the Node tree parsed from the input HTML."""
130 # CONSIDER: Should we invert things and offer streaming parsing?
131 assert not self.stack, "Did you forget to call close()?"
132 if len(self.root.children) > 1:
133 # The parse structure results in multiple root elements, so we
134 # return a Fragment to hold them all.
135 return self.root
136 elif len(self.root.children) == 1:
137 # The parse structure results in a single root element, so we
138 # return that element directly. This will be a non-Fragment Node.
139 return self.root.children[0]
140 else:
141 # Special case: the parse structure is empty; we treat
142 # this as an empty Text Node.
143 return Text("")
145 def feed(self, data: str) -> None:
146 # Special case: handle custom fragment syntax <>...</>
147 # by replacing it with a unique tag name that is unlikely
148 # to appear in normal HTML.
149 data = data.replace("<>", f"<{_FRAGMENT_TAG}>").replace(
150 "</>", f"</{_FRAGMENT_TAG}>"
151 )
152 super().feed(data)
155def parse_html(input: str | t.Iterable[str]) -> Node:
156 """
157 Parse a string, or sequence of HTML string chunks, into a Node tree.
159 If a single string is provided, it is parsed as a whole. If an iterable
160 of strings is provided, each string is fed to the parser in sequence.
161 This is particularly useful if you want to keep specific text chunks
162 separate in the resulting Node tree.
163 """
164 parser = NodeParser()
165 iterable = [input] if isinstance(input, str) else input
166 for chunk in iterable:
167 parser.feed(chunk)
168 parser.close()
169 return parser.get_node()