Coverage for tdom/parser.py: 100%
84 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-17 19:54 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-17 19:54 +0000
1import random
2import string
3import typing as t
4from html.parser import HTMLParser
6from .nodes import VOID_ELEMENTS, Comment, DocumentType, Element, Fragment, Node, Text
8_FRAGMENT_TAG = f"t🐍f-{''.join(random.choices(string.ascii_lowercase, k=4))}-"
11class NodeParser(HTMLParser):
12 root: Fragment
13 stack: list[Element]
15 def __init__(self):
16 super().__init__()
17 self.root = Fragment(children=[])
18 self.stack = []
20 def handle_starttag(
21 self, tag: str, attrs: t.Sequence[tuple[str, str | None]]
22 ) -> None:
23 node = Element(tag, attrs=dict(attrs), children=[])
24 self.stack.append(node)
26 # Unfortunately, Python's built-in HTMLParser has inconsistent behavior
27 # with void elements. In particular, it calls handle_endtag() for them
28 # only if they explicitly self-close (e.g., <br />). But in the HTML
29 # spec itself, *there is no distinction* between <br> and <br />.
30 # So we need to handle this case ourselves.
31 #
32 # See https://github.com/python/cpython/issues/69445
33 if tag in VOID_ELEMENTS:
34 # Always call handle_endtag for void elements. If it happens
35 # to be self-closed in the input, handle_endtag() will effectively
36 # be called twice. We ignore the second call there.
37 self.handle_endtag(tag)
39 def handle_endtag(self, tag: str) -> None:
40 if tag in VOID_ELEMENTS:
41 # Special case: handle Python issue #69445 (see comment above).
42 most_recent_closed = self.get_most_recent_closed_element()
43 if most_recent_closed and most_recent_closed.tag == tag:
44 # Ignore this call; we've already closed it.
45 return
46 open_element = self.get_open_element()
47 if open_element and open_element.tag == tag:
48 _ = self.stack.pop()
49 self.append_element_child(open_element)
50 return
52 if not self.stack:
53 raise ValueError(f"Unexpected closing tag </{tag}> with no open element.")
55 element = self.stack.pop()
56 if element.tag != tag:
57 raise ValueError(f"Mismatched closing tag </{tag}> for <{element.tag}>.")
59 self.append_element_child(element)
61 def handle_data(self, data: str) -> None:
62 text = Text(data)
63 self.append_child(text)
65 def handle_comment(self, data: str) -> None:
66 comment = Comment(data)
67 self.append_child(comment)
69 def handle_decl(self, decl: str) -> None:
70 if decl.upper().startswith("DOCTYPE"):
71 doctype_content = decl[7:].strip()
72 doctype = DocumentType(doctype_content)
73 self.append_child(doctype)
74 # For simplicity, we ignore other declarations.
75 pass
77 def get_parent(self) -> Fragment | Element:
78 """Return the current parent node to which new children should be added."""
79 return self.stack[-1] if self.stack else self.root
81 def get_open_element(self) -> Element | None:
82 """Return the currently open Element, if any."""
83 return self.stack[-1] if self.stack else None
85 def get_most_recent_closed_element(self) -> Element | None:
86 """Return the most recently closed Element, if any."""
87 parent = self.get_parent()
88 if parent.children and isinstance(parent.children[-1], Element):
89 return parent.children[-1]
90 return None
92 def append_element_child(self, child: Element) -> None:
93 parent = self.get_parent()
94 node: Element | Fragment = child
95 # Special case: if the element is a Fragment, convert it to a Fragment node.
96 if child.tag == _FRAGMENT_TAG:
97 assert not child.attrs, (
98 "Fragment elements should never be able to have attributes."
99 )
100 node = Fragment(children=child.children)
101 parent.children.append(node)
103 def append_child(self, child: Fragment | Text | Comment | DocumentType) -> None:
104 parent = self.get_parent()
105 parent.children.append(child)
107 def close(self) -> None:
108 if self.stack:
109 raise ValueError("Invalid HTML structure: unclosed tags remain.")
110 super().close()
112 def get_node(self) -> Node:
113 """Get the Node tree parsed from the input HTML."""
114 # CONSIDER: Should we invert things and offer streaming parsing?
115 assert not self.stack, "Did you forget to call close()?"
116 if len(self.root.children) > 1:
117 # The parse structure results in multiple root elements, so we
118 # return a Fragment to hold them all.
119 return self.root
120 elif len(self.root.children) == 1:
121 # The parse structure results in a single root element, so we
122 # return that element directly. This will be a non-Fragment Node.
123 return self.root.children[0]
124 else:
125 # Special case: the parse structure is empty; we treat
126 # this as an empty Text Node.
127 return Text("")
129 def feed(self, data: str) -> None:
130 # Special case: handle custom fragment syntax <>...</>
131 # by replacing it with a unique tag name that is unlikely
132 # to appear in normal HTML.
133 data = data.replace("<>", f"<{_FRAGMENT_TAG}>").replace(
134 "</>", f"</{_FRAGMENT_TAG}>"
135 )
136 super().feed(data)
139def parse_html(input: str | t.Iterable[str]) -> Node:
140 """
141 Parse a string, or sequence of HTML string chunks, into a Node tree.
143 If a single string is provided, it is parsed as a whole. If an iterable
144 of strings is provided, each string is fed to the parser in sequence.
145 This is particularly useful if you want to keep specific text chunks
146 separate in the resulting Node tree.
147 """
148 parser = NodeParser()
149 iterable = [input] if isinstance(input, str) else input
150 for chunk in iterable:
151 parser.feed(chunk)
152 parser.close()
153 return parser.get_node()