Coverage for tdom/parser.py: 98%
217 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 04:35 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 04:35 +0000
1from collections.abc import Sequence
2from dataclasses import dataclass, field
3from html.parser import HTMLParser
4from string.templatelib import Interpolation, Template
6from .htmlspec import VOID_ELEMENTS
7from .placeholders import PlaceholderConfig, PlaceholderState
8from .template_utils import TemplateRef, combine_template_refs
9from .tnodes import (
10 TAttribute,
11 TComment,
12 TComponent,
13 TDocumentType,
14 TElement,
15 TFragment,
16 TInterpolatedAttribute,
17 TLiteralAttribute,
18 TNode,
19 TSpreadAttribute,
20 TTemplatedAttribute,
21 TText,
22)
24type HTMLAttribute = tuple[str, str | None]
25type HTMLAttributesDict = dict[str, str | None]
28@dataclass
29class OpenTElement:
30 tag: str
31 attrs: tuple[TAttribute, ...]
32 children: list[TNode] = field(default_factory=list)
35@dataclass
36class OpenTFragment:
37 children: list[TNode] = field(default_factory=list)
40@dataclass
41class OpenTComponent:
42 start_i_index: int
43 children_start_s_index: int
44 """The strings index where the component's children template starts."""
45 offset_into_children_start_s: int
46 """The offset INTO the starting string where the component's children template starts."""
47 attrs: tuple[TAttribute, ...]
48 # @NOTE: The `children` are discarded after parsing and are just used to
49 # track template consistency. If the component is processed and
50 # returns its children template then that template will be
51 # re-parsed (or pulled from the cache).
52 children: list[TNode] = field(default_factory=list)
55type OpenTag = OpenTElement | OpenTFragment | OpenTComponent
58@dataclass
59class SourceTracker:
60 """Tracks source locations within a Template for error reporting."""
62 # TODO: write utilities to generate complete error messages, with the
63 # template itself in context and the relevant line/column underlined/etc.
65 template: Template
66 # if i_index >= s_index, feeding an interpolation;
67 # otherwise, when i_index < s_index, feeding a string.
68 i_index: int = -1 # The current interpolation index.
69 s_index: int = -1 # The current string index.
71 @property
72 def interpolations(self) -> tuple[Interpolation, ...]:
73 return self.template.interpolations
75 def advance_interpolation(self) -> int:
76 """Call before processing an interpolation to move to the next one."""
77 self.i_index += 1
78 return self.i_index
80 def advance_string(self) -> int:
81 self.s_index += 1
82 return self.s_index
84 def get_expression(
85 self, i_index: int, fallback_prefix: str = "interpolation"
86 ) -> str:
87 """
88 Resolve an interpolation index to its original expression for error messages.
89 Falls back to a synthetic expression if the original is empty.
90 """
91 ip = self.interpolations[i_index]
92 return ip.expression if ip.expression else f"{{{fallback_prefix}-{i_index}}}"
94 def format_starttag(self, i_index: int) -> str:
95 """Format a component start tag for error messages."""
96 return self.get_expression(i_index, fallback_prefix="component-starttag")
99class TemplateParser(HTMLParser):
100 root: OpenTFragment
101 stack: list[OpenTag]
102 placeholders: PlaceholderState
103 source: SourceTracker | None
105 def __init__(self, *, convert_charrefs: bool = True):
106 # This calls HTMLParser.reset() which we override to set up our state.
107 super().__init__(convert_charrefs=convert_charrefs)
109 # ------------------------------------------
110 # Parse state helpers
111 # ------------------------------------------
113 def get_parent(self) -> OpenTag:
114 """Return the current parent node to which new children should be added."""
115 return self.stack[-1] if self.stack else self.root
117 def append_child(self, child: TNode) -> None:
118 parent = self.get_parent()
119 parent.children.append(child)
121 # ------------------------------------------
122 # Attribute Helpers
123 # ------------------------------------------
125 def make_tattr(self, attr: HTMLAttribute) -> TAttribute:
126 """Build a TAttribute from a raw attribute tuple."""
128 name, value = attr
130 name_ref = self.placeholders.remove_placeholders(name)
131 value_ref = (
132 self.placeholders.remove_placeholders(value) if value is not None else None
133 )
135 if name_ref.is_literal:
136 if value_ref is None or value_ref.is_literal:
137 return TLiteralAttribute(name=name, value=value)
138 elif value_ref.is_singleton:
139 return TInterpolatedAttribute(
140 name=name, value_i_index=value_ref.i_indexes[0]
141 )
142 else:
143 return TTemplatedAttribute(name=name, value_ref=value_ref)
144 if value_ref is not None:
145 raise ValueError(
146 "Attribute names cannot contain interpolations if the value is also interpolated."
147 )
148 if not name_ref.is_singleton:
149 raise ValueError(
150 "Spread attributes must have exactly one interpolation in the name."
151 )
152 return TSpreadAttribute(i_index=name_ref.i_indexes[0])
154 def make_tattrs(self, attrs: Sequence[HTMLAttribute]) -> tuple[TAttribute, ...]:
155 """Build TAttributes from raw attribute tuples."""
156 return tuple(self.make_tattr(attr) for attr in attrs)
158 # ------------------------------------------
159 # Tag Helpers
160 # ------------------------------------------
162 def make_open_tag(self, tag: str, attrs: Sequence[HTMLAttribute]) -> OpenTag:
163 """Build an OpenTag from a raw tag and attribute tuples."""
164 tag_ref = self.placeholders.remove_placeholders(tag)
166 if tag_ref.is_literal:
167 return OpenTElement(tag=tag, attrs=self.make_tattrs(attrs))
169 if not tag_ref.is_singleton:
170 raise ValueError(
171 "Component element tags must have exactly one interpolation."
172 )
174 # HERE BE DRAGONS: the interpolation at i_index should be a
175 # component callable. We do not check this in the parser, instead
176 # relying on higher layers to validate types and render correctly.
177 i_index = tag_ref.i_indexes[0]
179 # @NOTE: This must be stored when the tag is handled since it is
180 # set based on when the template parts are fed in and otherwise
181 # might be out of sync.
182 # The starting s_index of the component's children template. Note that
183 # this string either contains ">" or " />". It might not be
184 # i_index + 1 because attributes WITHIN the component's tag might
185 # contain interpolations causing the i_index (and s_index) to advance
186 # arbitrarily.
187 children_start_s_index = self.get_source().s_index
189 # @NOTE: This must be called when the tag is handled since it is
190 # populated based on the most recently finished start tag. Otherwise
191 # the value will be out of sync.
192 starttag_text = self.get_starttag_text()
193 if starttag_text is None:
194 raise AssertionError(
195 f"Expected startag_text to be set when parsing component at {i_index}."
196 )
198 tattrs = self.make_tattrs(attrs)
200 offset_into_children_start_s = self.compute_offset_into_children_start_s(
201 start_i_index=i_index,
202 tattrs=tattrs,
203 config=self.placeholders.config,
204 starttag_text=starttag_text,
205 )
207 return OpenTComponent(
208 start_i_index=i_index,
209 children_start_s_index=children_start_s_index,
210 offset_into_children_start_s=offset_into_children_start_s,
211 attrs=tattrs,
212 )
214 def compute_offset_into_children_start_s(
215 self,
216 start_i_index: int,
217 tattrs: tuple[TAttribute, ...],
218 config: PlaceholderConfig,
219 starttag_text: str,
220 ) -> int:
221 """
222 Compute offset into "string" containing the start of children template.
224 @NOTE: This is to actually OFFLOAD work to the parser itself. If we try
225 to "rebuild" the tag from the parse result we are bound to fail in some
226 way(s). We essentially re-run the placeholder process but with content
227 we KNOWN ends at the end of the starttag, ie. ">", because the parser
228 told us that is where it ends (rather than trying to scan for ">"
229 because ">" might be in literal tags).
231 Examples:
233 <{Comp}></{Comp}> -- len(">")
234 <{Comp}>children</{Comp}> -- len(">")
235 <{Comp} title="1>0">children</{Comp}> -- len(' title="1>0">')
236 <{Comp} title="{'1>0'}">children</{Comp}> -- len('">')
237 """
238 # Rebuild known interpolations in the starttag.
239 known: set[int] = {start_i_index} # The component callable itself.
240 for attr in tattrs:
241 if isinstance(attr, TInterpolatedAttribute):
242 known.add(attr.value_i_index)
243 elif isinstance(attr, TSpreadAttribute):
244 known.add(attr.i_index)
245 elif isinstance(attr, TTemplatedAttribute):
246 known.update(attr.value_ref.i_indexes)
247 # Now re-remove those placeholders using the same config we used to
248 # make them.
249 temp_placeholders = PlaceholderState(known=known, config=config)
250 tag_ref = temp_placeholders.remove_placeholders(starttag_text)
251 if not temp_placeholders.is_empty:
252 raise AssertionError(
253 "There are extra placeholders still in the starttag_text."
254 )
255 # Now the last string should terminate the starttag and end with ">"
256 # So this length is the offset from the last interpolation to the start
257 # of the children's leading string.
258 return len(tag_ref.strings[-1])
260 def finalize_tag(
261 self, open_tag: OpenTag, endtag_i_index: int | None = None
262 ) -> TNode:
263 """Finalize an OpenTag into a TNode."""
264 match open_tag:
265 case OpenTElement(tag=tag, attrs=attrs, children=children):
266 return TElement(tag=tag, attrs=attrs, children=tuple(children))
267 case OpenTFragment(children=children):
268 return TFragment(children=tuple(children))
269 case OpenTComponent(
270 start_i_index=start_i_index,
271 children_start_s_index=children_start_s_index,
272 offset_into_children_start_s=offset_into_children_start_s,
273 attrs=attrs,
274 ):
275 children_ref = self.extract_component_children_ref(
276 start_i_index=start_i_index,
277 endtag_i_index=endtag_i_index,
278 children_start_s_index=children_start_s_index,
279 offset_into_children_start_s=offset_into_children_start_s,
280 template=self.get_source().template,
281 )
282 return TComponent(
283 start_i_index=start_i_index,
284 end_i_index=endtag_i_index,
285 children_ref=children_ref,
286 attrs=attrs,
287 )
289 def extract_component_children_ref(
290 self,
291 start_i_index: int,
292 endtag_i_index: int | None,
293 children_start_s_index: int,
294 offset_into_children_start_s: int,
295 template: Template,
296 ) -> TemplateRef:
297 """
298 Extract the component children template from the entire template.
300 We use this template as a "key" into the cache to get the TNode tree.
301 """
302 if start_i_index != endtag_i_index and endtag_i_index is not None:
303 # CASE: <{Comp}>...</{Comp}> or <{Comp}></{Comp}>
305 # Use the interpolation index of the callable in the closing tag
306 # preceding "string" index is always the same as an interpolation index
307 # The "string" should look like this: "...</"
308 children_end_s_index = endtag_i_index
309 # Offset past the trailing part of the component's start tag to get to
310 # where the first "string" of the children's template starts.
311 leading = template.strings[children_start_s_index][
312 offset_into_children_start_s:
313 ]
314 if children_start_s_index == children_end_s_index:
315 # CASE: Entire children template is a string, leading == trailing.
316 leading = leading[: leading.rfind("</")]
317 children_ref = TemplateRef(strings=(leading,), i_indexes=())
318 else:
319 # CASE: Children template contains interpolations so the trailing
320 # "string" will not be the same as the leading "string".
321 trailing = template.strings[children_end_s_index]
322 trailing = trailing[: trailing.rfind("</")]
323 children_ref = TemplateRef(
324 strings=(
325 leading,
326 *template.strings[
327 children_start_s_index + 1 : children_end_s_index
328 ],
329 trailing,
330 ),
331 i_indexes=tuple(
332 range(children_start_s_index, children_end_s_index)
333 ),
334 )
335 else:
336 # CASE: <{Comp} /> -- no children template
337 children_ref = TemplateRef(strings=("",), i_indexes=())
338 return children_ref
340 def validate_end_tag(self, tag: str, open_tag: OpenTag) -> int | None:
341 """Validate that closing tag matches open tag. Return component end index if applicable."""
342 assert self.source, "Parser source tracker not initialized."
343 tag_ref = self.placeholders.remove_placeholders(tag)
345 match open_tag:
346 case OpenTElement():
347 if not tag_ref.is_literal:
348 raise ValueError(
349 f"Component closing tag found for element <{open_tag.tag}>."
350 )
351 if tag != open_tag.tag:
352 raise ValueError(
353 f"Mismatched closing tag </{tag}> for element <{open_tag.tag}>."
354 )
355 return None
357 case OpenTFragment():
358 raise NotImplementedError("We do not support anonymous fragments.")
360 case OpenTComponent(start_i_index=start_i_index):
361 if tag_ref.is_literal:
362 raise ValueError(
363 f"Mismatched closing tag </{tag}> for component starting at {self.source.format_starttag(start_i_index)}."
364 )
365 if not tag_ref.is_singleton:
366 raise ValueError(
367 "Component end tags must have exactly one interpolation."
368 )
369 # HERE BE DRAGONS: the interpolation at end_i_index shuld be a
370 # component callable that matches the start tag. We do not check
371 # any of this in the parser, instead relying on higher layers.
372 return tag_ref.i_indexes[0]
374 # ------------------------------------------
375 # HTMLParser tag callbacks
376 # ------------------------------------------
378 def handle_starttag(self, tag: str, attrs: Sequence[HTMLAttribute]) -> None:
379 open_tag = self.make_open_tag(tag, attrs)
380 if isinstance(open_tag, OpenTElement) and open_tag.tag in VOID_ELEMENTS:
381 final_tag = self.finalize_tag(open_tag)
382 self.append_child(final_tag)
383 else:
384 self.stack.append(open_tag)
386 def handle_startendtag(self, tag: str, attrs: Sequence[HTMLAttribute]) -> None:
387 """Dispatch a self-closing tag, `<tag />` to specialized handlers."""
388 open_tag = self.make_open_tag(tag, attrs)
389 final_tag = self.finalize_tag(open_tag)
390 self.append_child(final_tag)
392 def handle_endtag(self, tag: str) -> None:
393 if not self.stack:
394 raise ValueError(f"Unexpected closing tag </{tag}> with no open tag.")
396 open_tag = self.stack.pop()
397 endtag_i_index = self.validate_end_tag(tag, open_tag)
398 final_tag = self.finalize_tag(open_tag, endtag_i_index)
399 self.append_child(final_tag)
401 # ------------------------------------------
402 # HTMLParser other callbacks
403 # ------------------------------------------
405 def handle_data(self, data: str) -> None:
406 ref = self.placeholders.remove_placeholders(data)
407 parent = self.get_parent()
408 if parent.children and isinstance(parent.children[-1], TText):
409 parent.children[-1] = TText(
410 ref=combine_template_refs(parent.children[-1].ref, ref)
411 )
412 else:
413 self.append_child(TText(ref=ref))
415 def handle_comment(self, data: str) -> None:
416 ref = self.placeholders.remove_placeholders(data)
417 comment = TComment(ref)
418 self.append_child(comment)
420 def handle_decl(self, decl: str) -> None:
421 ref = self.placeholders.remove_placeholders(decl)
422 if not ref.is_literal:
423 raise ValueError("Interpolations are not allowed in declarations.")
424 elif decl.upper().startswith("DOCTYPE "):
425 doctype_content = decl[7:].strip()
426 doctype = TDocumentType(doctype_content)
427 self.append_child(doctype)
428 else:
429 raise NotImplementedError(
430 "Only well formed DOCTYPE declarations are currently supported."
431 )
433 def reset(self):
434 super().reset()
435 self.root = OpenTFragment()
436 self.stack = []
437 self.placeholders = PlaceholderState()
438 self.source = None
440 def close(self) -> None:
441 if self.waiting_for_data():
442 # We apply heuristics here to try to guess why the parser didn't finish.
443 if self.rawdata.count('"') % 2 == 1 or self.rawdata.count("'") % 2 == 1:
444 raise ValueError(
445 "Parser expects more data, maybe you left an attribute quote unclosed?"
446 )
447 else:
448 raise ValueError(
449 "Parser expects more data, is the template valid html?"
450 )
451 if self.stack:
452 raise ValueError("Invalid HTML structure: unclosed tags remain.")
453 if not self.placeholders.is_empty:
454 raise ValueError("Some placeholders were never resolved.")
455 super().close()
457 def waiting_for_data(self):
458 return len(self.rawdata) > 0
460 # ------------------------------------------
461 # Getting the parsed node tree
462 # ------------------------------------------
464 def get_tnode(self) -> TNode:
465 """Get the Node tree parsed from the input HTML."""
466 # TODO: consider always returning a TTag?
467 if len(self.root.children) > 1:
468 # The parse structure results in multiple root elements, so we
469 # return a Fragment to hold them all.
470 return self.finalize_tag(self.root)
471 elif len(self.root.children) == 1:
472 # The parse structure results in a single root element, so we
473 # return that element directly. This will be a non-Fragment Node.
474 return self.root.children[0]
475 else:
476 # Special case: the parse structure is empty; we treat
477 # this as an empty document fragment.
478 # CONSIDER: or as an empty text node?
479 return self.finalize_tag(self.root)
481 # ------------------------------------------
482 # Feeding and parsing
483 # ------------------------------------------
485 def get_source(self) -> SourceTracker:
486 if self.source is None:
487 raise AssertionError("Source has not been initialized.")
488 return self.source
490 def feed_str(self, s: str) -> None:
491 """Feed a string part of a Template to the parser."""
492 self.feed(s)
494 def feed_interpolation(self, index: int) -> None:
495 placeholder = self.placeholders.add_placeholder(index)
496 self.feed(placeholder)
498 def feed_template(self, template: Template) -> None:
499 """Feed a Template's content to the parser."""
500 assert self.source is None, "Did you forget to call reset?"
501 self.source = SourceTracker(template)
502 for i_index in range(len(template.interpolations)):
503 self.source.advance_string()
504 self.feed_str(template.strings[i_index])
505 self.source.advance_interpolation()
506 self.feed_interpolation(i_index)
507 self.source.advance_string()
508 self.feed_str(template.strings[-1])
510 @staticmethod
511 def parse(t: Template) -> TNode:
512 """
513 Parse a Template containing valid HTML and substitutions and return
514 a TNode tree representing its structure. This cachable structure can later
515 be resolved against actual interpolation values to produce a Node tree.
516 """
517 parser = TemplateParser()
518 parser.feed_template(t)
519 parser.close()
520 return parser.get_tnode()