Coverage for tdom/parser.py: 98%

217 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 04:35 +0000

1from collections.abc import Sequence 

2from dataclasses import dataclass, field 

3from html.parser import HTMLParser 

4from string.templatelib import Interpolation, Template 

5 

6from .htmlspec import VOID_ELEMENTS 

7from .placeholders import PlaceholderConfig, PlaceholderState 

8from .template_utils import TemplateRef, combine_template_refs 

9from .tnodes import ( 

10 TAttribute, 

11 TComment, 

12 TComponent, 

13 TDocumentType, 

14 TElement, 

15 TFragment, 

16 TInterpolatedAttribute, 

17 TLiteralAttribute, 

18 TNode, 

19 TSpreadAttribute, 

20 TTemplatedAttribute, 

21 TText, 

22) 

23 

24type HTMLAttribute = tuple[str, str | None] 

25type HTMLAttributesDict = dict[str, str | None] 

26 

27 

28@dataclass 

29class OpenTElement: 

30 tag: str 

31 attrs: tuple[TAttribute, ...] 

32 children: list[TNode] = field(default_factory=list) 

33 

34 

35@dataclass 

36class OpenTFragment: 

37 children: list[TNode] = field(default_factory=list) 

38 

39 

40@dataclass 

41class OpenTComponent: 

42 start_i_index: int 

43 children_start_s_index: int 

44 """The strings index where the component's children template starts.""" 

45 offset_into_children_start_s: int 

46 """The offset INTO the starting string where the component's children template starts.""" 

47 attrs: tuple[TAttribute, ...] 

48 # @NOTE: The `children` are discarded after parsing and are just used to 

49 # track template consistency. If the component is processed and 

50 # returns its children template then that template will be 

51 # re-parsed (or pulled from the cache). 

52 children: list[TNode] = field(default_factory=list) 

53 

54 

55type OpenTag = OpenTElement | OpenTFragment | OpenTComponent 

56 

57 

58@dataclass 

59class SourceTracker: 

60 """Tracks source locations within a Template for error reporting.""" 

61 

62 # TODO: write utilities to generate complete error messages, with the 

63 # template itself in context and the relevant line/column underlined/etc. 

64 

65 template: Template 

66 # if i_index >= s_index, feeding an interpolation; 

67 # otherwise, when i_index < s_index, feeding a string. 

68 i_index: int = -1 # The current interpolation index. 

69 s_index: int = -1 # The current string index. 

70 

71 @property 

72 def interpolations(self) -> tuple[Interpolation, ...]: 

73 return self.template.interpolations 

74 

75 def advance_interpolation(self) -> int: 

76 """Call before processing an interpolation to move to the next one.""" 

77 self.i_index += 1 

78 return self.i_index 

79 

80 def advance_string(self) -> int: 

81 self.s_index += 1 

82 return self.s_index 

83 

84 def get_expression( 

85 self, i_index: int, fallback_prefix: str = "interpolation" 

86 ) -> str: 

87 """ 

88 Resolve an interpolation index to its original expression for error messages. 

89 Falls back to a synthetic expression if the original is empty. 

90 """ 

91 ip = self.interpolations[i_index] 

92 return ip.expression if ip.expression else f"{{{fallback_prefix}-{i_index}}}" 

93 

94 def format_starttag(self, i_index: int) -> str: 

95 """Format a component start tag for error messages.""" 

96 return self.get_expression(i_index, fallback_prefix="component-starttag") 

97 

98 

99class TemplateParser(HTMLParser): 

100 root: OpenTFragment 

101 stack: list[OpenTag] 

102 placeholders: PlaceholderState 

103 source: SourceTracker | None 

104 

105 def __init__(self, *, convert_charrefs: bool = True): 

106 # This calls HTMLParser.reset() which we override to set up our state. 

107 super().__init__(convert_charrefs=convert_charrefs) 

108 

109 # ------------------------------------------ 

110 # Parse state helpers 

111 # ------------------------------------------ 

112 

113 def get_parent(self) -> OpenTag: 

114 """Return the current parent node to which new children should be added.""" 

115 return self.stack[-1] if self.stack else self.root 

116 

117 def append_child(self, child: TNode) -> None: 

118 parent = self.get_parent() 

119 parent.children.append(child) 

120 

121 # ------------------------------------------ 

122 # Attribute Helpers 

123 # ------------------------------------------ 

124 

125 def make_tattr(self, attr: HTMLAttribute) -> TAttribute: 

126 """Build a TAttribute from a raw attribute tuple.""" 

127 

128 name, value = attr 

129 

130 name_ref = self.placeholders.remove_placeholders(name) 

131 value_ref = ( 

132 self.placeholders.remove_placeholders(value) if value is not None else None 

133 ) 

134 

135 if name_ref.is_literal: 

136 if value_ref is None or value_ref.is_literal: 

137 return TLiteralAttribute(name=name, value=value) 

138 elif value_ref.is_singleton: 

139 return TInterpolatedAttribute( 

140 name=name, value_i_index=value_ref.i_indexes[0] 

141 ) 

142 else: 

143 return TTemplatedAttribute(name=name, value_ref=value_ref) 

144 if value_ref is not None: 

145 raise ValueError( 

146 "Attribute names cannot contain interpolations if the value is also interpolated." 

147 ) 

148 if not name_ref.is_singleton: 

149 raise ValueError( 

150 "Spread attributes must have exactly one interpolation in the name." 

151 ) 

152 return TSpreadAttribute(i_index=name_ref.i_indexes[0]) 

153 

154 def make_tattrs(self, attrs: Sequence[HTMLAttribute]) -> tuple[TAttribute, ...]: 

155 """Build TAttributes from raw attribute tuples.""" 

156 return tuple(self.make_tattr(attr) for attr in attrs) 

157 

158 # ------------------------------------------ 

159 # Tag Helpers 

160 # ------------------------------------------ 

161 

162 def make_open_tag(self, tag: str, attrs: Sequence[HTMLAttribute]) -> OpenTag: 

163 """Build an OpenTag from a raw tag and attribute tuples.""" 

164 tag_ref = self.placeholders.remove_placeholders(tag) 

165 

166 if tag_ref.is_literal: 

167 return OpenTElement(tag=tag, attrs=self.make_tattrs(attrs)) 

168 

169 if not tag_ref.is_singleton: 

170 raise ValueError( 

171 "Component element tags must have exactly one interpolation." 

172 ) 

173 

174 # HERE BE DRAGONS: the interpolation at i_index should be a 

175 # component callable. We do not check this in the parser, instead 

176 # relying on higher layers to validate types and render correctly. 

177 i_index = tag_ref.i_indexes[0] 

178 

179 # @NOTE: This must be stored when the tag is handled since it is 

180 # set based on when the template parts are fed in and otherwise 

181 # might be out of sync. 

182 # The starting s_index of the component's children template. Note that 

183 # this string either contains ">" or " />". It might not be 

184 # i_index + 1 because attributes WITHIN the component's tag might 

185 # contain interpolations causing the i_index (and s_index) to advance 

186 # arbitrarily. 

187 children_start_s_index = self.get_source().s_index 

188 

189 # @NOTE: This must be called when the tag is handled since it is 

190 # populated based on the most recently finished start tag. Otherwise 

191 # the value will be out of sync. 

192 starttag_text = self.get_starttag_text() 

193 if starttag_text is None: 

194 raise AssertionError( 

195 f"Expected startag_text to be set when parsing component at {i_index}." 

196 ) 

197 

198 tattrs = self.make_tattrs(attrs) 

199 

200 offset_into_children_start_s = self.compute_offset_into_children_start_s( 

201 start_i_index=i_index, 

202 tattrs=tattrs, 

203 config=self.placeholders.config, 

204 starttag_text=starttag_text, 

205 ) 

206 

207 return OpenTComponent( 

208 start_i_index=i_index, 

209 children_start_s_index=children_start_s_index, 

210 offset_into_children_start_s=offset_into_children_start_s, 

211 attrs=tattrs, 

212 ) 

213 

214 def compute_offset_into_children_start_s( 

215 self, 

216 start_i_index: int, 

217 tattrs: tuple[TAttribute, ...], 

218 config: PlaceholderConfig, 

219 starttag_text: str, 

220 ) -> int: 

221 """ 

222 Compute offset into "string" containing the start of children template. 

223 

224 @NOTE: This is to actually OFFLOAD work to the parser itself. If we try 

225 to "rebuild" the tag from the parse result we are bound to fail in some 

226 way(s). We essentially re-run the placeholder process but with content 

227 we KNOWN ends at the end of the starttag, ie. ">", because the parser 

228 told us that is where it ends (rather than trying to scan for ">" 

229 because ">" might be in literal tags). 

230 

231 Examples: 

232 

233 <{Comp}></{Comp}> -- len(">") 

234 <{Comp}>children</{Comp}> -- len(">") 

235 <{Comp} title="1>0">children</{Comp}> -- len(' title="1>0">') 

236 <{Comp} title="{'1>0'}">children</{Comp}> -- len('">') 

237 """ 

238 # Rebuild known interpolations in the starttag. 

239 known: set[int] = {start_i_index} # The component callable itself. 

240 for attr in tattrs: 

241 if isinstance(attr, TInterpolatedAttribute): 

242 known.add(attr.value_i_index) 

243 elif isinstance(attr, TSpreadAttribute): 

244 known.add(attr.i_index) 

245 elif isinstance(attr, TTemplatedAttribute): 

246 known.update(attr.value_ref.i_indexes) 

247 # Now re-remove those placeholders using the same config we used to 

248 # make them. 

249 temp_placeholders = PlaceholderState(known=known, config=config) 

250 tag_ref = temp_placeholders.remove_placeholders(starttag_text) 

251 if not temp_placeholders.is_empty: 

252 raise AssertionError( 

253 "There are extra placeholders still in the starttag_text." 

254 ) 

255 # Now the last string should terminate the starttag and end with ">" 

256 # So this length is the offset from the last interpolation to the start 

257 # of the children's leading string. 

258 return len(tag_ref.strings[-1]) 

259 

260 def finalize_tag( 

261 self, open_tag: OpenTag, endtag_i_index: int | None = None 

262 ) -> TNode: 

263 """Finalize an OpenTag into a TNode.""" 

264 match open_tag: 

265 case OpenTElement(tag=tag, attrs=attrs, children=children): 

266 return TElement(tag=tag, attrs=attrs, children=tuple(children)) 

267 case OpenTFragment(children=children): 

268 return TFragment(children=tuple(children)) 

269 case OpenTComponent( 

270 start_i_index=start_i_index, 

271 children_start_s_index=children_start_s_index, 

272 offset_into_children_start_s=offset_into_children_start_s, 

273 attrs=attrs, 

274 ): 

275 children_ref = self.extract_component_children_ref( 

276 start_i_index=start_i_index, 

277 endtag_i_index=endtag_i_index, 

278 children_start_s_index=children_start_s_index, 

279 offset_into_children_start_s=offset_into_children_start_s, 

280 template=self.get_source().template, 

281 ) 

282 return TComponent( 

283 start_i_index=start_i_index, 

284 end_i_index=endtag_i_index, 

285 children_ref=children_ref, 

286 attrs=attrs, 

287 ) 

288 

289 def extract_component_children_ref( 

290 self, 

291 start_i_index: int, 

292 endtag_i_index: int | None, 

293 children_start_s_index: int, 

294 offset_into_children_start_s: int, 

295 template: Template, 

296 ) -> TemplateRef: 

297 """ 

298 Extract the component children template from the entire template. 

299 

300 We use this template as a "key" into the cache to get the TNode tree. 

301 """ 

302 if start_i_index != endtag_i_index and endtag_i_index is not None: 

303 # CASE: <{Comp}>...</{Comp}> or <{Comp}></{Comp}> 

304 

305 # Use the interpolation index of the callable in the closing tag 

306 # preceding "string" index is always the same as an interpolation index 

307 # The "string" should look like this: "...</" 

308 children_end_s_index = endtag_i_index 

309 # Offset past the trailing part of the component's start tag to get to 

310 # where the first "string" of the children's template starts. 

311 leading = template.strings[children_start_s_index][ 

312 offset_into_children_start_s: 

313 ] 

314 if children_start_s_index == children_end_s_index: 

315 # CASE: Entire children template is a string, leading == trailing. 

316 leading = leading[: leading.rfind("</")] 

317 children_ref = TemplateRef(strings=(leading,), i_indexes=()) 

318 else: 

319 # CASE: Children template contains interpolations so the trailing 

320 # "string" will not be the same as the leading "string". 

321 trailing = template.strings[children_end_s_index] 

322 trailing = trailing[: trailing.rfind("</")] 

323 children_ref = TemplateRef( 

324 strings=( 

325 leading, 

326 *template.strings[ 

327 children_start_s_index + 1 : children_end_s_index 

328 ], 

329 trailing, 

330 ), 

331 i_indexes=tuple( 

332 range(children_start_s_index, children_end_s_index) 

333 ), 

334 ) 

335 else: 

336 # CASE: <{Comp} /> -- no children template 

337 children_ref = TemplateRef(strings=("",), i_indexes=()) 

338 return children_ref 

339 

340 def validate_end_tag(self, tag: str, open_tag: OpenTag) -> int | None: 

341 """Validate that closing tag matches open tag. Return component end index if applicable.""" 

342 assert self.source, "Parser source tracker not initialized." 

343 tag_ref = self.placeholders.remove_placeholders(tag) 

344 

345 match open_tag: 

346 case OpenTElement(): 

347 if not tag_ref.is_literal: 

348 raise ValueError( 

349 f"Component closing tag found for element <{open_tag.tag}>." 

350 ) 

351 if tag != open_tag.tag: 

352 raise ValueError( 

353 f"Mismatched closing tag </{tag}> for element <{open_tag.tag}>." 

354 ) 

355 return None 

356 

357 case OpenTFragment(): 

358 raise NotImplementedError("We do not support anonymous fragments.") 

359 

360 case OpenTComponent(start_i_index=start_i_index): 

361 if tag_ref.is_literal: 

362 raise ValueError( 

363 f"Mismatched closing tag </{tag}> for component starting at {self.source.format_starttag(start_i_index)}." 

364 ) 

365 if not tag_ref.is_singleton: 

366 raise ValueError( 

367 "Component end tags must have exactly one interpolation." 

368 ) 

369 # HERE BE DRAGONS: the interpolation at end_i_index shuld be a 

370 # component callable that matches the start tag. We do not check 

371 # any of this in the parser, instead relying on higher layers. 

372 return tag_ref.i_indexes[0] 

373 

374 # ------------------------------------------ 

375 # HTMLParser tag callbacks 

376 # ------------------------------------------ 

377 

378 def handle_starttag(self, tag: str, attrs: Sequence[HTMLAttribute]) -> None: 

379 open_tag = self.make_open_tag(tag, attrs) 

380 if isinstance(open_tag, OpenTElement) and open_tag.tag in VOID_ELEMENTS: 

381 final_tag = self.finalize_tag(open_tag) 

382 self.append_child(final_tag) 

383 else: 

384 self.stack.append(open_tag) 

385 

386 def handle_startendtag(self, tag: str, attrs: Sequence[HTMLAttribute]) -> None: 

387 """Dispatch a self-closing tag, `<tag />` to specialized handlers.""" 

388 open_tag = self.make_open_tag(tag, attrs) 

389 final_tag = self.finalize_tag(open_tag) 

390 self.append_child(final_tag) 

391 

392 def handle_endtag(self, tag: str) -> None: 

393 if not self.stack: 

394 raise ValueError(f"Unexpected closing tag </{tag}> with no open tag.") 

395 

396 open_tag = self.stack.pop() 

397 endtag_i_index = self.validate_end_tag(tag, open_tag) 

398 final_tag = self.finalize_tag(open_tag, endtag_i_index) 

399 self.append_child(final_tag) 

400 

401 # ------------------------------------------ 

402 # HTMLParser other callbacks 

403 # ------------------------------------------ 

404 

405 def handle_data(self, data: str) -> None: 

406 ref = self.placeholders.remove_placeholders(data) 

407 parent = self.get_parent() 

408 if parent.children and isinstance(parent.children[-1], TText): 

409 parent.children[-1] = TText( 

410 ref=combine_template_refs(parent.children[-1].ref, ref) 

411 ) 

412 else: 

413 self.append_child(TText(ref=ref)) 

414 

415 def handle_comment(self, data: str) -> None: 

416 ref = self.placeholders.remove_placeholders(data) 

417 comment = TComment(ref) 

418 self.append_child(comment) 

419 

420 def handle_decl(self, decl: str) -> None: 

421 ref = self.placeholders.remove_placeholders(decl) 

422 if not ref.is_literal: 

423 raise ValueError("Interpolations are not allowed in declarations.") 

424 elif decl.upper().startswith("DOCTYPE "): 

425 doctype_content = decl[7:].strip() 

426 doctype = TDocumentType(doctype_content) 

427 self.append_child(doctype) 

428 else: 

429 raise NotImplementedError( 

430 "Only well formed DOCTYPE declarations are currently supported." 

431 ) 

432 

433 def reset(self): 

434 super().reset() 

435 self.root = OpenTFragment() 

436 self.stack = [] 

437 self.placeholders = PlaceholderState() 

438 self.source = None 

439 

440 def close(self) -> None: 

441 if self.waiting_for_data(): 

442 # We apply heuristics here to try to guess why the parser didn't finish. 

443 if self.rawdata.count('"') % 2 == 1 or self.rawdata.count("'") % 2 == 1: 

444 raise ValueError( 

445 "Parser expects more data, maybe you left an attribute quote unclosed?" 

446 ) 

447 else: 

448 raise ValueError( 

449 "Parser expects more data, is the template valid html?" 

450 ) 

451 if self.stack: 

452 raise ValueError("Invalid HTML structure: unclosed tags remain.") 

453 if not self.placeholders.is_empty: 

454 raise ValueError("Some placeholders were never resolved.") 

455 super().close() 

456 

457 def waiting_for_data(self): 

458 return len(self.rawdata) > 0 

459 

460 # ------------------------------------------ 

461 # Getting the parsed node tree 

462 # ------------------------------------------ 

463 

464 def get_tnode(self) -> TNode: 

465 """Get the Node tree parsed from the input HTML.""" 

466 # TODO: consider always returning a TTag? 

467 if len(self.root.children) > 1: 

468 # The parse structure results in multiple root elements, so we 

469 # return a Fragment to hold them all. 

470 return self.finalize_tag(self.root) 

471 elif len(self.root.children) == 1: 

472 # The parse structure results in a single root element, so we 

473 # return that element directly. This will be a non-Fragment Node. 

474 return self.root.children[0] 

475 else: 

476 # Special case: the parse structure is empty; we treat 

477 # this as an empty document fragment. 

478 # CONSIDER: or as an empty text node? 

479 return self.finalize_tag(self.root) 

480 

481 # ------------------------------------------ 

482 # Feeding and parsing 

483 # ------------------------------------------ 

484 

485 def get_source(self) -> SourceTracker: 

486 if self.source is None: 

487 raise AssertionError("Source has not been initialized.") 

488 return self.source 

489 

490 def feed_str(self, s: str) -> None: 

491 """Feed a string part of a Template to the parser.""" 

492 self.feed(s) 

493 

494 def feed_interpolation(self, index: int) -> None: 

495 placeholder = self.placeholders.add_placeholder(index) 

496 self.feed(placeholder) 

497 

498 def feed_template(self, template: Template) -> None: 

499 """Feed a Template's content to the parser.""" 

500 assert self.source is None, "Did you forget to call reset?" 

501 self.source = SourceTracker(template) 

502 for i_index in range(len(template.interpolations)): 

503 self.source.advance_string() 

504 self.feed_str(template.strings[i_index]) 

505 self.source.advance_interpolation() 

506 self.feed_interpolation(i_index) 

507 self.source.advance_string() 

508 self.feed_str(template.strings[-1]) 

509 

510 @staticmethod 

511 def parse(t: Template) -> TNode: 

512 """ 

513 Parse a Template containing valid HTML and substitutions and return 

514 a TNode tree representing its structure. This cachable structure can later 

515 be resolved against actual interpolation values to produce a Node tree. 

516 """ 

517 parser = TemplateParser() 

518 parser.feed_template(t) 

519 parser.close() 

520 return parser.get_tnode()