Coverage for tdom/parser.py: 100%

84 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-17 19:54 +0000

1import random 

2import string 

3import typing as t 

4from html.parser import HTMLParser 

5 

6from .nodes import VOID_ELEMENTS, Comment, DocumentType, Element, Fragment, Node, Text 

7 

8_FRAGMENT_TAG = f"t🐍f-{''.join(random.choices(string.ascii_lowercase, k=4))}-" 

9 

10 

11class NodeParser(HTMLParser): 

12 root: Fragment 

13 stack: list[Element] 

14 

15 def __init__(self): 

16 super().__init__() 

17 self.root = Fragment(children=[]) 

18 self.stack = [] 

19 

20 def handle_starttag( 

21 self, tag: str, attrs: t.Sequence[tuple[str, str | None]] 

22 ) -> None: 

23 node = Element(tag, attrs=dict(attrs), children=[]) 

24 self.stack.append(node) 

25 

26 # Unfortunately, Python's built-in HTMLParser has inconsistent behavior 

27 # with void elements. In particular, it calls handle_endtag() for them 

28 # only if they explicitly self-close (e.g., <br />). But in the HTML 

29 # spec itself, *there is no distinction* between <br> and <br />. 

30 # So we need to handle this case ourselves. 

31 # 

32 # See https://github.com/python/cpython/issues/69445 

33 if tag in VOID_ELEMENTS: 

34 # Always call handle_endtag for void elements. If it happens 

35 # to be self-closed in the input, handle_endtag() will effectively 

36 # be called twice. We ignore the second call there. 

37 self.handle_endtag(tag) 

38 

39 def handle_endtag(self, tag: str) -> None: 

40 if tag in VOID_ELEMENTS: 

41 # Special case: handle Python issue #69445 (see comment above). 

42 most_recent_closed = self.get_most_recent_closed_element() 

43 if most_recent_closed and most_recent_closed.tag == tag: 

44 # Ignore this call; we've already closed it. 

45 return 

46 open_element = self.get_open_element() 

47 if open_element and open_element.tag == tag: 

48 _ = self.stack.pop() 

49 self.append_element_child(open_element) 

50 return 

51 

52 if not self.stack: 

53 raise ValueError(f"Unexpected closing tag </{tag}> with no open element.") 

54 

55 element = self.stack.pop() 

56 if element.tag != tag: 

57 raise ValueError(f"Mismatched closing tag </{tag}> for <{element.tag}>.") 

58 

59 self.append_element_child(element) 

60 

61 def handle_data(self, data: str) -> None: 

62 text = Text(data) 

63 self.append_child(text) 

64 

65 def handle_comment(self, data: str) -> None: 

66 comment = Comment(data) 

67 self.append_child(comment) 

68 

69 def handle_decl(self, decl: str) -> None: 

70 if decl.upper().startswith("DOCTYPE"): 

71 doctype_content = decl[7:].strip() 

72 doctype = DocumentType(doctype_content) 

73 self.append_child(doctype) 

74 # For simplicity, we ignore other declarations. 

75 pass 

76 

77 def get_parent(self) -> Fragment | Element: 

78 """Return the current parent node to which new children should be added.""" 

79 return self.stack[-1] if self.stack else self.root 

80 

81 def get_open_element(self) -> Element | None: 

82 """Return the currently open Element, if any.""" 

83 return self.stack[-1] if self.stack else None 

84 

85 def get_most_recent_closed_element(self) -> Element | None: 

86 """Return the most recently closed Element, if any.""" 

87 parent = self.get_parent() 

88 if parent.children and isinstance(parent.children[-1], Element): 

89 return parent.children[-1] 

90 return None 

91 

92 def append_element_child(self, child: Element) -> None: 

93 parent = self.get_parent() 

94 node: Element | Fragment = child 

95 # Special case: if the element is a Fragment, convert it to a Fragment node. 

96 if child.tag == _FRAGMENT_TAG: 

97 assert not child.attrs, ( 

98 "Fragment elements should never be able to have attributes." 

99 ) 

100 node = Fragment(children=child.children) 

101 parent.children.append(node) 

102 

103 def append_child(self, child: Fragment | Text | Comment | DocumentType) -> None: 

104 parent = self.get_parent() 

105 parent.children.append(child) 

106 

107 def close(self) -> None: 

108 if self.stack: 

109 raise ValueError("Invalid HTML structure: unclosed tags remain.") 

110 super().close() 

111 

112 def get_node(self) -> Node: 

113 """Get the Node tree parsed from the input HTML.""" 

114 # CONSIDER: Should we invert things and offer streaming parsing? 

115 assert not self.stack, "Did you forget to call close()?" 

116 if len(self.root.children) > 1: 

117 # The parse structure results in multiple root elements, so we 

118 # return a Fragment to hold them all. 

119 return self.root 

120 elif len(self.root.children) == 1: 

121 # The parse structure results in a single root element, so we 

122 # return that element directly. This will be a non-Fragment Node. 

123 return self.root.children[0] 

124 else: 

125 # Special case: the parse structure is empty; we treat 

126 # this as an empty Text Node. 

127 return Text("") 

128 

129 def feed(self, data: str) -> None: 

130 # Special case: handle custom fragment syntax <>...</> 

131 # by replacing it with a unique tag name that is unlikely 

132 # to appear in normal HTML. 

133 data = data.replace("<>", f"<{_FRAGMENT_TAG}>").replace( 

134 "</>", f"</{_FRAGMENT_TAG}>" 

135 ) 

136 super().feed(data) 

137 

138 

139def parse_html(input: str | t.Iterable[str]) -> Node: 

140 """ 

141 Parse a string, or sequence of HTML string chunks, into a Node tree. 

142 

143 If a single string is provided, it is parsed as a whole. If an iterable 

144 of strings is provided, each string is fed to the parser in sequence. 

145 This is particularly useful if you want to keep specific text chunks 

146 separate in the resulting Node tree. 

147 """ 

148 parser = NodeParser() 

149 iterable = [input] if isinstance(input, str) else input 

150 for chunk in iterable: 

151 parser.feed(chunk) 

152 parser.close() 

153 return parser.get_node()