Coverage for tdom/parser.py: 100%

88 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-31 17:14 +0000

1import random 

2import string 

3import typing as t 

4from html.parser import HTMLParser 

5 

6from markupsafe import Markup 

7 

8from .nodes import ( 

9 CONTENT_ELEMENTS, 

10 VOID_ELEMENTS, 

11 Comment, 

12 DocumentType, 

13 Element, 

14 Fragment, 

15 Node, 

16 Text, 

17) 

18 

19_FRAGMENT_TAG = f"t🐍f-{''.join(random.choices(string.ascii_lowercase, k=4))}-" 

20 

21 

22class NodeParser(HTMLParser): 

23 root: Fragment 

24 stack: list[Element] 

25 

26 def __init__(self): 

27 super().__init__() 

28 self.root = Fragment(children=[]) 

29 self.stack = [] 

30 

31 def handle_starttag( 

32 self, tag: str, attrs: t.Sequence[tuple[str, str | None]] 

33 ) -> None: 

34 node = Element(tag, attrs=dict(attrs), children=[]) 

35 self.stack.append(node) 

36 

37 # Unfortunately, Python's built-in HTMLParser has inconsistent behavior 

38 # with void elements. In particular, it calls handle_endtag() for them 

39 # only if they explicitly self-close (e.g., <br />). But in the HTML 

40 # spec itself, *there is no distinction* between <br> and <br />. 

41 # So we need to handle this case ourselves. 

42 # 

43 # See https://github.com/python/cpython/issues/69445 

44 if tag in VOID_ELEMENTS: 

45 # Always call handle_endtag for void elements. If it happens 

46 # to be self-closed in the input, handle_endtag() will effectively 

47 # be called twice. We ignore the second call there. 

48 self.handle_endtag(tag) 

49 

50 def handle_endtag(self, tag: str) -> None: 

51 if tag in VOID_ELEMENTS: 

52 # Special case: handle Python issue #69445 (see comment above). 

53 most_recent_closed = self.get_most_recent_closed_element() 

54 if most_recent_closed and most_recent_closed.tag == tag: 

55 # Ignore this call; we've already closed it. 

56 return 

57 open_element = self.get_open_element() 

58 if open_element and open_element.tag == tag: 

59 _ = self.stack.pop() 

60 self.append_element_child(open_element) 

61 return 

62 

63 if not self.stack: 

64 raise ValueError(f"Unexpected closing tag </{tag}> with no open element.") 

65 

66 element = self.stack.pop() 

67 if element.tag != tag: 

68 raise ValueError(f"Mismatched closing tag </{tag}> for <{element.tag}>.") 

69 

70 self.append_element_child(element) 

71 

72 def handle_data(self, data: str) -> None: 

73 text = Text(Markup(data) if self.in_content_element() else data) 

74 self.append_child(text) 

75 

76 def handle_comment(self, data: str) -> None: 

77 comment = Comment(data) 

78 self.append_child(comment) 

79 

80 def handle_decl(self, decl: str) -> None: 

81 if decl.upper().startswith("DOCTYPE"): 

82 doctype_content = decl[7:].strip() 

83 doctype = DocumentType(doctype_content) 

84 self.append_child(doctype) 

85 # For simplicity, we ignore other declarations. 

86 pass 

87 

88 def in_content_element(self) -> bool: 

89 """Return True if the current context is within a content element.""" 

90 open_element = self.get_open_element() 

91 return open_element is not None and open_element.tag in CONTENT_ELEMENTS 

92 

93 def get_parent(self) -> Fragment | Element: 

94 """Return the current parent node to which new children should be added.""" 

95 return self.stack[-1] if self.stack else self.root 

96 

97 def get_open_element(self) -> Element | None: 

98 """Return the currently open Element, if any.""" 

99 return self.stack[-1] if self.stack else None 

100 

101 def get_most_recent_closed_element(self) -> Element | None: 

102 """Return the most recently closed Element, if any.""" 

103 parent = self.get_parent() 

104 if parent.children and isinstance(parent.children[-1], Element): 

105 return parent.children[-1] 

106 return None 

107 

108 def append_element_child(self, child: Element) -> None: 

109 parent = self.get_parent() 

110 node: Element | Fragment = child 

111 # Special case: if the element is a Fragment, convert it to a Fragment node. 

112 if child.tag == _FRAGMENT_TAG: 

113 assert not child.attrs, ( 

114 "Fragment elements should never be able to have attributes." 

115 ) 

116 node = Fragment(children=child.children) 

117 parent.children.append(node) 

118 

119 def append_child(self, child: Fragment | Text | Comment | DocumentType) -> None: 

120 parent = self.get_parent() 

121 parent.children.append(child) 

122 

123 def close(self) -> None: 

124 if self.stack: 

125 raise ValueError("Invalid HTML structure: unclosed tags remain.") 

126 super().close() 

127 

128 def get_node(self) -> Node: 

129 """Get the Node tree parsed from the input HTML.""" 

130 # CONSIDER: Should we invert things and offer streaming parsing? 

131 assert not self.stack, "Did you forget to call close()?" 

132 if len(self.root.children) > 1: 

133 # The parse structure results in multiple root elements, so we 

134 # return a Fragment to hold them all. 

135 return self.root 

136 elif len(self.root.children) == 1: 

137 # The parse structure results in a single root element, so we 

138 # return that element directly. This will be a non-Fragment Node. 

139 return self.root.children[0] 

140 else: 

141 # Special case: the parse structure is empty; we treat 

142 # this as an empty Text Node. 

143 return Text("") 

144 

145 def feed(self, data: str) -> None: 

146 # Special case: handle custom fragment syntax <>...</> 

147 # by replacing it with a unique tag name that is unlikely 

148 # to appear in normal HTML. 

149 data = data.replace("<>", f"<{_FRAGMENT_TAG}>").replace( 

150 "</>", f"</{_FRAGMENT_TAG}>" 

151 ) 

152 super().feed(data) 

153 

154 

155def parse_html(input: str | t.Iterable[str]) -> Node: 

156 """ 

157 Parse a string, or sequence of HTML string chunks, into a Node tree. 

158 

159 If a single string is provided, it is parsed as a whole. If an iterable 

160 of strings is provided, each string is fed to the parser in sequence. 

161 This is particularly useful if you want to keep specific text chunks 

162 separate in the resulting Node tree. 

163 """ 

164 parser = NodeParser() 

165 iterable = [input] if isinstance(input, str) else input 

166 for chunk in iterable: 

167 parser.feed(chunk) 

168 parser.close() 

169 return parser.get_node()