Coverage for tdom / escaping.py: 100%

37 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-03 21:23 +0000

1import re 

2 

3from markupsafe import escape as markup_escape 

4 

5from .protocols import HasHTMLDunder 

6 

7escape_html_text = markup_escape # unify api for test of project 

8 

9 

10GT = ">" 

11LT = "<" 

12 

13 

14def escape_html_comment(text: str, allow_markup: bool = False) -> str: 

15 """Escape text injected into an HTML comment.""" 

16 if not text: 

17 return text 

18 if allow_markup and isinstance(text, HasHTMLDunder): 

19 return text.__html__() 

20 

21 if not allow_markup and type(text) is not str: 

22 # String manipulation triggers regular html escapes on Markup 

23 # so we coerce the subclass of `str` into a true `str` before 

24 # we start string manipulating. 

25 text = str(text) 

26 

27 # - text must not start with the string ">" 

28 if text[0] == ">": 

29 text = GT + text[1:] 

30 

31 # - nor start with the string "->" 

32 if text[:2] == "->": 

33 text = "-" + GT + text[2:] 

34 

35 # - nor contain the strings "<!--", "-->", or "--!>" 

36 text = text.replace("<!--", LT + "!--") 

37 text = text.replace("-->", "--" + GT) 

38 text = text.replace("--!>", "--!" + GT) 

39 

40 # - nor end with the string "<!-". 

41 if text[-3:] == "<!-": 

42 text = text[:-3] + LT + "!-" 

43 

44 return text 

45 

46 

47# @NOTE: We use a group to preserve the case of the tagname, ie. StylE -> StylE 

48# @NOTE: Rawstrings are needed for the groupname to resolve correctly 

49# otherwise the slash must be escaped twice again. 

50STYLE_RES = ( 

51 ( 

52 re.compile("</(?P<tagname>style)>", re.IGNORECASE | re.ASCII), 

53 LT + r"/\g<tagname>>", 

54 ), 

55) 

56 

57 

58def escape_html_style(text: str, allow_markup: bool = False) -> str: 

59 """Escape text injected into an HTML style element.""" 

60 if allow_markup and isinstance(text, HasHTMLDunder): 

61 return text.__html__() 

62 for matche_re, replace_text in STYLE_RES: 

63 text = re.sub(matche_re, replace_text, text) 

64 return text 

65 

66 

67SCRIPT_RES = ( 

68 # @NOTE: Slashes are unescaped inside `repl` text in ADDITION to 

69 # python's default unescaping. So for a regular python str() you need 

70 # `//` but for a python str() in res.sub(*, repl, *) you need 4 slashes, 

71 # `////`, but we can use a rawstring to only need 2 slashes, ie. `//`. 

72 # in order to get a single slash out the other side. 

73 # @NOTE: We use a group to preserve the case of the tagname, 

74 # ie. ScripT->ScripT. 

75 # @NOTE: Rawstrings are also needed for the groupname to resolve correctly 

76 # otherwise the slash must be escaped twice again. 

77 (re.compile("<!--", re.IGNORECASE | re.ASCII), r"\\x3c!--"), 

78 (re.compile("<(?P<tagname>script)", re.IGNORECASE | re.ASCII), r"\\x3c\g<tagname>"), 

79 ( 

80 re.compile("</(?P<tagname>script)", re.IGNORECASE | re.ASCII), 

81 r"\\x3c/\g<tagname>", 

82 ), 

83) 

84 

85 

86def escape_html_script(text: str, allow_markup: bool = False) -> str: 

87 """ 

88 Escape text injected into an HTML script element. 

89 

90 https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements 

91 

92 (from link) The easiest and safest way to avoid the rather strange restrictions 

93 described in this section is to always escape an ASCII case-insensitive 

94 match for: 

95 - "<!--" as "\x3c!--" 

96 - "<script" as "\x3cscript" 

97 - "</script" as "\x3c/script"` 

98 """ 

99 if allow_markup and isinstance(text, HasHTMLDunder): 

100 return text.__html__() 

101 for match_re, replace_text in SCRIPT_RES: 

102 text = re.sub(match_re, replace_text, text) 

103 return text