Html Module¶
The html module provides utilities for working with HTML content, including escaping and unescaping text.
Complexity Reference¶
| Operation | Time | Space | Notes |
|---|---|---|---|
escape(text) |
O(n) | O(n) | n = string length |
unescape(text) |
O(n) | O(n) | n = string length |
parser.HTMLParser() |
O(n) | O(n) | n = HTML size |
parser.feed(text) |
O(n) | O(n) | n = text length |
Common Operations¶
Escaping HTML¶
from html import escape
text = '<script>alert("XSS")</script>'
# O(n) where n = string length
escaped = escape(text)
# Returns: <script>alert("XSS")</script>
# With quote escaping - O(n)
escaped_with_quotes = escape(text, quote=True)
# Same result by default (quote=True)
# Without quote escaping - O(n)
no_quote = escape('Hello "World"', quote=False)
# Returns: Hello "World"
Unescaping HTML¶
from html import unescape
escaped = '<p>Hello & goodbye</p>'
# O(n) where n = string length
unescaped = unescape(escaped)
# Returns: <p>Hello & goodbye</p>
# Common entities - O(n)
entities = '© © ©'
result = unescape(entities)
# Returns: © ™ © ©
Common Use Cases¶
Sanitizing User Input¶
from html import escape
def sanitize_for_html(user_input):
"""Escape user input for safe HTML display - O(n)"""
# O(n) where n = input length
return escape(user_input)
# Usage
user_text = '<img src=x onerror="alert(1)">'
safe = sanitize_for_html(user_text)
# <img src=x onerror="alert(1)">
# Safe to include in HTML
html = f'<p>{safe}</p>'
Parsing HTML¶
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
"""Extract links from HTML - O(n)"""
def __init__(self):
super().__init__()
self.links = []
def handle_starttag(self, tag, attrs):
"""Called for opening tags - O(1) per tag"""
if tag == 'a':
for attr, value in attrs:
if attr == 'href':
self.links.append(value)
# Usage - O(n) where n = HTML length
parser = MyHTMLParser()
html = '''
<html>
<a href="/page1">Link 1</a>
<a href="/page2">Link 2</a>
</html>
'''
parser.feed(html) # O(n)
print(parser.links) # ['/page1', '/page2']
Extracting Text from HTML¶
from html.parser import HTMLParser
from html import unescape
class TextExtractor(HTMLParser):
"""Extract plain text from HTML - O(n)"""
def __init__(self):
super().__init__()
self.text_parts = []
self.in_script = False
self.in_style = False
def handle_starttag(self, tag, attrs):
"""Track script/style tags - O(1)"""
if tag in ('script', 'style'):
self.in_script = tag == 'script'
self.in_style = tag == 'style'
def handle_endtag(self, tag):
"""O(1)"""
if tag in ('script', 'style'):
self.in_script = False
self.in_style = False
def handle_data(self, data):
"""Collect text - O(1) append, O(n) for data"""
if not self.in_script and not self.in_style:
# Strip whitespace - O(k) where k = data length
text = data.strip()
if text:
self.text_parts.append(text)
def get_text(self):
"""O(n) to join - n = total characters"""
return ' '.join(self.text_parts)
# Usage - O(n) where n = HTML length
extractor = TextExtractor()
html = '''
<html>
<head><title>Page</title></head>
<body>
<h1>Hello World</h1>
<p>This is a test</p>
<script>alert('hidden');</script>
</body>
</html>
'''
extractor.feed(html) # O(n)
text = extractor.get_text() # O(k) where k = text length
# "Page Hello World This is a test"
Building HTML Safely¶
from html import escape
def build_html_page(title, content, links):
"""Build HTML page with escaped content - O(n)"""
# O(n) for each escape where n = string length
safe_title = escape(title)
safe_content = escape(content)
# O(k) to build link HTML where k = link count
link_html = ''.join(
f'<a href="{escape(url)}">{escape(text)}</a>'
for url, text in links
)
# O(total) to combine
return f'''
<html>
<head><title>{safe_title}</title></head>
<body>
<h1>{safe_title}</h1>
<p>{safe_content}</p>
<nav>{link_html}</nav>
</body>
</html>
'''
# Usage - O(n) total where n = total characters
html = build_html_page(
"My <Site>",
"User input & special chars",
[("/?search=test&q=1", "Search")]
)
Performance Tips¶
Batch Escaping¶
from html import escape
# Bad: Multiple escape calls
html_parts = []
for item in items:
html_parts.append(f'<li>{escape(item)}</li>')
result = ''.join(html_parts) # O(n) for each item
# Good: Single escape with formatting
html_parts = [f'<li>{escape(item)}</li>' for item in items]
result = ''.join(html_parts) # O(n) total
Cache Escaped Strings¶
from html import escape
class HtmlBuilder:
"""Cache escaped strings - O(1) lookup after escape"""
def __init__(self):
self._escaped_cache = {}
def get_escaped(self, text):
"""O(n) first time, O(1) cached"""
if text not in self._escaped_cache:
# O(n) where n = text length
self._escaped_cache[text] = escape(text)
return self._escaped_cache[text]
# Usage
builder = HtmlBuilder()
escaped = builder.get_escaped("Hello & Goodbye") # O(n)
escaped = builder.get_escaped("Hello & Goodbye") # O(1) - cached
Use unescape Sparingly¶
from html import unescape
# Bad: Unescape every lookup
def get_title(attrs):
for name, value in attrs:
if name == 'title':
return unescape(value) # O(n)
# Good: Only unescape when needed
def get_title_safe(attrs):
for name, value in attrs:
if name == 'title':
# Only unescape if it contains entities
if '&' in value:
return unescape(value) # O(n)
return value # O(1)
Version Notes¶
- Python 3.2+: html.escape() and unescape()
- Python 3.4+: Improved html.parser
- Python 3.x: Full Unicode support
Related Documentation¶
- Re Module - Pattern matching for HTML
- Urllib Module - URL handling
- Json Module - Data serialization