Tokenize Module¶
The tokenize module provides a lexical analyzer for Python source code, tokenizing it into individual tokens.
Complexity Reference¶
| Operation | Time | Space | Notes |
|---|---|---|---|
generate_tokens(readline) |
O(n) | O(1) | n = source length; generator yields tokens lazily |
tokenize(readline) |
O(n) | O(1) | Binary mode tokenization |
detect_encoding(readline) |
O(n) | O(1) | n = header size |
untokenize(tokens) |
O(n) | O(n) | n = token count |
Common Operations¶
Tokenizing Python Source¶
import tokenize
import io
# O(n) where n = source length
source = """
x = 1 + 2
print(x)
"""
readline = io.StringIO(source).readline
# O(n) to tokenize entire source
tokens = list(tokenize.generate_tokens(readline))
# Each token: TokenInfo(type, string, start, end, line)
for token in tokens:
print(f"{token.type} {token.string!r}")
# Output:
# NAME 'x'
# OP '='
# NUMBER '1'
# OP '+'
# NUMBER '2'
# NEWLINE '\n'
# NAME 'print'
# OP '('
# NAME 'x'
# OP ')'
Reading Python Files¶
import tokenize
# O(n) where n = file size
with open('script.py', 'rb') as f:
# O(n) to tokenize binary file
tokens = list(tokenize.tokenize(f.readline))
# Process tokens - O(k) where k = token count
for token in tokens:
if token.type == tokenize.NAME:
print(f"Identifier: {token.string}")
elif token.type == tokenize.STRING:
print(f"String: {token.string}")
Common Use Cases¶
Code Analysis¶
import tokenize
import io
def analyze_python_code(source):
"""Analyze code structure - O(n)"""
readline = io.StringIO(source).readline
# O(n) to tokenize where n = source length
tokens = list(tokenize.generate_tokens(readline))
analysis = {
'functions': [],
'variables': [],
'imports': [],
'operators': set(),
}
# O(k) to process tokens where k = token count
i = 0
while i < len(tokens):
token = tokens[i]
# Check for function definition - O(1) per check
if (token.type == tokenize.NAME and
token.string == 'def' and
i + 1 < len(tokens)):
analysis['functions'].append(tokens[i + 1].string)
# Check for operators - O(1)
elif token.type == tokenize.OP:
analysis['operators'].add(token.string)
i += 1
return analysis
# Usage - O(n)
code = """
def add(a, b):
return a + b
x = 10 * 5
"""
result = analyze_python_code(code)
print(result)
Detecting Encoding¶
import tokenize
import io
def detect_python_encoding(filename):
"""Detect file encoding - O(n)"""
# O(n) where n = header size (usually small)
with open(filename, 'rb') as f:
# Reads only first/second line usually
try:
encoding = tokenize.detect_encoding(f.readline)
return encoding[0] # Returns encoding name
except (SyntaxError, UnicodeDecodeError):
return 'utf-8' # Fallback
# Usage - O(n) for small header
encoding = detect_python_encoding('script.py')
print(f"Encoding: {encoding}")
Pretty-Printing Source Code¶
import tokenize
import io
def highlight_tokens(source):
"""Pretty-print with token info - O(n)"""
readline = io.StringIO(source).readline
# O(n) to tokenize
tokens = list(tokenize.generate_tokens(readline))
# O(k) to display where k = token count
for token in tokens:
token_name = tokenize.tok_name[token.type]
print(f"{token_name:10} {token.string!r:20} {token.start} -> {token.end}")
# Usage - O(n)
code = "x = 1 + 2"
highlight_tokens(code)
Token Stream Processing¶
import tokenize
import io
def remove_comments(source):
"""Remove comments from code - O(n)"""
readline = io.StringIO(source).readline
# O(n) to tokenize where n = source length
tokens = list(tokenize.generate_tokens(readline))
# O(k) to filter where k = token count
filtered = [t for t in tokens
if t.type != tokenize.COMMENT]
# O(n) to untokenize
return tokenize.untokenize(filtered)
# Usage - O(n)
code = """
x = 1 # Initialize x
print(x) # Print value
"""
clean_code = remove_comments(code)
Counting Code Metrics¶
import tokenize
import io
def count_code_metrics(source):
"""Count various code metrics - O(n)"""
readline = io.StringIO(source).readline
# O(n) to tokenize
tokens = list(tokenize.generate_tokens(readline))
metrics = {
'lines': 0,
'identifiers': 0,
'numbers': 0,
'strings': 0,
'operators': 0,
'keywords': 0,
}
# O(k) to process
for token in tokens:
# O(1) per token type check
if token.type == tokenize.NEWLINE:
metrics['lines'] += 1
elif token.type == tokenize.NAME:
metrics['identifiers'] += 1
elif token.type == tokenize.NUMBER:
metrics['numbers'] += 1
elif token.type == tokenize.STRING:
metrics['strings'] += 1
elif token.type == tokenize.OP:
metrics['operators'] += 1
return metrics
# Usage - O(n)
code = "x = 1 + 2; print('hello')"
metrics = count_code_metrics(code)
print(metrics)
Performance Tips¶
Use Generator Form for Large Files¶
import tokenize
def process_large_file(filename):
"""Process file without loading all tokens - O(1) memory"""
with open(filename, 'rb') as f:
# O(n) time but O(1) memory - generator
for token in tokenize.tokenize(f.readline):
# Process one token at a time
if token.type == tokenize.NAME:
yield token.string
# Usage - O(1) memory for any file size
for identifier in process_large_file('large_script.py'):
print(identifier)
Cache Tokenized Results¶
import tokenize
import io
class TokenCache:
"""Cache tokenized code - O(1) lookup"""
def __init__(self):
self._cache = {}
def get_tokens(self, source):
"""O(1) cached or O(n) new tokenization"""
source_id = hash(source)
if source_id not in self._cache:
# O(n) first time where n = source length
readline = io.StringIO(source).readline
self._cache[source_id] = list(
tokenize.generate_tokens(readline)
)
return self._cache[source_id]
# Usage
cache = TokenCache()
tokens = cache.get_tokens(code) # O(n)
tokens = cache.get_tokens(code) # O(1)
Filter Early for Efficiency¶
import tokenize
import io
# Bad: Tokenize all, then filter - O(n) + O(k)
tokens = list(tokenize.generate_tokens(readline))
identifiers = [t for t in tokens if t.type == tokenize.NAME]
# Good: Filter while tokenizing - O(n)
identifiers = [t for t in tokenize.generate_tokens(readline)
if t.type == tokenize.NAME]
Token Types¶
import tokenize
# Main token types:
# NAME - identifiers
# NUMBER - numeric literals
# STRING - string literals
# COMMENT - comments
# OP - operators
# NEWLINE - line breaks
# INDENT/DEDENT - indentation changes
# ERRORTOKEN - syntax errors
# ENDMARKER - end of input
Version Notes¶
- Python 2.6+: Basic tokenization
- Python 3.x: All features available
- Python 3.10+: Enhanced token types
Related Documentation¶
- Ast Module - Abstract syntax trees
- Code Module - Code compilation
- Re Module - Regular expressions