difflib Module Complexity¶
The difflib module provides tools for comparing sequences (lists, strings) and generating human-readable diffs, useful for text comparison, patching, and change detection.
Complexity Reference¶
| Operation | Time | Space | Notes |
|---|---|---|---|
SequenceMatcher() init |
O(1) | O(n) | Create matcher, n = combined length |
get_matching_blocks() |
O(n*m) worst | O(n+m) | Heuristic reduces typical case |
get_opcodes() |
O(n*m) worst | O(n+m) | Uses get_matching_blocks |
ratio() |
O(n*m) worst | O(1) | Uses cached matching blocks |
unified_diff() |
O(n+m) | O(n+m) | Generate unified diff |
context_diff() |
O(n+m) | O(n+m) | Generate context diff |
ndiff() |
O(n+m) | O(n+m) | Generate detailed diff |
Sequence Matching¶
Basic Comparison¶
from difflib import SequenceMatcher
# Create matcher - O(1)
a = "kitten"
b = "sitting"
matcher = SequenceMatcher(None, a, b)
# Get similarity ratio - O(n+m)
ratio = matcher.ratio()
print(f"Similarity: {ratio:.2%}") # 57.14%
# Get matching blocks - O(n+m)
blocks = matcher.get_matching_blocks()
for block in blocks:
print(f"Match: a[{block.a}:{block.a+block.size}] = b[{block.b}:{block.b+block.size}]")
# Match: a[0:1] = b[0:1] ('k'/'s') - wait, this won't match
# Actually shows longest common substrings
List Comparison¶
from difflib import SequenceMatcher
# Compare lists - O(1) init
list1 = [1, 2, 3, 4, 5]
list2 = [1, 2, 4, 5, 6]
matcher = SequenceMatcher(None, list1, list2)
# Similarity ratio - O(n+m)
ratio = matcher.ratio()
print(f"Match: {ratio:.2%}") # 70%
# Find matching blocks - O(n+m)
matching = matcher.get_matching_blocks()
for m in matching:
print(f"Block: {list1[m.a:m.a+m.size]} matches {list2[m.b:m.b+m.size]}")
Edit Operations¶
Get Opcodes¶
from difflib import SequenceMatcher
# Compare strings - O(1) init
s1 = "eating"
s2 = "running"
matcher = SequenceMatcher(None, s1, s2)
# Get edit operations - O(n+m)
opcodes = matcher.get_opcodes()
for op, i1, i2, j1, j2 in opcodes:
if op == 'equal':
print(f"Equal: {s1[i1:i2]} == {s2[j1:j2]}")
elif op == 'replace':
print(f"Replace: {s1[i1:i2]} -> {s2[j1:j2]}")
elif op == 'insert':
print(f"Insert: {s2[j1:j2]} at position {i1}")
elif op == 'delete':
print(f"Delete: {s1[i1:i2]}")
# Output describes all changes
String Diffs¶
Unified Diff Format¶
from difflib import unified_diff
# Compare lists of lines - O(n+m)
text1 = "The quick brown fox\njumps over the lazy dog\n".splitlines(keepends=True)
text2 = "The fast brown fox\njumps over the lazy cat\n".splitlines(keepends=True)
# Generate unified diff - O(n+m)
diff = unified_diff(text1, text2, fromfile='original', tofile='modified')
# Print diff
print(''.join(diff))
# Output:
# --- original
# +++ modified
# @@ -1,2 +1,2 @@
# The quick brown fox
# -jumps over the lazy dog
# +jumps over the lazy cat
Context Diff Format¶
from difflib import context_diff
text1 = ["line1\n", "line2\n", "line3\n", "line4\n"]
text2 = ["line1\n", "modified2\n", "line3\n", "line4\n"]
# Generate context diff - O(n+m)
diff = context_diff(text1, text2, fromfile='old', tofile='new', n=1)
print(''.join(diff))
# Output shows 1 line of context around changes
Ndiff Format¶
from difflib import ndiff
# Detailed line-by-line diff - O(n+m)
text1 = ["apple\n", "banana\n", "cherry\n"]
text2 = ["apple\n", "blueberry\n", "cherry\n"]
diff = ndiff(text1, text2)
for line in diff:
if line[0] == ' ':
print(f" {line[2:]}", end='') # Unchanged
elif line[0] == '+':
print(f"+ {line[2:]}", end='') # Added
elif line[0] == '-':
print(f"- {line[2:]}", end='') # Removed
# Marks: ' ' for equal, '- ' for removed, '+ ' for added, '? ' for hints
Close Matches¶
Find Similar Items¶
from difflib import get_close_matches
# Find close matches - O(n*m) where n = options, m = search string length
options = ["kitten", "sitting", "kitchen", "mittens", "bitten"]
search = "kitten"
# Get similar strings - O(n*m)
matches = get_close_matches(search, options, n=3, cutoff=0.6)
print(matches) # ['kitten', 'kitchen', 'mittens']
# With different cutoff - O(n*m)
matches = get_close_matches("kiten", options, n=2, cutoff=0.8)
print(matches) # ['kitten', 'kitchen']
Advanced Patterns¶
Diff Statistics¶
from difflib import SequenceMatcher
text1 = "The quick brown fox jumps over the lazy dog"
text2 = "The fast brown fox leaps over the lazy cat"
matcher = SequenceMatcher(None, text1, text2)
# Get matching blocks - O(n+m)
blocks = matcher.get_matching_blocks()
total_matches = sum(b.size for b in blocks)
# Calculate stats - O(1)
ratio = matcher.ratio()
print(f"Length: {len(text1)} vs {len(text2)}")
print(f"Matching chars: {total_matches}")
print(f"Similarity: {ratio:.2%}")
Three-way Merge¶
from difflib import Differ
# Three-way comparison helper
original = "The quick brown fox".split()
current = "The fast brown fox".split()
other = "The quick dark fox".split()
differ = Differ()
# Show differences - O(n+m)
diff_current = list(differ.compare(original, current))
diff_other = list(differ.compare(original, other))
print("Current changes:")
for line in diff_current:
print(line)
print("\nOther changes:")
for line in diff_other:
print(line)
# Manual merge needed for conflicts
HTML Report Generation¶
from difflib import HtmlDiff
# Generate HTML diff - O(n+m)
text1 = ["line 1\n", "line 2\n", "line 3\n"]
text2 = ["line 1\n", "modified line 2\n", "line 3\n"]
h = HtmlDiff()
html = h.make_file(text1, text2, fromfile='old.txt', tofile='new.txt')
# html is a complete HTML document
with open('diff.html', 'w') as f:
f.write(html)
Use Cases¶
File Comparison¶
from difflib import unified_diff
def compare_files(file1, file2):
"""Compare two text files and show unified diff - O(n+m)"""
with open(file1, 'r') as f:
lines1 = f.readlines()
with open(file2, 'r') as f:
lines2 = f.readlines()
# Generate diff - O(n+m)
diff = unified_diff(lines1, lines2,
fromfile=file1, tofile=file2)
return ''.join(diff)
# Usage
result = compare_files('original.txt', 'modified.txt')
print(result)
Spell Checker¶
from difflib import get_close_matches
def spell_check(word, dictionary):
"""Find spelling suggestions - O(n*m)"""
# Find close matches - O(n*m) where n = dict size
suggestions = get_close_matches(word, dictionary, n=3, cutoff=0.6)
if suggestions:
return f"Did you mean: {', '.join(suggestions)}?"
return "No suggestions found"
# Usage
dictionary = ['apple', 'application', 'apply', 'appreciate']
print(spell_check('aple', dictionary))
Code Review Helper¶
from difflib import SequenceMatcher, unified_diff
def analyze_changes(old_code, new_code):
"""Analyze code changes - O(n+m)"""
# Quick similarity check - O(n+m)
matcher = SequenceMatcher(None, old_code, new_code)
similarity = matcher.ratio()
print(f"Code similarity: {similarity:.1%}")
# Show detailed changes - O(n+m)
old_lines = old_code.splitlines(keepends=True)
new_lines = new_code.splitlines(keepends=True)
diff = unified_diff(old_lines, new_lines)
for line in diff:
print(line, end='')
# Usage
old_code = "def add(a, b):\n return a + b"
new_code = "def add(a, b):\n \"\"\"Add two numbers\"\"\"\n return a + b"
analyze_changes(old_code, new_code)
Performance Characteristics¶
Time Complexity¶
- SequenceMatcher: O(n+m) preprocessing, O(n*m) in worst case for matching
- ratio(): O(n+m) using cached matching blocks
- unified_diff/context_diff/ndiff: O(n+m) for comparison, O(n+m) for output
- get_close_matches: O(n*m) where n = choices, m = pattern length
Space Complexity¶
- SequenceMatcher: O(n+m) to store matching blocks
- Diffs: O(n+m) for output generation
Algorithm Notes¶
from difflib import SequenceMatcher
# The algorithm is optimized but not optimal
# Best case: O(min(n,m)) for identical sequences
# Worst case: O(n*m) for completely different sequences
# Large file comparison can be slow
large_file1 = "x" * 1000000
large_file2 = "y" * 1000000
# This will take time proportional to size
matcher = SequenceMatcher(None, large_file1, large_file2)
ratio = matcher.ratio() # Relatively quick
Filtering Junk Elements¶
Ignore Whitespace¶
from difflib import SequenceMatcher
def is_whitespace(x):
"""Filter function to ignore whitespace"""
return x.isspace()
text1 = "hello world"
text2 = "hello world"
# Without filtering - shows difference
matcher1 = SequenceMatcher(None, text1, text2)
print(f"With spaces: {matcher1.ratio():.2%}")
# With filtering - ignores spaces
matcher2 = SequenceMatcher(is_whitespace, text1, text2)
print(f"Without spaces: {matcher2.ratio():.2%}")
Best Practices¶
Do's¶
- Use SequenceMatcher for sequence comparison
- Use get_close_matches for spell checking
- Use unified_diff for human-readable output
- Consider cutoff value for fuzzy matching
- Cache SequenceMatcher for repeated comparisons
Avoid's¶
- Don't use for very large files without optimization
- Don't ignore important matching blocks
- Don't expect perfect spell checking with low cutoff
- Don't use for exact matching (use == instead)
Limitations¶
- Not optimized for very large files
- Cannot detect moved blocks (only insertions/deletions)
- Similarity metric is heuristic-based
- No context-aware matching
Alternatives¶
# For version control quality diffs:
# - Use git diff, mercurial
# - Use external tools like 'diff', 'patch'
# For fuzzy matching:
# - Use fuzzywuzzy library (pip install fuzzywuzzy)
# - Use rapidfuzz for faster matching
# For language-aware diffs:
# - Use Pygments for syntax highlighting
# - Use specialized diff tools for code
# For ML-based similarity:
# - Use scikit-learn for text similarity
# - Use sentence-transformers for semantic similarity