gzip Module Complexity¶
The gzip module provides GZIP compression and decompression functionality.
Functions & Methods¶
| Operation | Time | Space | Notes |
|---|---|---|---|
gzip.open(filename) |
O(1) | O(1) | Open file handle |
GzipFile.read() |
O(m) | O(m) | Decompress all, m = uncompressed size |
GzipFile.write(data) |
O(n) | O(k) | Compress and write, n = input size, k = buffer |
compress(data) |
O(n) | O(n) | Compress bytes (DEFLATE is linear in practice) |
decompress(data) |
O(m) | O(m) | Decompress bytes |
Opening Files¶
Time Complexity: O(1)¶
import gzip
# Opening gzip file: O(1) time
# Just opens file handle, doesn't read headers
f = gzip.open('file.gz', 'rb') # O(1)
# Or with context manager
with gzip.open('file.gz', 'rb') as f:
data = f.read() # O(m) to decompress
Space Complexity: O(1)¶
import gzip
# Just file handle, minimal memory
f = gzip.open('file.gz', 'rb') # O(1) space
Reading (Decompression)¶
Time Complexity: O(m)¶
Where m = uncompressed file size.
import gzip
# Full read: O(m)
with gzip.open('file.gz', 'rb') as f:
data = f.read() # O(m) to decompress entire file
# Partial read: O(k)
with gzip.open('file.gz', 'rb') as f:
chunk = f.read(4096) # O(k) per chunk, k = chunk size
# Read all chunks: O(m) total
with gzip.open('file.gz', 'rb') as f:
while True:
chunk = f.read(4096) # O(k) per iteration
if not chunk:
break # Total: O(m)
Space Complexity: O(m) or O(k)¶
import gzip
# Full decompression: O(m)
with gzip.open('file.gz', 'rb') as f:
data = f.read() # O(m) memory for entire file
# Streaming decompression: O(k)
with gzip.open('file.gz', 'rb') as f:
while True:
chunk = f.read(4096) # O(k) memory, k = chunk size
if not chunk:
break
process(chunk)
Writing (Compression)¶
Time Complexity: O(n)¶
Where n = input size. DEFLATE compression is linear in practice.
import gzip
# Write compressed: O(n)
with gzip.open('output.gz', 'wb') as f:
f.write(data) # O(n) with compression
# Multiple writes: O(n) total
with gzip.open('output.gz', 'wb') as f:
for chunk in chunks: # O(n) total
f.write(chunk)
Space Complexity: O(k)¶
Where k = compression buffer size (typically 8KB window).
import gzip
# Streaming compression: O(k) space
with gzip.open('output.gz', 'wb') as f:
for chunk in large_chunks:
f.write(chunk) # O(k) buffer, not O(n)
Compress/Decompress Functions¶
compress() - One-shot Compression¶
import gzip
# Compress entire data: O(n) time, O(n) space
data = b'Large data...' * 10000
compressed = gzip.compress(data) # O(n)
# Space: creates entire compressed result
# O(n) space for output (compressed data smaller)
decompress() - One-shot Decompression¶
import gzip
# Decompress entire data: O(m) time, O(m) space
compressed = b'\x1f\x8b...' # compressed data
data = gzip.decompress(compressed) # O(m)
# m = uncompressed size
# Space: creates entire uncompressed result O(m)
Compression Levels¶
Effect on Performance¶
import gzip
# Compression level 1 (fastest): O(n)
data = b'x' * 1000000
compressed = gzip.compress(data, compresslevel=1) # Fastest
# Compression level 6 (default): O(n)
compressed = gzip.compress(data, compresslevel=6) # Balanced
# Compression level 9 (best): O(n) but with higher constant factor
compressed = gzip.compress(data, compresslevel=9) # Slowest, best ratio
Trade-offs¶
import gzip
# Fast compression, poor ratio
fast = gzip.compress(data, compresslevel=1) # O(n) time
# Default balance
default = gzip.compress(data) # O(n) time
# Best compression, slower (higher constant factor)
best = gzip.compress(data, compresslevel=9) # O(n) time
Streaming Decompression¶
Time Complexity: O(m)¶
import gzip
# Streaming: memory efficient
with gzip.open('large.gz', 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
process_chunk(chunk) # O(m) total time, O(k) memory
Space Complexity: O(k)¶
import gzip
# Only keeps buffer, not entire file
with gzip.open('large.gz', 'rb') as f:
buffer_size = 8192
while True:
chunk = f.read(buffer_size) # O(k) memory
if not chunk:
break
process(chunk)
Common Patterns¶
Reading Compressed File¶
import gzip
# Simple: O(m) time and space
with gzip.open('file.gz', 'rb') as f:
content = f.read().decode('utf-8') # O(m)
# Streaming: O(m) time, O(k) space (better for large files)
with gzip.open('file.gz', 'rb') as f:
for line in f: # Iterates line by line
process_line(line) # Total: O(m), memory: O(k)
Writing Compressed Data¶
import gzip
# Simple one-shot: O(n)
with gzip.open('output.gz', 'wb') as f:
f.write(large_data) # O(n) - DEFLATE is linear
# Streaming write: O(n), O(k) memory
with gzip.open('output.gz', 'wb') as f:
for chunk in data_chunks:
f.write(chunk) # O(n) total
Compress String to Bytes¶
import gzip
# String → compressed bytes: O(n)
text = "Hello world" * 100000
compressed = gzip.compress(text.encode('utf-8')) # O(n) - DEFLATE is linear
# Decompress back: O(m)
decompressed = gzip.decompress(compressed) # O(m)
text = decompressed.decode('utf-8')
Processing Line by Line¶
import gzip
# Memory efficient: O(m) time, O(k) space
with gzip.open('data.gz', 'rt') as f: # text mode
for line in f: # Iterates efficiently
process(line)
Performance Characteristics¶
Best Practices¶
import gzip
# Good: Use streaming for large files
with gzip.open('large.gz', 'rb') as f:
for chunk in iter(lambda: f.read(65536), b''):
process(chunk) # O(k) memory
# Avoid: Load entire large file
with gzip.open('large.gz', 'rb') as f:
data = f.read() # O(m) memory
# Good: Use text mode for text
with gzip.open('text.gz', 'rt') as f: # 't' = text mode
for line in f:
process(line)
# Avoid: Binary then decode
with gzip.open('text.gz', 'rb') as f: # 'b' = binary
data = f.read().decode('utf-8')
Compression Level Selection¶
import gzip
# For streaming: use default (6)
# Already optimized for sequential compression
# For one-shot:
data = large_data
# Speed critical: level 1
compressed = gzip.compress(data, compresslevel=1) # Fastest
# Storage critical: level 9
compressed = gzip.compress(data, compresslevel=9) # Best ratio
# Default: level 6 (good balance)
compressed = gzip.compress(data)
Memory Considerations¶
import gzip
# Bad: Unlimited read for large file
with gzip.open('huge.gz') as f:
data = f.read() # O(m) memory - could be GBs
# Good: Read in chunks
with gzip.open('huge.gz') as f:
while True:
chunk = f.read(1024*1024) # 1MB chunks
if not chunk:
break
process(chunk) # O(1MB) memory
# Good: Iterate lines (text)
with gzip.open('huge.gz', 'rt') as f:
for line in f: # Auto-chunked
process(line) # O(line_size) memory
Version Notes¶
- Python 2.3+: Basic gzip support
- Python 3.0+: Enhanced API
- Python 3.8+: Performance improvements
- Python 3.10+: Better compression options
Related Documentation¶
- zipfile Module - ZIP archive handling
- tarfile Module - TAR archive handling
- bz2 Module - BZIP2 compression
- io Module - I/O operations