pickle Module Complexity¶
The pickle module serializes and deserializes Python objects into bytes, enabling object persistence and network transmission while maintaining Python semantics.
Complexity Reference¶
| Operation | Time | Space | Notes |
|---|---|---|---|
pickle.dumps(obj) |
O(n) | O(n) | Serialize to bytes, n = object size |
pickle.loads(data) |
O(n) | O(n) | Deserialize from bytes, n = data size |
dump() to file |
O(n) | O(n) | n = object size, requires memoization for object graph |
load() from file |
O(n) | O(n) | n = object size, loads all |
| Circular reference handling | O(n) | O(n) | Memoization tracks visited objects |
Basic Pickling¶
Simple Object Serialization¶
import pickle
# Create object - O(1)
data = {'name': 'Alice', 'age': 30, 'scores': [95, 87, 92]}
# Serialize to bytes - O(n) where n = object size
pickled = pickle.dumps(data)
print(type(pickled)) # <class 'bytes'>
print(len(pickled)) # ~50 bytes
# Deserialize from bytes - O(n)
restored = pickle.loads(pickled)
print(restored) # {'name': 'Alice', 'age': 30, 'scores': [95, 87, 92]}
File Persistence¶
import pickle
# Serialize to file - O(n)
data = [1, 2, 3, 4, 5]
with open('data.pkl', 'wb') as f:
pickle.dump(data, f) # O(n) streaming
# Deserialize from file - O(n)
with open('data.pkl', 'rb') as f:
restored = pickle.load(f) # O(n) reads all
print(restored) # [1, 2, 3, 4, 5]
Pickle Protocols¶
Protocol Versions¶
import pickle
obj = {'a': 1, 'b': [2, 3, 4]}
# Protocol 0 (ASCII, human-readable) - O(n) slowest
p0 = pickle.dumps(obj, protocol=0)
print(len(p0)) # Largest
# Protocol 1 (Binary, old) - O(n)
p1 = pickle.dumps(obj, protocol=1)
print(len(p1)) # Smaller
# Protocol 2 (Binary, Python 2.3+) - O(n)
p2 = pickle.dumps(obj, protocol=2)
print(len(p2)) # Smaller still
# Protocol 3 (Binary, Python 3.0+) - O(n)
p3 = pickle.dumps(obj, protocol=3)
print(len(p3)) # Even smaller
# Protocol 4 (Binary, Python 3.4+) - O(n) faster
p4 = pickle.dumps(obj, protocol=4)
print(len(p4)) # Optimized
# Protocol 5 (Binary, Python 3.8+) - O(n) fastest
p5 = pickle.dumps(obj, protocol=5)
print(len(p5)) # Smallest and fastest
Default Protocol¶
import pickle
import sys
# Default protocol depends on Python version
default = pickle.DEFAULT_PROTOCOL
print(f"Default: {default}") # 3, 4, or 5 depending on version
# Highest protocol available
highest = pickle.HIGHEST_PROTOCOL
print(f"Highest: {highest}") # 5 in Python 3.8+
# For compatibility, specify protocol explicitly
data = {'key': 'value'}
p_compat = pickle.dumps(data, protocol=3) # Python 3.0+ compatible
Custom Serialization¶
getstate and setstate¶
import pickle
class Person:
def __init__(self, name, age, password):
self.name = name
self.age = age
self.password = password # Sensitive, don't pickle
# Called during pickling - O(1)
def __getstate__(self):
# Return what to pickle
state = self.__dict__.copy()
del state['password'] # Exclude password
return state
# Called during unpickling - O(1)
def __setstate__(self, state):
self.__dict__.update(state)
self.password = None # Set default
# Pickle - O(n)
person = Person('Alice', 30, 'secret123')
pickled = pickle.dumps(person)
# Unpickle - O(n)
restored = pickle.loads(pickled)
print(restored.name) # 'Alice'
print(restored.password) # None (not stored)
reduce() Method¶
import pickle
class CustomObject:
def __init__(self, x, y):
self.x = x
self.y = y
# Called during pickling - O(1)
def __reduce__(self):
# Return (callable, args) to recreate object
return (CustomObject, (self.x, self.y))
# Pickle - O(n)
obj = CustomObject(10, 20)
pickled = pickle.dumps(obj)
# Unpickle - O(n)
restored = pickle.loads(pickled)
print(restored.x, restored.y) # 10 20
Complex Object Types¶
Circular References¶
import pickle
# Create circular reference
node1 = {'name': 'A', 'next': None}
node2 = {'name': 'B', 'next': None}
node1['next'] = node2
node2['next'] = node1 # Circular
# Pickle handles with memoization - O(n)
pickled = pickle.dumps(node1)
# Unpickle reconstructs structure - O(n)
restored = pickle.loads(pickled)
print(restored['name']) # 'A'
print(restored['next']['name']) # 'B'
print(restored['next']['next'] is restored) # True (circular preserved)
Class Instances¶
import pickle
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def __repr__(self):
return f"Point({self.x}, {self.y})"
# Pickle - O(n)
p = Point(3, 4)
pickled = pickle.dumps(p)
# Unpickle - O(n) (requires class accessible)
restored = pickle.loads(pickled)
print(restored) # Point(3, 4)
print(type(restored)) # <class '__main__.Point'>
Nested Objects¶
import pickle
class Team:
def __init__(self, name, members):
self.name = name
self.members = members # List of dicts
# Pickle nested structure - O(n*m)
team = Team('Team A', [
{'name': 'Alice', 'role': 'lead'},
{'name': 'Bob', 'role': 'dev'}
])
pickled = pickle.dumps(team)
# Unpickle - O(n*m)
restored = pickle.loads(pickled)
print(restored.name) # 'Team A'
print(restored.members[0]) # {'name': 'Alice', ...}
Performance and Size¶
Protocol Comparison¶
import pickle
import sys
# Create test object
test_data = {
'strings': ['a' * 100 for _ in range(10)],
'numbers': list(range(1000)),
'nested': [{'id': i, 'value': i**2} for i in range(100)]
}
# Compare sizes
sizes = {}
for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
pickled = pickle.dumps(test_data, protocol=protocol)
sizes[protocol] = len(pickled)
print(f"Protocol {protocol}: {len(pickled)} bytes")
# Higher protocols typically produce smaller output
Large Object Serialization¶
import pickle
import io
# Large list - O(n)
large_list = list(range(1000000))
# Method 1: dumps - creates bytes in memory - O(n) space
data_bytes = pickle.dumps(large_list) # Large memory usage
# Method 2: dump to file - streams output - O(n) time, O(1) space
with open('large.pkl', 'wb') as f:
pickle.dump(large_list, f) # Much more memory efficient
# Method 3: Pickler with file - O(n) time, O(1) space
with open('large.pkl', 'wb') as f:
pickler = pickle.Pickler(f)
pickler.dump(large_list)
Unpickler Customization¶
Custom Unpickler¶
import pickle
# Version migration on unpickle
class VersionedUnpickler(pickle.Unpickler):
def find_class(self, module, name):
# Intercept class loading - O(1)
if module == 'old_module':
module = 'new_module'
return super().find_class(module, name)
# Usage
data = b'...' # Pickled data
with open('old_data.pkl', 'rb') as f:
unpickler = VersionedUnpickler(f)
obj = unpickler.load()
Security Considerations¶
Dangerous Unpickling¶
import pickle
# ⚠️ SECURITY RISK: Never unpickle untrusted data!
# Pickle can execute arbitrary code during unpickling
# Unsafe - could execute malicious code
untrusted_data = b'...' # From network/user
# obj = pickle.loads(untrusted_data) # DANGEROUS!
# Safe: Use alternative formats for untrusted data
import json
safe_data = json.loads(untrusted_string) # JSON is safe
Restrict Classes¶
import pickle
import io
class RestrictedUnpickler(pickle.Unpickler):
def find_class(self, module, name):
# Only allow specific classes - O(1)
if module == 'my_app' and name in ['SafeClass', 'Data']:
return super().find_class(module, name)
raise pickle.UnpicklingError(f"Forbidden: {module}.{name}")
# Use restricted unpickler
data = b'...'
with io.BytesIO(data) as f:
unpickler = RestrictedUnpickler(f)
obj = unpickler.load()
Pickle Alternatives¶
JSON (Safe Alternative)¶
import json
# JSON is safe but limited
data = {'name': 'Alice', 'age': 30}
# Serialize - O(n)
json_str = json.dumps(data)
# Deserialize - O(n)
restored = json.loads(json_str)
# Advantages:
# - Human readable
# - Language independent
# - Safe (no code execution)
# - Limited types (no custom classes)
Dill (Extended Pickling)¶
# pip install dill
import dill
# Dill extends pickle to handle more types
def my_function(x):
return x ** 2
# Can pickle functions - O(n)
pickled = dill.dumps(my_function)
restored = dill.loads(pickled)
# Standard pickle cannot pickle functions
Performance Notes¶
Time Complexity¶
- dumps(): O(n) where n = total size of object graph
- loads(): O(n) where n = size of pickled data
- Circular reference handling: O(n) with memoization
Space Complexity¶
- dumps(): O(n) creates bytes representation
- loads(): O(n) creates deserialized objects
- dump()/load(): Streaming reduces memory for file I/O
Protocol Selection¶
- Protocol 0: Slowest, largest, ASCII (rarely use)
- Protocol 1-2: Legacy compatibility
- Protocol 3: Default Python 3, good balance
- Protocol 4+: Better compression, faster for large objects
Best Practices¶
Do's¶
- Use protocol 4+ for new code (Python 3.4+)
- Implement getstate for security/control
- Use file I/O for large objects
- Use json/yaml for inter-language data
Avoid's¶
- Never unpickle untrusted data
- Don't pickle sensitive data
- Don't rely on pickle for long-term storage (fragile across versions)
- Don't use pickle for inter-process communication across Python versions