summaryrefslogtreecommitdiff
path: root/Lib/pickletools.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/pickletools.py')
-rw-r--r--Lib/pickletools.py871
1 files changed, 604 insertions, 267 deletions
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index 612fa8f27e..71c2aa1c79 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -11,6 +11,7 @@ dis(pickle, out=None, memo=None, indentlevel=4)
'''
import codecs
+import io
import pickle
import re
import sys
@@ -34,119 +35,118 @@ bytes_types = pickle.bytes_types
# by a later GET.
-"""
-"A pickle" is a program for a virtual pickle machine (PM, but more accurately
-called an unpickling machine). It's a sequence of opcodes, interpreted by the
-PM, building an arbitrarily complex Python object.
-
-For the most part, the PM is very simple: there are no looping, testing, or
-conditional instructions, no arithmetic and no function calls. Opcodes are
-executed once each, from first to last, until a STOP opcode is reached.
-
-The PM has two data areas, "the stack" and "the memo".
-
-Many opcodes push Python objects onto the stack; e.g., INT pushes a Python
-integer object on the stack, whose value is gotten from a decimal string
-literal immediately following the INT opcode in the pickle bytestream. Other
-opcodes take Python objects off the stack. The result of unpickling is
-whatever object is left on the stack when the final STOP opcode is executed.
-
-The memo is simply an array of objects, or it can be implemented as a dict
-mapping little integers to objects. The memo serves as the PM's "long term
-memory", and the little integers indexing the memo are akin to variable
-names. Some opcodes pop a stack object into the memo at a given index,
-and others push a memo object at a given index onto the stack again.
-
-At heart, that's all the PM has. Subtleties arise for these reasons:
-
-+ Object identity. Objects can be arbitrarily complex, and subobjects
- may be shared (for example, the list [a, a] refers to the same object a
- twice). It can be vital that unpickling recreate an isomorphic object
- graph, faithfully reproducing sharing.
-
-+ Recursive objects. For example, after "L = []; L.append(L)", L is a
- list, and L[0] is the same list. This is related to the object identity
- point, and some sequences of pickle opcodes are subtle in order to
- get the right result in all cases.
-
-+ Things pickle doesn't know everything about. Examples of things pickle
- does know everything about are Python's builtin scalar and container
- types, like ints and tuples. They generally have opcodes dedicated to
- them. For things like module references and instances of user-defined
- classes, pickle's knowledge is limited. Historically, many enhancements
- have been made to the pickle protocol in order to do a better (faster,
- and/or more compact) job on those.
-
-+ Backward compatibility and micro-optimization. As explained below,
- pickle opcodes never go away, not even when better ways to do a thing
- get invented. The repertoire of the PM just keeps growing over time.
- For example, protocol 0 had two opcodes for building Python integers (INT
- and LONG), protocol 1 added three more for more-efficient pickling of short
- integers, and protocol 2 added two more for more-efficient pickling of
- long integers (before protocol 2, the only ways to pickle a Python long
- took time quadratic in the number of digits, for both pickling and
- unpickling). "Opcode bloat" isn't so much a subtlety as a source of
- wearying complication.
-
-
-Pickle protocols:
-
-For compatibility, the meaning of a pickle opcode never changes. Instead new
-pickle opcodes get added, and each version's unpickler can handle all the
-pickle opcodes in all protocol versions to date. So old pickles continue to
-be readable forever. The pickler can generally be told to restrict itself to
-the subset of opcodes available under previous protocol versions too, so that
-users can create pickles under the current version readable by older
-versions. However, a pickle does not contain its version number embedded
-within it. If an older unpickler tries to read a pickle using a later
-protocol, the result is most likely an exception due to seeing an unknown (in
-the older unpickler) opcode.
-
-The original pickle used what's now called "protocol 0", and what was called
-"text mode" before Python 2.3. The entire pickle bytestream is made up of
-printable 7-bit ASCII characters, plus the newline character, in protocol 0.
-That's why it was called text mode. Protocol 0 is small and elegant, but
-sometimes painfully inefficient.
-
-The second major set of additions is now called "protocol 1", and was called
-"binary mode" before Python 2.3. This added many opcodes with arguments
-consisting of arbitrary bytes, including NUL bytes and unprintable "high bit"
-bytes. Binary mode pickles can be substantially smaller than equivalent
-text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte
-int as 4 bytes following the opcode, which is cheaper to unpickle than the
-(perhaps) 11-character decimal string attached to INT. Protocol 1 also added
-a number of opcodes that operate on many stack elements at once (like APPENDS
-and SETITEMS), and "shortcut" opcodes (like EMPTY_DICT and EMPTY_TUPLE).
-
-The third major set of additions came in Python 2.3, and is called "protocol
-2". This added:
-
-- A better way to pickle instances of new-style classes (NEWOBJ).
-
-- A way for a pickle to identify its protocol (PROTO).
-
-- Time- and space- efficient pickling of long ints (LONG{1,4}).
-
-- Shortcuts for small tuples (TUPLE{1,2,3}}.
-
-- Dedicated opcodes for bools (NEWTRUE, NEWFALSE).
-
-- The "extension registry", a vector of popular objects that can be pushed
- efficiently by index (EXT{1,2,4}). This is akin to the memo and GET, but
- the registry contents are predefined (there's nothing akin to the memo's
- PUT).
-
-Another independent change with Python 2.3 is the abandonment of any
-pretense that it might be safe to load pickles received from untrusted
-parties -- no sufficient security analysis has been done to guarantee
-this and there isn't a use case that warrants the expense of such an
-analysis.
-
-To this end, all tests for __safe_for_unpickling__ or for
-copyreg.safe_constructors are removed from the unpickling code.
-References to these variables in the descriptions below are to be seen
-as describing unpickling in Python 2.2 and before.
-"""
+# "A pickle" is a program for a virtual pickle machine (PM, but more accurately
+# called an unpickling machine). It's a sequence of opcodes, interpreted by the
+# PM, building an arbitrarily complex Python object.
+#
+# For the most part, the PM is very simple: there are no looping, testing, or
+# conditional instructions, no arithmetic and no function calls. Opcodes are
+# executed once each, from first to last, until a STOP opcode is reached.
+#
+# The PM has two data areas, "the stack" and "the memo".
+#
+# Many opcodes push Python objects onto the stack; e.g., INT pushes a Python
+# integer object on the stack, whose value is gotten from a decimal string
+# literal immediately following the INT opcode in the pickle bytestream. Other
+# opcodes take Python objects off the stack. The result of unpickling is
+# whatever object is left on the stack when the final STOP opcode is executed.
+#
+# The memo is simply an array of objects, or it can be implemented as a dict
+# mapping little integers to objects. The memo serves as the PM's "long term
+# memory", and the little integers indexing the memo are akin to variable
+# names. Some opcodes pop a stack object into the memo at a given index,
+# and others push a memo object at a given index onto the stack again.
+#
+# At heart, that's all the PM has. Subtleties arise for these reasons:
+#
+# + Object identity. Objects can be arbitrarily complex, and subobjects
+# may be shared (for example, the list [a, a] refers to the same object a
+# twice). It can be vital that unpickling recreate an isomorphic object
+# graph, faithfully reproducing sharing.
+#
+# + Recursive objects. For example, after "L = []; L.append(L)", L is a
+# list, and L[0] is the same list. This is related to the object identity
+# point, and some sequences of pickle opcodes are subtle in order to
+# get the right result in all cases.
+#
+# + Things pickle doesn't know everything about. Examples of things pickle
+# does know everything about are Python's builtin scalar and container
+# types, like ints and tuples. They generally have opcodes dedicated to
+# them. For things like module references and instances of user-defined
+# classes, pickle's knowledge is limited. Historically, many enhancements
+# have been made to the pickle protocol in order to do a better (faster,
+# and/or more compact) job on those.
+#
+# + Backward compatibility and micro-optimization. As explained below,
+# pickle opcodes never go away, not even when better ways to do a thing
+# get invented. The repertoire of the PM just keeps growing over time.
+# For example, protocol 0 had two opcodes for building Python integers (INT
+# and LONG), protocol 1 added three more for more-efficient pickling of short
+# integers, and protocol 2 added two more for more-efficient pickling of
+# long integers (before protocol 2, the only ways to pickle a Python long
+# took time quadratic in the number of digits, for both pickling and
+# unpickling). "Opcode bloat" isn't so much a subtlety as a source of
+# wearying complication.
+#
+#
+# Pickle protocols:
+#
+# For compatibility, the meaning of a pickle opcode never changes. Instead new
+# pickle opcodes get added, and each version's unpickler can handle all the
+# pickle opcodes in all protocol versions to date. So old pickles continue to
+# be readable forever. The pickler can generally be told to restrict itself to
+# the subset of opcodes available under previous protocol versions too, so that
+# users can create pickles under the current version readable by older
+# versions. However, a pickle does not contain its version number embedded
+# within it. If an older unpickler tries to read a pickle using a later
+# protocol, the result is most likely an exception due to seeing an unknown (in
+# the older unpickler) opcode.
+#
+# The original pickle used what's now called "protocol 0", and what was called
+# "text mode" before Python 2.3. The entire pickle bytestream is made up of
+# printable 7-bit ASCII characters, plus the newline character, in protocol 0.
+# That's why it was called text mode. Protocol 0 is small and elegant, but
+# sometimes painfully inefficient.
+#
+# The second major set of additions is now called "protocol 1", and was called
+# "binary mode" before Python 2.3. This added many opcodes with arguments
+# consisting of arbitrary bytes, including NUL bytes and unprintable "high bit"
+# bytes. Binary mode pickles can be substantially smaller than equivalent
+# text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte
+# int as 4 bytes following the opcode, which is cheaper to unpickle than the
+# (perhaps) 11-character decimal string attached to INT. Protocol 1 also added
+# a number of opcodes that operate on many stack elements at once (like APPENDS
+# and SETITEMS), and "shortcut" opcodes (like EMPTY_DICT and EMPTY_TUPLE).
+#
+# The third major set of additions came in Python 2.3, and is called "protocol
+# 2". This added:
+#
+# - A better way to pickle instances of new-style classes (NEWOBJ).
+#
+# - A way for a pickle to identify its protocol (PROTO).
+#
+# - Time- and space- efficient pickling of long ints (LONG{1,4}).
+#
+# - Shortcuts for small tuples (TUPLE{1,2,3}}.
+#
+# - Dedicated opcodes for bools (NEWTRUE, NEWFALSE).
+#
+# - The "extension registry", a vector of popular objects that can be pushed
+# efficiently by index (EXT{1,2,4}). This is akin to the memo and GET, but
+# the registry contents are predefined (there's nothing akin to the memo's
+# PUT).
+#
+# Another independent change with Python 2.3 is the abandonment of any
+# pretense that it might be safe to load pickles received from untrusted
+# parties -- no sufficient security analysis has been done to guarantee
+# this and there isn't a use case that warrants the expense of such an
+# analysis.
+#
+# To this end, all tests for __safe_for_unpickling__ or for
+# copyreg.safe_constructors are removed from the unpickling code.
+# References to these variables in the descriptions below are to be seen
+# as describing unpickling in Python 2.2 and before.
+
# Meta-rule: Descriptions are stored in instances of descriptor objects,
# with plain constructors. No meta-language is defined from which
@@ -169,6 +169,7 @@ UP_TO_NEWLINE = -1
TAKEN_FROM_ARGUMENT1 = -2 # num bytes is 1-byte unsigned int
TAKEN_FROM_ARGUMENT4 = -3 # num bytes is 4-byte signed little-endian int
TAKEN_FROM_ARGUMENT4U = -4 # num bytes is 4-byte unsigned little-endian int
+TAKEN_FROM_ARGUMENT8U = -5 # num bytes is 8-byte unsigned little-endian int
class ArgumentDescriptor(object):
__slots__ = (
@@ -176,7 +177,7 @@ class ArgumentDescriptor(object):
'name',
# length of argument, in bytes; an int; UP_TO_NEWLINE and
- # TAKEN_FROM_ARGUMENT{1,4} are negative values for variable-length
+ # TAKEN_FROM_ARGUMENT{1,4,8} are negative values for variable-length
# cases
'n',
@@ -197,7 +198,8 @@ class ArgumentDescriptor(object):
n in (UP_TO_NEWLINE,
TAKEN_FROM_ARGUMENT1,
TAKEN_FROM_ARGUMENT4,
- TAKEN_FROM_ARGUMENT4U))
+ TAKEN_FROM_ARGUMENT4U,
+ TAKEN_FROM_ARGUMENT8U))
self.n = n
self.reader = reader
@@ -289,6 +291,27 @@ uint4 = ArgumentDescriptor(
doc="Four-byte unsigned integer, little-endian.")
+def read_uint8(f):
+ r"""
+ >>> import io
+ >>> read_uint8(io.BytesIO(b'\xff\x00\x00\x00\x00\x00\x00\x00'))
+ 255
+ >>> read_uint8(io.BytesIO(b'\xff' * 8)) == 2**64-1
+ True
+ """
+
+ data = f.read(8)
+ if len(data) == 8:
+ return _unpack("<Q", data)[0]
+ raise ValueError("not enough data in stream to read uint8")
+
+uint8 = ArgumentDescriptor(
+ name='uint8',
+ n=8,
+ reader=read_uint8,
+ doc="Eight-byte unsigned integer, little-endian.")
+
+
def read_stringnl(f, decode=True, stripquotes=True):
r"""
>>> import io
@@ -382,6 +405,36 @@ stringnl_noescape_pair = ArgumentDescriptor(
a single blank separating the two strings.
""")
+
+def read_string1(f):
+ r"""
+ >>> import io
+ >>> read_string1(io.BytesIO(b"\x00"))
+ ''
+ >>> read_string1(io.BytesIO(b"\x03abcdef"))
+ 'abc'
+ """
+
+ n = read_uint1(f)
+ assert n >= 0
+ data = f.read(n)
+ if len(data) == n:
+ return data.decode("latin-1")
+ raise ValueError("expected %d bytes in a string1, but only %d remain" %
+ (n, len(data)))
+
+string1 = ArgumentDescriptor(
+ name="string1",
+ n=TAKEN_FROM_ARGUMENT1,
+ reader=read_string1,
+ doc="""A counted string.
+
+ The first argument is a 1-byte unsigned int giving the number
+ of bytes in the string, and the second argument is that many
+ bytes.
+ """)
+
+
def read_string4(f):
r"""
>>> import io
@@ -416,28 +469,28 @@ string4 = ArgumentDescriptor(
""")
-def read_string1(f):
+def read_bytes1(f):
r"""
>>> import io
- >>> read_string1(io.BytesIO(b"\x00"))
- ''
- >>> read_string1(io.BytesIO(b"\x03abcdef"))
- 'abc'
+ >>> read_bytes1(io.BytesIO(b"\x00"))
+ b''
+ >>> read_bytes1(io.BytesIO(b"\x03abcdef"))
+ b'abc'
"""
n = read_uint1(f)
assert n >= 0
data = f.read(n)
if len(data) == n:
- return data.decode("latin-1")
- raise ValueError("expected %d bytes in a string1, but only %d remain" %
+ return data
+ raise ValueError("expected %d bytes in a bytes1, but only %d remain" %
(n, len(data)))
-string1 = ArgumentDescriptor(
- name="string1",
+bytes1 = ArgumentDescriptor(
+ name="bytes1",
n=TAKEN_FROM_ARGUMENT1,
- reader=read_string1,
- doc="""A counted string.
+ reader=read_bytes1,
+ doc="""A counted bytes string.
The first argument is a 1-byte unsigned int giving the number
of bytes in the string, and the second argument is that many
@@ -487,6 +540,7 @@ def read_bytes4(f):
"""
n = read_uint4(f)
+ assert n >= 0
if n > sys.maxsize:
raise ValueError("bytes4 byte count > sys.maxsize: %d" % n)
data = f.read(n)
@@ -506,6 +560,40 @@ bytes4 = ArgumentDescriptor(
""")
+def read_bytes8(f):
+ r"""
+ >>> import io, struct, sys
+ >>> read_bytes8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x00\x00abc"))
+ b''
+ >>> read_bytes8(io.BytesIO(b"\x03\x00\x00\x00\x00\x00\x00\x00abcdef"))
+ b'abc'
+ >>> bigsize8 = struct.pack("<Q", sys.maxsize//3)
+ >>> read_bytes8(io.BytesIO(bigsize8 + b"abcdef")) #doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ ValueError: expected ... bytes in a bytes8, but only 6 remain
+ """
+
+ n = read_uint8(f)
+ assert n >= 0
+ if n > sys.maxsize:
+ raise ValueError("bytes8 byte count > sys.maxsize: %d" % n)
+ data = f.read(n)
+ if len(data) == n:
+ return data
+ raise ValueError("expected %d bytes in a bytes8, but only %d remain" %
+ (n, len(data)))
+
+bytes8 = ArgumentDescriptor(
+ name="bytes8",
+ n=TAKEN_FROM_ARGUMENT8U,
+ reader=read_bytes8,
+ doc="""A counted bytes string.
+
+ The first argument is a 8-byte little-endian unsigned int giving
+ the number of bytes, and the second argument is that many bytes.
+ """)
+
def read_unicodestringnl(f):
r"""
>>> import io
@@ -531,6 +619,46 @@ unicodestringnl = ArgumentDescriptor(
escape sequences.
""")
+
+def read_unicodestring1(f):
+ r"""
+ >>> import io
+ >>> s = 'abcd\uabcd'
+ >>> enc = s.encode('utf-8')
+ >>> enc
+ b'abcd\xea\xaf\x8d'
+ >>> n = bytes([len(enc)]) # little-endian 1-byte length
+ >>> t = read_unicodestring1(io.BytesIO(n + enc + b'junk'))
+ >>> s == t
+ True
+
+ >>> read_unicodestring1(io.BytesIO(n + enc[:-1]))
+ Traceback (most recent call last):
+ ...
+ ValueError: expected 7 bytes in a unicodestring1, but only 6 remain
+ """
+
+ n = read_uint1(f)
+ assert n >= 0
+ data = f.read(n)
+ if len(data) == n:
+ return str(data, 'utf-8', 'surrogatepass')
+ raise ValueError("expected %d bytes in a unicodestring1, but only %d "
+ "remain" % (n, len(data)))
+
+unicodestring1 = ArgumentDescriptor(
+ name="unicodestring1",
+ n=TAKEN_FROM_ARGUMENT1,
+ reader=read_unicodestring1,
+ doc="""A counted Unicode string.
+
+ The first argument is a 1-byte little-endian signed int
+ giving the number of bytes in the string, and the second
+ argument-- the UTF-8 encoding of the Unicode string --
+ contains that many bytes.
+ """)
+
+
def read_unicodestring4(f):
r"""
>>> import io
@@ -550,6 +678,7 @@ def read_unicodestring4(f):
"""
n = read_uint4(f)
+ assert n >= 0
if n > sys.maxsize:
raise ValueError("unicodestring4 byte count > sys.maxsize: %d" % n)
data = f.read(n)
@@ -571,6 +700,47 @@ unicodestring4 = ArgumentDescriptor(
""")
+def read_unicodestring8(f):
+ r"""
+ >>> import io
+ >>> s = 'abcd\uabcd'
+ >>> enc = s.encode('utf-8')
+ >>> enc
+ b'abcd\xea\xaf\x8d'
+ >>> n = bytes([len(enc)]) + bytes(7) # little-endian 8-byte length
+ >>> t = read_unicodestring8(io.BytesIO(n + enc + b'junk'))
+ >>> s == t
+ True
+
+ >>> read_unicodestring8(io.BytesIO(n + enc[:-1]))
+ Traceback (most recent call last):
+ ...
+ ValueError: expected 7 bytes in a unicodestring8, but only 6 remain
+ """
+
+ n = read_uint8(f)
+ assert n >= 0
+ if n > sys.maxsize:
+ raise ValueError("unicodestring8 byte count > sys.maxsize: %d" % n)
+ data = f.read(n)
+ if len(data) == n:
+ return str(data, 'utf-8', 'surrogatepass')
+ raise ValueError("expected %d bytes in a unicodestring8, but only %d "
+ "remain" % (n, len(data)))
+
+unicodestring8 = ArgumentDescriptor(
+ name="unicodestring8",
+ n=TAKEN_FROM_ARGUMENT8U,
+ reader=read_unicodestring8,
+ doc="""A counted Unicode string.
+
+ The first argument is a 8-byte little-endian signed int
+ giving the number of bytes in the string, and the second
+ argument-- the UTF-8 encoding of the Unicode string --
+ contains that many bytes.
+ """)
+
+
def read_decimalnl_short(f):
r"""
>>> import io
@@ -799,103 +969,107 @@ class StackObject(object):
return self.name
-pyint = StackObject(
- name='int',
- obtype=int,
- doc="A short (as opposed to long) Python integer object.")
-
-pylong = StackObject(
- name='long',
- obtype=int,
- doc="A long (as opposed to short) Python integer object.")
+pyint = pylong = StackObject(
+ name='int',
+ obtype=int,
+ doc="A Python integer object.")
pyinteger_or_bool = StackObject(
- name='int_or_bool',
- obtype=(int, bool),
- doc="A Python integer object (short or long), or "
- "a Python bool.")
+ name='int_or_bool',
+ obtype=(int, bool),
+ doc="A Python integer or boolean object.")
pybool = StackObject(
- name='bool',
- obtype=(bool,),
- doc="A Python bool object.")
+ name='bool',
+ obtype=bool,
+ doc="A Python boolean object.")
pyfloat = StackObject(
- name='float',
- obtype=float,
- doc="A Python float object.")
+ name='float',
+ obtype=float,
+ doc="A Python float object.")
-pystring = StackObject(
- name='string',
- obtype=bytes,
- doc="A Python (8-bit) string object.")
+pybytes_or_str = pystring = StackObject(
+ name='bytes_or_str',
+ obtype=(bytes, str),
+ doc="A Python bytes or (Unicode) string object.")
pybytes = StackObject(
- name='bytes',
- obtype=bytes,
- doc="A Python bytes object.")
+ name='bytes',
+ obtype=bytes,
+ doc="A Python bytes object.")
pyunicode = StackObject(
- name='str',
- obtype=str,
- doc="A Python (Unicode) string object.")
+ name='str',
+ obtype=str,
+ doc="A Python (Unicode) string object.")
pynone = StackObject(
- name="None",
- obtype=type(None),
- doc="The Python None object.")
+ name="None",
+ obtype=type(None),
+ doc="The Python None object.")
pytuple = StackObject(
- name="tuple",
- obtype=tuple,
- doc="A Python tuple object.")
+ name="tuple",
+ obtype=tuple,
+ doc="A Python tuple object.")
pylist = StackObject(
- name="list",
- obtype=list,
- doc="A Python list object.")
+ name="list",
+ obtype=list,
+ doc="A Python list object.")
pydict = StackObject(
- name="dict",
- obtype=dict,
- doc="A Python dict object.")
+ name="dict",
+ obtype=dict,
+ doc="A Python dict object.")
+
+pyset = StackObject(
+ name="set",
+ obtype=set,
+ doc="A Python set object.")
+
+pyfrozenset = StackObject(
+ name="frozenset",
+ obtype=set,
+ doc="A Python frozenset object.")
anyobject = StackObject(
- name='any',
- obtype=object,
- doc="Any kind of object whatsoever.")
+ name='any',
+ obtype=object,
+ doc="Any kind of object whatsoever.")
markobject = StackObject(
- name="mark",
- obtype=StackObject,
- doc="""'The mark' is a unique object.
-
- Opcodes that operate on a variable number of objects
- generally don't embed the count of objects in the opcode,
- or pull it off the stack. Instead the MARK opcode is used
- to push a special marker object on the stack, and then
- some other opcodes grab all the objects from the top of
- the stack down to (but not including) the topmost marker
- object.
- """)
+ name="mark",
+ obtype=StackObject,
+ doc="""'The mark' is a unique object.
+
+Opcodes that operate on a variable number of objects
+generally don't embed the count of objects in the opcode,
+or pull it off the stack. Instead the MARK opcode is used
+to push a special marker object on the stack, and then
+some other opcodes grab all the objects from the top of
+the stack down to (but not including) the topmost marker
+object.
+""")
stackslice = StackObject(
- name="stackslice",
- obtype=StackObject,
- doc="""An object representing a contiguous slice of the stack.
+ name="stackslice",
+ obtype=StackObject,
+ doc="""An object representing a contiguous slice of the stack.
- This is used in conjunction with markobject, to represent all
- of the stack following the topmost markobject. For example,
- the POP_MARK opcode changes the stack from
+This is used in conjunction with markobject, to represent all
+of the stack following the topmost markobject. For example,
+the POP_MARK opcode changes the stack from
- [..., markobject, stackslice]
- to
- [...]
+ [..., markobject, stackslice]
+to
+ [...]
- No matter how many object are on the stack after the topmost
- markobject, POP_MARK gets rid of all of them (including the
- topmost markobject too).
- """)
+No matter how many object are on the stack after the topmost
+markobject, POP_MARK gets rid of all of them (including the
+topmost markobject too).
+""")
##############################################################################
# Descriptors for pickle opcodes.
@@ -1032,7 +1206,7 @@ opcodes = [
code='L',
arg=decimalnl_long,
stack_before=[],
- stack_after=[pylong],
+ stack_after=[pyint],
proto=0,
doc="""Push a long integer.
@@ -1050,7 +1224,7 @@ opcodes = [
code='\x8a',
arg=long1,
stack_before=[],
- stack_after=[pylong],
+ stack_after=[pyint],
proto=2,
doc="""Long integer using one-byte length.
@@ -1061,7 +1235,7 @@ opcodes = [
code='\x8b',
arg=long4,
stack_before=[],
- stack_after=[pylong],
+ stack_after=[pyint],
proto=2,
doc="""Long integer using found-byte length.
@@ -1074,45 +1248,50 @@ opcodes = [
code='S',
arg=stringnl,
stack_before=[],
- stack_after=[pystring],
+ stack_after=[pybytes_or_str],
proto=0,
doc="""Push a Python string object.
The argument is a repr-style string, with bracketing quote characters,
and perhaps embedded escapes. The argument extends until the next
- newline character. (Actually, they are decoded into a str instance
+ newline character. These are usually decoded into a str instance
using the encoding given to the Unpickler constructor. or the default,
- 'ASCII'.)
+ 'ASCII'. If the encoding given was 'bytes' however, they will be
+ decoded as bytes object instead.
"""),
I(name='BINSTRING',
code='T',
arg=string4,
stack_before=[],
- stack_after=[pystring],
+ stack_after=[pybytes_or_str],
proto=1,
doc="""Push a Python string object.
- There are two arguments: the first is a 4-byte little-endian signed int
- giving the number of bytes in the string, and the second is that many
- bytes, which are taken literally as the string content. (Actually,
- they are decoded into a str instance using the encoding given to the
- Unpickler constructor. or the default, 'ASCII'.)
+ There are two arguments: the first is a 4-byte little-endian
+ signed int giving the number of bytes in the string, and the
+ second is that many bytes, which are taken literally as the string
+ content. These are usually decoded into a str instance using the
+ encoding given to the Unpickler constructor. or the default,
+ 'ASCII'. If the encoding given was 'bytes' however, they will be
+ decoded as bytes object instead.
"""),
I(name='SHORT_BINSTRING',
code='U',
arg=string1,
stack_before=[],
- stack_after=[pystring],
+ stack_after=[pybytes_or_str],
proto=1,
doc="""Push a Python string object.
- There are two arguments: the first is a 1-byte unsigned int giving
- the number of bytes in the string, and the second is that many bytes,
- which are taken literally as the string content. (Actually, they
- are decoded into a str instance using the encoding given to the
- Unpickler constructor. or the default, 'ASCII'.)
+ There are two arguments: the first is a 1-byte unsigned int giving
+ the number of bytes in the string, and the second is that many
+ bytes, which are taken literally as the string content. These are
+ usually decoded into a str instance using the encoding given to
+ the Unpickler constructor. or the default, 'ASCII'. If the
+ encoding given was 'bytes' however, they will be decoded as bytes
+ object instead.
"""),
# Bytes (protocol 3 only; older protocols don't support bytes at all)
@@ -1143,6 +1322,19 @@ opcodes = [
literally as the string content.
"""),
+ I(name='BINBYTES8',
+ code='\x8e',
+ arg=bytes8,
+ stack_before=[],
+ stack_after=[pybytes],
+ proto=4,
+ doc="""Push a Python bytes object.
+
+ There are two arguments: the first is a 8-byte unsigned int giving
+ the number of bytes in the string, and the second is that many bytes,
+ which are taken literally as the string content.
+ """),
+
# Ways to spell None.
I(name='NONE',
@@ -1191,6 +1383,19 @@ opcodes = [
until the next newline character.
"""),
+ I(name='SHORT_BINUNICODE',
+ code='\x8c',
+ arg=unicodestring1,
+ stack_before=[],
+ stack_after=[pyunicode],
+ proto=4,
+ doc="""Push a Python Unicode string object.
+
+ There are two arguments: the first is a 1-byte little-endian signed int
+ giving the number of bytes in the string. The second is that many
+ bytes, and is the UTF-8 encoding of the Unicode string.
+ """),
+
I(name='BINUNICODE',
code='X',
arg=unicodestring4,
@@ -1204,6 +1409,19 @@ opcodes = [
bytes, and is the UTF-8 encoding of the Unicode string.
"""),
+ I(name='BINUNICODE8',
+ code='\x8d',
+ arg=unicodestring8,
+ stack_before=[],
+ stack_after=[pyunicode],
+ proto=4,
+ doc="""Push a Python Unicode string object.
+
+ There are two arguments: the first is a 8-byte little-endian signed int
+ giving the number of bytes in the string. The second is that many
+ bytes, and is the UTF-8 encoding of the Unicode string.
+ """),
+
# Ways to spell floats.
I(name='FLOAT',
@@ -1429,6 +1647,54 @@ opcodes = [
1, 2, ..., n, and in that order.
"""),
+ # Ways to build sets
+
+ I(name='EMPTY_SET',
+ code='\x8f',
+ arg=None,
+ stack_before=[],
+ stack_after=[pyset],
+ proto=4,
+ doc="Push an empty set."),
+
+ I(name='ADDITEMS',
+ code='\x90',
+ arg=None,
+ stack_before=[pyset, markobject, stackslice],
+ stack_after=[pyset],
+ proto=4,
+ doc="""Add an arbitrary number of items to an existing set.
+
+ The slice of the stack following the topmost markobject is taken as
+ a sequence of items, added to the set immediately under the topmost
+ markobject. Everything at and after the topmost markobject is popped,
+ leaving the mutated set at the top of the stack.
+
+ Stack before: ... pyset markobject item_1 ... item_n
+ Stack after: ... pyset
+
+ where pyset has been modified via pyset.add(item_i) = item_i for i in
+ 1, 2, ..., n, and in that order.
+ """),
+
+ # Way to build frozensets
+
+ I(name='FROZENSET',
+ code='\x91',
+ arg=None,
+ stack_before=[markobject, stackslice],
+ stack_after=[pyfrozenset],
+ proto=4,
+ doc="""Build a frozenset out of the topmost slice, after markobject.
+
+ All the stack entries following the topmost markobject are placed into
+ a single Python frozenset, which single frozenset object replaces all
+ of the stack from the topmost markobject onward. For example,
+
+ Stack before: ... markobject 1 2 3
+ Stack after: ... frozenset({1, 2, 3})
+ """),
+
# Stack manipulation.
I(name='POP',
@@ -1550,6 +1816,18 @@ opcodes = [
unsigned little-endian integer following.
"""),
+ I(name='MEMOIZE',
+ code='\x94',
+ arg=None,
+ stack_before=[anyobject],
+ stack_after=[anyobject],
+ proto=4,
+ doc="""Store the stack top into the memo. The stack is not popped.
+
+ The index of the memo location to write is the number of
+ elements currently present in the memo.
+ """),
+
# Access the extension registry (predefined objects). Akin to the GET
# family.
@@ -1615,6 +1893,15 @@ opcodes = [
stack, so unpickling subclasses can override this form of lookup.
"""),
+ I(name='STACK_GLOBAL',
+ code='\x93',
+ arg=None,
+ stack_before=[pyunicode, pyunicode],
+ stack_after=[anyobject],
+ proto=0,
+ doc="""Push a global object (module.attr) on the stack.
+ """),
+
# Ways to build objects of classes pickle doesn't know about directly
# (user-defined classes). I despair of documenting this accurately
# and comprehensibly -- you really have to read the pickle code to
@@ -1771,6 +2058,21 @@ opcodes = [
onto the stack.
"""),
+ I(name='NEWOBJ_EX',
+ code='\x92',
+ arg=None,
+ stack_before=[anyobject, anyobject, anyobject],
+ stack_after=[anyobject],
+ proto=4,
+ doc="""Build an object instance.
+
+ The stack before should be thought of as containing a class
+ object followed by an argument tuple and by a keyword argument dict
+ (the dict being the stack top). Call these cls and args. They are
+ popped off the stack, and the value returned by
+ cls.__new__(cls, *args, *kwargs) is pushed back onto the stack.
+ """),
+
# Machine control.
I(name='PROTO',
@@ -1798,6 +2100,20 @@ opcodes = [
empty then.
"""),
+ # Framing support.
+
+ I(name='FRAME',
+ code='\x95',
+ arg=uint8,
+ stack_before=[],
+ stack_after=[],
+ proto=4,
+ doc="""Indicate the beginning of a new frame.
+
+ The unpickler may use this opcode to safely prefetch data from its
+ underlying stream.
+ """),
+
# Ways to deal with persistent IDs.
I(name='PERSID',
@@ -1904,6 +2220,38 @@ del assure_pickle_consistency
##############################################################################
# A pickle opcode generator.
+def _genops(data, yield_end_pos=False):
+ if isinstance(data, bytes_types):
+ data = io.BytesIO(data)
+
+ if hasattr(data, "tell"):
+ getpos = data.tell
+ else:
+ getpos = lambda: None
+
+ while True:
+ pos = getpos()
+ code = data.read(1)
+ opcode = code2op.get(code.decode("latin-1"))
+ if opcode is None:
+ if code == b"":
+ raise ValueError("pickle exhausted before seeing STOP")
+ else:
+ raise ValueError("at position %s, opcode %r unknown" % (
+ "<unknown>" if pos is None else pos,
+ code))
+ if opcode.arg is None:
+ arg = None
+ else:
+ arg = opcode.arg.reader(data)
+ if yield_end_pos:
+ yield opcode, arg, pos, getpos()
+ else:
+ yield opcode, arg, pos
+ if code == b'.':
+ assert opcode.name == 'STOP'
+ break
+
def genops(pickle):
"""Generate all the opcodes in a pickle.
@@ -1927,62 +2275,48 @@ def genops(pickle):
used. Else (the pickle doesn't have a tell(), and it's not obvious how
to query its current position) pos is None.
"""
-
- if isinstance(pickle, bytes_types):
- import io
- pickle = io.BytesIO(pickle)
-
- if hasattr(pickle, "tell"):
- getpos = pickle.tell
- else:
- getpos = lambda: None
-
- while True:
- pos = getpos()
- code = pickle.read(1)
- opcode = code2op.get(code.decode("latin-1"))
- if opcode is None:
- if code == b"":
- raise ValueError("pickle exhausted before seeing STOP")
- else:
- raise ValueError("at position %s, opcode %r unknown" % (
- pos is None and "<unknown>" or pos,
- code))
- if opcode.arg is None:
- arg = None
- else:
- arg = opcode.arg.reader(pickle)
- yield opcode, arg, pos
- if code == b'.':
- assert opcode.name == 'STOP'
- break
+ return _genops(pickle)
##############################################################################
# A pickle optimizer.
def optimize(p):
'Optimize a pickle string by removing unused PUT opcodes'
- gets = set() # set of args used by a GET opcode
- puts = [] # (arg, startpos, stoppos) for the PUT opcodes
- prevpos = None # set to pos if previous opcode was a PUT
- for opcode, arg, pos in genops(p):
- if prevpos is not None:
- puts.append((prevarg, prevpos, pos))
- prevpos = None
+ not_a_put = object()
+ gets = { not_a_put } # set of args used by a GET opcode
+ opcodes = [] # (startpos, stoppos, putid)
+ proto = 0
+ for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
if 'PUT' in opcode.name:
- prevarg, prevpos = arg, pos
- elif 'GET' in opcode.name:
- gets.add(arg)
-
- # Copy the pickle string except for PUTS without a corresponding GET
- s = []
- i = 0
- for arg, start, stop in puts:
- j = stop if (arg in gets) else start
- s.append(p[i:j])
- i = stop
- s.append(p[i:])
- return b''.join(s)
+ opcodes.append((pos, end_pos, arg))
+ elif 'FRAME' in opcode.name:
+ pass
+ else:
+ if 'GET' in opcode.name:
+ gets.add(arg)
+ elif opcode.name == 'PROTO':
+ assert pos == 0, pos
+ proto = arg
+ opcodes.append((pos, end_pos, not_a_put))
+ prevpos, prevarg = pos, None
+
+ # Copy the opcodes except for PUTS without a corresponding GET
+ out = io.BytesIO()
+ opcodes = iter(opcodes)
+ if proto >= 2:
+ # Write the PROTO header before any framing
+ start, stop, _ = next(opcodes)
+ out.write(p[start:stop])
+ buf = pickle._Framer(out.write)
+ if proto >= 4:
+ buf.start_framing()
+ for start, stop, putid in opcodes:
+ if putid in gets:
+ buf.commit_frame()
+ buf.write(p[start:stop])
+ if proto >= 4:
+ buf.end_framing()
+ return out.getvalue()
##############################################################################
# A symbolic pickle disassembler.
@@ -2082,17 +2416,20 @@ def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
errormsg = markmsg = "no MARK exists on stack"
# Check for correct memo usage.
- if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"):
- assert arg is not None
- if arg in memo:
+ if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT", "MEMOIZE"):
+ if opcode.name == "MEMOIZE":
+ memo_idx = len(memo)
+ else:
+ assert arg is not None
+ memo_idx = arg
+ if memo_idx in memo:
errormsg = "memo key %r already defined" % arg
elif not stack:
errormsg = "stack is empty -- can't store into memo"
elif stack[-1] is markobject:
errormsg = "can't store markobject in the memo"
else:
- memo[arg] = stack[-1]
-
+ memo[memo_idx] = stack[-1]
elif opcode.name in ("GET", "BINGET", "LONG_BINGET"):
if arg in memo:
assert len(after) == 1