diff options
Diffstat (limited to 'Lib/email/header.py')
| -rw-r--r-- | Lib/email/header.py | 136 | 
1 files changed, 96 insertions, 40 deletions
diff --git a/Lib/email/header.py b/Lib/email/header.py index 1d97f8f5ea..e03e42d6c1 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -25,10 +25,11 @@ BSPACE = b' '  SPACE8 = ' ' * 8  EMPTYSTRING = '' -MAXLINELEN = 76 +MAXLINELEN = 78  USASCII = Charset('us-ascii')  UTF8 = Charset('utf-8') +TRANSITIONAL_SPACE = object()  # Match encoded-word strings in the form =?charset?q?Hello_World?=  ecre = re.compile(r''' @@ -109,7 +110,7 @@ def decode_header(header):      last_word = last_charset = None      for word, charset in decoded_words:          if isinstance(word, str): -            word = bytes(ord(c) for c in word) +            word = bytes(word, 'raw-unicode-escape')          if last_word is None:              last_word = word              last_charset = charset @@ -170,7 +171,8 @@ class Header:          The maximum line length can be specified explicit via maxlinelen.  For          splitting the first line to a shorter value (to account for the field          header which isn't included in s, e.g. `Subject') pass in the name of -        the field in header_name.  The default maxlinelen is 76. +        the field in header_name.  The default maxlinelen is 78 as recommended +        by RFC 2822.          continuation_ws must be RFC 2822 compliant folding whitespace (usually          either a space or a hard tab) which will be prepended to continuation @@ -198,9 +200,10 @@ class Header:      def __str__(self):          """Return the string value of the header.""" +        self._normalize()          uchunks = []          lastcs = None -        for s, charset in self._chunks: +        for string, charset in self._chunks:              # We must preserve spaces between encoded and non-encoded word              # boundaries, which means for us we need to add a space when we go              # from a charset to None/us-ascii, or from None/us-ascii to a @@ -214,15 +217,16 @@ class Header:                  elif nextcs not in (None, 'us-ascii'):                      uchunks.append(SPACE)              lastcs = nextcs -            uchunks.append(s) +            uchunks.append(string)          return EMPTYSTRING.join(uchunks)      # Rich comparison operators for equality only.  BAW: does it make sense to      # have or explicitly disable <, <=, >, >= operators?      def __eq__(self, other):          # other may be a Header or a string.  Both are fine so coerce -        # ourselves to a string, swap the args and do another comparison. -        return other == self.encode() +        # ourselves to a unicode (of the unencoded header value), swap the +        # args and do another comparison. +        return other == str(self)      def __ne__(self, other):          return not self == other @@ -267,7 +271,7 @@ class Header:          output_string = input_bytes.decode(output_charset, errors)          self._chunks.append((output_string, charset)) -    def encode(self, splitchars=';, \t'): +    def encode(self, splitchars=';, \t', maxlinelen=None):          """Encode a message header into an RFC-compliant format.          There are many issues involved in converting a given string for use in @@ -290,7 +294,14 @@ class Header:          syntactic breaks'.  This doesn't affect RFC 2047 encoded lines.          """          self._normalize() -        formatter = _ValueFormatter(self._headerlen, self._maxlinelen, +        if maxlinelen is None: +            maxlinelen = self._maxlinelen +        # A maxlinelen of 0 means don't wrap.  For all practical purposes, +        # choosing a huge number here accomplishes that and makes the +        # _ValueFormatter algorithm much simpler. +        if maxlinelen == 0: +            maxlinelen = 1000000 +        formatter = _ValueFormatter(self._headerlen, maxlinelen,                                      self._continuation_ws, splitchars)          for string, charset in self._chunks:              lines = string.splitlines() @@ -301,9 +312,8 @@ class Header:          return str(formatter)      def _normalize(self): -        # Normalize the chunks so that all runs of identical charsets get -        # collapsed into a single unicode string.  You need a space between -        # encoded words, or between encoded and unencoded words. +        # Step 1: Normalize the chunks so that all runs of identical charsets +        # get collapsed into a single unicode string.          chunks = []          last_charset = None          last_chunk = [] @@ -313,8 +323,6 @@ class Header:              else:                  if last_charset is not None:                      chunks.append((SPACE.join(last_chunk), last_charset)) -                    if last_charset != USASCII or charset != USASCII: -                        chunks.append((' ', USASCII))                  last_chunk = [string]                  last_charset = charset          if last_chunk: @@ -333,6 +341,10 @@ class _ValueFormatter:          self._current_line = _Accumulator(headerlen)      def __str__(self): +        # Remove the trailing TRANSITIONAL_SPACE +        last_line = self._current_line.pop() +        if last_line is not TRANSITIONAL_SPACE: +            self._current_line.push(last_line)          self.newline()          return NL.join(self._lines) @@ -348,24 +360,66 @@ class _ValueFormatter:          if len(encoded_string) + len(self._current_line) <= self._maxlen:              self._current_line.push(encoded_string)              return -        # Attempt to split the line at the highest-level syntactic break -        # possible.  Note that we don't have a lot of smarts about field +        # If the charset has no header encoding (i.e. it is an ASCII encoding) +        # then we must split the header at the "highest level syntactic break" +        # possible. Note that we don't have a lot of smarts about field          # syntax; we just try to break on semi-colons, then commas, then -        # whitespace.  Eventually, we'll allow this to be pluggable. -        for ch in self._splitchars: -            if ch in string: -                break -        else: -            # We can't split the string to fit on the current line, so just -            # put it on a line by itself. -            self._lines.append(str(self._current_line)) -            self._current_line.reset(self._continuation_ws) -            self._current_line.push(encoded_string) +        # whitespace.  Eventually, this should be pluggable. +        if charset.header_encoding is None: +            for ch in self._splitchars: +                if ch in string: +                    break +            else: +                ch = None +            # If there's no available split character then regardless of +            # whether the string fits on the line, we have to put it on a line +            # by itself. +            if ch is None: +                if not self._current_line.is_onlyws(): +                    self._lines.append(str(self._current_line)) +                    self._current_line.reset(self._continuation_ws) +                self._current_line.push(encoded_string) +            else: +                self._ascii_split(string, ch)              return -        self._spliterate(string, ch, charset) - -    def _spliterate(self, string, ch, charset): -        holding = _Accumulator(transformfunc=charset.header_encode) +        # Otherwise, we're doing either a Base64 or a quoted-printable +        # encoding which means we don't need to split the line on syntactic +        # breaks.  We can basically just find enough characters to fit on the +        # current line, minus the RFC 2047 chrome.  What makes this trickier +        # though is that we have to split at octet boundaries, not character +        # boundaries but it's only safe to split at character boundaries so at +        # best we can only get close. +        encoded_lines = charset.header_encode_lines(string, self._maxlengths()) +        # The first element extends the current line, but if it's None then +        # nothing more fit on the current line so start a new line. +        try: +            first_line = encoded_lines.pop(0) +        except IndexError: +            # There are no encoded lines, so we're done. +            return +        if first_line is not None: +            self._current_line.push(first_line) +        self._lines.append(str(self._current_line)) +        self._current_line.reset(self._continuation_ws) +        try: +            last_line = encoded_lines.pop() +        except IndexError: +            # There was only one line. +            return +        self._current_line.push(last_line) +        self._current_line.push(TRANSITIONAL_SPACE) +        # Everything else are full lines in themselves. +        for line in encoded_lines: +            self._lines.append(self._continuation_ws + line) + +    def _maxlengths(self): +        # The first line's length. +        yield self._maxlen - len(self._current_line) +        while True: +            yield self._maxlen - self._continuation_ws_len + +    def _ascii_split(self, string, ch): +        holding = _Accumulator()          # Split the line on the split character, preserving it.  If the split          # character is whitespace RFC 2822 $2.2.3 requires us to fold on the          # whitespace, so that the line leads with the original whitespace we @@ -387,8 +441,7 @@ class _ValueFormatter:                      # line, watch out for the current line containing only                      # whitespace.                      holding.pop() -                    if len(self._current_line) == 0 and ( -                        len(holding) == 0 or str(holding).isspace()): +                    if self._current_line.is_onlyws() and holding.is_onlyws():                          # Don't start a new line.                          holding.push(part)                          part = None @@ -492,12 +545,8 @@ def _spliterator(character, string):  class _Accumulator: -    def __init__(self, initial_size=0, transformfunc=None): +    def __init__(self, initial_size=0):          self._initial_size = initial_size -        if transformfunc is None: -            self._transformfunc = lambda string: string -        else: -            self._transformfunc = transformfunc          self._current = []      def push(self, string): @@ -507,14 +556,21 @@ class _Accumulator:          return self._current.pop()      def __len__(self): -        return len(str(self)) + self._initial_size +        return sum((len(string) +                    for string in self._current +                    if string is not TRANSITIONAL_SPACE), +                   self._initial_size)      def __str__(self): -        return self._transformfunc(EMPTYSTRING.join(self._current)) +        return EMPTYSTRING.join( +            (' ' if string is TRANSITIONAL_SPACE else string) +            for string in self._current)      def reset(self, string=None):          self._current = [] -        self._current_len = 0          self._initial_size = 0          if string is not None:              self.push(string) + +    def is_onlyws(self): +        return len(self) == 0 or str(self).isspace()  | 
