fix: Decode field names and filenames correctly

The HTML5 specification defines that "field names and filenames for file fields [...] must be escaped by replacing any 0x0A (LF) bytes with the byte sequence %0A, 0x0D (CR) with %0D and 0x22 (") with %22. The user agent must not perform any other escapes." and tests show that modern browsers actually do that. This is different from traditional header quoting (which involves backslash-escaping quotes and backslashes). fixes #60
defnull · Oct 18, 2024 · 2b75a5a · 2b75a5a
1 parent 1969677
commit 2b75a5a
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 8 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,13 @@
 Changelog
 =========
 
+Release 1.2
+===========
+
+* fix: Implement modern quoting rules for field names and filenames (#60)
+* feat: Added specialized `content_disposition_[un]quote` functions.
+* feat: `parse_options_header` can now use different unquote functions.
+
 Release 1.1
 ===========
 

diff --git a/multipart.py b/multipart.py
@@ -23,6 +23,21 @@
 from collections.abc import MutableMapping as DictMixin
 import tempfile
 import functools
+import warnings
+
+try:
+ from warnings import deprecated
+except ImportError:
+ from functools import wraps
+ def deprecated(reason):
+ def decorator(func):
+ @wraps(func)
+ def wrapper(*a, **ka):
+ warnings.warn(reason, category=DeprecationWarning, stacklevel=2)
+ return func(*a, **ka)
+ func.__deprecated__ = wrapper.__deprecated__ = reason
+ return wrapper
+ return decorator
 
 
 ##############################################################################
@@ -148,13 +163,23 @@ def __get__(self, obj, cls):
 
 
 def header_quote(val):
+ """ Quote header option values if necessary.
+
+ Note: This is NOT the way modern browsers quote field names or filenames
+ in Content-Disposition headers. See :func:`content_disposition_quote`
+ """
  if not _re_special.search(val):
  return val
 
  return '"' + val.replace("\\", "\\\\").replace('"', '\\"') + '"'
 
 
 def header_unquote(val, filename=False):
+ """ Unquote header option values.
+
+ Note: This is NOT the way modern browsers quote field names or filenames
+ in Content-Disposition headers. See :func:`content_disposition_unquote`
+ """
  if val[0] == val[-1] == '"':
  val = val[1:-1]
 
@@ -167,16 +192,52 @@ def header_unquote(val, filename=False):
  return val
 
 
-def parse_options_header(header, options=None):
- value, sep, tail = header.partition(";")
- if not sep:
+def content_disposition_quote(val):
+ """ Quote field names or filenames for Content-Disposition headers the
+ same way modern browsers do it (see WHATWG HTML5 specification).
+ """
+ val = val.replace("\r", "%0D").replace("\n", "%0A").replace('"', "%22")
+ return '"' + val + '"'
+
+
+def content_disposition_unquote(val, filename=False):
+ """ Unquote field names or filenames from Content-Disposition headers.
+
+ Legacy quoting mechanisms are detected to some degree and also supported,
+ but there are rare ambiguous edge cases where we have to guess. If in
+ doubt, this function assumes a modern browser and follows the WHATWG
+ HTML5 specification (limited percent-encoding, no backslash-encoding).
+ """
+
+ if '"' == val[0] == val[-1]:
+ val = val[1:-1]
+ if '\\"' in val: # Legacy backslash-escaped quoted strings
+ val = val.replace("\\\\", "\\").replace('\\"', '"')
+ elif "%" in val: # Modern (HTML5) limited percent-encoding
+ val = val.replace("%0D", "\r").replace("%0A", "\n").replace("%22", '"')
+ # ie6/windows bug: full path instead of just filename
+ if filename and (val[1:3] == ":\\" or val[:2] == "\\\\"):
+ val = val.rpartition("\\")[-1]
+ elif "%" in val: # Modern (HTML5) limited percent-encoding
+ val = val.replace("%0D", "\r").replace("%0A", "\n").replace("%22", '"')
+ return val
+
+
+def parse_options_header(header, options=None, unquote=header_unquote):
+ """ Parse Content-Type (or similar) headers into a primary value 
+ and an options-dict.
+
+ Note: For Content-Disposition headers you need a different unquote
+ function. See `content_disposition_unquote`.
+
+ """
  return header.lower().strip(), {}
 
  options = options or {}
  for match in _re_option.finditer(tail):
  key, val = match.groups()
  key = key.lower()
- options[key] = header_unquote(val, key == "filename")
+ options[key] = unquote(val, key == "filename")
 
  return value.lower(), options
 
@@ -499,7 +560,7 @@ def _close_headers(self):
 
  for h,v in self.headerlist:
  if h == "Content-Disposition":
- dtype, args = parse_options_header(v)
+ dtype, args = parse_options_header(v, unquote=content_disposition_unquote)
  if dtype != "form-data":
  raise self._fail("Invalid Content-Disposition segment header: Wrong type")
  if "name" not in args and self._parser.strict:

diff --git a/test/test_header_utils.py b/test/test_header_utils.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
+import functools
 import unittest
 import multipart
+import pytest
 
 class TestHeaderParser(unittest.TestCase):
 
@@ -11,17 +13,32 @@ def test_token_unquote(self):
  self.assertEqual('ie.exe', unquote('"\\\\network\\ie.exe"', True))
  self.assertEqual('ie.exe', unquote('"c:\\wondows\\ie.exe"', True))
 
+ unquote = multipart.content_disposition_unquote
+ self.assertEqual('foo', unquote('"foo"'))
+ self.assertEqual('foo"bar', unquote('"foo\\"bar"'))
+ self.assertEqual('ie.exe', unquote('"\\\\network\\ie.exe"', True))
+ self.assertEqual('ie.exe', unquote('"c:\\wondows\\ie.exe"', True))
+
  def test_token_quote(self):
  quote = multipart.header_quote
  self.assertEqual(quote('foo'), 'foo')
  self.assertEqual(quote('foo"bar'), '"foo\\"bar"')
 
+ quote = multipart.content_disposition_quote
+ self.assertEqual(quote('foo'), '"foo"')
+ self.assertEqual(quote('foo"bar'), '"foo%22bar"')
+
  def test_options_parser(self):
  parse = multipart.parse_options_header
  head = 'form-data; name="Test"; '
  self.assertEqual(parse(head+'filename="Test.txt"')[0], 'form-data')
  self.assertEqual(parse(head+'filename="Test.txt"')[1]['name'], 'Test')
  self.assertEqual(parse(head+'filename="Test.txt"')[1]['filename'], 'Test.txt')
- self.assertEqual(parse(head+'FileName="Te\\"st.txt"')[1]['filename'], 'Te"st.txt')
+ self.assertEqual(parse(head+'FileName="Te\\"s\\\\t.txt"')[1]['filename'], 'Te"s\\t.txt')
  self.assertEqual(parse(head+'filename="C:\\test\\bla.txt"')[1]['filename'], 'bla.txt')
  self.assertEqual(parse(head+'filename="\\\\test\\bla.txt"')[1]['filename'], 'bla.txt')
+ self.assertEqual(parse(head+'filename="täst.txt"')[1]['filename'], 'täst.txt')
+
+ parse = functools.partial(multipart.parse_options_header, unquote=multipart.content_disposition_unquote)
+ self.assertEqual(parse(head+'FileName="Te%22s\\\\t.txt"')[1]['filename'], 'Te"s\\\\t.txt')
+
diff --git a/test/test_push_parser.py b/test/test_push_parser.py
@@ -63,10 +63,12 @@ def compact_events(self):
  yield current, b''.join(data)
 
  def get_segment(self, index_or_name):
+ allnames = []
  for i, (segment, body) in enumerate(self.compact_events()):
+ allnames.append(segment.name)
  if index_or_name == i or index_or_name == segment.name:
  return segment, body
- self.fail(f"Segment not found: {index_or_name}")
+ self.fail(f"Segment {index_or_name!r} not found in {allnames!r}")
 
 
 class TestPushParser(PushTestBase):
@@ -769,3 +771,25 @@ def test_werkzeug_examples(self):
  self.assertEqual(segment.filename, None)
  self.assertEqual(segment.content_type, None)
  self.assertEqual(body.decode(segment.charset or 'utf8'), forms[field])
+
+
+
+class TestRealWorldExamples(PushTestBase):
+ def test_special_characters(self):
+ """ Test the ultimate segment name/filename from hell. """
+ teststring = 'test \\ \\\\ ; ö " = ;'
+ firefox_131 = ['---------------------------3697486332756351920303607403',
+b'-----------------------------3697486332756351920303607403\r\nContent-Disposition: form-data; name="test \\ \\\\ ; \xc3\xb6 %22 = ;"; filename="test \\ \\\\ ; \xc3\xb6 %22 = ;"\r\nContent-Type: application/octet-stream\r\n\r\ntest \\ \\\\ ; \xc3\xb6 " = ;\r\n-----------------------------3697486332756351920303607403--\r\n']
+ chrome_129 = ["----WebKitFormBoundary9duA54BXJUGUymtb", 
+b'------WebKitFormBoundary9duA54BXJUGUymtb\r\nContent-Disposition: form-data; name="test \\ \\\\ ; \xc3\xb6 %22 = ;"; filename="test \\ \\\\ ; \xc3\xb6 %22 = ;"\r\nContent-Type: application/octet-stream\r\n\r\ntest \\ \\\\ ; \xc3\xb6 " = ;\r\n------WebKitFormBoundary9duA54BXJUGUymtb--\r\n']
+
+ for boundary, body in [firefox_131, chrome_129]:
+ print(repr(boundary))
+ print(repr(body))
+ self.reset(boundary=boundary, strict=True, header_charset='utf8')
+ self.parse(body)
+ segment, body = self.get_segment(teststring)
+ self.assertEqual(segment.name, teststring)
+ self.assertEqual(segment.filename, teststring)
+ self.assertEqual(body, teststring.encode("utf8"))
+
diff --git a/test/utils.py b/test/utils.py
@@ -48,7 +48,7 @@ def write_header(self, header, value, **opts):
  line = to_bytes(header) + b': ' + to_bytes(value)
  for opt, val in opts.items():
  if val is not None:
- line += b"; " + to_bytes(opt) + b'=' + to_bytes(multipart.header_quote(val))
+ line += b"; " + to_bytes(opt) + b'=' + to_bytes(multipart.content_disposition_quote(val))
  self.write(line + b'\r\n')
 
  def write_field(self, name, data, filename=None, content_type=None):