Skip to content

Commit

Permalink
fix: Decode field names and filenames correctly
Browse files Browse the repository at this point in the history
The HTML5 specification defines that "field names and filenames for file fields [...] must be escaped by replacing any 0x0A (LF) bytes with the byte sequence %0A, 0x0D (CR) with %0D and 0x22 (") with %22. The user agent must not perform any other escapes." and tests show that modern browsers actually do that. This is different from traditional header quoting (which involves backslash-escaping quotes and backslashes).

fixes #60
  • Loading branch information
defnull committed Oct 18, 2024
1 parent 1969677 commit 2b75a5a
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 8 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
Changelog
=========

Release 1.2
===========

* fix: Implement modern quoting rules for field names and filenames (#60)
* feat: Added specialized `content_disposition_[un]quote` functions.
* feat: `parse_options_header` can now use different unquote functions.

Release 1.1
===========

Expand Down
71 changes: 66 additions & 5 deletions multipart.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@
from collections.abc import MutableMapping as DictMixin
import tempfile
import functools
import warnings

try:
from warnings import deprecated
except ImportError:
from functools import wraps
def deprecated(reason):
def decorator(func):
@wraps(func)
def wrapper(*a, **ka):
warnings.warn(reason, category=DeprecationWarning, stacklevel=2)
return func(*a, **ka)
func.__deprecated__ = wrapper.__deprecated__ = reason
return wrapper
return decorator


##############################################################################
Expand Down Expand Up @@ -148,13 +163,23 @@ def __get__(self, obj, cls):


def header_quote(val):
""" Quote header option values if necessary.
Note: This is NOT the way modern browsers quote field names or filenames
in Content-Disposition headers. See :func:`content_disposition_quote`
"""
if not _re_special.search(val):
return val

return '"' + val.replace("\\", "\\\\").replace('"', '\\"') + '"'


def header_unquote(val, filename=False):
""" Unquote header option values.
Note: This is NOT the way modern browsers quote field names or filenames
in Content-Disposition headers. See :func:`content_disposition_unquote`
"""
if val[0] == val[-1] == '"':
val = val[1:-1]

Expand All @@ -167,16 +192,52 @@ def header_unquote(val, filename=False):
return val


def parse_options_header(header, options=None):
value, sep, tail = header.partition(";")
if not sep:
def content_disposition_quote(val):
""" Quote field names or filenames for Content-Disposition headers the
same way modern browsers do it (see WHATWG HTML5 specification).
"""
val = val.replace("\r", "%0D").replace("\n", "%0A").replace('"', "%22")
return '"' + val + '"'


def content_disposition_unquote(val, filename=False):
""" Unquote field names or filenames from Content-Disposition headers.
Legacy quoting mechanisms are detected to some degree and also supported,
but there are rare ambiguous edge cases where we have to guess. If in
doubt, this function assumes a modern browser and follows the WHATWG
HTML5 specification (limited percent-encoding, no backslash-encoding).
"""

if '"' == val[0] == val[-1]:
val = val[1:-1]
if '\\"' in val: # Legacy backslash-escaped quoted strings
val = val.replace("\\\\", "\\").replace('\\"', '"')
elif "%" in val: # Modern (HTML5) limited percent-encoding
val = val.replace("%0D", "\r").replace("%0A", "\n").replace("%22", '"')
# ie6/windows bug: full path instead of just filename
if filename and (val[1:3] == ":\\" or val[:2] == "\\\\"):
val = val.rpartition("\\")[-1]
elif "%" in val: # Modern (HTML5) limited percent-encoding
val = val.replace("%0D", "\r").replace("%0A", "\n").replace("%22", '"')
return val


def parse_options_header(header, options=None, unquote=header_unquote):
""" Parse Content-Type (or similar) headers into a primary value
and an options-dict.
Note: For Content-Disposition headers you need a different unquote
function. See `content_disposition_unquote`.
"""
return header.lower().strip(), {}

options = options or {}
for match in _re_option.finditer(tail):
key, val = match.groups()
key = key.lower()
options[key] = header_unquote(val, key == "filename")
options[key] = unquote(val, key == "filename")

return value.lower(), options

Expand Down Expand Up @@ -499,7 +560,7 @@ def _close_headers(self):

for h,v in self.headerlist:
if h == "Content-Disposition":
dtype, args = parse_options_header(v)
dtype, args = parse_options_header(v, unquote=content_disposition_unquote)
if dtype != "form-data":
raise self._fail("Invalid Content-Disposition segment header: Wrong type")
if "name" not in args and self._parser.strict:
Expand Down
19 changes: 18 additions & 1 deletion test/test_header_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
import functools
import unittest
import multipart
import pytest

class TestHeaderParser(unittest.TestCase):

Expand All @@ -11,17 +13,32 @@ def test_token_unquote(self):
self.assertEqual('ie.exe', unquote('"\\\\network\\ie.exe"', True))
self.assertEqual('ie.exe', unquote('"c:\\wondows\\ie.exe"', True))

unquote = multipart.content_disposition_unquote
self.assertEqual('foo', unquote('"foo"'))
self.assertEqual('foo"bar', unquote('"foo\\"bar"'))
self.assertEqual('ie.exe', unquote('"\\\\network\\ie.exe"', True))
self.assertEqual('ie.exe', unquote('"c:\\wondows\\ie.exe"', True))

def test_token_quote(self):
quote = multipart.header_quote
self.assertEqual(quote('foo'), 'foo')
self.assertEqual(quote('foo"bar'), '"foo\\"bar"')

quote = multipart.content_disposition_quote
self.assertEqual(quote('foo'), '"foo"')
self.assertEqual(quote('foo"bar'), '"foo%22bar"')

def test_options_parser(self):
parse = multipart.parse_options_header
head = 'form-data; name="Test"; '
self.assertEqual(parse(head+'filename="Test.txt"')[0], 'form-data')
self.assertEqual(parse(head+'filename="Test.txt"')[1]['name'], 'Test')
self.assertEqual(parse(head+'filename="Test.txt"')[1]['filename'], 'Test.txt')
self.assertEqual(parse(head+'FileName="Te\\"st.txt"')[1]['filename'], 'Te"st.txt')
self.assertEqual(parse(head+'FileName="Te\\"s\\\\t.txt"')[1]['filename'], 'Te"s\\t.txt')
self.assertEqual(parse(head+'filename="C:\\test\\bla.txt"')[1]['filename'], 'bla.txt')
self.assertEqual(parse(head+'filename="\\\\test\\bla.txt"')[1]['filename'], 'bla.txt')
self.assertEqual(parse(head+'filename="täst.txt"')[1]['filename'], 'täst.txt')

parse = functools.partial(multipart.parse_options_header, unquote=multipart.content_disposition_unquote)
self.assertEqual(parse(head+'FileName="Te%22s\\\\t.txt"')[1]['filename'], 'Te"s\\\\t.txt')

26 changes: 25 additions & 1 deletion test/test_push_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,12 @@ def compact_events(self):
yield current, b''.join(data)

def get_segment(self, index_or_name):
allnames = []
for i, (segment, body) in enumerate(self.compact_events()):
allnames.append(segment.name)
if index_or_name == i or index_or_name == segment.name:
return segment, body
self.fail(f"Segment not found: {index_or_name}")
self.fail(f"Segment {index_or_name!r} not found in {allnames!r}")


class TestPushParser(PushTestBase):
Expand Down Expand Up @@ -769,3 +771,25 @@ def test_werkzeug_examples(self):
self.assertEqual(segment.filename, None)
self.assertEqual(segment.content_type, None)
self.assertEqual(body.decode(segment.charset or 'utf8'), forms[field])



class TestRealWorldExamples(PushTestBase):
def test_special_characters(self):
""" Test the ultimate segment name/filename from hell. """
teststring = 'test \\ \\\\ ; ö " = ;'
firefox_131 = ['---------------------------3697486332756351920303607403',
b'-----------------------------3697486332756351920303607403\r\nContent-Disposition: form-data; name="test \\ \\\\ ; \xc3\xb6 %22 = ;"; filename="test \\ \\\\ ; \xc3\xb6 %22 = ;"\r\nContent-Type: application/octet-stream\r\n\r\ntest \\ \\\\ ; \xc3\xb6 " = ;\r\n-----------------------------3697486332756351920303607403--\r\n']
chrome_129 = ["----WebKitFormBoundary9duA54BXJUGUymtb",
b'------WebKitFormBoundary9duA54BXJUGUymtb\r\nContent-Disposition: form-data; name="test \\ \\\\ ; \xc3\xb6 %22 = ;"; filename="test \\ \\\\ ; \xc3\xb6 %22 = ;"\r\nContent-Type: application/octet-stream\r\n\r\ntest \\ \\\\ ; \xc3\xb6 " = ;\r\n------WebKitFormBoundary9duA54BXJUGUymtb--\r\n']

for boundary, body in [firefox_131, chrome_129]:
print(repr(boundary))
print(repr(body))
self.reset(boundary=boundary, strict=True, header_charset='utf8')
self.parse(body)
segment, body = self.get_segment(teststring)
self.assertEqual(segment.name, teststring)
self.assertEqual(segment.filename, teststring)
self.assertEqual(body, teststring.encode("utf8"))

2 changes: 1 addition & 1 deletion test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def write_header(self, header, value, **opts):
line = to_bytes(header) + b': ' + to_bytes(value)
for opt, val in opts.items():
if val is not None:
line += b"; " + to_bytes(opt) + b'=' + to_bytes(multipart.header_quote(val))
line += b"; " + to_bytes(opt) + b'=' + to_bytes(multipart.content_disposition_quote(val))
self.write(line + b'\r\n')

def write_field(self, name, data, filename=None, content_type=None):
Expand Down

0 comments on commit 2b75a5a

Please sign in to comment.