From 3ee124ea23a7ec27365dded27652049c8ddf2fb4 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Mon, 29 Jan 2018 16:29:47 +0100 Subject: [PATCH] Add range read funcionality for single file download --- api/handlers/listhandler.py | 71 +++++++- api/util.py | 62 +++++++ .../integration_tests/python/test_download.py | 167 ++++++++++++++++++ tests/unit_tests/python/test_util.py | 34 ++++ 4 files changed, 325 insertions(+), 9 deletions(-) diff --git a/api/handlers/listhandler.py b/api/handlers/listhandler.py index 8f601e532..3a10830e6 100644 --- a/api/handlers/listhandler.py +++ b/api/handlers/listhandler.py @@ -1,3 +1,4 @@ +import os import bson import copy import datetime @@ -465,8 +466,8 @@ def get(self, cont_name, list_name, **kwargs): with zipfile.ZipFile(f) as zf: self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) self.response.write(zf.open(zip_member).read()) - except zipfile.BadZipfile: - self.abort(400, 'not a zip file') + except zipfile.BadZipfile as e: + self.abort(400, str(e)) except KeyError: self.abort(400, 'zip file contains no such member') # log download if we haven't already for this ticket @@ -483,13 +484,65 @@ def get(self, cont_name, list_name, **kwargs): if signed_url: self.redirect(signed_url) else: - self.response.app_iter = file_system.open(file_path, 'rb') - self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter - if self.is_true('view'): - self.response.headers['Content-Type'] = str(fileinfo.get('mimetype', 'application/octet-stream')) - else: - self.response.headers['Content-Type'] = 'application/octet-stream' - self.response.headers['Content-Disposition'] = 'attachment; filename="' + filename + '"' + range_header = self.request.headers.get('Range', '') + try: + ranges = util.parse_range_header(range_header) + + if ranges: + with file_system.open(file_path, 'rb') as f: + for first, last in ranges: + if first > fileinfo['size']-1: + self.abort(416, 'Invalid range') + + if last > fileinfo['size'] - 1: + raise util.ParseError('Invalid range') + + mode = os.SEEK_SET + if first < 0: + mode = os.SEEK_END + length = abs(first) + elif last is None: + length = fileinfo['size'] - first + else: + if last > fileinfo['size']: + length = fileinfo['size'] - first + else: + length = last - first + 1 + + f.seek(first, mode) + data = f.read(length) + + if len(ranges) > 1: + self.response.write('--THIS_STRING_SEPARATES\n') + self.response.write('Content-Type: %s\n' % str( + fileinfo.get('mimetype', 'application/octet-stream'))) + self.response.write('Content-Range: %s' % 'bytes %s-%s/%s\n' % (str(first), + str(last), + str(fileinfo['size']))) + self.response.write('\n') + self.response.write(data) + self.response.write('\n') + else: + self.response.headers['Content-Type'] = str( + fileinfo.get('mimetype', 'application/octet-stream')) + self.response.headers['Content-Range'] = 'bytes %s-%s/%s' % (str(first), + str(last), + str(fileinfo['size'])) + self.response.write(data) + + if len(ranges) > 1: + self.response.headers['Content-Type'] = 'multipart/byteranges; boundary=THIS_STRING_SEPARATES' + + self.response.status = 206 + except util.ParseError: + self.response.app_iter = file_system.open(file_path, 'rb') + self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter + + if self.is_true('view'): + self.response.headers['Content-Type'] = str(fileinfo.get('mimetype', 'application/octet-stream')) + else: + self.response.headers['Content-Type'] = 'application/octet-stream' + self.response.headers['Content-Disposition'] = 'attachment; filename="' + filename + '"' # log download if we haven't already for this ticket if ticket: diff --git a/api/util.py b/api/util.py index a0239e821..dbd818377 100644 --- a/api/util.py +++ b/api/util.py @@ -17,6 +17,9 @@ from django.conf import settings from django.template import Template, Context +BYTE_RANGE_REGEX = r'^(?P\d+)-(?P\d+)?$' +SUFFIX_BYTE_RANGE_REGEX = r'^(?P-\d+)$' + # If this is not called before templating, django throws a hissy fit settings.configure( TEMPLATES=[{'BACKEND': 'django.template.backends.django.DjangoTemplates'}], @@ -275,3 +278,62 @@ def path_from_hash(hash_): second_stanza = actual_hash[2:4] path = (hash_version, hash_alg, first_stanza, second_stanza, hash_) return os.path.join(*path) + + +class ParseError(ValueError): + """Exception class representing a string parsing error.""" + + +def parse_range_header(range_header_val, valid_units=('bytes',)): + """ + Range header parser according to RFC7233 + + https://tools.ietf.org/html/rfc7233 + """ + byte_range_re = re.compile(BYTE_RANGE_REGEX) + suffix_byte_range_re = re.compile(SUFFIX_BYTE_RANGE_REGEX) + + split_range_header_val = range_header_val.split('=') + if not len(split_range_header_val) == 2: + raise ParseError('Invalid range header syntax') + + unit, ranges_str = split_range_header_val + + if unit not in valid_units: + raise ParseError('Invalid unit specified') + + split_ranges_str = ranges_str.split(', ') + + ranges = [] + + for range_str in split_ranges_str: + re_match = byte_range_re.match(range_str) + first, last = None, None + + if re_match: + first, last = re_match.groups() + else: + re_match = suffix_byte_range_re.match(range_str) + if re_match: + first = re_match.group('first') + else: + raise ParseError('Invalid range format') + + if first is not None: + try: + first = int(first) + except TypeError: + raise ParseError('Invalid range, only numbers are allowed') + + if last is not None: + try: + last = int(last) + except TypeError: + raise ParseError('Invalid range, only numbers are allowed') + + if last is not None and first > last: + raise ParseError('Invalid range, first %s can\'t be greater than the last %s' % (unit, unit)) + + ranges.append((first, last)) + + return ranges diff --git a/tests/integration_tests/python/test_download.py b/tests/integration_tests/python/test_download.py index 39053e067..3c72aebd2 100644 --- a/tests/integration_tests/python/test_download.py +++ b/tests/integration_tests/python/test_download.py @@ -244,6 +244,173 @@ def test_filelist_download(data_builder, file_form, as_admin, legacy_cas_file): assert r.content == file_content +def test_filelist_range_download(data_builder, as_admin, file_form): + session = data_builder.create_session() + session_files = '/sessions/' + session + '/files' + as_admin.post(session_files, files=file_form(('one.csv', '123456789'))) + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download single file from byte 0 to end of file + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=0-'}) + assert r.ok + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download single file's first byte + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=0-0'}) + assert r.ok + assert r.content == '1' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download single file's first two byte + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=0-1'}) + assert r.ok + assert r.content == '12' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download single file's last 5 bytes + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=-5'}) + assert r.ok + assert r.content == '56789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid unit + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'lol=-5'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range where the last byte is greater then the size of the file + # in this case the whole file is returned + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=0-500'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range where the first byte is greater then the size of the file + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=500-'}) + assert r.status_code == 416 + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range, in this case the whole file is returned + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=-'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range first byte is greater than the last one + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=10-5'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range, can't parse first byte + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=r-0'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range, can't parse last byte + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=0-bb'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range syntax + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=1+5'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # try to download single file with invalid range header syntax + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes-1+5'}) + assert r.status_code == 200 + assert r.content == '123456789' + + r = as_admin.get(session_files + '/one.csv', params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download multiple ranges + r = as_admin.get(session_files + '/one.csv', + params={'ticket': ticket}, + headers={'Range': 'bytes=1-2, 3-4'}) + + assert r.content == '--THIS_STRING_SEPARATES\n' \ + 'Content-Type: text/csv\n' \ + 'Content-Range: bytes 1-2/9\n\n' \ + '23\n' \ + '--THIS_STRING_SEPARATES\n' \ + 'Content-Type: text/csv\n' \ + 'Content-Range: bytes 3-4/9\n\n' \ + '45\n' + + def test_analysis_download(data_builder, file_form, as_admin, default_payload): session = data_builder.create_session() acquisition = data_builder.create_acquisition() diff --git a/tests/unit_tests/python/test_util.py b/tests/unit_tests/python/test_util.py index c35d916e7..c7164cea8 100644 --- a/tests/unit_tests/python/test_util.py +++ b/tests/unit_tests/python/test_util.py @@ -1,5 +1,39 @@ +import pytest + from api import util +@pytest.fixture(scope='function', params=[ + #range header content expected_output + ('bytes=1-5', [(1, 5)]), + ('bytes=-5', [(-5, None)]), + ('bytes=5-', [(5, None)]), + ('bytes=-', util.ParseError), + ('bytes=3', util.ParseError), + ('bytes=a-b', util.ParseError), + ('by-', util.ParseError), + ('bytes=5+5', util.ParseError), + ('bytes=5=', util.ParseError), + ('b=1-5', util.ParseError), + ('bytes=1-5, 6-10, 10-15', [(1, 5), (6, 10), (10, 15)]), + ('bytes=-5, 6-, 10-15', [(-5, None), (6, None), (10, 15)]), + ('bytes=15, 6-10, 10-15', util.ParseError), + ('bytes=15, -6--10, 10-15', util.ParseError), + ('bytes=1-5; 6-10; 10-15', util.ParseError), +]) +def parse_range_header_fixture(request): + header, expected_output = request.param + return header, expected_output + + +def test_parse_range_header(parse_range_header_fixture): + input, expected_output = parse_range_header_fixture + + if expected_output == util.ParseError: + with pytest.raises(expected_output): + util.parse_range_header(input) + else: + assert util.parse_range_header(input) == expected_output + def test_hrsize(): assert util.hrsize(999) == '999B'