diff --git a/tensor2tensor/data_generators/algorithmic_math_two_variables.py b/tensor2tensor/data_generators/algorithmic_math_two_variables.py index 18d7704ca..857a6d776 100644 --- a/tensor2tensor/data_generators/algorithmic_math_two_variables.py +++ b/tensor2tensor/data_generators/algorithmic_math_two_variables.py @@ -80,7 +80,26 @@ def _download_mlu_data(tmp_dir, data_dir): f.write(resp.content) with tarfile.open(file_path, "r:gz") as tar: - tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tar, tmp_dir) return tmp_dir diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py index 10c6bd9f2..889d13bf2 100644 --- a/tensor2tensor/data_generators/audio.py +++ b/tensor2tensor/data_generators/audio.py @@ -48,7 +48,26 @@ def _get_timit(directory): for path in FLAGS.timit_paths.split(","): with tf.gfile.GFile(path) as f: with tarfile.open(fileobj=f, mode="r:gz") as timit_compressed: - timit_compressed.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(timit_compressed, directory) def _collect_data(directory, input_ext, target_ext): diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py index 08dbf5123..637f6659a 100644 --- a/tensor2tensor/data_generators/cnn_dailymail.py +++ b/tensor2tensor/data_generators/cnn_dailymail.py @@ -83,12 +83,50 @@ def _maybe_download_corpora(tmp_dir, dataset_split): cnn_file = generator_utils.maybe_download_from_drive( tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL) with tarfile.open(cnn_file, "r:gz") as cnn_tar: - cnn_tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(cnn_tar, tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: - dailymail_tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(dailymail_tar, tmp_dir) cnn_files = tf.gfile.Glob(cnn_finalpath + "*") dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*") diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py index 93b20bfd3..99a1f0183 100644 --- a/tensor2tensor/data_generators/imdb.py +++ b/tensor2tensor/data_generators/imdb.py @@ -82,7 +82,26 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split): imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: - tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tar, tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index 75ab01632..e5bc841bc 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -100,7 +100,26 @@ def _maybe_download_corpus(tmp_dir): if not os.path.exists(corpus_filepath): generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url) with tarfile.open(corpus_filepath, "r:gz") as corpus_tar: - corpus_tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(corpus_tar, tmp_dir) @registry.register_problem diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index 26186a7b6..cce641181 100644 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -99,7 +99,26 @@ def _maybe_download_corpus(tmp_dir, vocab_type): ptb_files += [m.name] files += [m] - tgz.extractall(tmp_dir, members=files) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tgz, tmp_dir, members=files) if vocab_type == text_problems.VocabType.CHARACTER: return ptb_char_files diff --git a/tensor2tensor/data_generators/vqa.py b/tensor2tensor/data_generators/vqa.py index cd932845a..47e69fdcd 100644 --- a/tensor2tensor/data_generators/vqa.py +++ b/tensor2tensor/data_generators/vqa.py @@ -49,7 +49,26 @@ def _get_vqa_v2_annotations(directory, annotation_file = generator_utils.maybe_download_from_drive( directory, annotation_filename, annotation_url) with tarfile.open(annotation_file, "r:gz") as annotation_tar: - annotation_tar.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(annotation_tar, directory) def _get_vqa_v2_image_raw_dataset(directory, image_root_url, image_urls): @@ -69,7 +88,26 @@ def _get_vqa_v2_image_feature_dataset( feature_file = generator_utils.maybe_download_from_drive( directory, feature_filename, feature_url) with tarfile.open(feature_file, "r:gz") as feature_tar: - feature_tar.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(feature_tar, directory) class ImageQuestion2MultilabelProblem(image_utils.ImageProblem): diff --git a/tensor2tensor/data_generators/yelp_full.py b/tensor2tensor/data_generators/yelp_full.py index 02d1b826d..f9891e4e5 100644 --- a/tensor2tensor/data_generators/yelp_full.py +++ b/tensor2tensor/data_generators/yelp_full.py @@ -82,7 +82,26 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split): yelp_dir = os.path.join(tmp_dir, "yelp_review_full_csv") if not tf.gfile.Exists(yelp_dir): with tarfile.open(download_path, "r:gz") as tar: - tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tar, tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN diff --git a/tensor2tensor/data_generators/yelp_polarity.py b/tensor2tensor/data_generators/yelp_polarity.py index 60d0d7dcd..b6421f542 100644 --- a/tensor2tensor/data_generators/yelp_polarity.py +++ b/tensor2tensor/data_generators/yelp_polarity.py @@ -82,7 +82,26 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split): yelp_dir = os.path.join(tmp_dir, "yelp_review_polarity_csv") if not tf.gfile.Exists(yelp_dir): with tarfile.open(download_path, "r:gz") as tar: - tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tar, tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN