Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #2446: Abide to file size and number of files limits in shapefile processing #2450

Merged
merged 3 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dataverse-persistence/src/main/resources/Bundle_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2207,7 +2207,7 @@ ingest.error.DB_FAIL=Failed to save the tabular file produced by the ingest.
ingest.error.DB_FAIL_WITH_TAB_PRODUCED=Ingest produced tabular data, but failed to save it in the database; {0} No further information is available.
ingest.error.WRONG_HEADER=reading failure: wrong headerId(Z) here
ingest.error.UNZIP_FAIL=Failed to unzip the file.
ingest.error.UNZIP_SIZE_FAIL=One of the unzipped files exceeds the size limit resorting to saving the file as is, unzipped.
ingest.error.UNZIP_SIZE_FAIL=One of the unzipped files exceeds the size limit resorting to saving the file as is, zipped.
ingest.error.UNZIP_FILE_LIMIT_FAIL=The number of files in the ZIP archive exceeds the limit of {0}. The archive will not be extracted and will be uploaded in its original ZIP format. If you want to add individual files to the dataset, please upload a ZIP archive with fewer files.
ingest.error.GENERAL_TOO_MANY_VARIABLES=Input file has too many variables (columns): {1}. Maximal allowed number of variables is {0}.
ingest.error.UNKNOWN_ERROR=Unknown error occurred during ingest.
Expand Down Expand Up @@ -2466,7 +2466,7 @@ dataset.file.uploadWorked=upload worked
dataset.file.uploadBatchTooBig=The batch size exceeds the limit of {0}.
dataset.file.zip.unpack.failure=Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is.
dataset.file.zip.unzip.failure=Failed to unzip the file. Saving the file as is.
dataset.file.zip.uploadFileSizeLimit.exceeded=One of the unzipped files exceeds the size limit resorting to saving the file as is, unzipped.
dataset.file.zip.uploadFileSizeLimit.exceeded=One of the unzipped files exceeds the size limit resorting to saving the file as is, zipped.

#EmailValidator.java
email.invalid=is not a valid email address.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import static edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestReport.createIngestFailureReport;
import static edu.harvard.iq.dataverse.util.FileUtil.calculateChecksum;
Expand Down Expand Up @@ -148,7 +147,7 @@ private List<DataFile> createDataFiles(Path tempFile, String fileName, String su

} else if (finalType.equals(ShapefileHandler.SHAPEFILE_FILE_TYPE)) {
try {
return createDataFilesFromReshapedShapeFile(tempFile, fileSizeLimit);
return createDataFilesFromReshapedShapeFile(tempFile, fileSizeLimit, zipFileUnpackFilesLimit);
} catch (FileExceedsMaxSizeException femsx) {
logger.error("One of the unzipped shape files exceeded the size limit; giving up. " + femsx.getMessage());
throw new IOException("One of the unzipped shape files exceeded the size limit", femsx);
Expand Down Expand Up @@ -314,8 +313,9 @@ private Tuple2<String, String> extractDirectoryAndFileName(ZipEntry zipEntry) {
* Shape files may have to be split into multiple files,
* one zip archive per each complete set of shape files.
*/
private List<DataFile> createDataFilesFromReshapedShapeFile(Path tempFile, Long fileSizeLimit) throws IOException {
try (IngestServiceShapefileHelper shpHelper = new IngestServiceShapefileHelper(tempFile.toFile(), Paths.get(getFilesTempDirectory()).toFile())) {
private List<DataFile> createDataFilesFromReshapedShapeFile(Path tempFile, Long fileSizeLimit, Long zipFileUnpackFilesLimit) throws IOException {
try (IngestServiceShapefileHelper shpHelper = new IngestServiceShapefileHelper(tempFile.toFile(), Paths.get(getFilesTempDirectory()).toFile(),
fileSizeLimit, zipFileUnpackFilesLimit)) {
List<DataFile> datafiles = new ArrayList<>();

for (File finalFile : shpHelper.processFile()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.iq.dataverse.datafile;

import edu.harvard.iq.dataverse.ingest.IngestableDataChecker;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
import edu.harvard.iq.dataverse.util.JhoveFileType;
import edu.harvard.iq.dataverse.util.ShapefileHandler;
import org.apache.commons.io.IOUtils;
Expand All @@ -13,6 +14,7 @@
import javax.activation.MimetypesFileTypeMap;
import javax.ejb.EJBException;
import javax.enterprise.context.ApplicationScoped;
import javax.inject.Inject;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
Expand Down Expand Up @@ -42,6 +44,9 @@ public class FileTypeDetector {

private static final MimetypesFileTypeMap MIME_TYPE_MAP = new MimetypesFileTypeMap();

@Inject
protected SettingsServiceBean settingsService;

/**
* Detects file type based on file content and filename
*/
Expand Down Expand Up @@ -102,7 +107,10 @@ public String determineFileType(File f, String fileName) throws IOException {
// Is this a zipped Shapefile?
// Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile
try {
ShapefileHandler shapefileHandler = new ShapefileHandler(f);
Long fileSizeLimit = settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.MaxFileUploadSizeInBytes);
Long zipFileUnpackFilesLimit = settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.ZipUploadFilesLimit);

ShapefileHandler shapefileHandler = new ShapefileHandler(f, fileSizeLimit, zipFileUnpackFilesLimit);
madryk marked this conversation as resolved.
Show resolved Hide resolved
if (shapefileHandler.containsShapefile()) {
fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE;
}
Expand Down Expand Up @@ -199,4 +207,10 @@ private boolean isGraphMLFile(File file) {
logger.debug("end isGraphML()");
return isGraphML;
}

// For tests
FileTypeDetector withSettingsService(SettingsServiceBean settingsService) {
this.settingsService = settingsService;
return this;
}
madryk marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,24 @@ public class IngestServiceShapefileHelper implements Closeable {
private final File zippedShapefile;
private final File reZipFolder;
private final File unZipFolder;
private final Long fileSizeLimit;
private final Long zipFileUnpackFilesLimit;

// -------------------- CONSTRUCTOR --------------------

/**
* Constructor that accepts a file object
*/
public IngestServiceShapefileHelper(File zippedShapefile, File workingFolderBase) {
public IngestServiceShapefileHelper(File zippedShapefile, File workingFolderBase, Long fileSizeLimit, Long zipFileUnpackFilesLimit) {
Preconditions.checkArgument(isValidFile(zippedShapefile));
Preconditions.checkArgument(isValidFolder(workingFolderBase));

this.zippedShapefile = zippedShapefile;
String id = new SimpleDateFormat("yyyy-MM-dd-hh-mm-ss-SSS").format(new Date());
this.reZipFolder = getShapefileUnzipTempDirectory(workingFolderBase, "shp_" + id + "_rezip");
this.unZipFolder = getShapefileUnzipTempDirectory(workingFolderBase, "shp_" + id + "_unzip");
this.fileSizeLimit = fileSizeLimit;
this.zipFileUnpackFilesLimit = zipFileUnpackFilesLimit;

}

Expand All @@ -55,7 +59,7 @@ public List<File> processFile() {
try {
// (1) Use the ShapefileHandler to the .zip for a shapefile
//
ShapefileHandler shpHandler = new ShapefileHandler(zippedShapefile);
ShapefileHandler shpHandler = new ShapefileHandler(zippedShapefile, fileSizeLimit, zipFileUnpackFilesLimit);
if (!shpHandler.containsShapefile()) {
logger.severe("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here");
throw new IllegalStateException("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,6 @@ public boolean ingestAsTabular(Long datafile_id) {
}

if (forceTypeCheck) {
FileTypeDetector fileTypeDetector = new FileTypeDetector();
String newType = fileTypeDetector.detectTabularFileType(localFile.get(), dataFile.getContentType());

ingestPlugin = getTabDataReaderByMimeType(newType);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package edu.harvard.iq.dataverse.util;

import edu.harvard.iq.dataverse.common.BundleUtil;
import edu.harvard.iq.dataverse.common.files.mime.ShapefileMimeType;
import edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException;
import edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestError;
import edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
Expand All @@ -12,6 +16,7 @@
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand All @@ -28,6 +33,8 @@
import java.util.stream.Stream;
import java.util.zip.ZipEntry;

import static edu.harvard.iq.dataverse.common.FileSizeUtil.bytesToHumanReadable;

/**
* Used to identify, "repackage", and extract data from Shapefiles in .zip format
* <p>
Expand Down Expand Up @@ -89,10 +96,10 @@ public class ShapefileHandler {

// -------------------- CONSTRUCTOR --------------------

public ShapefileHandler(File zipFile) {
public ShapefileHandler(File zipFile, Long fileSizeLimit, Long zipFileUnpackFilesLimit) {
this.zipfile = zipFile;

examineZipFile();
examineZipFile(fileSizeLimit, zipFileUnpackFilesLimit);
}

// -------------------- GETTERS --------------------
Expand Down Expand Up @@ -346,7 +353,7 @@ private boolean isFileToSkip(String fname) {
* Iterate through the zip file contents.
* Does it contain any shapefiles?
*/
private void examineZipFile() {
private void examineZipFile(Long fileSizeLimit, Long zipFileUnpackFilesLimit) {
if (zipfile == null || !zipfile.isFile()) {
throw new IllegalArgumentException("Invalid zip file: " + zipfile);
}
Expand All @@ -365,7 +372,20 @@ private void examineZipFile() {
if (fileNamesInZip.contains(unzipFileName)) {
throw new IllegalStateException("Found file-name collision: " + unzipFileName);
}

if (fileSizeLimit != null && zipFileEntry.getSize() >= fileSizeLimit) {
throw new FileExceedsMaxSizeException(fileSizeLimit,
MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"),
bytesToHumanReadable(fileSizeLimit)));
madryk marked this conversation as resolved.
Show resolved Hide resolved
}

fileNamesInZip.add(unzipFileName);

if (zipFileUnpackFilesLimit != null && fileNamesInZip.size() >= zipFileUnpackFilesLimit) {
logger.log(Level.WARNING, "Zip upload - too many files.");
throw new IngestException(IngestError.UNZIP_FILE_LIMIT_FAIL);
}

updateFileGroupHash(unzipFileName);
}
} catch (IOException ex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ void createDataFiles_shouldRezipShapefiles() throws IOException {
byte[] zipBytes = UnitTestUtils.readFileToByteArray("jhove/fake_shapefile.zip");

lenient().when(settingsService.getValueForKeyAsLong(Key.MaxFileUploadSizeInBytes)).thenReturn(1024*1024L);
lenient().when(settingsService.getValueForKeyAsLong(Key.ZipUploadFilesLimit)).thenReturn(100L);
lenient().when(settingsService.getValueForKey(Key.FileFixityChecksumAlgorithm)).thenReturn("MD5");
lenient().when(fileTypeDetector.determineFileType(any(), any())).thenReturn("application/zipped-shapefile");

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.iq.dataverse.datafile;

import edu.harvard.iq.dataverse.UnitTestUtils;
import edu.harvard.iq.dataverse.engine.TestSettingsServiceBean;
import edu.harvard.iq.dataverse.util.JhoveConfigurationInitializer;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.io.TempDir;
Expand All @@ -14,7 +15,8 @@

public class FileTypeDetectorTest {

private FileTypeDetector fileTypeDetector = new FileTypeDetector();
private FileTypeDetector fileTypeDetector = new FileTypeDetector()
.withSettingsService(new TestSettingsServiceBean());

@TempDir
File tempDir;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
package edu.harvard.iq.dataverse.util;

import com.google.common.collect.ImmutableMap;
import edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException;
import edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestException;
import org.jetbrains.annotations.NotNull;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.testcontainers.shaded.org.apache.commons.lang3.RandomStringUtils;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand All @@ -34,7 +41,7 @@ public void testCreateZippedNonShapefile() throws IOException {
List<String> file_names = Arrays.asList("not-quite-a-shape.shp", "not-quite-a-shape.shx", "not-quite-a-shape.dbf", "not-quite-a-shape.pdf"); //, "prj");
File zipfile_obj = createAndZipFiles(file_names, "not-quite-a-shape.zip");

ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj);
ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj);

assertThat(shp_handler.containsShapefile()).isFalse();

Expand All @@ -55,7 +62,7 @@ public void testZippedTwoShapefiles() throws IOException {

File zipfile_obj = createAndZipFiles(file_names, "two-shapes.zip");

ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj);
ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj);

assertThat(shp_handler.containsShapefile()).isTrue();

Expand All @@ -76,7 +83,8 @@ public void testZipped__duplicate() throws IOException {

File zipfile_obj = createAndZipFiles(file_names, "duplicate_file.zip");

assertThatThrownBy(() -> new ShapefileHandler(zipfile_obj)).hasMessage("Found file-name collision: shape2.pdf");
assertThatThrownBy(() -> newShapeFileHandler(zipfile_obj))
.hasMessage("Found file-name collision: shape2.pdf");
}

@Test
Expand All @@ -91,7 +99,7 @@ public void testZippedTwoShapefiles_reshape() throws IOException {
File test_rezip_folder = this.tempFolder.newFolder("test_rezip").getAbsoluteFile();


ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj);
ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj);
shp_handler.reZipShapefileSets(test_unzip_folder, test_rezip_folder);

assertThat(test_unzip_folder.list().length).isEqualTo(0);
Expand All @@ -106,7 +114,7 @@ public void testZippedShapefileWithExtraFiles() throws IOException {
List<String> file_names = Arrays.asList("shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.pdf", "shape1.cpg", "shape1." + SHP_XML_EXTENSION, "README.md", "shape_notes.txt");
File zipfile_obj = createAndZipFiles(file_names, "shape-plus.zip");

ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj);
ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj);

assertThat(shp_handler.containsShapefile()).isTrue();

Expand All @@ -122,6 +130,37 @@ public void testZippedShapefileWithExtraFiles() throws IOException {
.containsExactly("txt");
}

@Test
public void testZippedShapefile__too_many_files() throws IOException {

List<String> file_names = Arrays.asList("shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.pdf",
"shape1.cpg", "shape1.shp.xml", "README.md", "shape_notes.txt");
File zipfile_obj = createAndZipFiles(file_names, "shape-plus.zip");

assertThatThrownBy(() -> new ShapefileHandler(zipfile_obj, 1024L, 8L))
.isInstanceOf(IngestException.class)
.hasMessage("There was a problem during ingest. Passing error key UNZIP_FILE_LIMIT_FAIL to report.");
}

@Test
public void testZippedShapefile__too_big_files() throws IOException {
Map<String, String> files = ImmutableMap.<String, String>builder().put("shape1.shp", "")
.put("shape1.shx", "")
.put("shape1.dbf", "")
.put("shape1.prj", "")
.put("shape1.pdf", RandomStringUtils.randomAlphanumeric(2048))
.put("shape1.cpg", "")
.put("shape1.shp.xml", "")
.put("README.md", "")
.put("shape_notes.txt", "")
.build();
File zipfile_obj = createAndZipFiles(files, "shape-plus.zip");

assertThatThrownBy(() -> new ShapefileHandler(zipfile_obj, 1024L, 100L))
.isInstanceOf(FileExceedsMaxSizeException.class)
.hasMessage("This file size exceeds the size limit of 1.0 KB.");
}

@Test
public void testZippedShapefileWithExtraFiles_reshape() throws IOException {

Expand All @@ -130,7 +169,7 @@ public void testZippedShapefileWithExtraFiles_reshape() throws IOException {
File unzip2Folder = this.tempFolder.newFolder("test_unzip2").getAbsoluteFile();
File rezip2Folder = this.tempFolder.newFolder("test_rezip2").getAbsoluteFile();

ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj);
ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj);
shp_handler.reZipShapefileSets(unzip2Folder, rezip2Folder);

assertThat(unzip2Folder.list()).isEmpty();
Expand All @@ -145,13 +184,17 @@ public void testZippedShapefileWithExtraFiles_reshape() throws IOException {
* @param zipFileName - Name of .zip file to create
*/
private File createAndZipFiles(List<String> fileNamesToZip, String zipFileName) throws IOException {
Map<String, String> filesWithContent = fileNamesToZip.stream().collect(LinkedHashMap::new,
(map, fileName) -> map.put(fileName, ""), Map::putAll);
return createAndZipFiles(filesWithContent, zipFileName);
}

private File createAndZipFiles(Map<String, String> filesToZip, String zipFileName) throws IOException {
File zipFileObj = this.tempFolder.newFile(zipFileName);

try (ZipOutputStream zip_stream = new ZipOutputStream(Files.newOutputStream(zipFileObj.toPath()))) {

for (String fileName : fileNamesToZip) {
this.addToZipFile(fileName, "".getBytes(StandardCharsets.UTF_8), zip_stream);
try (ZipOutputStream zip_stream = new ZipOutputStream(Files.newOutputStream(zipFileObj.toPath()))) {
for (Map.Entry<String, String> entry : filesToZip.entrySet()) {
this.addToZipFile(entry.getKey(), entry.getValue().getBytes(StandardCharsets.UTF_8), zip_stream);
}
}

Expand All @@ -167,6 +210,9 @@ private void addToZipFile(String fileName, byte[] fileAsBytesToZip, ZipOutputStr
zipOutputStream.closeEntry();
}

private @NotNull ShapefileHandler newShapeFileHandler(File zipfile_obj) {
madryk marked this conversation as resolved.
Show resolved Hide resolved
return new ShapefileHandler(zipfile_obj, 1024L, 100L);
}
}


Expand Down
Loading