diff --git a/dataverse-persistence/src/main/resources/Bundle_en.properties b/dataverse-persistence/src/main/resources/Bundle_en.properties index 98ca5c4b87..7b05ef478d 100755 --- a/dataverse-persistence/src/main/resources/Bundle_en.properties +++ b/dataverse-persistence/src/main/resources/Bundle_en.properties @@ -2217,7 +2217,7 @@ ingest.error.DB_FAIL=Failed to save the tabular file produced by the ingest. ingest.error.DB_FAIL_WITH_TAB_PRODUCED=Ingest produced tabular data, but failed to save it in the database; {0} No further information is available. ingest.error.WRONG_HEADER=reading failure: wrong headerId(Z) here ingest.error.UNZIP_FAIL=Failed to unzip the file. -ingest.error.UNZIP_SIZE_FAIL=One of the unzipped files exceeds the size limit resorting to saving the file as is, unzipped. +ingest.error.UNZIP_SIZE_FAIL=One of the unzipped files exceeds the size limit resorting to saving the file as is, zipped. ingest.error.UNZIP_FILE_LIMIT_FAIL=The number of files in the ZIP archive exceeds the limit of {0}. The archive will not be extracted and will be uploaded in its original ZIP format. If you want to add individual files to the dataset, please upload a ZIP archive with fewer files. ingest.error.GENERAL_TOO_MANY_VARIABLES=Input file has too many variables (columns): {1}. Maximal allowed number of variables is {0}. ingest.error.UNKNOWN_ERROR=Unknown error occurred during ingest. @@ -2476,7 +2476,7 @@ dataset.file.uploadWorked=upload worked dataset.file.uploadBatchTooBig=The batch size exceeds the limit of {0}. dataset.file.zip.unpack.failure=Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is. dataset.file.zip.unzip.failure=Failed to unzip the file. Saving the file as is. -dataset.file.zip.uploadFileSizeLimit.exceeded=One of the unzipped files exceeds the size limit resorting to saving the file as is, unzipped. +dataset.file.zip.uploadFileSizeLimit.exceeded=One of the unzipped files exceeds the size limit resorting to saving the file as is, zipped. #EmailValidator.java email.invalid=is not a valid email address. diff --git a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/DataFileCreator.java b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/DataFileCreator.java index 31fc0806bf..602a6acbf7 100644 --- a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/DataFileCreator.java +++ b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/DataFileCreator.java @@ -40,7 +40,6 @@ import java.util.List; import java.util.zip.GZIPInputStream; import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; import static edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestReport.createIngestFailureReport; import static edu.harvard.iq.dataverse.util.FileUtil.calculateChecksum; @@ -148,7 +147,7 @@ private List createDataFiles(Path tempFile, String fileName, String su } else if (finalType.equals(ShapefileHandler.SHAPEFILE_FILE_TYPE)) { try { - return createDataFilesFromReshapedShapeFile(tempFile, fileSizeLimit); + return createDataFilesFromReshapedShapeFile(tempFile, fileSizeLimit, zipFileUnpackFilesLimit); } catch (FileExceedsMaxSizeException femsx) { logger.error("One of the unzipped shape files exceeded the size limit; giving up. " + femsx.getMessage()); throw new IOException("One of the unzipped shape files exceeded the size limit", femsx); @@ -314,8 +313,9 @@ private Tuple2 extractDirectoryAndFileName(ZipEntry zipEntry) { * Shape files may have to be split into multiple files, * one zip archive per each complete set of shape files. */ - private List createDataFilesFromReshapedShapeFile(Path tempFile, Long fileSizeLimit) throws IOException { - try (IngestServiceShapefileHelper shpHelper = new IngestServiceShapefileHelper(tempFile.toFile(), Paths.get(getFilesTempDirectory()).toFile())) { + private List createDataFilesFromReshapedShapeFile(Path tempFile, Long fileSizeLimit, Long zipFileUnpackFilesLimit) throws IOException { + try (IngestServiceShapefileHelper shpHelper = new IngestServiceShapefileHelper(tempFile.toFile(), Paths.get(getFilesTempDirectory()).toFile(), + fileSizeLimit, zipFileUnpackFilesLimit)) { List datafiles = new ArrayList<>(); for (File finalFile : shpHelper.processFile()) { diff --git a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/FileTypeDetector.java b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/FileTypeDetector.java index 48ee0c8438..cf768d960f 100644 --- a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/FileTypeDetector.java +++ b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/FileTypeDetector.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.datafile; import edu.harvard.iq.dataverse.ingest.IngestableDataChecker; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.JhoveFileType; import edu.harvard.iq.dataverse.util.ShapefileHandler; import org.apache.commons.io.IOUtils; @@ -13,6 +14,7 @@ import javax.activation.MimetypesFileTypeMap; import javax.ejb.EJBException; import javax.enterprise.context.ApplicationScoped; +import javax.inject.Inject; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; @@ -42,6 +44,18 @@ public class FileTypeDetector { private static final MimetypesFileTypeMap MIME_TYPE_MAP = new MimetypesFileTypeMap(); + protected SettingsServiceBean settingsService; + + protected FileTypeDetector() { + } + + @Inject + public FileTypeDetector(SettingsServiceBean settingsService) { + this.settingsService = settingsService; + } + + // -------------------- CONSTRUCTOR -------------------- + /** * Detects file type based on file content and filename */ @@ -102,7 +116,10 @@ public String determineFileType(File f, String fileName) throws IOException { // Is this a zipped Shapefile? // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile try { - ShapefileHandler shapefileHandler = new ShapefileHandler(f); + Long fileSizeLimit = settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.MaxFileUploadSizeInBytes); + Long zipFileUnpackFilesLimit = settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.ZipUploadFilesLimit); + + ShapefileHandler shapefileHandler = new ShapefileHandler(f, fileSizeLimit, zipFileUnpackFilesLimit); if (shapefileHandler.containsShapefile()) { fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE; } diff --git a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/IngestServiceShapefileHelper.java b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/IngestServiceShapefileHelper.java index 8763391ca3..17a82d15d8 100644 --- a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/IngestServiceShapefileHelper.java +++ b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/IngestServiceShapefileHelper.java @@ -32,13 +32,15 @@ public class IngestServiceShapefileHelper implements Closeable { private final File zippedShapefile; private final File reZipFolder; private final File unZipFolder; + private final Long fileSizeLimit; + private final Long zipFileUnpackFilesLimit; // -------------------- CONSTRUCTOR -------------------- /** * Constructor that accepts a file object */ - public IngestServiceShapefileHelper(File zippedShapefile, File workingFolderBase) { + public IngestServiceShapefileHelper(File zippedShapefile, File workingFolderBase, Long fileSizeLimit, Long zipFileUnpackFilesLimit) { Preconditions.checkArgument(isValidFile(zippedShapefile)); Preconditions.checkArgument(isValidFolder(workingFolderBase)); @@ -46,6 +48,8 @@ public IngestServiceShapefileHelper(File zippedShapefile, File workingFolderBase String id = new SimpleDateFormat("yyyy-MM-dd-hh-mm-ss-SSS").format(new Date()); this.reZipFolder = getShapefileUnzipTempDirectory(workingFolderBase, "shp_" + id + "_rezip"); this.unZipFolder = getShapefileUnzipTempDirectory(workingFolderBase, "shp_" + id + "_unzip"); + this.fileSizeLimit = fileSizeLimit; + this.zipFileUnpackFilesLimit = zipFileUnpackFilesLimit; } @@ -55,7 +59,7 @@ public List processFile() { try { // (1) Use the ShapefileHandler to the .zip for a shapefile // - ShapefileHandler shpHandler = new ShapefileHandler(zippedShapefile); + ShapefileHandler shpHandler = new ShapefileHandler(zippedShapefile, fileSizeLimit, zipFileUnpackFilesLimit); if (!shpHandler.containsShapefile()) { logger.severe("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here"); throw new IllegalStateException("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here"); diff --git a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 39b68504ac..0e07b44ed7 100644 --- a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -482,7 +482,6 @@ public boolean ingestAsTabular(Long datafile_id) { } if (forceTypeCheck) { - FileTypeDetector fileTypeDetector = new FileTypeDetector(); String newType = fileTypeDetector.detectTabularFileType(localFile.get(), dataFile.getContentType()); ingestPlugin = getTabDataReaderByMimeType(newType); diff --git a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java index 3e98e18840..c8908d682a 100644 --- a/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java +++ b/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java @@ -1,6 +1,10 @@ package edu.harvard.iq.dataverse.util; +import edu.harvard.iq.dataverse.common.BundleUtil; import edu.harvard.iq.dataverse.common.files.mime.ShapefileMimeType; +import edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException; +import edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestError; +import edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipFile; @@ -12,6 +16,7 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.text.MessageFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -28,6 +33,8 @@ import java.util.stream.Stream; import java.util.zip.ZipEntry; +import static edu.harvard.iq.dataverse.common.FileSizeUtil.bytesToHumanReadable; + /** * Used to identify, "repackage", and extract data from Shapefiles in .zip format *

@@ -89,10 +96,10 @@ public class ShapefileHandler { // -------------------- CONSTRUCTOR -------------------- - public ShapefileHandler(File zipFile) { + public ShapefileHandler(File zipFile, Long fileSizeLimit, Long zipFileUnpackFilesLimit) { this.zipfile = zipFile; - examineZipFile(); + examineZipFile(fileSizeLimit, zipFileUnpackFilesLimit); } // -------------------- GETTERS -------------------- @@ -346,7 +353,7 @@ private boolean isFileToSkip(String fname) { * Iterate through the zip file contents. * Does it contain any shapefiles? */ - private void examineZipFile() { + private void examineZipFile(Long fileSizeLimit, Long zipFileUnpackFilesLimit) { if (zipfile == null || !zipfile.isFile()) { throw new IllegalArgumentException("Invalid zip file: " + zipfile); } @@ -365,7 +372,18 @@ private void examineZipFile() { if (fileNamesInZip.contains(unzipFileName)) { throw new IllegalStateException("Found file-name collision: " + unzipFileName); } + + if (fileSizeLimit != null && zipFileEntry.getSize() >= fileSizeLimit) { + throw new IngestException(IngestError.UNZIP_SIZE_FAIL); + } + fileNamesInZip.add(unzipFileName); + + if (zipFileUnpackFilesLimit != null && fileNamesInZip.size() >= zipFileUnpackFilesLimit) { + logger.log(Level.WARNING, "Zip upload - too many files."); + throw new IngestException(IngestError.UNZIP_FILE_LIMIT_FAIL); + } + updateFileGroupHash(unzipFileName); } } catch (IOException ex) { diff --git a/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/DataFileCreatorTest.java b/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/DataFileCreatorTest.java index 3e14c7e9b2..19d2b678e8 100644 --- a/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/DataFileCreatorTest.java +++ b/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/DataFileCreatorTest.java @@ -365,6 +365,7 @@ void createDataFiles_shouldRezipShapefiles() throws IOException { byte[] zipBytes = UnitTestUtils.readFileToByteArray("jhove/fake_shapefile.zip"); lenient().when(settingsService.getValueForKeyAsLong(Key.MaxFileUploadSizeInBytes)).thenReturn(1024*1024L); + lenient().when(settingsService.getValueForKeyAsLong(Key.ZipUploadFilesLimit)).thenReturn(100L); lenient().when(settingsService.getValueForKey(Key.FileFixityChecksumAlgorithm)).thenReturn("MD5"); lenient().when(fileTypeDetector.determineFileType(any(), any())).thenReturn("application/zipped-shapefile"); diff --git a/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/FileTypeDetectorTest.java b/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/FileTypeDetectorTest.java index c11ccb256b..83adb0b8ea 100644 --- a/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/FileTypeDetectorTest.java +++ b/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/datafile/FileTypeDetectorTest.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.datafile; import edu.harvard.iq.dataverse.UnitTestUtils; +import edu.harvard.iq.dataverse.engine.TestSettingsServiceBean; import edu.harvard.iq.dataverse.util.JhoveConfigurationInitializer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.io.TempDir; @@ -14,7 +15,7 @@ public class FileTypeDetectorTest { - private FileTypeDetector fileTypeDetector = new FileTypeDetector(); + private FileTypeDetector fileTypeDetector = new FileTypeDetector(new TestSettingsServiceBean()); @TempDir File tempDir; diff --git a/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/util/ShapefileHandlerTest.java b/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/util/ShapefileHandlerTest.java index f621fb74a1..457dd3bad3 100644 --- a/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/util/ShapefileHandlerTest.java +++ b/dataverse-webapp/src/test/java/edu/harvard/iq/dataverse/util/ShapefileHandlerTest.java @@ -1,15 +1,20 @@ package edu.harvard.iq.dataverse.util; +import com.google.common.collect.ImmutableMap; +import edu.harvard.iq.dataverse.persistence.datafile.ingest.IngestException; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.testcontainers.shaded.org.apache.commons.lang3.RandomStringUtils; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -34,7 +39,7 @@ public void testCreateZippedNonShapefile() throws IOException { List file_names = Arrays.asList("not-quite-a-shape.shp", "not-quite-a-shape.shx", "not-quite-a-shape.dbf", "not-quite-a-shape.pdf"); //, "prj"); File zipfile_obj = createAndZipFiles(file_names, "not-quite-a-shape.zip"); - ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); + ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj); assertThat(shp_handler.containsShapefile()).isFalse(); @@ -55,7 +60,7 @@ public void testZippedTwoShapefiles() throws IOException { File zipfile_obj = createAndZipFiles(file_names, "two-shapes.zip"); - ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); + ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj); assertThat(shp_handler.containsShapefile()).isTrue(); @@ -76,7 +81,8 @@ public void testZipped__duplicate() throws IOException { File zipfile_obj = createAndZipFiles(file_names, "duplicate_file.zip"); - assertThatThrownBy(() -> new ShapefileHandler(zipfile_obj)).hasMessage("Found file-name collision: shape2.pdf"); + assertThatThrownBy(() -> newShapeFileHandler(zipfile_obj)) + .hasMessage("Found file-name collision: shape2.pdf"); } @Test @@ -91,7 +97,7 @@ public void testZippedTwoShapefiles_reshape() throws IOException { File test_rezip_folder = this.tempFolder.newFolder("test_rezip").getAbsoluteFile(); - ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); + ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj); shp_handler.reZipShapefileSets(test_unzip_folder, test_rezip_folder); assertThat(test_unzip_folder.list().length).isEqualTo(0); @@ -106,7 +112,7 @@ public void testZippedShapefileWithExtraFiles() throws IOException { List file_names = Arrays.asList("shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.pdf", "shape1.cpg", "shape1." + SHP_XML_EXTENSION, "README.md", "shape_notes.txt"); File zipfile_obj = createAndZipFiles(file_names, "shape-plus.zip"); - ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); + ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj); assertThat(shp_handler.containsShapefile()).isTrue(); @@ -122,6 +128,37 @@ public void testZippedShapefileWithExtraFiles() throws IOException { .containsExactly("txt"); } + @Test + public void testZippedShapefile__too_many_files() throws IOException { + + List file_names = Arrays.asList("shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.pdf", + "shape1.cpg", "shape1.shp.xml", "README.md", "shape_notes.txt"); + File zipfile_obj = createAndZipFiles(file_names, "shape-plus.zip"); + + assertThatThrownBy(() -> new ShapefileHandler(zipfile_obj, 1024L, 8L)) + .isInstanceOf(IngestException.class) + .hasMessage("There was a problem during ingest. Passing error key UNZIP_FILE_LIMIT_FAIL to report."); + } + + @Test + public void testZippedShapefile__too_big_files() throws IOException { + Map files = ImmutableMap.builder().put("shape1.shp", "") + .put("shape1.shx", "") + .put("shape1.dbf", "") + .put("shape1.prj", "") + .put("shape1.pdf", RandomStringUtils.randomAlphanumeric(2048)) + .put("shape1.cpg", "") + .put("shape1.shp.xml", "") + .put("README.md", "") + .put("shape_notes.txt", "") + .build(); + File zipfile_obj = createAndZipFiles(files, "shape-plus.zip"); + + assertThatThrownBy(() -> new ShapefileHandler(zipfile_obj, 1024L, 100L)) + .isInstanceOf(IngestException.class) + .hasMessage("There was a problem during ingest. Passing error key UNZIP_SIZE_FAIL to report."); + } + @Test public void testZippedShapefileWithExtraFiles_reshape() throws IOException { @@ -130,7 +167,7 @@ public void testZippedShapefileWithExtraFiles_reshape() throws IOException { File unzip2Folder = this.tempFolder.newFolder("test_unzip2").getAbsoluteFile(); File rezip2Folder = this.tempFolder.newFolder("test_rezip2").getAbsoluteFile(); - ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); + ShapefileHandler shp_handler = newShapeFileHandler(zipfile_obj); shp_handler.reZipShapefileSets(unzip2Folder, rezip2Folder); assertThat(unzip2Folder.list()).isEmpty(); @@ -145,13 +182,17 @@ public void testZippedShapefileWithExtraFiles_reshape() throws IOException { * @param zipFileName - Name of .zip file to create */ private File createAndZipFiles(List fileNamesToZip, String zipFileName) throws IOException { + Map filesWithContent = fileNamesToZip.stream().collect(LinkedHashMap::new, + (map, fileName) -> map.put(fileName, ""), Map::putAll); + return createAndZipFiles(filesWithContent, zipFileName); + } + private File createAndZipFiles(Map filesToZip, String zipFileName) throws IOException { File zipFileObj = this.tempFolder.newFile(zipFileName); - - try (ZipOutputStream zip_stream = new ZipOutputStream(Files.newOutputStream(zipFileObj.toPath()))) { - for (String fileName : fileNamesToZip) { - this.addToZipFile(fileName, "".getBytes(StandardCharsets.UTF_8), zip_stream); + try (ZipOutputStream zip_stream = new ZipOutputStream(Files.newOutputStream(zipFileObj.toPath()))) { + for (Map.Entry entry : filesToZip.entrySet()) { + this.addToZipFile(entry.getKey(), entry.getValue().getBytes(StandardCharsets.UTF_8), zip_stream); } } @@ -167,6 +208,9 @@ private void addToZipFile(String fileName, byte[] fileAsBytesToZip, ZipOutputStr zipOutputStream.closeEntry(); } + private ShapefileHandler newShapeFileHandler(File zipfile_obj) { + return new ShapefileHandler(zipfile_obj, 1024L, 100L); + } }