From 806dd061971026af4da77507bbffa90d1060dd8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Tellnes?= Date: Wed, 14 Jun 2023 16:27:06 +0200 Subject: [PATCH] Ensure only files belonging to a Siard extract is included in IP package --- .../Base/Siard/SiardXmlTableReaderTests.cs | 174 +++++++++++++++++- .../schema1/table1/lob9/unreferenced-file.bin | 0 .../table0/lob9/seg0/unreferenced-file.bin | 0 .../external/lobs/blobs/unreferenced-file.bin | 0 .../Base/Siard/SiardXmlTableReader.cs | 33 +++- .../Identify/TestSessionFactory.cs | 41 ++++- .../Resources/SiardMessages.Designer.cs | 9 + .../Resources/SiardMessages.nb-NO.resx | 3 + .../Resources/SiardMessages.resx | 4 + 9 files changed, 251 insertions(+), 13 deletions(-) create mode 100644 src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/dbPtk/external/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1/table1/lob9/unreferenced-file.bin create mode 100644 src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/fullConvert/external/t01bclob12_scfc1654_ext.siard_documents/content/schema0/table0/lob9/seg0/unreferenced-file.bin create mode 100644 src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/siardGui/external/lobs/blobs/unreferenced-file.bin diff --git a/src/Arkivverket.Arkade.Core.Tests/Base/Siard/SiardXmlTableReaderTests.cs b/src/Arkivverket.Arkade.Core.Tests/Base/Siard/SiardXmlTableReaderTests.cs index f4bb29c58..f22fbafa0 100644 --- a/src/Arkivverket.Arkade.Core.Tests/Base/Siard/SiardXmlTableReaderTests.cs +++ b/src/Arkivverket.Arkade.Core.Tests/Base/Siard/SiardXmlTableReaderTests.cs @@ -1,4 +1,4 @@ -using Xunit; +using Xunit; using Arkivverket.Arkade.Core.Base.Siard; using System.Collections.Generic; using System.IO; @@ -234,9 +234,181 @@ public void GetFormatAnalysedLobsFromSiard2_1ArchiveFileWithInternalLobsCreatedB } } + [Fact] + public void ShouldGetPathsToExternalLobsReferencedFromSiardCreatedByDatabasePreservationToolkit() + { + const string siardArchiveDirectoryPath = "TestData/Siard/siard2/dbPtk/external"; + const string siardArchivePath = $"{siardArchiveDirectoryPath}/dbptk.siard"; + + const string pathToSchemaFolder = $"{siardArchiveDirectoryPath}/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1"; + + List referencedLobs = new() + { + $"{pathToSchemaFolder}/table1/lob9/record1.bin", + $"{pathToSchemaFolder}/table1/lob9/record2.bin", + $"{pathToSchemaFolder}/table1/lob9/record3.bin", + $"{pathToSchemaFolder}/table1/lob9/record4.bin", + $"{pathToSchemaFolder}/table1/lob9/record5.bin", + $"{pathToSchemaFolder}/table1/lob9/record6.bin", + $"{pathToSchemaFolder}/table1/lob9/record7.bin", + $"{pathToSchemaFolder}/table1/lob9/record8.bin", + $"{pathToSchemaFolder}/table1/lob9/record9.bin", + $"{pathToSchemaFolder}/table1/lob9/record10.bin", + $"{pathToSchemaFolder}/table1/lob9/record11.bin", + $"{pathToSchemaFolder}/table1/lob9/record12.bin", + $"{pathToSchemaFolder}/table1/lob9/record13.bin", + $"{pathToSchemaFolder}/table1/lob9/record14.bin", + $"{pathToSchemaFolder}/table1/lob9/record15.bin", + $"{pathToSchemaFolder}/table1/lob9/record16.bin", + $"{pathToSchemaFolder}/table1/lob9/record17.bin", + $"{pathToSchemaFolder}/table1/lob9/record18.bin", + $"{pathToSchemaFolder}/table1/lob9/record19.bin", + $"{pathToSchemaFolder}/table1/lob9/record20.bin", + $"{pathToSchemaFolder}/table1/lob9/record21.bin", + $"{pathToSchemaFolder}/table1/lob9/record22.bin", + $"{pathToSchemaFolder}/table1/lob9/record23.bin", + $"{pathToSchemaFolder}/table1/lob9/record24.bin", + $"{pathToSchemaFolder}/table1/lob9/record25.bin", + $"{pathToSchemaFolder}/table1/lob9/record26.bin", + $"{pathToSchemaFolder}/table1/lob9/record27.bin", + $"{pathToSchemaFolder}/table1/lob9/record28.bin", + $"{pathToSchemaFolder}/table2/lob9/record1.txt", + $"{pathToSchemaFolder}/table2/lob9/record2.txt", + $"{pathToSchemaFolder}/table2/lob9/record3.txt", + $"{pathToSchemaFolder}/table2/lob9/record4.txt", + $"{pathToSchemaFolder}/table2/lob9/record5.txt", + $"{pathToSchemaFolder}/table2/lob9/record6.txt", + $"{pathToSchemaFolder}/table2/lob9/record7.txt" + }; + + const string pathToNonReferencedLob = $"{pathToSchemaFolder}/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1/table1/lob9/unreferenced-file.bin"; + + ValidateFacts(siardArchivePath, referencedLobs, pathToNonReferencedLob); + } + + [Fact] + public void ShouldGetPathsToExternalLobsReferencedFromSiardCreatedBySiardGui() + { + const string siardArchiveDirectoryPath = "TestData/Siard/siard2/siardGui/external"; + const string siardArchivePath = $"{siardArchiveDirectoryPath}/siardGui.siard"; + + const string pathToLobsFolder = $"{siardArchiveDirectoryPath}/lobs"; + + List referencedLobs = new() + { + $"{pathToLobsFolder}/blobs/record0.bin", + $"{pathToLobsFolder}/blobs/record1.bin", + $"{pathToLobsFolder}/blobs/record2.bin", + $"{pathToLobsFolder}/blobs/record3.bin", + $"{pathToLobsFolder}/blobs/record4.bin", + $"{pathToLobsFolder}/blobs/record5.bin", + $"{pathToLobsFolder}/blobs/record6.bin", + $"{pathToLobsFolder}/blobs/record7.bin", + $"{pathToLobsFolder}/blobs/record8.bin", + $"{pathToLobsFolder}/blobs/record9.bin", + $"{pathToLobsFolder}/blobs/record10.bin", + $"{pathToLobsFolder}/blobs/record11.bin", + $"{pathToLobsFolder}/blobs/record12.bin", + $"{pathToLobsFolder}/blobs/record13.bin", + $"{pathToLobsFolder}/blobs/record14.bin", + $"{pathToLobsFolder}/blobs/record15.bin", + $"{pathToLobsFolder}/blobs/record16.bin", + $"{pathToLobsFolder}/blobs/record17.bin", + $"{pathToLobsFolder}/blobs/record18.bin", + $"{pathToLobsFolder}/blobs/record19.bin", + $"{pathToLobsFolder}/blobs/record20.bin", + $"{pathToLobsFolder}/blobs/record21.bin", + $"{pathToLobsFolder}/blobs/record22.bin", + $"{pathToLobsFolder}/blobs/record23.bin", + $"{pathToLobsFolder}/blobs/record24.bin", + $"{pathToLobsFolder}/blobs/record25.bin", + $"{pathToLobsFolder}/blobs/record26.bin", + $"{pathToLobsFolder}/blobs/record27.bin", + $"{pathToLobsFolder}/clobs/record0.txt", + $"{pathToLobsFolder}/clobs/record1.txt", + $"{pathToLobsFolder}/clobs/record2.txt", + $"{pathToLobsFolder}/clobs/record3.txt", + $"{pathToLobsFolder}/clobs/record4.txt", + $"{pathToLobsFolder}/clobs/record5.txt", + $"{pathToLobsFolder}/clobs/record6.txt" + }; + + const string pathToNonReferencedLob = $"{pathToLobsFolder}/blobs/unreferenced-file.bin"; + + ValidateFacts(siardArchivePath, referencedLobs, pathToNonReferencedLob); + } + + [Fact] + public void ShouldGetPathsToExternalLobsReferencedFromSiardCreatedBySpectralCoreFullConvert() + { + const string siardArchiveDirectoryPath = "TestData/Siard/siard2/fullConvert/external"; + const string siardArchivePath = $"{siardArchiveDirectoryPath}/scfc.siard"; + + const string pathToSchemaFolder = $"{siardArchiveDirectoryPath}/t01bclob12_scfc1654_ext.siard_documents/content/schema0"; + + List referencedLobs = new() + { + $"{pathToSchemaFolder}/table0/lob9/seg0/rec2.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec4.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec6.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec7.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec8.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec9.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec10.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec11.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec12.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec13.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec14.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec15.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec16.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec17.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec18.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec19.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec20.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec21.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec22.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec23.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec24.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec25.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec26.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec27.bin", + $"{pathToSchemaFolder}/table0/lob9/seg0/rec28.bin", + $"{pathToSchemaFolder}/table1/lob9/seg0/rec2.txt", + $"{pathToSchemaFolder}/table1/lob9/seg0/rec4.txt", + $"{pathToSchemaFolder}/table1/lob9/seg0/rec6.txt", + $"{pathToSchemaFolder}/table1/lob9/seg0/rec7.txt" + }; + + const string pathToNonReferencedLob = $"{pathToSchemaFolder}/table0/lob9/seg0/unreferenced-file.bin"; + + ValidateFacts(siardArchivePath, referencedLobs, pathToNonReferencedLob); + } + private static SiardXmlTableReader CreateReader() { return new SiardXmlTableReader(new SiardArchiveReader()); } + + private static void ValidateFacts(string siardArchivePath, List referencedLobs, + string pathToNonReferencedLob) + { + SiardXmlTableReader xmlTableReader = CreateReader(); + + List fullPathsToExternalLobs = xmlTableReader.GetFullPathsToExternalLobs(siardArchivePath).ToList(); + + // Ensure only referenced files are fetched + fullPathsToExternalLobs.Except(referencedLobs).Should().BeEmpty(); + + // Ensure all referenced files are fetched + referencedLobs.Except(fullPathsToExternalLobs).Should().BeEmpty(); + + // When the former statements validates successfully, success of the following + // statement implies that the list of fetched lob paths contains no duplicates. + fullPathsToExternalLobs.Should().HaveCount(referencedLobs.Count); + + // Explicitly check that non referenced files are not included. Redundant, but + // included for emphasis on the importance of not including unreferenced files. + fullPathsToExternalLobs.Should().NotContain(pathToNonReferencedLob); + } } } \ No newline at end of file diff --git a/src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/dbPtk/external/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1/table1/lob9/unreferenced-file.bin b/src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/dbPtk/external/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1/table1/lob9/unreferenced-file.bin new file mode 100644 index 000000000..e69de29bb diff --git a/src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/fullConvert/external/t01bclob12_scfc1654_ext.siard_documents/content/schema0/table0/lob9/seg0/unreferenced-file.bin b/src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/fullConvert/external/t01bclob12_scfc1654_ext.siard_documents/content/schema0/table0/lob9/seg0/unreferenced-file.bin new file mode 100644 index 000000000..e69de29bb diff --git a/src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/siardGui/external/lobs/blobs/unreferenced-file.bin b/src/Arkivverket.Arkade.Core.Tests/TestData/Siard/siard2/siardGui/external/lobs/blobs/unreferenced-file.bin new file mode 100644 index 000000000..e69de29bb diff --git a/src/Arkivverket.Arkade.Core/Base/Siard/SiardXmlTableReader.cs b/src/Arkivverket.Arkade.Core/Base/Siard/SiardXmlTableReader.cs index d0ee23457..9bcdd0e75 100644 --- a/src/Arkivverket.Arkade.Core/Base/Siard/SiardXmlTableReader.cs +++ b/src/Arkivverket.Arkade.Core/Base/Siard/SiardXmlTableReader.cs @@ -23,6 +23,32 @@ public SiardXmlTableReader(ISiardArchiveReader siardArchiveReader) _siardArchiveReader = siardArchiveReader; } + internal IEnumerable GetFullPathsToExternalLobs(string siardArchiveFullPath) + { + Dictionary> lobFolderPathsWithColumnIndexes = + _siardArchiveReader.GetLobFolderPathsWithColumnIndexes(siardArchiveFullPath); + + foreach (SiardLobReference siardLobReference in lobFolderPathsWithColumnIndexes.Values.SelectMany(l => l)) + { + XDocument xmlTableDoc = XDocument.Parse(GetXmlTableStringContent(siardArchiveFullPath, siardLobReference)); + var xPathQuery = $"//*:c{siardLobReference.Column.Index}"; + + List lobXmlElements = xmlTableDoc.XPath2SelectElements(xPathQuery).ToList(); + + foreach (XElement lobXmlElement in lobXmlElements) + { + var lobReference = new SiardLobReference(siardLobReference) + { + FilePathInTableXml = lobXmlElement.Attributes().FirstOrDefault(a => a.Name.LocalName.Equals("file"))?.Value + }; + + if (!SiardLobIsExternal(lobReference)) continue; + + yield return GetPathToExternalLob(siardArchiveFullPath, lobReference); + } + } + } + public IEnumerable>> CreateLobByteArrays(string siardFileName) { Dictionary> lobFolderPathsWithColumnIndexes = @@ -110,7 +136,8 @@ private static bool LobIsInlinedInXmlTable(string siardLobFileReferenceFromTable private static bool SiardLobIsExternal(SiardLobReference siardLobReference) { - return siardLobReference.FilePathInTableXml.StartsWith("..") || siardLobReference.IsExternal; + return !LobIsInlinedInXmlTable(siardLobReference.FilePathInTableXml) && + (siardLobReference.FilePathInTableXml?.StartsWith("..") == true || siardLobReference.IsExternal); } private KeyValuePair> CreateKeyValuePairForInlinedLob(XElement lobXmlElement, SiardLobReference siardLobReference) @@ -171,13 +198,13 @@ private static string GetPathToExternalLob(string siardFileName, SiardLobReferen { string siardFileDirectoryPath = siardFileName.Replace(Path.GetFileName(siardFileName), string.Empty); - string relativePathToLobFile = siardLobReference.FilePathInTableXml.StartsWith("..") + string relativePathToLobFile = siardLobReference.FilePathInTableXml?.StartsWith("..") == true ? siardLobReference.FilePathInTableXml.Remove(0, 3) : Path.Combine(siardLobReference.LobFolderPath.TrimStart('.', '\\', '/'), siardLobReference.FilePathRelativeToLobFolder); string pathToExternalLob = Path.Combine(siardFileDirectoryPath, relativePathToLobFile); - return pathToExternalLob; + return pathToExternalLob.Replace('\\', '/'); } } } diff --git a/src/Arkivverket.Arkade.Core/Identify/TestSessionFactory.cs b/src/Arkivverket.Arkade.Core/Identify/TestSessionFactory.cs index 808c4b8be..8b9b71b0e 100644 --- a/src/Arkivverket.Arkade.Core/Identify/TestSessionFactory.cs +++ b/src/Arkivverket.Arkade.Core/Identify/TestSessionFactory.cs @@ -1,7 +1,9 @@ using System; +using System.Collections.Generic; using System.IO; using Arkivverket.Arkade.Core.Base; using Arkivverket.Arkade.Core.Base.Addml.Definitions; +using Arkivverket.Arkade.Core.Base.Siard; using Arkivverket.Arkade.Core.Logging; using Arkivverket.Arkade.Core.Resources; using Arkivverket.Arkade.Core.Testing.Noark5; @@ -55,7 +57,7 @@ public TestSession NewSession(ArchiveFile archiveFile) if (archiveFile.ArchiveType == ArchiveType.Siard && archiveFile.File.Extension.Equals(".siard")) { - CopySiardFilesToContentDirectory(archiveFile.File.Directory, workingDirectory.Content().ToString()); + CopySiardFilesToContentDirectory(archiveFile, workingDirectory.Content().ToString()); } else { @@ -149,17 +151,38 @@ private void TarExtractionFinishedEvent(WorkingDirectory workingDirectory) OperationMessageStatus.Ok); } - private static void CopySiardFilesToContentDirectory(DirectoryInfo archiveFileDirectory, string contentDirectoryPath) + private void CopySiardFilesToContentDirectory(ArchiveFile siardArchiveFile, string contentDirectoryPath) { - foreach (FileInfo fileInfo in archiveFileDirectory.GetFiles()) - { - File.Copy(fileInfo.FullName, - Path.Combine(contentDirectoryPath, fileInfo.Name)); - } + var siardTableXmlReader = new SiardXmlTableReader(new SiardArchiveReader()); + + siardArchiveFile.File.CopyTo(Path.Combine(contentDirectoryPath, siardArchiveFile.File.Name)); - foreach (DirectoryInfo contentDirectory in archiveFileDirectory.GetDirectories()) + IEnumerable fullPathsToExternalLobs = + siardTableXmlReader.GetFullPathsToExternalLobs(siardArchiveFile.File.FullName); + + foreach (string fullPathToExternalLob in fullPathsToExternalLobs) { - contentDirectory.CopyTo(Path.Combine(contentDirectoryPath, contentDirectory.Name), true); + if (!File.Exists(fullPathToExternalLob)) + { + string message = string.Format(SiardMessages.ExternalLobFileNotFoundMessage, fullPathToExternalLob); + _statusEventHandler.RaiseEventOperationMessage("", message, OperationMessageStatus.Error); + _log.Error(message); + continue; + } + + string relativePathFromSiardFileToExternalLob = + Path.GetRelativePath(siardArchiveFile.File.DirectoryName, fullPathToExternalLob); + + string externalLobDestinationPath = + Path.Combine(contentDirectoryPath, relativePathFromSiardFileToExternalLob); + + var destinationDirectoryForExternalLob = Path.GetDirectoryName(externalLobDestinationPath); + + Directory.CreateDirectory(destinationDirectoryForExternalLob); + + File.Copy(fullPathToExternalLob, externalLobDestinationPath); + + _log.Debug("'{0}' has been added to Arkade temporary work area", fullPathToExternalLob); } } } diff --git a/src/Arkivverket.Arkade.Core/Resources/SiardMessages.Designer.cs b/src/Arkivverket.Arkade.Core/Resources/SiardMessages.Designer.cs index b155c2205..86fc9c82b 100644 --- a/src/Arkivverket.Arkade.Core/Resources/SiardMessages.Designer.cs +++ b/src/Arkivverket.Arkade.Core/Resources/SiardMessages.Designer.cs @@ -88,6 +88,15 @@ public static string ErrorMessage { } } + /// + /// Looks up a localized string similar to External LOB file '{0}' not found. + /// + public static string ExternalLobFileNotFoundMessage { + get { + return ResourceManager.GetString("ExternalLobFileNotFoundMessage", resourceCulture); + } + } + /// /// Looks up a localized string similar to LOB is inlined in an unsupported format. /// diff --git a/src/Arkivverket.Arkade.Core/Resources/SiardMessages.nb-NO.resx b/src/Arkivverket.Arkade.Core/Resources/SiardMessages.nb-NO.resx index 28eace4a7..e37ea48bb 100644 --- a/src/Arkivverket.Arkade.Core/Resources/SiardMessages.nb-NO.resx +++ b/src/Arkivverket.Arkade.Core/Resources/SiardMessages.nb-NO.resx @@ -148,4 +148,7 @@ {0} er ikke gyldig for SIARD-versjon {1}: {2} + + Ekstern LOB-fil '{0}' ikke funnet + \ No newline at end of file diff --git a/src/Arkivverket.Arkade.Core/Resources/SiardMessages.resx b/src/Arkivverket.Arkade.Core/Resources/SiardMessages.resx index 87b549b52..fcd610b21 100644 --- a/src/Arkivverket.Arkade.Core/Resources/SiardMessages.resx +++ b/src/Arkivverket.Arkade.Core/Resources/SiardMessages.resx @@ -133,4 +133,8 @@ {0} xml-file, {1} SIARD version, {2} schema errors + + External LOB file '{0}' not found + {0}: full path to directory + \ No newline at end of file