Skip to content

Commit

Permalink
Ensure only files belonging to a Siard extract is included in IP package
Browse files Browse the repository at this point in the history
  • Loading branch information
jtellnes committed Jun 14, 2023
1 parent b6e8f42 commit 806dd06
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 13 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Xunit;
using Xunit;
using Arkivverket.Arkade.Core.Base.Siard;
using System.Collections.Generic;
using System.IO;
Expand Down Expand Up @@ -234,9 +234,181 @@ public void GetFormatAnalysedLobsFromSiard2_1ArchiveFileWithInternalLobsCreatedB
}
}

[Fact]
public void ShouldGetPathsToExternalLobsReferencedFromSiardCreatedByDatabasePreservationToolkit()
{
const string siardArchiveDirectoryPath = "TestData/Siard/siard2/dbPtk/external";
const string siardArchivePath = $"{siardArchiveDirectoryPath}/dbptk.siard";

const string pathToSchemaFolder = $"{siardArchiveDirectoryPath}/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1";

List<string> referencedLobs = new()
{
$"{pathToSchemaFolder}/table1/lob9/record1.bin",
$"{pathToSchemaFolder}/table1/lob9/record2.bin",
$"{pathToSchemaFolder}/table1/lob9/record3.bin",
$"{pathToSchemaFolder}/table1/lob9/record4.bin",
$"{pathToSchemaFolder}/table1/lob9/record5.bin",
$"{pathToSchemaFolder}/table1/lob9/record6.bin",
$"{pathToSchemaFolder}/table1/lob9/record7.bin",
$"{pathToSchemaFolder}/table1/lob9/record8.bin",
$"{pathToSchemaFolder}/table1/lob9/record9.bin",
$"{pathToSchemaFolder}/table1/lob9/record10.bin",
$"{pathToSchemaFolder}/table1/lob9/record11.bin",
$"{pathToSchemaFolder}/table1/lob9/record12.bin",
$"{pathToSchemaFolder}/table1/lob9/record13.bin",
$"{pathToSchemaFolder}/table1/lob9/record14.bin",
$"{pathToSchemaFolder}/table1/lob9/record15.bin",
$"{pathToSchemaFolder}/table1/lob9/record16.bin",
$"{pathToSchemaFolder}/table1/lob9/record17.bin",
$"{pathToSchemaFolder}/table1/lob9/record18.bin",
$"{pathToSchemaFolder}/table1/lob9/record19.bin",
$"{pathToSchemaFolder}/table1/lob9/record20.bin",
$"{pathToSchemaFolder}/table1/lob9/record21.bin",
$"{pathToSchemaFolder}/table1/lob9/record22.bin",
$"{pathToSchemaFolder}/table1/lob9/record23.bin",
$"{pathToSchemaFolder}/table1/lob9/record24.bin",
$"{pathToSchemaFolder}/table1/lob9/record25.bin",
$"{pathToSchemaFolder}/table1/lob9/record26.bin",
$"{pathToSchemaFolder}/table1/lob9/record27.bin",
$"{pathToSchemaFolder}/table1/lob9/record28.bin",
$"{pathToSchemaFolder}/table2/lob9/record1.txt",
$"{pathToSchemaFolder}/table2/lob9/record2.txt",
$"{pathToSchemaFolder}/table2/lob9/record3.txt",
$"{pathToSchemaFolder}/table2/lob9/record4.txt",
$"{pathToSchemaFolder}/table2/lob9/record5.txt",
$"{pathToSchemaFolder}/table2/lob9/record6.txt",
$"{pathToSchemaFolder}/table2/lob9/record7.txt"
};

const string pathToNonReferencedLob = $"{pathToSchemaFolder}/t01bclob12_dbptk-desktop-2.5.9_ext.siard_lobseg_1/content/schema1/table1/lob9/unreferenced-file.bin";

ValidateFacts(siardArchivePath, referencedLobs, pathToNonReferencedLob);
}

[Fact]
public void ShouldGetPathsToExternalLobsReferencedFromSiardCreatedBySiardGui()
{
const string siardArchiveDirectoryPath = "TestData/Siard/siard2/siardGui/external";
const string siardArchivePath = $"{siardArchiveDirectoryPath}/siardGui.siard";

const string pathToLobsFolder = $"{siardArchiveDirectoryPath}/lobs";

List<string> referencedLobs = new()
{
$"{pathToLobsFolder}/blobs/record0.bin",
$"{pathToLobsFolder}/blobs/record1.bin",
$"{pathToLobsFolder}/blobs/record2.bin",
$"{pathToLobsFolder}/blobs/record3.bin",
$"{pathToLobsFolder}/blobs/record4.bin",
$"{pathToLobsFolder}/blobs/record5.bin",
$"{pathToLobsFolder}/blobs/record6.bin",
$"{pathToLobsFolder}/blobs/record7.bin",
$"{pathToLobsFolder}/blobs/record8.bin",
$"{pathToLobsFolder}/blobs/record9.bin",
$"{pathToLobsFolder}/blobs/record10.bin",
$"{pathToLobsFolder}/blobs/record11.bin",
$"{pathToLobsFolder}/blobs/record12.bin",
$"{pathToLobsFolder}/blobs/record13.bin",
$"{pathToLobsFolder}/blobs/record14.bin",
$"{pathToLobsFolder}/blobs/record15.bin",
$"{pathToLobsFolder}/blobs/record16.bin",
$"{pathToLobsFolder}/blobs/record17.bin",
$"{pathToLobsFolder}/blobs/record18.bin",
$"{pathToLobsFolder}/blobs/record19.bin",
$"{pathToLobsFolder}/blobs/record20.bin",
$"{pathToLobsFolder}/blobs/record21.bin",
$"{pathToLobsFolder}/blobs/record22.bin",
$"{pathToLobsFolder}/blobs/record23.bin",
$"{pathToLobsFolder}/blobs/record24.bin",
$"{pathToLobsFolder}/blobs/record25.bin",
$"{pathToLobsFolder}/blobs/record26.bin",
$"{pathToLobsFolder}/blobs/record27.bin",
$"{pathToLobsFolder}/clobs/record0.txt",
$"{pathToLobsFolder}/clobs/record1.txt",
$"{pathToLobsFolder}/clobs/record2.txt",
$"{pathToLobsFolder}/clobs/record3.txt",
$"{pathToLobsFolder}/clobs/record4.txt",
$"{pathToLobsFolder}/clobs/record5.txt",
$"{pathToLobsFolder}/clobs/record6.txt"
};

const string pathToNonReferencedLob = $"{pathToLobsFolder}/blobs/unreferenced-file.bin";

ValidateFacts(siardArchivePath, referencedLobs, pathToNonReferencedLob);
}

[Fact]
public void ShouldGetPathsToExternalLobsReferencedFromSiardCreatedBySpectralCoreFullConvert()
{
const string siardArchiveDirectoryPath = "TestData/Siard/siard2/fullConvert/external";
const string siardArchivePath = $"{siardArchiveDirectoryPath}/scfc.siard";

const string pathToSchemaFolder = $"{siardArchiveDirectoryPath}/t01bclob12_scfc1654_ext.siard_documents/content/schema0";

List<string> referencedLobs = new()
{
$"{pathToSchemaFolder}/table0/lob9/seg0/rec2.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec4.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec6.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec7.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec8.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec9.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec10.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec11.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec12.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec13.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec14.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec15.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec16.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec17.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec18.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec19.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec20.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec21.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec22.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec23.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec24.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec25.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec26.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec27.bin",
$"{pathToSchemaFolder}/table0/lob9/seg0/rec28.bin",
$"{pathToSchemaFolder}/table1/lob9/seg0/rec2.txt",
$"{pathToSchemaFolder}/table1/lob9/seg0/rec4.txt",
$"{pathToSchemaFolder}/table1/lob9/seg0/rec6.txt",
$"{pathToSchemaFolder}/table1/lob9/seg0/rec7.txt"
};

const string pathToNonReferencedLob = $"{pathToSchemaFolder}/table0/lob9/seg0/unreferenced-file.bin";

ValidateFacts(siardArchivePath, referencedLobs, pathToNonReferencedLob);
}

private static SiardXmlTableReader CreateReader()
{
return new SiardXmlTableReader(new SiardArchiveReader());
}

private static void ValidateFacts(string siardArchivePath, List<string> referencedLobs,
string pathToNonReferencedLob)
{
SiardXmlTableReader xmlTableReader = CreateReader();

List<string> fullPathsToExternalLobs = xmlTableReader.GetFullPathsToExternalLobs(siardArchivePath).ToList();

// Ensure only referenced files are fetched
fullPathsToExternalLobs.Except(referencedLobs).Should().BeEmpty();

// Ensure all referenced files are fetched
referencedLobs.Except(fullPathsToExternalLobs).Should().BeEmpty();

// When the former statements validates successfully, success of the following
// statement implies that the list of fetched lob paths contains no duplicates.
fullPathsToExternalLobs.Should().HaveCount(referencedLobs.Count);

// Explicitly check that non referenced files are not included. Redundant, but
// included for emphasis on the importance of not including unreferenced files.
fullPathsToExternalLobs.Should().NotContain(pathToNonReferencedLob);
}
}
}
33 changes: 30 additions & 3 deletions src/Arkivverket.Arkade.Core/Base/Siard/SiardXmlTableReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,32 @@ public SiardXmlTableReader(ISiardArchiveReader siardArchiveReader)
_siardArchiveReader = siardArchiveReader;
}

internal IEnumerable<string> GetFullPathsToExternalLobs(string siardArchiveFullPath)
{
Dictionary<string, List<SiardLobReference>> lobFolderPathsWithColumnIndexes =
_siardArchiveReader.GetLobFolderPathsWithColumnIndexes(siardArchiveFullPath);

foreach (SiardLobReference siardLobReference in lobFolderPathsWithColumnIndexes.Values.SelectMany(l => l))
{
XDocument xmlTableDoc = XDocument.Parse(GetXmlTableStringContent(siardArchiveFullPath, siardLobReference));
var xPathQuery = $"//*:c{siardLobReference.Column.Index}";

List<XElement> lobXmlElements = xmlTableDoc.XPath2SelectElements(xPathQuery).ToList();

foreach (XElement lobXmlElement in lobXmlElements)
{
var lobReference = new SiardLobReference(siardLobReference)
{
FilePathInTableXml = lobXmlElement.Attributes().FirstOrDefault(a => a.Name.LocalName.Equals("file"))?.Value
};

if (!SiardLobIsExternal(lobReference)) continue;

yield return GetPathToExternalLob(siardArchiveFullPath, lobReference);
}
}
}

public IEnumerable<KeyValuePair<string, IEnumerable<byte>>> CreateLobByteArrays(string siardFileName)
{
Dictionary<string, List<SiardLobReference>> lobFolderPathsWithColumnIndexes =
Expand Down Expand Up @@ -110,7 +136,8 @@ private static bool LobIsInlinedInXmlTable(string siardLobFileReferenceFromTable

private static bool SiardLobIsExternal(SiardLobReference siardLobReference)
{
return siardLobReference.FilePathInTableXml.StartsWith("..") || siardLobReference.IsExternal;
return !LobIsInlinedInXmlTable(siardLobReference.FilePathInTableXml) &&
(siardLobReference.FilePathInTableXml?.StartsWith("..") == true || siardLobReference.IsExternal);
}

private KeyValuePair<string, IEnumerable<byte>> CreateKeyValuePairForInlinedLob(XElement lobXmlElement, SiardLobReference siardLobReference)
Expand Down Expand Up @@ -171,13 +198,13 @@ private static string GetPathToExternalLob(string siardFileName, SiardLobReferen
{
string siardFileDirectoryPath = siardFileName.Replace(Path.GetFileName(siardFileName), string.Empty);

string relativePathToLobFile = siardLobReference.FilePathInTableXml.StartsWith("..")
string relativePathToLobFile = siardLobReference.FilePathInTableXml?.StartsWith("..") == true
? siardLobReference.FilePathInTableXml.Remove(0, 3)
: Path.Combine(siardLobReference.LobFolderPath.TrimStart('.', '\\', '/'),
siardLobReference.FilePathRelativeToLobFolder);

string pathToExternalLob = Path.Combine(siardFileDirectoryPath, relativePathToLobFile);
return pathToExternalLob;
return pathToExternalLob.Replace('\\', '/');
}
}
}
41 changes: 32 additions & 9 deletions src/Arkivverket.Arkade.Core/Identify/TestSessionFactory.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
using System;
using System.Collections.Generic;
using System.IO;
using Arkivverket.Arkade.Core.Base;
using Arkivverket.Arkade.Core.Base.Addml.Definitions;
using Arkivverket.Arkade.Core.Base.Siard;
using Arkivverket.Arkade.Core.Logging;
using Arkivverket.Arkade.Core.Resources;
using Arkivverket.Arkade.Core.Testing.Noark5;
Expand Down Expand Up @@ -55,7 +57,7 @@ public TestSession NewSession(ArchiveFile archiveFile)

if (archiveFile.ArchiveType == ArchiveType.Siard && archiveFile.File.Extension.Equals(".siard"))
{
CopySiardFilesToContentDirectory(archiveFile.File.Directory, workingDirectory.Content().ToString());
CopySiardFilesToContentDirectory(archiveFile, workingDirectory.Content().ToString());
}
else
{
Expand Down Expand Up @@ -149,17 +151,38 @@ private void TarExtractionFinishedEvent(WorkingDirectory workingDirectory)
OperationMessageStatus.Ok);
}

private static void CopySiardFilesToContentDirectory(DirectoryInfo archiveFileDirectory, string contentDirectoryPath)
private void CopySiardFilesToContentDirectory(ArchiveFile siardArchiveFile, string contentDirectoryPath)
{
foreach (FileInfo fileInfo in archiveFileDirectory.GetFiles())
{
File.Copy(fileInfo.FullName,
Path.Combine(contentDirectoryPath, fileInfo.Name));
}
var siardTableXmlReader = new SiardXmlTableReader(new SiardArchiveReader());

siardArchiveFile.File.CopyTo(Path.Combine(contentDirectoryPath, siardArchiveFile.File.Name));

foreach (DirectoryInfo contentDirectory in archiveFileDirectory.GetDirectories())
IEnumerable<string> fullPathsToExternalLobs =
siardTableXmlReader.GetFullPathsToExternalLobs(siardArchiveFile.File.FullName);

foreach (string fullPathToExternalLob in fullPathsToExternalLobs)
{
contentDirectory.CopyTo(Path.Combine(contentDirectoryPath, contentDirectory.Name), true);
if (!File.Exists(fullPathToExternalLob))
{
string message = string.Format(SiardMessages.ExternalLobFileNotFoundMessage, fullPathToExternalLob);
_statusEventHandler.RaiseEventOperationMessage("", message, OperationMessageStatus.Error);
_log.Error(message);
continue;
}

string relativePathFromSiardFileToExternalLob =
Path.GetRelativePath(siardArchiveFile.File.DirectoryName, fullPathToExternalLob);

string externalLobDestinationPath =
Path.Combine(contentDirectoryPath, relativePathFromSiardFileToExternalLob);

var destinationDirectoryForExternalLob = Path.GetDirectoryName(externalLobDestinationPath);

Directory.CreateDirectory(destinationDirectoryForExternalLob);

File.Copy(fullPathToExternalLob, externalLobDestinationPath);

_log.Debug("'{0}' has been added to Arkade temporary work area", fullPathToExternalLob);
}
}
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,7 @@
<value>{0} er ikke gyldig for SIARD-versjon {1}:
{2}</value>
</data>
<data name="ExternalLobFileNotFoundMessage" xml:space="preserve">
<value>Ekstern LOB-fil '{0}' ikke funnet</value>
</data>
</root>
4 changes: 4 additions & 0 deletions src/Arkivverket.Arkade.Core/Resources/SiardMessages.resx
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,8 @@
<comment>{0} xml-file, {1} SIARD version,
{2} schema errors</comment>
</data>
<data name="ExternalLobFileNotFoundMessage" xml:space="preserve">
<value>External LOB file '{0}' not found</value>
<comment>{0}: full path to directory</comment>
</data>
</root>

0 comments on commit 806dd06

Please sign in to comment.