Skip to content

Commit

Permalink
wip: Adds empty tar file support
Browse files Browse the repository at this point in the history
  • Loading branch information
meghfossa committed Oct 27, 2023
1 parent a458414 commit 349d032
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 13 deletions.
12 changes: 7 additions & 5 deletions src/App/Fossa/LicenseScanner.hs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import Data.HashMap.Strict qualified as HM
import Data.List.NonEmpty (NonEmpty)
import Data.List.NonEmpty qualified as NE
import Data.Maybe (catMaybes)
import Data.Either.Combinators (rightToMaybe)
import Data.Semigroup (Any (..))
import Data.String.Conversion (toString, toText)
import Data.Text (Text)
Expand Down Expand Up @@ -135,8 +136,8 @@ recursivelyScanArchives pathPrefix licenseScanPathFilters fullFileUploads dir =
-- but it would be easy to allow customers to filter out single files too.
let archivesToSkip = maybe [] licenseScanPathFilterFileExclude licenseScanPathFilters
let filesToProcess = filter (`notElem` archivesToSkip) files
-- withArchive' emits Nothing when archive type is not supported.
archives <- traverse (\file -> withArchive' file (process file)) filesToProcess

archives <- catMaybes <$> traverse (\file -> rightToMaybe <$> withArchive' file (process file)) filesToProcess
pure (concat (catMaybes archives), WalkContinue)

-- When we recursively scan archives, we end up with an array of LicenseUnits that may have multiple entries for a single license.
Expand Down Expand Up @@ -243,12 +244,13 @@ scanArchive ::
ScannableArchive ->
m (NonEmpty LicenseUnit)
scanArchive baseDir licenseScanPathFilters fullFileUploads file = runFinally $ do
-- withArchive' emits Nothing when archive type is not supported.
logSticky $ "scanning archive at " <> toText (scanFile file)
result <- withArchive' (scanFile file) (scanDirectory (Just file) pathPrefix licenseScanPathFilters fullFileUploads)
case result of
Nothing -> fatal . UnsupportedArchive $ scanFile file
Just units -> pure units
Left _ -> fatal . UnsupportedArchive $ scanFile file
Right r -> case r of
Nothing -> fatal . EmptyArchive $ scanFile file
Just units -> pure units
where
pathPrefix :: Text
pathPrefix = getPathPrefix baseDir (parent $ scanFile file)
Expand Down
2 changes: 1 addition & 1 deletion src/App/Fossa/VSI/Analyze.hs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ discover output filters root renderAncestry =
-- Fork an async task to walk the contents of the archive, if the file is an archive.
-- Note: if you're ever looking at the @TaskPool@ progress message and are like "why does it show 2x the files as are actually in the project?"
-- This is the reason. We fork a task for every file to process its fingerprint, then fork a second task to (maybe) extract it as an archive.
forkTask . recover . fatalOnSomeException "extract archive" . withArchive' file $ \archiveRoot -> context "walking into child archive" $ do
forkTask . recover . fatalOnSomeException "extract archive" $ withArchive' file $ \archiveRoot -> context "walking into child archive" $ do
logDebug . pretty $ "walking into " <> toText file <> " as archive"
logicalParent <- convertArchiveSuffix logicalPath
discover output filters archiveRoot $ ancestryDerived $ FileAncestry logicalParent
Expand Down
24 changes: 17 additions & 7 deletions src/Discovery/Archive.hs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import Prettyprinter (Pretty (pretty), hsep, viaShow, vsep)
import Prelude hiding (zip)

data ArchiveUnpackFailure = ArchiveUnpackFailure (Path Abs File) SomeException
newtype UnsupportedArchiveErr = UnsupportedArchiveErr (Path Abs File)

unpackFailurePath :: ArchiveUnpackFailure -> Path Abs File
unpackFailurePath (ArchiveUnpackFailure path _) = path
Expand Down Expand Up @@ -125,11 +126,11 @@ withArchive' ::
Path Abs File ->
-- | Callback
(Path Abs Dir -> m c) ->
m (Maybe c)
m (Either UnsupportedArchiveErr (Maybe c))
withArchive' file go =
case selectUnarchiver (fileName file) of
Just extract -> withArchive extract file go
Nothing -> pure Nothing
Just extract -> Right <$> withArchive extract file go
Nothing -> pure . Left . UnsupportedArchiveErr $ file

-- | Extract an archive to a temporary directory, and run the provided callback
-- on the temporary directory. Archive contents are removed when the callback
Expand Down Expand Up @@ -162,20 +163,20 @@ mkTempDir name = do

extractTar :: Has (Lift IO) sig m => Path Abs Dir -> Path Abs File -> m ()
extractTar dir tarFile =
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . Tar.read =<< BL.readFile (fromAbsFile tarFile)
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . readTar =<< BL.readFile (fromAbsFile tarFile)

extractTarGz :: Has (Lift IO) sig m => Path Abs Dir -> Path Abs File -> m ()
extractTarGz dir tarGzFile =
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . Tar.read . GZip.decompress =<< BL.readFile (fromAbsFile tarGzFile)
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . readTar . GZip.decompress =<< BL.readFile (fromAbsFile tarGzFile)

extractTarXz :: Has (Lift IO) sig m => Path Abs Dir -> Path Abs File -> m ()
extractTarXz dir tarXzFile = do
decompressed <- sendIO (runResourceT . runConduit $ sourceFileBS (toFilePath tarXzFile) .| CLzma.decompress Nothing .| sinkLbs)
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . Tar.read $ decompressed
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . readTar $ decompressed

extractTarBz2 :: Has (Lift IO) sig m => Path Abs Dir -> Path Abs File -> m ()
extractTarBz2 dir tarGzFile =
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . Tar.read . BZip.decompress =<< BL.readFile (fromAbsFile tarGzFile)
sendIO $ Tar.unpack (fromAbsDir dir) . removeTarLinks . readTar . BZip.decompress =<< BL.readFile (fromAbsFile tarGzFile)

-- The tar unpacker dies when tar files reference files outside of the archive root
removeTarLinks :: Tar.Entries e -> Tar.Entries e
Expand All @@ -187,6 +188,15 @@ removeTarLinks (Tar.Next x xs) =
removeTarLinks Tar.Done = Tar.Done
removeTarLinks (Tar.Fail e) = Tar.Fail e

readTar :: BL.ByteString -> Tar.Entries Tar.FormatError
readTar bs = if BL.null bs
-- Although .tar file spec requires that two 512 block of zero bytes
-- are required to mark end of the tar file. GNU tar
-- and BSD tar utilities consider empty file to be a
-- valid tar file.
then Tar.Done
else Tar.read bs

---------- Zip files

extractZip :: Has (Lift IO) sig m => Path Abs Dir -> Path Abs File -> m ()
Expand Down
9 changes: 9 additions & 0 deletions test/Discovery/ArchiveSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ spec = do
contentB <- sendIO . TIO.readFile . toFilePath $ dir </> $(mkRelDir "simple") </> $(mkRelFile "b.txt")
pure (dir, contentA, contentB)

emptyTarget <- runIO emptyTarPath
emptyResult <- runIO . runStack . runDiagnostics . runFinally $ withArchive extractTar emptyTarget (pure . const ())

it "should have extracted the correct contents" $ do
assertOnSuccess result $ \_ (_, _, extractedContentB) -> extractedContentB `shouldBe` expectedSimpleContentB
assertOnSuccess result $ \_ (_, extractedContentA, _) -> extractedContentA `shouldBe` expectedSimpleContentA
Expand All @@ -65,6 +68,9 @@ spec = do
tempDirExists <- sendIO $ PIO.doesDirExist extractedDir
tempDirExists `shouldBe` False

it "should not thrown an error when working with empty tar file" $ do
assertOnSuccess emptyResult $ \warns _ -> (length warns) `shouldBe` 0

describe "extract tar.gz archive to a temporary location" $ do
target <- runIO simpleTarGzPath
result <- runIO $
Expand Down Expand Up @@ -149,6 +155,9 @@ brokenZipPath = PIO.resolveFile' "test/Discovery/testdata/broken.zip"
simpleTarPath :: IO (Path Abs File)
simpleTarPath = PIO.resolveFile' "test/Discovery/testdata/simple.tar"

emptyTarPath :: IO (Path Abs File)
emptyTarPath = PIO.resolveFile' "test/Discovery/testdata/empty.tar"

simpleTarGzPath :: IO (Path Abs File)
simpleTarGzPath = PIO.resolveFile' "test/Discovery/testdata/simple.tar.gz"

Expand Down
Empty file.

0 comments on commit 349d032

Please sign in to comment.