From a0e6e36fc9ffca73996099dc158fb60bf920a0b0 Mon Sep 17 00:00:00 2001 From: "Roy, Rajat" Date: Mon, 13 Jun 2022 15:00:25 -0700 Subject: [PATCH] Feature/downloader manifest 1014 (#655) * adding appsettings to could * using config file for clouds * using the s3 bucket as default for lambdas * using default s3 base url from config file * using Cloud in Downloader * fixing downloader manifest path * adding index for gsa files * adding check for index file * fixing texts * ensuring consistency --- Cloud/Cloud.appsettings.json | 8 ++++ Cloud/Cloud.csproj | 7 ++++ Cloud/Configuration.cs | 27 ++++++++++++ Cloud/LambdaUrlHelper.cs | 36 +++++++++------- Cloud/Utilities/LambdaUtilities.cs | 8 ++-- Downloader/Configuration.cs | 29 ++++++++----- Downloader/Downloader.appsettings.json | 6 +-- Downloader/Downloader.csproj | 1 + .../SupplementaryAnnotationFileExtensions.cs | 1 + Downloader/Manifest.cs | 3 +- Nirvana/ProviderUtilities.cs | 41 +++++++++++++++---- SingleAnnotationLambda/CacheUtilities.cs | 7 ++-- UnitTests/Cloud/ConsistencyTests.cs | 14 +++++++ UnitTests/Cloud/LambdaUrlHelperTests.cs | 15 +++++-- UnitTests/Downloader/ConfigurationTests.cs | 6 ++- 15 files changed, 158 insertions(+), 51 deletions(-) create mode 100644 Cloud/Cloud.appsettings.json create mode 100644 Cloud/Configuration.cs create mode 100644 UnitTests/Cloud/ConsistencyTests.cs diff --git a/Cloud/Cloud.appsettings.json b/Cloud/Cloud.appsettings.json new file mode 100644 index 00000000..23a5cc6f --- /dev/null +++ b/Cloud/Cloud.appsettings.json @@ -0,0 +1,8 @@ +{ + "DataSource": { + "BaseUrl": "http://nirvana-annotations.s3.us-west-2.amazonaws.com/", + "CacheDirectory": "ab0cf104f39708eabd07b8cb67e149ba-Cache", + "ReferencesDirectory": "d95867deadfe690e40f42068d6b59df8-References", + "ManifestDirectory": "a9f54ea6ac0548696c97a3ee64bc39ec2e71b84b-SaManifest" + } +} \ No newline at end of file diff --git a/Cloud/Cloud.csproj b/Cloud/Cloud.csproj index 714602e5..a5414f27 100644 --- a/Cloud/Cloud.csproj +++ b/Cloud/Cloud.csproj @@ -9,6 +9,8 @@ + + @@ -16,5 +18,10 @@ + + + PreserveNewest + + diff --git a/Cloud/Configuration.cs b/Cloud/Configuration.cs new file mode 100644 index 00000000..91e00dcb --- /dev/null +++ b/Cloud/Configuration.cs @@ -0,0 +1,27 @@ +using Cloud.Utilities; + +namespace Cloud; +using Microsoft.Extensions.Configuration; + +public sealed class Configuration +{ + public readonly IConfigurationRoot Config; + public readonly IConfigurationSection DataSources; + public string CacheDirectory => DataSources["CacheDirectory"]; + public string ReferencesDirectory => DataSources["ReferencesDirectory"]; + public string ManifestDirectory => DataSources["ManifestDirectory"]; + public string NirvanaBaseUrl => DataSources["BaseUrl"]; + public Configuration() + { + const string appSettingsFilename = "Cloud.appsettings.json"; + + Config = new ConfigurationBuilder() + .AddJsonFile(appSettingsFilename) + .Build(); + + DataSources = Config.GetSection("DataSource"); + + } + + +} \ No newline at end of file diff --git a/Cloud/LambdaUrlHelper.cs b/Cloud/LambdaUrlHelper.cs index cf379a4b..9e4386dc 100644 --- a/Cloud/LambdaUrlHelper.cs +++ b/Cloud/LambdaUrlHelper.cs @@ -1,4 +1,5 @@ -using Cloud.Utilities; +using System; +using Cloud.Utilities; using Genome; using IO; using ReferenceSequence; @@ -7,16 +8,19 @@ namespace Cloud { public static class LambdaUrlHelper { - public const string UrlBaseEnvironmentVariableName = "NirvanaDataUrlBase"; + public const ushort SaSchemaVersion = 22; + public const string UrlBaseEnvironmentVariableName = "NirvanaDataUrlBase"; + private static readonly Configuration Config = new (); - public const string S3CacheFolderBase = "ab0cf104f39708eabd07b8cb67e149ba-Cache"; - public const string S3ManifestFolderBase = "a9f54ea6ac0548696c97a3ee64bc39ec2e71b84b-SaManifest"; + public static string S3CacheFolderBase = Config.CacheDirectory; + // public const string S3ManifestFolderBase = "a9f54ea6ac0548696c97a3ee64bc39ec2e71b84b-SaManifest"; public static readonly string S3CacheFolder = - $"{S3CacheFolderBase}/{CacheConstants.DataVersion}/"; + $"{Config.CacheDirectory}/{CacheConstants.DataVersion}/"; private static readonly string S3RefPrefix = - $"d95867deadfe690e40f42068d6b59df8-References/{ReferenceSequenceCommon.HeaderVersion}/Homo_sapiens."; + $"{Config.ReferencesDirectory}/{ReferenceSequenceCommon.HeaderVersion}/Homo_sapiens."; + private const string UgaFileName = "UGA.tsv.gz"; public const string DefaultCacheSource = "Both"; public const string RefSuffix = ".Nirvana.dat"; @@ -24,16 +28,20 @@ public static class LambdaUrlHelper public const string JsonIndexSuffix = ".jsi"; public const string SuccessMessage = "Success"; - public static string GetBaseUrl(string baseUrl = null) => - baseUrl ?? LambdaUtilities.GetEnvironmentVariable(UrlBaseEnvironmentVariableName); + public static string GetBaseUrl() + { + var envBaseUrl = Environment.GetEnvironmentVariable(UrlBaseEnvironmentVariableName); + + return string.IsNullOrEmpty(envBaseUrl) ? Config.NirvanaBaseUrl: envBaseUrl; + } - public static string GetManifestBaseUrl(string baseUrl = null) => GetBaseUrl(baseUrl) + S3ManifestFolderBase; + public static string GetManifestBaseUrl() => GetBaseUrl() + Config.ManifestDirectory; - public static string GetCacheFolder(string baseUrl = null) => GetBaseUrl(baseUrl) + S3CacheFolder; - public static string GetUgaUrl(string baseUrl = null) => GetCacheFolder(baseUrl) + UgaFileName; - public static string GetRefPrefix(string baseUrl = null) => GetBaseUrl(baseUrl) + S3RefPrefix; + public static string GetCacheFolder() => GetBaseUrl() + S3CacheFolder; + public static string GetUgaUrl() => GetCacheFolder() + UgaFileName; + public static string GetRefPrefix() => GetBaseUrl() + S3RefPrefix; - public static string GetRefUrl(GenomeAssembly genomeAssembly, string baseUrl = null) => - GetRefPrefix(baseUrl) + genomeAssembly + RefSuffix; + public static string GetRefUrl(GenomeAssembly genomeAssembly) => + GetRefPrefix() + genomeAssembly + RefSuffix; } } \ No newline at end of file diff --git a/Cloud/Utilities/LambdaUtilities.cs b/Cloud/Utilities/LambdaUtilities.cs index 0e780a44..49d31a2c 100644 --- a/Cloud/Utilities/LambdaUtilities.cs +++ b/Cloud/Utilities/LambdaUtilities.cs @@ -30,10 +30,10 @@ public static void DeleteTempOutput() foreach (string tempFile in files) File.Delete(tempFile); } - public static string GetManifestUrl(string version, GenomeAssembly genomeAssembly, int saSchemaVersion, string baseUrl = null) + public static string GetManifestUrl(string version, GenomeAssembly genomeAssembly, int saSchemaVersion = LambdaUrlHelper.SaSchemaVersion) { if (string.IsNullOrEmpty(version)) version = "latest"; - string s3BaseUrl = LambdaUrlHelper.GetManifestBaseUrl(baseUrl) +$"/{saSchemaVersion}/"; + string s3BaseUrl = LambdaUrlHelper.GetManifestBaseUrl() +$"/{saSchemaVersion}/"; switch (version) { case "latest": @@ -47,9 +47,9 @@ public static string GetManifestUrl(string version, GenomeAssembly genomeAssembl } } - public static string GetCachePathPrefix(GenomeAssembly genomeAssembly, string baseUrl=null) + public static string GetCachePathPrefix(GenomeAssembly genomeAssembly) { - return LambdaUrlHelper.GetCacheFolder(baseUrl).UrlCombine(genomeAssembly.ToString()) + return LambdaUrlHelper.GetCacheFolder().UrlCombine(genomeAssembly.ToString()) .UrlCombine(LambdaUrlHelper.DefaultCacheSource); } } diff --git a/Downloader/Configuration.cs b/Downloader/Configuration.cs index fb8ae393..518cd844 100644 --- a/Downloader/Configuration.cs +++ b/Downloader/Configuration.cs @@ -1,4 +1,8 @@ -using System.IO; +using System; +using System.IO; +using Cloud; +using Cloud.Utilities; +using Genome; using Microsoft.Extensions.Configuration; namespace Downloader @@ -21,33 +25,36 @@ public static (string HostName, string CacheDir, string ReferencesDir, string Ma hostName = dataSource["HostName"]; if (string.IsNullOrEmpty(hostName)) throw new InvalidDataException($"Could not find the HostName entry in the {appSettingsFilename} file."); + // this env variable will over-ride the configuration in cloud + Environment.SetEnvironmentVariable(LambdaUrlHelper.UrlBaseEnvironmentVariableName, $"http://{hostName}/"); } - string cacheDir = dataSource["CacheDirectory"]; + var cloudConfiguration = new Cloud.Configuration(); + string cacheDir = cloudConfiguration.CacheDirectory; if (string.IsNullOrEmpty(cacheDir)) - throw new InvalidDataException($"Could not find the CacheDirectory entry in the {appSettingsFilename} file."); + throw new InvalidDataException($"Could not find the CacheDirectory entry in the Cloud.appsettings.json file."); - string referencesDir = dataSource["ReferencesDirectory"]; + string referencesDir = cloudConfiguration.ReferencesDirectory; if (string.IsNullOrEmpty(referencesDir)) - throw new InvalidDataException($"Could not find the ReferencesDirectory entry in the {appSettingsFilename} file."); + throw new InvalidDataException($"Could not find the ReferencesDirectory entry in the Cloud.appsettings.json file."); - string manifestGRCh37; - string manifestGRCh38; + string manifestGRCh37 ; + string manifestGRCh38 ; if (string.IsNullOrEmpty(manifestPrefix)) { - manifestGRCh37 = dataSource["ManifestGRCh37"]; + manifestGRCh37 = LambdaUtilities.GetManifestUrl(dataSource["ManifestGRCh37"], GenomeAssembly.GRCh37); if (string.IsNullOrEmpty(manifestGRCh37)) throw new InvalidDataException($"Could not find the ManifestGRCh37 entry in the {appSettingsFilename} file."); - manifestGRCh38 = dataSource["ManifestGRCh38"]; + manifestGRCh38 = LambdaUtilities.GetManifestUrl(dataSource["ManifestGRCh38"], GenomeAssembly.GRCh38); if (string.IsNullOrEmpty(manifestGRCh38)) throw new InvalidDataException($"Could not find the ManifestGRCh38 entry in the {appSettingsFilename} file."); } else { - manifestGRCh37 = $"{manifestPrefix}_GRCh37.txt"; - manifestGRCh38 = $"{manifestPrefix}_GRCh38.txt"; + manifestGRCh37 = LambdaUtilities.GetManifestUrl($"{manifestPrefix}", GenomeAssembly.GRCh37); + manifestGRCh38 = LambdaUtilities.GetManifestUrl($"{manifestPrefix}", GenomeAssembly.GRCh38); } return (hostName, '/' + cacheDir, '/' + referencesDir, manifestGRCh37, manifestGRCh38); diff --git a/Downloader/Downloader.appsettings.json b/Downloader/Downloader.appsettings.json index 4a48d89b..b0890f40 100644 --- a/Downloader/Downloader.appsettings.json +++ b/Downloader/Downloader.appsettings.json @@ -1,9 +1,7 @@ { "DataSource": { "HostName": "annotations.nirvana.illumina.com", - "CacheDirectory": "ab0cf104f39708eabd07b8cb67e149ba-Cache", - "ReferencesDirectory": "d95867deadfe690e40f42068d6b59df8-References", - "ManifestGRCh37": "latest_SA_GRCh37.txt", - "ManifestGRCh38": "latest_SA_GRCh38.txt" + "ManifestGRCh37": "latest", + "ManifestGRCh38": "latest" } } \ No newline at end of file diff --git a/Downloader/Downloader.csproj b/Downloader/Downloader.csproj index 594d54e5..e881453f 100644 --- a/Downloader/Downloader.csproj +++ b/Downloader/Downloader.csproj @@ -12,6 +12,7 @@ + diff --git a/Downloader/FileExtensions/SupplementaryAnnotationFileExtensions.cs b/Downloader/FileExtensions/SupplementaryAnnotationFileExtensions.cs index 3057681d..e2a78c33 100644 --- a/Downloader/FileExtensions/SupplementaryAnnotationFileExtensions.cs +++ b/Downloader/FileExtensions/SupplementaryAnnotationFileExtensions.cs @@ -13,6 +13,7 @@ static SupplementaryAnnotationFileExtensions() NeedsIndexSet.Add(".nsa"); NeedsIndexSet.Add(".npd"); NeedsIndexSet.Add(".rma"); + NeedsIndexSet.Add(".gsa"); } public static void AddSupplementaryAnnotationFiles(this List files, diff --git a/Downloader/Manifest.cs b/Downloader/Manifest.cs index 12e1b16d..e0574d69 100644 --- a/Downloader/Manifest.cs +++ b/Downloader/Manifest.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using Genome; namespace Downloader diff --git a/Nirvana/ProviderUtilities.cs b/Nirvana/ProviderUtilities.cs index 2a3e8269..6e5fb167 100644 --- a/Nirvana/ProviderUtilities.cs +++ b/Nirvana/ProviderUtilities.cs @@ -1,6 +1,8 @@ using System; using System.Collections.Generic; +using System.IO; using System.Linq; +using ErrorHandling.Exceptions; using IO; using VariantAnnotation.GeneAnnotation; using VariantAnnotation.GeneFusions.IO; @@ -28,9 +30,26 @@ public static ProteinConservationProvider GetProteinConservationProvider(Annotat public static IAnnotationProvider GetConservationProvider(AnnotationFiles files) { if (files == null || files.PhylopFile == default) return null; + (Stream phylopStream, Stream indexStream) = GetDataAndIndexStreams(files.PhylopFile.Npd, files.PhylopFile.Idx); return new ConservationScoreProvider() - .AddPhylopReader(PersistentStreamUtils.GetReadStream(files.PhylopFile.Npd), - PersistentStreamUtils.GetReadStream(files.PhylopFile.Idx)); + .AddPhylopReader(phylopStream, indexStream); + } + + private static (Stream, Stream) GetDataAndIndexStreams(string dataFilePath, string indexPath) + { + var dataStream = PersistentStreamUtils.GetReadStream(dataFilePath); + var indexStream = PersistentStreamUtils.GetReadStream(indexPath); + if (dataStream == null) + { + throw new UserErrorException($"Unable to open data file {dataFilePath}"); + } + + if (indexStream == null) + { + throw new UserErrorException($"Unable to open index file {indexPath}"); + } + + return (dataStream, indexStream); } public static IAnnotationProvider GetLcrProvider(AnnotationFiles files) => @@ -38,11 +57,13 @@ public static IAnnotationProvider GetLcrProvider(AnnotationFiles files) => ? null : new LcrProvider(PersistentStreamUtils.GetReadStream(files.LowComplexityRegionFile)); - public static IRefMinorProvider GetRefMinorProvider(AnnotationFiles files) => - files == null || files.RefMinorFile == default - ? null - : new RefMinorProvider(PersistentStreamUtils.GetReadStream(files.RefMinorFile.Rma), + public static IRefMinorProvider GetRefMinorProvider(AnnotationFiles files) + { + if( files == null || files.RefMinorFile == default) return null; + + return new RefMinorProvider(PersistentStreamUtils.GetReadStream(files.RefMinorFile.Rma), PersistentStreamUtils.GetReadStream(files.RefMinorFile.Idx)); + } public static IGeneAnnotationProvider GetGeneAnnotationProvider(AnnotationFiles files) => files?.NsiFiles == null ? null @@ -64,7 +85,10 @@ private static INsaReader[] GetNsaReaders(IReadOnlyCollection<(string Nsa, strin { var readers = new List(filePaths.Count); foreach ((string nsaPath, string idxPath) in filePaths) - readers.Add(new NsaReader(PersistentStreamUtils.GetReadStream(nsaPath), PersistentStreamUtils.GetReadStream(idxPath))); + { + var (nsaStream, idxStream) = GetDataAndIndexStreams(nsaPath, idxPath); + readers.Add(new NsaReader(nsaStream, idxStream)); + } return readers.SortByJsonKey(); } @@ -79,7 +103,8 @@ public static IAnnotationProvider GetGsaProvider(AnnotationFiles files) var i = 0; foreach ((string gsaPath, string idxPath) in filePaths) { - readers[i] = ScoreReader.Read(PersistentStreamUtils.GetReadStream(gsaPath), PersistentStreamUtils.GetReadStream(idxPath)); + var (gsaStream, idxStream) = GetDataAndIndexStreams(gsaPath, idxPath); + readers[i] = ScoreReader.Read(gsaStream, idxStream); i++; } diff --git a/SingleAnnotationLambda/CacheUtilities.cs b/SingleAnnotationLambda/CacheUtilities.cs index f48f1f3d..810c62da 100644 --- a/SingleAnnotationLambda/CacheUtilities.cs +++ b/SingleAnnotationLambda/CacheUtilities.cs @@ -20,14 +20,13 @@ public static bool IsVepVersionSupported(int vepVersion) => public static string GetCachePathPrefix(int vepVersion, GenomeAssembly genomeAssembly) { string suffix = $"{genomeAssembly}/{LambdaUrlHelper.DefaultCacheSource}"; - - //LambdaUrlHelper.GetBaseUrl() + + switch (vepVersion) { case 84: - return UrlCombine($"{LambdaUrlHelper.GetBaseUrl()+LambdaUrlHelper.S3CacheFolderBase}/26/VEP84/", suffix); + return UrlCombine($"{LambdaUrlHelper.GetBaseUrl() +LambdaUrlHelper.S3CacheFolderBase}/26/VEP84/", suffix); default: - return UrlCombine($"{LambdaUrlHelper.GetBaseUrl()+LambdaUrlHelper.S3CacheFolder}", suffix); + return UrlCombine($"{LambdaUrlHelper.GetCacheFolder()}", suffix); } } diff --git a/UnitTests/Cloud/ConsistencyTests.cs b/UnitTests/Cloud/ConsistencyTests.cs new file mode 100644 index 00000000..79ef3749 --- /dev/null +++ b/UnitTests/Cloud/ConsistencyTests.cs @@ -0,0 +1,14 @@ +using Cloud; +using VariantAnnotation.SA; +using Xunit; + +namespace UnitTests.Cloud; + +public sealed class ConsistencyTests +{ + [Fact] + public void Consistency_with_SAUtils() + { + Assert.Equal(LambdaUrlHelper.SaSchemaVersion, SaCommon.SchemaVersion); + } +} \ No newline at end of file diff --git a/UnitTests/Cloud/LambdaUrlHelperTests.cs b/UnitTests/Cloud/LambdaUrlHelperTests.cs index 41aa7934..44dec182 100644 --- a/UnitTests/Cloud/LambdaUrlHelperTests.cs +++ b/UnitTests/Cloud/LambdaUrlHelperTests.cs @@ -14,14 +14,16 @@ public sealed class LambdaUrlHelperTests [Fact] public void GetDataUrlBase_AsExpected() { - Assert.Equal($"http://somewhere.on.the.earth/ab0cf104f39708eabd07b8cb67e149ba-Cache/{CacheConstants.DataVersion}/", LambdaUrlHelper.GetCacheFolder("http://somewhere.on.the.earth/")); - Assert.Equal($"http://somewhere.on.the.earth/d95867deadfe690e40f42068d6b59df8-References/{ReferenceSequenceCommon.HeaderVersion}/Homo_sapiens.", LambdaUrlHelper.GetRefPrefix("http://somewhere.on.the.earth/")); + Environment.SetEnvironmentVariable("NirvanaDataUrlBase", "http://somewhere.on.the.earth/"); + Assert.Equal($"http://somewhere.on.the.earth/ab0cf104f39708eabd07b8cb67e149ba-Cache/{CacheConstants.DataVersion}/", LambdaUrlHelper.GetCacheFolder()); + Assert.Equal($"http://somewhere.on.the.earth/d95867deadfe690e40f42068d6b59df8-References/{ReferenceSequenceCommon.HeaderVersion}/Homo_sapiens.", LambdaUrlHelper.GetRefPrefix()); } [Fact] public void GetS3RefLocation_AsExpected() { - Assert.Equal(LambdaUrlHelper.GetRefPrefix("whatever") + "GRCh37" + LambdaUrlHelper.RefSuffix, LambdaUrlHelper.GetRefUrl(GenomeAssembly.GRCh37, "whatever")); + Environment.SetEnvironmentVariable("NirvanaDataUrlBase", "whatever"); + Assert.Equal(LambdaUrlHelper.GetRefPrefix() + "GRCh37" + LambdaUrlHelper.RefSuffix, LambdaUrlHelper.GetRefUrl(GenomeAssembly.GRCh37)); } [Fact] @@ -31,5 +33,12 @@ public void GetS3_SaManifest_Location_AsExpected() var saManifestUrl = LambdaUtilities.GetManifestUrl("latest", GenomeAssembly.GRCh38, SaCommon.SchemaVersion); HttpUtilities.ValidateUrl(saManifestUrl); } + + [Fact] + public void GetS3_SaManifest_Location_from_config() + { + var saManifestUrl = LambdaUtilities.GetManifestUrl("latest", GenomeAssembly.GRCh38, SaCommon.SchemaVersion); + HttpUtilities.ValidateUrl(saManifestUrl); + } } } diff --git a/UnitTests/Downloader/ConfigurationTests.cs b/UnitTests/Downloader/ConfigurationTests.cs index c1f23815..92e77c49 100755 --- a/UnitTests/Downloader/ConfigurationTests.cs +++ b/UnitTests/Downloader/ConfigurationTests.cs @@ -1,4 +1,5 @@ using Downloader; +using VariantAnnotation.SA; using Xunit; namespace UnitTests.Downloader @@ -28,9 +29,10 @@ public void Load_OverrideHostName() [Fact] public void Load_OverrideManifest() { + var config = new global::Cloud.Configuration(); (string _, string _, string _, string manifestGRCh37, string manifestGRCh38) = Configuration.Load(null, "Schema23"); - Assert.Equal("Schema23_GRCh37.txt", manifestGRCh37); - Assert.Equal("Schema23_GRCh38.txt", manifestGRCh38); + Assert.Equal($"http://annotations.nirvana.illumina.com/{config.ManifestDirectory}/{SaCommon.SchemaVersion}/Schema23_SA_GRCh37.txt", manifestGRCh37); + Assert.Equal($"http://annotations.nirvana.illumina.com/{config.ManifestDirectory}/{SaCommon.SchemaVersion}/Schema23_SA_GRCh38.txt", manifestGRCh38); } } } \ No newline at end of file