diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 3ca982993..1f194dbbd 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -16,10 +16,10 @@ namespace { silo::Database buildTestDatabase() { - const std::string input_directory{"./testBaseData/exampleDataset/"}; + const std::string input_directory{"./testBaseData/unitTestDummyDataset/"}; silo::config::PreprocessingConfig config; - config.overwrite(silo::config::YamlFile("./testBaseData/test_preprocessing_config.yaml")); + config.overwrite(silo::config::YamlFile(input_directory + "preprocessing_config.yaml")); const auto database_config = silo::config::ConfigRepository().getValidatedConfig(input_directory + "database_config.yaml"); @@ -43,8 +43,8 @@ TEST(DatabaseTest, shouldBuildDatabaseWithoutErrors) { const auto simple_database_info = database.getDatabaseInfo(); EXPECT_GT(simple_database_info.total_size, 0); - EXPECT_EQ(simple_database_info.sequence_count, 100); - EXPECT_EQ(simple_database_info.number_of_partitions, 11); + EXPECT_EQ(simple_database_info.sequence_count, 5); + EXPECT_EQ(simple_database_info.number_of_partitions, 2); } TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) { @@ -83,10 +83,10 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { const auto simple_info = database.getDatabaseInfo(); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 2775910 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 148 ); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 2661831 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 128 ); EXPECT_EQ( @@ -97,7 +97,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .number_of_values_stored_in_run_containers, - 2875 + 0 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic @@ -106,20 +106,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 42629964 - ); - EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 21433248 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 2108 ); + EXPECT_EQ(detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 1066); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .total_bitmap_size_array_containers, - 133240 + 12 ); - EXPECT_EQ(simple_info.total_size, 26589508); - EXPECT_EQ(simple_info.sequence_count, 100); - EXPECT_EQ(simple_info.n_bitmaps_size, 3931); + EXPECT_EQ(simple_info.total_size, 1956); + EXPECT_EQ(simple_info.sequence_count, 5); + EXPECT_EQ(simple_info.n_bitmaps_size, 62); } TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) { @@ -141,7 +139,7 @@ TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) { const auto simple_database_info = database.getDatabaseInfo(); EXPECT_GT(simple_database_info.total_size, 0); - EXPECT_EQ(simple_database_info.sequence_count, 100); + EXPECT_EQ(simple_database_info.sequence_count, 5); EXPECT_GT(simple_database_info.n_bitmaps_size, 0); - EXPECT_EQ(simple_database_info.number_of_partitions, 11); + EXPECT_EQ(simple_database_info.number_of_partitions, 2); } diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 0f98cc87e..9ce80e315 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -194,7 +194,7 @@ const Scenario EMPTY_INPUT_NDJSON_UNPARTITIONED = { const Scenario NO_GENES = { .input_directory = "testBaseData/noGenes/", - .expected_sequence_count = 30, + .expected_sequence_count = 9, .query = R"( { "action": { @@ -206,7 +206,7 @@ const Scenario NO_GENES = { } )", .expected_query_result = nlohmann::json::parse(R"( -[{"count":30}])") +[{"count":9}])") }; const Scenario NO_NUCLEOTIDE_SEQUENCES = { @@ -227,7 +227,7 @@ const Scenario NO_NUCLEOTIDE_SEQUENCES = { const Scenario NO_SEQUENCES = { .input_directory = "testBaseData/noSequences/", - .expected_sequence_count = 30, + .expected_sequence_count = 6, .query = R"( { "action": { @@ -238,7 +238,7 @@ const Scenario NO_SEQUENCES = { } } )", - .expected_query_result = nlohmann::json::parse(R"([{"count":30}])") + .expected_query_result = nlohmann::json::parse(R"([{"count":6}])") }; const Scenario DIVERSE_SEQUENCE_NAMES = { @@ -275,22 +275,6 @@ const Scenario DIVERSE_SEQUENCE_NAMES_NDJSON = { [{"count":2}])") }; -const Scenario MEDIUM_SIZED_RSV_DATASET = { - .input_directory = "testBaseData/mediumSizedRsvDataset/", - .expected_sequence_count = 19662, - .query = R"( - { - "action": { - "type": "Aggregated" - }, - "filterExpression": { - "type": "True" - } - } - )", - .expected_query_result = nlohmann::json::parse(R"([{"count":19662}])") -}; - class PreprocessorTestFixture : public ::testing::TestWithParam {}; INSTANTIATE_TEST_SUITE_P( @@ -310,8 +294,7 @@ INSTANTIATE_TEST_SUITE_P( EMPTY_INPUT_NDJSON_UNPARTITIONED, NO_GENES, NO_NUCLEOTIDE_SEQUENCES, - NO_SEQUENCES, - MEDIUM_SIZED_RSV_DATASET + NO_SEQUENCES ), printTestName ); diff --git a/testBaseData/noGenes/database_config.yaml b/testBaseData/noGenes/database_config.yaml index cb3f8cb07..4c2d40ebf 100644 --- a/testBaseData/noGenes/database_config.yaml +++ b/testBaseData/noGenes/database_config.yaml @@ -2,24 +2,6 @@ schema: instanceName: RSV A opennessLevel: OPEN metadata: - - name: accession - type: string - - name: version - type: int - - name: submissionId - type: string - name: accessionVersion type: string - - name: isRevocation - type: boolean - - name: submitter - type: string - generateIndex: true - - name: groupId - type: int - - name: dataUseTermsUrl - type: string - - name: sample_collection_date - type: date primaryKey: accessionVersion - dateToSortBy: sample_collection_date diff --git a/testBaseData/noGenes/input.ndjson b/testBaseData/noGenes/input.ndjson new file mode 100644 index 000000000..a1dcd1b95 --- /dev/null +++ b/testBaseData/noGenes/input.ndjson @@ -0,0 +1,9 @@ +{"metadata": {"accessionVersion": "LOC_000CJJP.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G", "7504:A"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJKM.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CCTTTGGTTAGAGACCGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTG"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJLK.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJMH.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TCNCGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTGAGACGAC"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJNF.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJPD.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTAATTTAGTTAATATACATATAA"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJQB.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJR9.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJS7.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGAAAAAATGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} diff --git a/testBaseData/noGenes/input.ndjson.zst b/testBaseData/noGenes/input.ndjson.zst deleted file mode 100644 index 4e90de5c6..000000000 Binary files a/testBaseData/noGenes/input.ndjson.zst and /dev/null differ diff --git a/testBaseData/noGenes/preprocessing_config.yaml b/testBaseData/noGenes/preprocessing_config.yaml index 107a1d2db..a6772f530 100644 --- a/testBaseData/noGenes/preprocessing_config.yaml +++ b/testBaseData/noGenes/preprocessing_config.yaml @@ -1,2 +1,2 @@ -ndjsonInputFilename: "input.ndjson.zst" +ndjsonInputFilename: "input.ndjson" referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/noSequences/database_config.yaml b/testBaseData/noSequences/database_config.yaml index 0bade2884..4c2d40ebf 100644 --- a/testBaseData/noSequences/database_config.yaml +++ b/testBaseData/noSequences/database_config.yaml @@ -2,38 +2,6 @@ schema: instanceName: RSV A opennessLevel: OPEN metadata: - - name: accession - type: string - - name: version - type: int - - name: submissionId - type: string - name: accessionVersion type: string - - name: isRevocation - type: boolean - - name: submitter - type: string - generateIndex: true - - name: groupId - type: int - - name: groupName - type: string - generateIndex: true - - name: submittedAt - type: int - - name: releasedAt - type: int - - name: dataUseTerms - type: string - generateIndex: true - - name: dataUseTermsRestrictedUntil - type: date - - name: versionStatus - type: string - - name: dataUseTermsUrl - type: string - - name: sample_collection_date - type: date primaryKey: accessionVersion - dateToSortBy: sample_collection_date diff --git a/testBaseData/noSequences/input.ndjson b/testBaseData/noSequences/input.ndjson new file mode 100644 index 000000000..cfc7f39c0 --- /dev/null +++ b/testBaseData/noSequences/input.ndjson @@ -0,0 +1,6 @@ +{"metadata": { "accessionVersion": "LOC_000CHXY.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CHYW.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CHZU.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CJ0S.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CJ1Q.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CJ2N.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} diff --git a/testBaseData/noSequences/input.ndjson.zst b/testBaseData/noSequences/input.ndjson.zst deleted file mode 100644 index 51fbac85c..000000000 Binary files a/testBaseData/noSequences/input.ndjson.zst and /dev/null differ diff --git a/testBaseData/noSequences/preprocessing_config.yaml b/testBaseData/noSequences/preprocessing_config.yaml index 107a1d2db..a6772f530 100644 --- a/testBaseData/noSequences/preprocessing_config.yaml +++ b/testBaseData/noSequences/preprocessing_config.yaml @@ -1,2 +1,2 @@ -ndjsonInputFilename: "input.ndjson.zst" +ndjsonInputFilename: "input.ndjson" referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/unitTestDummyDataset/aa_insertions.tsv b/testBaseData/unitTestDummyDataset/aa_insertions.tsv new file mode 100644 index 000000000..c86a8d70a --- /dev/null +++ b/testBaseData/unitTestDummyDataset/aa_insertions.tsv @@ -0,0 +1,6 @@ +primaryKey E M +key1 [214:EPE] [] +key2 [] [] +key3 [] [] +key4 [] [] +key5 [214:EPE] [] diff --git a/testBaseData/unitTestDummyDataset/database_config.yaml b/testBaseData/unitTestDummyDataset/database_config.yaml new file mode 100644 index 000000000..a0185668e --- /dev/null +++ b/testBaseData/unitTestDummyDataset/database_config.yaml @@ -0,0 +1,30 @@ +schema: + instanceName: sars_cov-2_minimal_test_config + metadata: + - name: primaryKey + type: string + - name: date + type: date + - name: unsorted_date + type: date + - name: region + type: string + generateIndex: true + - name: country + type: string + generateIndex: true + - name: pango_lineage + type: pango_lineage + - name: division + type: string + generateIndex: true + - name: age + type: int + - name: qc_value + type: float + - name: test_boolean_column + type: boolean + primaryKey: primaryKey + dateToSortBy: date + partitionBy: pango_lineage +defaultNucleotideSequence: "main" diff --git a/testBaseData/unitTestDummyDataset/gene_0.fasta b/testBaseData/unitTestDummyDataset/gene_0.fasta new file mode 100644 index 000000000..a06dce2c5 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/gene_0.fasta @@ -0,0 +1,10 @@ +>key1 +MYSF* +>key2 +MYSF* +>key3 +MYSF* +>key4 +MYSF* +>key5 +MYSF* diff --git a/testBaseData/unitTestDummyDataset/gene_1.fasta b/testBaseData/unitTestDummyDataset/gene_1.fasta new file mode 100644 index 000000000..85ba153ab --- /dev/null +++ b/testBaseData/unitTestDummyDataset/gene_1.fasta @@ -0,0 +1,10 @@ +>key1 +MADS* +>key2 +MADS* +>key3 +MADS* +>key4 +MADS* +>key5 +XXXX* diff --git a/testBaseData/unitTestDummyDataset/metadata.tsv b/testBaseData/unitTestDummyDataset/metadata.tsv new file mode 100644 index 000000000..a21d3bbff --- /dev/null +++ b/testBaseData/unitTestDummyDataset/metadata.tsv @@ -0,0 +1,6 @@ +primaryKey pango_lineage date region country division unsorted_date age qc_value test_boolean_column +key1 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 true +key2 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 false +key3 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96 +key4 XBB 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 true +key5 XBB 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 true diff --git a/testBaseData/unitTestDummyDataset/nuc_0.fasta b/testBaseData/unitTestDummyDataset/nuc_0.fasta new file mode 100644 index 000000000..182af609f --- /dev/null +++ b/testBaseData/unitTestDummyDataset/nuc_0.fasta @@ -0,0 +1,10 @@ +>key1 +ACGTACGT +>key2 +AAGNAAGN +>key3 +ACGTACGT +>key4 +ACGTACGT +>key5 +ACGTACGT diff --git a/testBaseData/unitTestDummyDataset/nuc_1.fasta b/testBaseData/unitTestDummyDataset/nuc_1.fasta new file mode 100644 index 000000000..733eb86b0 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/nuc_1.fasta @@ -0,0 +1,10 @@ +>key1 +ACGT +>key2 +AAGN +>key3 +ACGT +>key4 +ACGT +>key5 +ACGT diff --git a/testBaseData/unitTestDummyDataset/nuc_insertions.tsv b/testBaseData/unitTestDummyDataset/nuc_insertions.tsv new file mode 100644 index 000000000..66df8022c --- /dev/null +++ b/testBaseData/unitTestDummyDataset/nuc_insertions.tsv @@ -0,0 +1,6 @@ +primaryKey main testSecondSequence +key1 [] [] +key2 [] [] +key3 [] [] +key4 [] [] +key5 [] [] diff --git a/testBaseData/unitTestDummyDataset/pangolineage_alias.json b/testBaseData/unitTestDummyDataset/pangolineage_alias.json new file mode 100644 index 000000000..28210c7b9 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/pangolineage_alias.json @@ -0,0 +1,501 @@ +{ + "A": "", + "B": "", + "C": "B.1.1.1", + "D": "B.1.1.25", + "G": "B.1.258.2", + "K": "B.1.1.277", + "L": "B.1.1.10", + "M": "B.1.1.294", + "N": "B.1.1.33", + "P": "B.1.1.28", + "Q": "B.1.1.7", + "R": "B.1.1.316", + "S": "B.1.1.217", + "U": "B.1.177.60", + "V": "B.1.177.54", + "W": "B.1.177.53", + "Y": "B.1.177.52", + "Z": "B.1.177.50", + "AA": "B.1.177.15", + "AB": "B.1.160.16", + "AC": "B.1.1.405", + "AD": "B.1.1.315", + "AE": "B.1.1.306", + "AF": "B.1.1.305", + "AG": "B.1.1.297", + "AH": "B.1.1.241", + "AJ": "B.1.1.240", + "AK": "B.1.1.232", + "AL": "B.1.1.231", + "AM": "B.1.1.216", + "AN": "B.1.1.200", + "AP": "B.1.1.70", + "AQ": "B.1.1.39", + "AS": "B.1.1.317", + "AT": "B.1.1.370", + "AU": "B.1.466.2", + "AV": "B.1.1.482", + "AW": "B.1.1.464", + "AY": "B.1.617.2", + "AZ": "B.1.1.318", + "BA": "B.1.1.529", + "BB": "B.1.621.1", + "BC": "B.1.1.529.1.1.1", + "BD": "B.1.1.529.1.17.2", + "BE": "B.1.1.529.5.3.1", + "BF": "B.1.1.529.5.2.1", + "BG": "B.1.1.529.2.12.1", + "BH": "B.1.1.529.2.38.3", + "BJ": "B.1.1.529.2.10.1", + "BK": "B.1.1.529.5.1.10", + "BL": "B.1.1.529.2.75.1", + "BM": "B.1.1.529.2.75.3", + "BN": "B.1.1.529.2.75.5", + "BP": "B.1.1.529.2.3.16", + "BQ": "B.1.1.529.5.3.1.1.1.1", + "BR": "B.1.1.529.2.75.4", + "BS": "B.1.1.529.2.3.2", + "BT": "B.1.1.529.5.1.21", + "BU": "B.1.1.529.5.2.16", + "BV": "B.1.1.529.5.2.20", + "BW": "B.1.1.529.5.6.2", + "BY": "B.1.1.529.2.75.6", + "BZ": "B.1.1.529.5.2.3", + "CA": "B.1.1.529.2.75.2", + "CB": "B.1.1.529.2.75.9", + "CC": "B.1.1.529.5.3.1.1.1.2", + "CD": "B.1.1.529.5.2.31", + "CE": "B.1.1.529.5.2.33", + "CF": "B.1.1.529.5.2.27", + "CG": "B.1.1.529.5.2.26", + "CH": "B.1.1.529.2.75.3.4.1.1", + "CJ": "B.1.1.529.2.75.3.1.1.1", + "CK": "B.1.1.529.5.2.24", + "CL": "B.1.1.529.5.1.29", + "CM": "B.1.1.529.2.3.20", + "CN": "B.1.1.529.5.2.21", + "CP": "B.1.1.529.5.2.6", + "CQ": "B.1.1.529.5.3.1.4.1.1", + "CR": "B.1.1.529.5.2.18", + "CS": "B.1.1.529.4.1.10", + "CT": "B.1.1.529.5.2.36", + "CU": "B.1.1.529.5.1.26", + "CV": "B.1.1.529.2.75.3.1.1.3", + "CW": "B.1.1.529.5.3.1.1.1.1.1.1.14", + "CY": "B.1.1.529.5.2.7", + "CZ": "B.1.1.529.5.3.1.1.1.1.1.1.1", + "DA": "B.1.1.529.5.2.38", + "DB": "B.1.1.529.5.2.25", + "DC": "B.1.1.529.4.6.5", + "DD": "B.1.1.529.2.3.21", + "DE": "B.1.1.529.5.1.23", + "DF": "B.1.1.529.5.10.1", + "DG": "B.1.1.529.5.2.24.2.1.1", + "DH": "B.1.1.529.5.1.22", + "DJ": "B.1.1.529.5.1.25", + "DK": "B.1.1.529.5.3.1.1.1.1.1.1.7", + "DL": "B.1.1.529.5.1.15", + "DM": "B.1.1.529.5.3.1.1.1.1.1.1.15", + "DN": "B.1.1.529.5.3.1.1.1.1.1.1.5", + "DP": "B.1.1.529.5.3.1.1.1.1.1.1.8", + "DQ": "B.1.1.529.5.2.47", + "DR": "B.1.1.529.5.3.1.1.1.1.1.1.3", + "DS": "B.1.1.529.2.75.5.1.3.1", + "DT": "B.1.1.529.5.3.1.1.1.1.1.1.32", + "DU": "B.1.1.529.5.3.1.1.1.1.1.1.2", + "DV": "B.1.1.529.2.75.3.4.1.1.1.1.1", + "DW": "B.1.1.529.5.3.1.1.2.1", + "DY": "B.1.1.529.5.2.48", + "DZ": "B.1.1.529.5.2.49", + "EA": "B.1.1.529.5.3.1.1.1.1.1.1.52", + "EB": "B.1.1.529.5.1.35", + "EC": "B.1.1.529.5.3.1.1.1.1.1.10.1", + "ED": "B.1.1.529.5.3.1.1.1.1.1.1.18", + "EE": "B.1.1.529.5.3.1.1.1.1.1.1.4", + "EF": "B.1.1.529.5.3.1.1.1.1.1.1.13", + "EG": "XBB.1.9.2", + "EH": "B.1.1.529.5.3.1.1.1.1.1.1.28", + "EJ": "B.1.1.529.2.75.5.1.3.8", + "EK": "XBB.1.5.13", + "EL": "XBB.1.5.14", + "EM": "XBB.1.5.7", + "EN": "B.1.1.529.5.3.1.1.1.1.1.1.46", + "EP": "B.1.1.529.2.75.3.1.1.4", + "EQ": "B.1.1.529.5.1.33", + "ER": "B.1.1.529.5.3.1.1.1.1.1.1.22", + "ES": "B.1.1.529.5.3.1.1.1.1.1.1.65", + "ET": "B.1.1.529.5.3.1.1.1.1.1.1.35", + "EU": "XBB.1.5.26", + "EV": "B.1.1.529.5.3.1.1.1.1.1.1.71", + "EW": "B.1.1.529.5.3.1.1.1.1.1.1.38", + "EY": "B.1.1.529.5.3.1.1.1.1.1.1.13.1.1.1", + "EZ": "B.1.1.529.5.3.1.1.1.1.1.1.43", + "FA": "B.1.1.529.5.3.1.1.1.1.1.1.10", + "FB": "B.1.1.529.5.3.1.1.1.1.1.2.1", + "FC": "B.1.1.529.5.3.1.1.1.1.1.1.72", + "FD": "XBB.1.5.15", + "FE": "XBB.1.18.1", + "FF": "B.1.1.529.5.3.1.1.1.1.1.8.2", + "FG": "XBB.1.5.16", + "FH": "XBB.1.5.17", + "FJ": "B.1.1.529.2.75.3.4.1.1.1.1.19", + "FK": "B.1.1.529.2.75.3.4.1.1.1.1.17", + "FL": "XBB.1.9.1", + "FM": "B.1.1.529.5.3.1.1.1.1.1.1.53", + "FN": "B.1.1.529.5.3.1.1.1.1.1.1.74", + "FP": "XBB.1.11.1", + "FQ": "B.1.1.529.5.3.1.1.1.1.1.1.39", + "FR": "B.1.1.529.2.75.5.1.2.3", + "FS": "B.1.1.529.2.75.3.4.1.1.1.1.12", + "FT": "XBB.1.5.39", + "FU": "XBB.1.16.1", + "FV": "B.1.1.529.2.3.20.8.1.1", + "FW": "XBB.1.28.1", + "FY": "XBB.1.22.1", + "FZ": "XBB.1.5.47", + "GA": "XBB.1.17.1", + "GB": "XBB.1.5.46", + "GC": "XBB.1.5.21", + "GD": "XBB.1.9.3", + "GE": "XBB.2.3.10", + "GF": "XBB.1.5.24", + "GG": "XBB.1.5.38", + "GH": "XBB.2.6.1", + "GJ": "XBB.2.3.3", + "GK": "XBB.1.5.70", + "GL": "XAY.1.1.1", + "GM": "XBB.2.3.6", + "GN": "XBB.1.5.73", + "GP": "B.1.1.529.2.75.3.4.1.1.1.1.11", + "GQ": "B.1.1.529.2.75.3.4.1.1.1.1.3", + "GR": "XBB.1.5.42", + "GS": "XBB.2.3.11", + "GT": "XBC.1.6.1", + "GU": "XBB.1.5.41", + "GV": "XBB.1.5.48", + "GW": "XBB.1.19.1", + "GY": "XBB.1.16.2", + "GZ": "XBB.2.3.4", + "HA": "XBB.1.5.86", + "HB": "XBB.1.34.2", + "HC": "XBB.1.5.44", + "XA": [ + "B.1.1.7", + "B.1.177" + ], + "XB": [ + "B.1.634", + "B.1.631" + ], + "XC": [ + "AY.29", + "B.1.1.7" + ], + "XD": [ + "B.1.617.2*", + "BA.1*" + ], + "XE": [ + "BA.1*", + "BA.2*" + ], + "XF": [ + "B.1.617.2*", + "BA.1*" + ], + "XG": [ + "BA.1*", + "BA.2*" + ], + "XH": [ + "BA.1*", + "BA.2*" + ], + "XJ": [ + "BA.1*", + "BA.2*" + ], + "XK": [ + "BA.1*", + "BA.2*" + ], + "XL": [ + "BA.1*", + "BA.2*" + ], + "XM": [ + "BA.1.1*", + "BA.2*" + ], + "XN": [ + "BA.1*", + "BA.2*" + ], + "XP": [ + "BA.1.1*", + "BA.2*" + ], + "XQ": [ + "BA.1.1*", + "BA.2*" + ], + "XR": [ + "BA.1.1*", + "BA.2*" + ], + "XS": [ + "B.1.617.2*", + "BA.1.1*" + ], + "XT": [ + "BA.2*", + "BA.1*" + ], + "XU": [ + "BA.1*", + "BA.2*" + ], + "XV": [ + "BA.1*", + "BA.2*" + ], + "XW": [ + "BA.1*", + "BA.2*" + ], + "XY": [ + "BA.1*", + "BA.2*" + ], + "XZ": [ + "BA.2*", + "BA.1*" + ], + "XAA": [ + "BA.1*", + "BA.2*" + ], + "XAB": [ + "BA.1*", + "BA.2*" + ], + "XAC": [ + "BA.2*", + "BA.1*", + "BA.2*" + ], + "XAD": [ + "BA.2*", + "BA.1*" + ], + "XAE": [ + "BA.2*", + "BA.1*" + ], + "XAF": [ + "BA.1*", + "BA.2*" + ], + "XAG": [ + "BA.1*", + "BA.2*" + ], + "XAH": [ + "BA.2*", + "BA.1*" + ], + "XAJ": [ + "BA.2.12.1*", + "BA.4*" + ], + "XAK": [ + "BA.2*", + "BA.1*", + "BA.2*" + ], + "XAL": [ + "BA.1*", + "BA.2*" + ], + "XAM": [ + "BA.1.1", + "BA.2.9" + ], + "XAN": [ + "BA.2*", + "BA.5.1" + ], + "XAP": [ + "BA.2*", + "BA.1*" + ], + "XAQ": [ + "BA.1*", + "BA.2*" + ], + "XAR": [ + "BA.1*", + "BA.2*" + ], + "XAS": [ + "BA.5*", + "BA.2*" + ], + "XAT": [ + "BA.2.3.13", + "BA.1*" + ], + "XAU": [ + "BA.1.1*", + "BA.2.9*" + ], + "XAV": [ + "BA.2*", + "BA.5*" + ], + "XAW": [ + "BA.2*", + "AY.122" + ], + "XAY": [ + "BA.2*", + "AY.45", + "BA.2*", + "AY.45", + "BA.2*" + ], + "XAZ": [ + "BA.2.5", + "BA.5", + "BA.2.5" + ], + "XBA": [ + "BA.2*", + "AY.45", + "BA.2*", + "AY.45", + "BA.2*" + ], + "XBB": [ + "BJ.1", + "BM.1.1.1" + ], + "XBC": [ + "BA.2*", + "B.1.617.2*", + "BA.2*", + "B.1.617.2*" + ], + "XBD": [ + "BA.2.75.2", + "BF.5" + ], + "XBE": [ + "BA.5.2", + "BE.4.1" + ], + "XBF": [ + "BA.5.2.3", + "CJ.1" + ], + "XBG": [ + "BA.2.76", + "BA.5.2" + ], + "XBH": [ + "BA.2.3.17", + "BA.2.75.2" + ], + "XBJ": [ + "BA.2.3.20", + "BA.5.2" + ], + "XBK": [ + "BA.5.2", + "CJ.1" + ], + "XBL": [ + "XBB.1.5.57", + "BA.2.75*", + "XBB.1.5.57" + ], + "XBM": [ + "BA.2.76", + "BF.3" + ], + "XBN": [ + "BA.2.75", + "XBB.3" + ], + "XBP": [ + "BA.2.75*", + "BQ.1*" + ], + "XBQ": [ + "BA.5.2", + "CJ.1" + ], + "XBR": [ + "BA.2.75", + "BQ.1" + ], + "XBS": [ + "BA.2.75", + "BQ.1" + ], + "XBT": [ + "BA.5.2.34", + "BA.2.75", + "BA.5.2.34" + ], + "XBU": [ + "BA.2.75.3", + "BQ.1", + "BA.2.75.3" + ], + "XBV": [ + "CR.1", + "XBB.1" + ], + "XBW": [ + "XBB.1.5", + "BQ.1.14" + ], + "XBY": [ + "BR.2.1", + "XBF" + ], + "XBZ": [ + "BA.5.2*", + "EF.1.3" + ], + "XCA": [ + "BA.2.75*", + "BQ.1*" + ], + "XCB": [ + "BF.31.1", + "BQ.1.10*" + ], + "XCC": [ + "CH.1.1.1", + "XBB.1.9.1" + ], + "XCD": [ + "XBB.1*", + "BQ.1.1.25*" + ], + "XCE": [ + "BQ.1*", + "FY.1" + ], + "XCF": [ + "XBB*", + "FE.1" + ], + "XCG": [ + "BA.5.2*", + "XBB.1" + ] +} diff --git a/testBaseData/unitTestDummyDataset/preprocessing_config.yaml b/testBaseData/unitTestDummyDataset/preprocessing_config.yaml new file mode 100644 index 000000000..498dd9d1a --- /dev/null +++ b/testBaseData/unitTestDummyDataset/preprocessing_config.yaml @@ -0,0 +1,5 @@ +inputDirectory: "./testBaseData/unitTestDummyDataset/" +outputDirectory: "./output/" +metadataFilename: "metadata.tsv" +pangoLineageDefinitionFilename: "pangolineage_alias.json" +referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/unitTestDummyDataset/reference_genomes.json b/testBaseData/unitTestDummyDataset/reference_genomes.json new file mode 100644 index 000000000..4235418ca --- /dev/null +++ b/testBaseData/unitTestDummyDataset/reference_genomes.json @@ -0,0 +1,22 @@ +{ + "nucleotideSequences": [ + { + "name": "main", + "sequence": "ACGTACGT" + }, + { + "name": "testSecondSequence", + "sequence": "ACGT" + } + ], + "genes": [ + { + "name": "E", + "sequence": "MYSF*" + }, + { + "name": "M", + "sequence": "MADS*" + } + ] +} diff --git a/testBaseData/unitTestDummyDataset/unaligned_0.fasta b/testBaseData/unitTestDummyDataset/unaligned_0.fasta new file mode 100644 index 000000000..182af609f --- /dev/null +++ b/testBaseData/unitTestDummyDataset/unaligned_0.fasta @@ -0,0 +1,10 @@ +>key1 +ACGTACGT +>key2 +AAGNAAGN +>key3 +ACGTACGT +>key4 +ACGTACGT +>key5 +ACGTACGT diff --git a/testBaseData/unitTestDummyDataset/unaligned_1.fasta b/testBaseData/unitTestDummyDataset/unaligned_1.fasta new file mode 100644 index 000000000..733eb86b0 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/unaligned_1.fasta @@ -0,0 +1,10 @@ +>key1 +ACGT +>key2 +AAGN +>key3 +ACGT +>key4 +ACGT +>key5 +ACGT