From 362b1ed6960a7a9b39ebbf9a1926ed2c52b9b136 Mon Sep 17 00:00:00 2001 From: Fabian Engelniederhammer Date: Wed, 11 Sep 2024 10:06:32 +0200 Subject: [PATCH] test: speed up unit tests by using smaller datasets * database.test.cpp: Use a smaller dataset. Speed up locally: 7s -> 800ms * preprocessor.test.cpp: The RSV dataset took 45s -> shift it to the new e2e tests * preprocessor.test.cpp: Use smaller datasets for some tests: 1.8s -> ~500ms --- src/silo/database.test.cpp | 32 +- src/silo/preprocessing/preprocessor.test.cpp | 27 +- testBaseData/noGenes/database_config.yaml | 18 - testBaseData/noGenes/input.ndjson | 9 + testBaseData/noGenes/input.ndjson.zst | Bin 4137 -> 0 bytes .../noGenes/preprocessing_config.yaml | 2 +- testBaseData/noSequences/database_config.yaml | 32 -- testBaseData/noSequences/input.ndjson | 6 + testBaseData/noSequences/input.ndjson.zst | Bin 3657 -> 0 bytes .../noSequences/preprocessing_config.yaml | 2 +- .../unitTestDummyDataset/aa_insertions.tsv | 6 + .../unitTestDummyDataset/database_config.yaml | 30 ++ .../unitTestDummyDataset/gene_0.fasta | 10 + .../unitTestDummyDataset/gene_1.fasta | 10 + .../unitTestDummyDataset/metadata.tsv | 6 + .../unitTestDummyDataset/metadata.tsv_ | 6 + testBaseData/unitTestDummyDataset/nuc_0.fasta | 10 + testBaseData/unitTestDummyDataset/nuc_1.fasta | 10 + .../unitTestDummyDataset/nuc_insertions.tsv | 6 + .../pangolineage_alias.json | 501 ++++++++++++++++++ .../preprocessing_config.yaml | 5 + .../reference_genomes.json | 22 + .../unitTestDummyDataset/unaligned_0.fasta | 10 + .../unitTestDummyDataset/unaligned_1.fasta | 10 + 24 files changed, 679 insertions(+), 91 deletions(-) create mode 100644 testBaseData/noGenes/input.ndjson delete mode 100644 testBaseData/noGenes/input.ndjson.zst create mode 100644 testBaseData/noSequences/input.ndjson delete mode 100644 testBaseData/noSequences/input.ndjson.zst create mode 100644 testBaseData/unitTestDummyDataset/aa_insertions.tsv create mode 100644 testBaseData/unitTestDummyDataset/database_config.yaml create mode 100644 testBaseData/unitTestDummyDataset/gene_0.fasta create mode 100644 testBaseData/unitTestDummyDataset/gene_1.fasta create mode 100644 testBaseData/unitTestDummyDataset/metadata.tsv create mode 100644 testBaseData/unitTestDummyDataset/metadata.tsv_ create mode 100644 testBaseData/unitTestDummyDataset/nuc_0.fasta create mode 100644 testBaseData/unitTestDummyDataset/nuc_1.fasta create mode 100644 testBaseData/unitTestDummyDataset/nuc_insertions.tsv create mode 100644 testBaseData/unitTestDummyDataset/pangolineage_alias.json create mode 100644 testBaseData/unitTestDummyDataset/preprocessing_config.yaml create mode 100644 testBaseData/unitTestDummyDataset/reference_genomes.json create mode 100644 testBaseData/unitTestDummyDataset/unaligned_0.fasta create mode 100644 testBaseData/unitTestDummyDataset/unaligned_1.fasta diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 3ca982993..1f194dbbd 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -16,10 +16,10 @@ namespace { silo::Database buildTestDatabase() { - const std::string input_directory{"./testBaseData/exampleDataset/"}; + const std::string input_directory{"./testBaseData/unitTestDummyDataset/"}; silo::config::PreprocessingConfig config; - config.overwrite(silo::config::YamlFile("./testBaseData/test_preprocessing_config.yaml")); + config.overwrite(silo::config::YamlFile(input_directory + "preprocessing_config.yaml")); const auto database_config = silo::config::ConfigRepository().getValidatedConfig(input_directory + "database_config.yaml"); @@ -43,8 +43,8 @@ TEST(DatabaseTest, shouldBuildDatabaseWithoutErrors) { const auto simple_database_info = database.getDatabaseInfo(); EXPECT_GT(simple_database_info.total_size, 0); - EXPECT_EQ(simple_database_info.sequence_count, 100); - EXPECT_EQ(simple_database_info.number_of_partitions, 11); + EXPECT_EQ(simple_database_info.sequence_count, 5); + EXPECT_EQ(simple_database_info.number_of_partitions, 2); } TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) { @@ -83,10 +83,10 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { const auto simple_info = database.getDatabaseInfo(); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 2775910 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 148 ); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 2661831 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 128 ); EXPECT_EQ( @@ -97,7 +97,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .number_of_values_stored_in_run_containers, - 2875 + 0 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic @@ -106,20 +106,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 42629964 - ); - EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 21433248 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 2108 ); + EXPECT_EQ(detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 1066); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .total_bitmap_size_array_containers, - 133240 + 12 ); - EXPECT_EQ(simple_info.total_size, 26589508); - EXPECT_EQ(simple_info.sequence_count, 100); - EXPECT_EQ(simple_info.n_bitmaps_size, 3931); + EXPECT_EQ(simple_info.total_size, 1956); + EXPECT_EQ(simple_info.sequence_count, 5); + EXPECT_EQ(simple_info.n_bitmaps_size, 62); } TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) { @@ -141,7 +139,7 @@ TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) { const auto simple_database_info = database.getDatabaseInfo(); EXPECT_GT(simple_database_info.total_size, 0); - EXPECT_EQ(simple_database_info.sequence_count, 100); + EXPECT_EQ(simple_database_info.sequence_count, 5); EXPECT_GT(simple_database_info.n_bitmaps_size, 0); - EXPECT_EQ(simple_database_info.number_of_partitions, 11); + EXPECT_EQ(simple_database_info.number_of_partitions, 2); } diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 0f98cc87e..9ce80e315 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -194,7 +194,7 @@ const Scenario EMPTY_INPUT_NDJSON_UNPARTITIONED = { const Scenario NO_GENES = { .input_directory = "testBaseData/noGenes/", - .expected_sequence_count = 30, + .expected_sequence_count = 9, .query = R"( { "action": { @@ -206,7 +206,7 @@ const Scenario NO_GENES = { } )", .expected_query_result = nlohmann::json::parse(R"( -[{"count":30}])") +[{"count":9}])") }; const Scenario NO_NUCLEOTIDE_SEQUENCES = { @@ -227,7 +227,7 @@ const Scenario NO_NUCLEOTIDE_SEQUENCES = { const Scenario NO_SEQUENCES = { .input_directory = "testBaseData/noSequences/", - .expected_sequence_count = 30, + .expected_sequence_count = 6, .query = R"( { "action": { @@ -238,7 +238,7 @@ const Scenario NO_SEQUENCES = { } } )", - .expected_query_result = nlohmann::json::parse(R"([{"count":30}])") + .expected_query_result = nlohmann::json::parse(R"([{"count":6}])") }; const Scenario DIVERSE_SEQUENCE_NAMES = { @@ -275,22 +275,6 @@ const Scenario DIVERSE_SEQUENCE_NAMES_NDJSON = { [{"count":2}])") }; -const Scenario MEDIUM_SIZED_RSV_DATASET = { - .input_directory = "testBaseData/mediumSizedRsvDataset/", - .expected_sequence_count = 19662, - .query = R"( - { - "action": { - "type": "Aggregated" - }, - "filterExpression": { - "type": "True" - } - } - )", - .expected_query_result = nlohmann::json::parse(R"([{"count":19662}])") -}; - class PreprocessorTestFixture : public ::testing::TestWithParam {}; INSTANTIATE_TEST_SUITE_P( @@ -310,8 +294,7 @@ INSTANTIATE_TEST_SUITE_P( EMPTY_INPUT_NDJSON_UNPARTITIONED, NO_GENES, NO_NUCLEOTIDE_SEQUENCES, - NO_SEQUENCES, - MEDIUM_SIZED_RSV_DATASET + NO_SEQUENCES ), printTestName ); diff --git a/testBaseData/noGenes/database_config.yaml b/testBaseData/noGenes/database_config.yaml index cb3f8cb07..4c2d40ebf 100644 --- a/testBaseData/noGenes/database_config.yaml +++ b/testBaseData/noGenes/database_config.yaml @@ -2,24 +2,6 @@ schema: instanceName: RSV A opennessLevel: OPEN metadata: - - name: accession - type: string - - name: version - type: int - - name: submissionId - type: string - name: accessionVersion type: string - - name: isRevocation - type: boolean - - name: submitter - type: string - generateIndex: true - - name: groupId - type: int - - name: dataUseTermsUrl - type: string - - name: sample_collection_date - type: date primaryKey: accessionVersion - dateToSortBy: sample_collection_date diff --git a/testBaseData/noGenes/input.ndjson b/testBaseData/noGenes/input.ndjson new file mode 100644 index 000000000..a1dcd1b95 --- /dev/null +++ b/testBaseData/noGenes/input.ndjson @@ -0,0 +1,9 @@ +{"metadata": {"accessionVersion": "LOC_000CJJP.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G", "7504:A"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJKM.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CCTTTGGTTAGAGACCGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTG"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJLK.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJMH.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TCNCGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTGAGACGAC"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJNF.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJPD.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTAATTTAGTTAATATACATATAA"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJQB.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJR9.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": {"accessionVersion": "LOC_000CJS7.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGAAAAAATGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} diff --git a/testBaseData/noGenes/input.ndjson.zst b/testBaseData/noGenes/input.ndjson.zst deleted file mode 100644 index 4e90de5c69b0475aedc0b08dfe799571e00536ff..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4137 zcmV+^5Z3P~wJ-go^T+`J&42(ZtV|y-z-ox-0U&77FzZc#jmnmI8JJcOf`hs_$6;AW zPxOz+1_7hgcbFQF>ne&?5F8KO0p|hf0TRuA@)V^V7SVBrVEB2~3~;}g!V<}isEhaa zTV)1xRv=#vPNryFx^gsqf9j`7cdLrv(Ufn*^8|CIXc>Rk+sa>My>TUfV~gW2NjJ*Z zj_&4-Elj1X$ih4<7&G5S`l?6TIkdM8{GjX)4k`=@Lxg(N}j%CaoonEtQFr zq3$)XM(1Wta7@uS|NQ+atJ0EmO?q2V#H1sQEe!z!vYxaQS-Nv8qDN0z)#VY6NV#)O zi6WPZINm?ypHOaP9#OaJKH=#1)u~IO^v;-yDC(ug7K*wuQ?#GlsB=weY-zXQu5;G` z2@8!W8sD1IuiM!2-i(ha8l_M2Q>_7}Xt&?#_ODt40gs|e_$cCo3`7A9L;+0$pN#R* zMmEmHNMoF`4!}STz@SPw0eJ9W!jjb)R~CWBFpXrIsG#vN4aKYwHX4dY-D?IU$k5~1 zb}J6~33vUJbTy{W6HMKFHMTrx`7N#BSind;9C&~NP*6~Ct9QzV=ow=AF5%tzKNuVj zZ9nuJ;`ZBkNC0^-EQlZo17sZ<#L>o<3SUFreA7;?@vKYlNn=af&#$QSpJd$obLvVY zUHJ#c3#J5(Ew8bqF(9D9an41~gG0OhP7K_=F9ThodG!3^`FYlo-gv*&;H&-pr$(Pm zEPy-`4kqeLJRTGzep_wpi@KGHfBp&1SJd-Yzb}5j!ngWL=buC`<=-#)Ey+0Im&TRf zR(}55iuzCX?U95#*LiaB^c0Cl`zySyXiJs@obP1oF5O5p{gtsK-3speExoPiFV1m9 z>D{xIqKJ;Edo8fPcU(m-UR}HMj3X}Rw@aUZ-j>D@e(7t1Bh{V0f?xU_eFb;vM!&7b zmi7`T>XJk!C7ljOi$wXcbve_Omaqi({8-}g?dKWmZAJMjEWO3Gq`Tf$B+=1xZSBjN z;H@hy^-E9pcSK_@;oUEu-oYfA_54-e*ZKo&`F?fWc}~Dy%8w-oqprpaZ262OJUxR2 z{2%NqjU#$Xn8Bue8JFrXH$^Vnziv$Q;E=s@qg5XTKkSkxxbSrj2 zlu**Tpmo7O2NmX=uqGTD7D9mlhC%|q1OpvxOjBLmm%5~K=o>{dGKepxo%i2-78a4;SmE#U^wkkHAdk+MQZ>!kBR2e|Ktgr-Qh zx(-|T#rsAQB<`MkzLH*Wu;uCqbr%v2e#$rRZ*WL(*h2Z~QZNOQ(3E!I;A%LqwD=o(tzV_bF8y+|u%ppy>F!hu;x=$?C45he6k+klJOfQtxg zWtd2tN#krDgp$_A(dh(u3o5`{KuGC>a#E(LQ!=zNlA+cr8Y*XFi({>VPKaAfh+8~3 zXCs+NCzX_u)WKOB=o$#6ldrMG1`{$N4$9{ePevJIjda#0>!Ya;#!)Lwz@~YOd{w%p zN~XFAUrlh8!Wb6>SP&q_7$amV^IY>_<5Cp@9#MQTk@S?ii8QdLb4h0DgY?)-d zuj>Fwu+>G!Dx#CqI@W>g1K9_19u4%-z)VUXl?;s$I>sts1My6VCo*-;TBobce2+5# zf@S~&wP_mn8v3lY3>YK>1{$S=aJoq=gzi9e2P6Yyj0|0;)F~lknx@Xz#AZ?mQpoq@dmzZM);iWY7J8gV z;Tqf6*F5=P0^*?|K>(y+ArJ~yVFBdPz_^5)?!9Lli-)5zPgY2q$-LNLSV*Ht`+kE> zsN%w?V+I+akr5e5lB5HI3lN0DFb;GOqQV{$K+rgjqc{#?5QIS(1VIo21VIQ81|dXX z6fuSnq7YLK^8x?jIKokFOL+OxjV~^P{i%6$h`Ofp@W;M{<3j7diE@$BELW1i5!n6|(RWYn*+woXep9ftC&?E-5a3 zU0SgWsPLg*r~HR9oytb>9&T|UjR)kKqjt>Dj?9S{f)*Oty`QWNqiGMgEP8TD8biv`A62E|?gN73 zJS&`LDtOil<|X$S+j7Y*^PkK)*fCJ=-N41s_F@1*d@a}V-d4zviBHf&+Sg4%0-Oud z^n+-5>qjLTbvipohueDk3{XI5*AbAJ%?;#c5iHfYozJ_e|OT}BjH`9 zJ+Yi^KzU(q8gmx0=mNQcO^K+ZUICEt895OWe__9>ARg4)_Pv;t#<5dZs zm&KeVU=|wd(b^|T&d7c8@@1g{HJmrrmmR6Jt$umLj2!ne+K%KamHx{D9<*6o`O6$4 zm9Zm%CVm{gs>B3U2WHpo3yUh-y~?Ko<=t;nzuB%}LPaD)RNlBjpf+VO?#yheqfW

18h?&l{;R#~wW&O+*KLZ3x3 z&Y+`MVm}hzCVKZeKa2~H!6tSu`8}4U^U>bXJK|B-?4bx6WZHD=w;cY(g^Ug52Dh># znSaIK^c5e;I=DH`;vV^kdNByA;Z!Ngti5u5lU0_ek82g7)mq&`|(BGwi!LF6@M0+>HohGbk6 z(2T}Gk!NuH&1LMq?_whNjwtsHHI-X!nJV@W?j>)5#I|YQIB)O#oTDI-I_&ij@&r+~xRJDy1m1I3fp&(H&u4;7rFM#XG zUh2A+Y*E`bQ7Yd<|GI+N1%+Z8bcf^)kqZEdu~{)-K_Pb)WA|!rf%40Yq6#G-M{A+U ziz35QqeO7WHjL9yjukpE?u!J{h3YF;7Nd-nQ&_s|t~B{=D_Q5L(q zQDoPVhlr~TL<;RY-}Qz4Z`T*dIZB<@DH_ytL?9xrT>al)EwUwh3s|^T+NigBG0J0# za?;{bV72dOx%mYbE`=Q*XIw9-2fklR5tO2M-Us+FIC5m;tDwMBmYX`uvbq-}L{@8X zG^pRtw`toZD5|Cq;DiMts_`;bYK?D9(Q>veH2(~m6s@rCsnQ=4(aWg)*c3QPJds^v z9y_FDe)O_SdFh)xN701^9DEhhfA6?s8=2lST=?`r@Mtg;+d8JArq6$nb&k$E)%8mlxQu0KKi)B n%v_)ncKv+0b_s-D@9|5MD>_Tq^EAWoA!|&uu diff --git a/testBaseData/noGenes/preprocessing_config.yaml b/testBaseData/noGenes/preprocessing_config.yaml index 107a1d2db..a6772f530 100644 --- a/testBaseData/noGenes/preprocessing_config.yaml +++ b/testBaseData/noGenes/preprocessing_config.yaml @@ -1,2 +1,2 @@ -ndjsonInputFilename: "input.ndjson.zst" +ndjsonInputFilename: "input.ndjson" referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/noSequences/database_config.yaml b/testBaseData/noSequences/database_config.yaml index 0bade2884..4c2d40ebf 100644 --- a/testBaseData/noSequences/database_config.yaml +++ b/testBaseData/noSequences/database_config.yaml @@ -2,38 +2,6 @@ schema: instanceName: RSV A opennessLevel: OPEN metadata: - - name: accession - type: string - - name: version - type: int - - name: submissionId - type: string - name: accessionVersion type: string - - name: isRevocation - type: boolean - - name: submitter - type: string - generateIndex: true - - name: groupId - type: int - - name: groupName - type: string - generateIndex: true - - name: submittedAt - type: int - - name: releasedAt - type: int - - name: dataUseTerms - type: string - generateIndex: true - - name: dataUseTermsRestrictedUntil - type: date - - name: versionStatus - type: string - - name: dataUseTermsUrl - type: string - - name: sample_collection_date - type: date primaryKey: accessionVersion - dateToSortBy: sample_collection_date diff --git a/testBaseData/noSequences/input.ndjson b/testBaseData/noSequences/input.ndjson new file mode 100644 index 000000000..cfc7f39c0 --- /dev/null +++ b/testBaseData/noSequences/input.ndjson @@ -0,0 +1,6 @@ +{"metadata": { "accessionVersion": "LOC_000CHXY.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CHYW.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CHZU.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CJ0S.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CJ1Q.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} +{"metadata": { "accessionVersion": "LOC_000CJ2N.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}} diff --git a/testBaseData/noSequences/input.ndjson.zst b/testBaseData/noSequences/input.ndjson.zst deleted file mode 100644 index 51fbac85cd452e28a61bd8504b067a06cd704abc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3657 zcmV-P4z}?qwJ-goalHWm&2a#lp0pe<&}gdW0l*a`mWWPkA8mT#Rx*iJuz{;dT*YxY zw$-B>&8us2c^{S>0k{Fh0lHQfihjTJ(S$cOS!gJ_4(WTbdNtU!zj3DS zPZwvKxzAS9^w;Xptuo|Uv(-3s&h{4g*8b9Ghy#LR;*hYdAl!d$yGRysl5x16vLDlV3Lnc6BGd?Bs4U&Hzrr?TM_-PHgElt2FV1=Pw~3Kw@XYwP$mrrAPPzYg#*GU zZL-zS=bqW5la7ck8WXFW2tXTIlz`{#32Y12(CbRy*B4iuZGLLOPx<#pY&r+y0A<2xk3k({G7-V}OtFkB$upDw z`xkpFXy2bcSK5EFKK+F6U!bq|^IJY!u7z}kZRRs|Uw@{c{g><;s&;F9FK(m9R1M{y zY^I=GsSA7klJTxyfugUjt<^Jj=cjO{ps%#Hkc9KTxIz}&N?sw5e{VZgDm_(s>uW7s zYv1(`z?s6fqAOfoY-RH3GxqCe=reX#hdxtmHOwJMQe}ZH#wjHdR)OwY%Jsq%uGV7r zeOonj`SrCpQ_%g%)~7VCc;ie#77V?{RIaFt&3M9fS2*&|6>N32dAp5XvMhM*`_s-& zJ;Q2#SKDrVA=cOXwo;bl*ea~%udO!v$g<$|mzN~Guq?K{70bT9IDy@6C2O%bB)QGM zeYy$YtKe4c$<<&12=3+Rtz^PjLBbJj?{x}`N+O!jN<@|FHZWH^x?er5~P%mG9VI;iUa{735@3$&TEf34{bENRMnSo zWLYe)cCr-${?UqjOI7s(tGW0g3unHPSRXB)@Ws1wtjMDViUx=zGO>WGUGS0kT6~@u z>1&h9Dc!rUJ5Mk$ruyUytI=+=S0EUkFMqva1<7h0S0q=6`@7EUX6ekz*`| z!oU)~xTqu`t8q!PTQ@MYXKVz)u^Qj`s|Ut)$$=2D8Z1d)@dbuoWOrCVc&x^w&wjBF zj0;;$3LrROi;n4-&WSF%_FQyebkLb|Itdr#f@}_it>I~SsN)>h9HK*<&pp@P6CF8U ze2_xtntTqv2Yf*3oO2DfXHw^!Mq6wSPCA{}BE)$>oJV}kH7D^NRN_70qttQWeB^}k zaqq=RpEhMP;2zU}d%QQNdr?m3p)xv?I?`}11fW4=s{>yQg_1ftdFaHEQ2H3}d7Kj+jqRb)p-nzV5gMHjLInd*!2pENq0Vg4z4#)7 zghBEqqz-hTY}7qkK%#)eeDA?1A0qA{BA@7>gYSt=!q?u5fJ8vxYp}sMV}vdeK_d|m zK1!nl=R7c9lsqJlNP8v@bYNg!W4OkEO-d@;gKjb`T|q3yddYA8eIZtJ{bDiIMwVq_ zS#0_^*BUf{GfLJog=?2fUM?n zZLS;Jk%rI*ya?IgbMs9#o(*zUMp-mF|I$oR8yixW`5J z7!?x_3;`$-9F&R#N5TQh7 z$cnBnU9Inrtc)!y|GvMl6ubh-k+Edq3QlYEO(qoPpj5Eu3!Z|tg6pWl(kfKO7@?67 z8A+0)1JVmL4C5q}jF9ybK(IKFqc{v=5QISx1VIo21VIQ8h5$hb42Bp)2vLeC$NJ5P zIY0%W-yas_1i-xcOy)!{VU23V3QY6iv2&$ORnC_q0ZFGFcf4$hk6=YCZ$el|DPOq2 zA(E}B5s6aD(L)pFK`eBq48uR}5x{aBuyc97GkrQk>&#kbaTmB4HsY?XW z6CdW(V|=a#kDr7%VapictQbf>p2JW9r5}ZKN=^~$<=eQ5k)h#BeXP=FsAyM|rUc+8 z7ZAllKd%J9P@(QjP{F1)M=gdMlZ}t)^NKHK10=}7R;eLO^b0U(fYxEoFV~1+@mvyM zPX#a{<&s&DLHO2`nRYP6&CZ#_ zDq$E?khS%57LOhfmXD{WW!h>;%k;4ELR3JAaLQE8JuUR>8sD0ZEAJm(2 z$i*FKo;f^)YCqtc(c`NLu{-d`Q*p7}lFtB*;=6Mps=h*ft{@p85Q7|W>;pMX==WM$ zG?_Fx0T>AweR= z;jcUKnZxnm3Y}^Na8BF>$d5GYlsppiQg&28gq*+e$R}I|Ub{`bpj&8xF@}a`;tM~8 z1-;rrN`3%tQV%$k#q45SAAD@ds(8x)fIPfiU5!FRSFDWaM}u|x-x;GYG4rtKP7jrODm8gcRJLgIpe*X z?DKp~gTQ8ERaYL5U{25H88Cd2{tFK|2IVM^!t`kPNML+&o}S~RGLrPSVaZ;jUZ8pB(Qp;=^L7^OpQUFR+}NoTYC_~Xp3&4) zuXVDz>qWNB9ByiYvaC3PIY|+`Ag=JTxVVDK=_`GT^(^9Ln2Bozp#Bcy6Bji~sp}#4 zaTS8Z<Yw8+Q}Mdo6aG&-Eni3P!Ed&AjSuY_io^4o&f@VSsXzQQ2I}J9KIoBIg8()42+Z zEd%CvT+#{0d`+j0l~N8+@mhv36Eiw1ikthy#%sI0h@C|; zdw%Cf-kDs^bfHsOq_C0U9zs>z1r}L$VwVzX1weK@r*``2DN#nz0UNJj#Yblwxihi< zKW`8;;;IVjegdv9{xnR>8jWVBj#*d1H#M&!5?-WvQ{plja7D$^3x%m4--n^R zvj98UGB4hj#<%@y@efKr^IM{DB#!xuGKZjSw(GY$e`RE z{R3aK0}919=nlzNA_tzu#tXwR0L6S2Xq?BXkzXU;~xm1m=>dGQ-Hfl*7iA!+@rO1XH z%)dSZ{~Ph<(j4BTC6R`29SPJC|EvGySDU@0kq^t5${Ok1Lri~6VN6S6uY28oh=&HETT3;;*=W)+kgWij}(@~g9az-DuRRhN2~U|ZYPVWetW zz+rh45$Q#_R6YEWMayYup@uV)0>~30;`4+v8}Z$YWyrNm$e%AXA^DpVx@5WDoDi1fmtjs zzgN+W`H7t&NN5ZpIc#|0e^Oz{RS7UvTI!W3N(sp#r1+i#=%eR)m=y_BWv+ic*A5CB b5BF>GqG#Mod7YLsYEjO*$(EW{O41qcHTL!@ diff --git a/testBaseData/noSequences/preprocessing_config.yaml b/testBaseData/noSequences/preprocessing_config.yaml index 107a1d2db..a6772f530 100644 --- a/testBaseData/noSequences/preprocessing_config.yaml +++ b/testBaseData/noSequences/preprocessing_config.yaml @@ -1,2 +1,2 @@ -ndjsonInputFilename: "input.ndjson.zst" +ndjsonInputFilename: "input.ndjson" referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/unitTestDummyDataset/aa_insertions.tsv b/testBaseData/unitTestDummyDataset/aa_insertions.tsv new file mode 100644 index 000000000..c86a8d70a --- /dev/null +++ b/testBaseData/unitTestDummyDataset/aa_insertions.tsv @@ -0,0 +1,6 @@ +primaryKey E M +key1 [214:EPE] [] +key2 [] [] +key3 [] [] +key4 [] [] +key5 [214:EPE] [] diff --git a/testBaseData/unitTestDummyDataset/database_config.yaml b/testBaseData/unitTestDummyDataset/database_config.yaml new file mode 100644 index 000000000..a0185668e --- /dev/null +++ b/testBaseData/unitTestDummyDataset/database_config.yaml @@ -0,0 +1,30 @@ +schema: + instanceName: sars_cov-2_minimal_test_config + metadata: + - name: primaryKey + type: string + - name: date + type: date + - name: unsorted_date + type: date + - name: region + type: string + generateIndex: true + - name: country + type: string + generateIndex: true + - name: pango_lineage + type: pango_lineage + - name: division + type: string + generateIndex: true + - name: age + type: int + - name: qc_value + type: float + - name: test_boolean_column + type: boolean + primaryKey: primaryKey + dateToSortBy: date + partitionBy: pango_lineage +defaultNucleotideSequence: "main" diff --git a/testBaseData/unitTestDummyDataset/gene_0.fasta b/testBaseData/unitTestDummyDataset/gene_0.fasta new file mode 100644 index 000000000..a06dce2c5 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/gene_0.fasta @@ -0,0 +1,10 @@ +>key1 +MYSF* +>key2 +MYSF* +>key3 +MYSF* +>key4 +MYSF* +>key5 +MYSF* diff --git a/testBaseData/unitTestDummyDataset/gene_1.fasta b/testBaseData/unitTestDummyDataset/gene_1.fasta new file mode 100644 index 000000000..85ba153ab --- /dev/null +++ b/testBaseData/unitTestDummyDataset/gene_1.fasta @@ -0,0 +1,10 @@ +>key1 +MADS* +>key2 +MADS* +>key3 +MADS* +>key4 +MADS* +>key5 +XXXX* diff --git a/testBaseData/unitTestDummyDataset/metadata.tsv b/testBaseData/unitTestDummyDataset/metadata.tsv new file mode 100644 index 000000000..a21d3bbff --- /dev/null +++ b/testBaseData/unitTestDummyDataset/metadata.tsv @@ -0,0 +1,6 @@ +primaryKey pango_lineage date region country division unsorted_date age qc_value test_boolean_column +key1 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 true +key2 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 false +key3 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96 +key4 XBB 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 true +key5 XBB 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 true diff --git a/testBaseData/unitTestDummyDataset/metadata.tsv_ b/testBaseData/unitTestDummyDataset/metadata.tsv_ new file mode 100644 index 000000000..709ee204e --- /dev/null +++ b/testBaseData/unitTestDummyDataset/metadata.tsv_ @@ -0,0 +1,6 @@ +primaryKey pango_lineage date region country division unsorted_date age qc_value test_boolean_column +key1 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 true +key2 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 false +key3 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96 +key4 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 true +key5 B.1.1.7 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 true diff --git a/testBaseData/unitTestDummyDataset/nuc_0.fasta b/testBaseData/unitTestDummyDataset/nuc_0.fasta new file mode 100644 index 000000000..182af609f --- /dev/null +++ b/testBaseData/unitTestDummyDataset/nuc_0.fasta @@ -0,0 +1,10 @@ +>key1 +ACGTACGT +>key2 +AAGNAAGN +>key3 +ACGTACGT +>key4 +ACGTACGT +>key5 +ACGTACGT diff --git a/testBaseData/unitTestDummyDataset/nuc_1.fasta b/testBaseData/unitTestDummyDataset/nuc_1.fasta new file mode 100644 index 000000000..733eb86b0 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/nuc_1.fasta @@ -0,0 +1,10 @@ +>key1 +ACGT +>key2 +AAGN +>key3 +ACGT +>key4 +ACGT +>key5 +ACGT diff --git a/testBaseData/unitTestDummyDataset/nuc_insertions.tsv b/testBaseData/unitTestDummyDataset/nuc_insertions.tsv new file mode 100644 index 000000000..66df8022c --- /dev/null +++ b/testBaseData/unitTestDummyDataset/nuc_insertions.tsv @@ -0,0 +1,6 @@ +primaryKey main testSecondSequence +key1 [] [] +key2 [] [] +key3 [] [] +key4 [] [] +key5 [] [] diff --git a/testBaseData/unitTestDummyDataset/pangolineage_alias.json b/testBaseData/unitTestDummyDataset/pangolineage_alias.json new file mode 100644 index 000000000..28210c7b9 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/pangolineage_alias.json @@ -0,0 +1,501 @@ +{ + "A": "", + "B": "", + "C": "B.1.1.1", + "D": "B.1.1.25", + "G": "B.1.258.2", + "K": "B.1.1.277", + "L": "B.1.1.10", + "M": "B.1.1.294", + "N": "B.1.1.33", + "P": "B.1.1.28", + "Q": "B.1.1.7", + "R": "B.1.1.316", + "S": "B.1.1.217", + "U": "B.1.177.60", + "V": "B.1.177.54", + "W": "B.1.177.53", + "Y": "B.1.177.52", + "Z": "B.1.177.50", + "AA": "B.1.177.15", + "AB": "B.1.160.16", + "AC": "B.1.1.405", + "AD": "B.1.1.315", + "AE": "B.1.1.306", + "AF": "B.1.1.305", + "AG": "B.1.1.297", + "AH": "B.1.1.241", + "AJ": "B.1.1.240", + "AK": "B.1.1.232", + "AL": "B.1.1.231", + "AM": "B.1.1.216", + "AN": "B.1.1.200", + "AP": "B.1.1.70", + "AQ": "B.1.1.39", + "AS": "B.1.1.317", + "AT": "B.1.1.370", + "AU": "B.1.466.2", + "AV": "B.1.1.482", + "AW": "B.1.1.464", + "AY": "B.1.617.2", + "AZ": "B.1.1.318", + "BA": "B.1.1.529", + "BB": "B.1.621.1", + "BC": "B.1.1.529.1.1.1", + "BD": "B.1.1.529.1.17.2", + "BE": "B.1.1.529.5.3.1", + "BF": "B.1.1.529.5.2.1", + "BG": "B.1.1.529.2.12.1", + "BH": "B.1.1.529.2.38.3", + "BJ": "B.1.1.529.2.10.1", + "BK": "B.1.1.529.5.1.10", + "BL": "B.1.1.529.2.75.1", + "BM": "B.1.1.529.2.75.3", + "BN": "B.1.1.529.2.75.5", + "BP": "B.1.1.529.2.3.16", + "BQ": "B.1.1.529.5.3.1.1.1.1", + "BR": "B.1.1.529.2.75.4", + "BS": "B.1.1.529.2.3.2", + "BT": "B.1.1.529.5.1.21", + "BU": "B.1.1.529.5.2.16", + "BV": "B.1.1.529.5.2.20", + "BW": "B.1.1.529.5.6.2", + "BY": "B.1.1.529.2.75.6", + "BZ": "B.1.1.529.5.2.3", + "CA": "B.1.1.529.2.75.2", + "CB": "B.1.1.529.2.75.9", + "CC": "B.1.1.529.5.3.1.1.1.2", + "CD": "B.1.1.529.5.2.31", + "CE": "B.1.1.529.5.2.33", + "CF": "B.1.1.529.5.2.27", + "CG": "B.1.1.529.5.2.26", + "CH": "B.1.1.529.2.75.3.4.1.1", + "CJ": "B.1.1.529.2.75.3.1.1.1", + "CK": "B.1.1.529.5.2.24", + "CL": "B.1.1.529.5.1.29", + "CM": "B.1.1.529.2.3.20", + "CN": "B.1.1.529.5.2.21", + "CP": "B.1.1.529.5.2.6", + "CQ": "B.1.1.529.5.3.1.4.1.1", + "CR": "B.1.1.529.5.2.18", + "CS": "B.1.1.529.4.1.10", + "CT": "B.1.1.529.5.2.36", + "CU": "B.1.1.529.5.1.26", + "CV": "B.1.1.529.2.75.3.1.1.3", + "CW": "B.1.1.529.5.3.1.1.1.1.1.1.14", + "CY": "B.1.1.529.5.2.7", + "CZ": "B.1.1.529.5.3.1.1.1.1.1.1.1", + "DA": "B.1.1.529.5.2.38", + "DB": "B.1.1.529.5.2.25", + "DC": "B.1.1.529.4.6.5", + "DD": "B.1.1.529.2.3.21", + "DE": "B.1.1.529.5.1.23", + "DF": "B.1.1.529.5.10.1", + "DG": "B.1.1.529.5.2.24.2.1.1", + "DH": "B.1.1.529.5.1.22", + "DJ": "B.1.1.529.5.1.25", + "DK": "B.1.1.529.5.3.1.1.1.1.1.1.7", + "DL": "B.1.1.529.5.1.15", + "DM": "B.1.1.529.5.3.1.1.1.1.1.1.15", + "DN": "B.1.1.529.5.3.1.1.1.1.1.1.5", + "DP": "B.1.1.529.5.3.1.1.1.1.1.1.8", + "DQ": "B.1.1.529.5.2.47", + "DR": "B.1.1.529.5.3.1.1.1.1.1.1.3", + "DS": "B.1.1.529.2.75.5.1.3.1", + "DT": "B.1.1.529.5.3.1.1.1.1.1.1.32", + "DU": "B.1.1.529.5.3.1.1.1.1.1.1.2", + "DV": "B.1.1.529.2.75.3.4.1.1.1.1.1", + "DW": "B.1.1.529.5.3.1.1.2.1", + "DY": "B.1.1.529.5.2.48", + "DZ": "B.1.1.529.5.2.49", + "EA": "B.1.1.529.5.3.1.1.1.1.1.1.52", + "EB": "B.1.1.529.5.1.35", + "EC": "B.1.1.529.5.3.1.1.1.1.1.10.1", + "ED": "B.1.1.529.5.3.1.1.1.1.1.1.18", + "EE": "B.1.1.529.5.3.1.1.1.1.1.1.4", + "EF": "B.1.1.529.5.3.1.1.1.1.1.1.13", + "EG": "XBB.1.9.2", + "EH": "B.1.1.529.5.3.1.1.1.1.1.1.28", + "EJ": "B.1.1.529.2.75.5.1.3.8", + "EK": "XBB.1.5.13", + "EL": "XBB.1.5.14", + "EM": "XBB.1.5.7", + "EN": "B.1.1.529.5.3.1.1.1.1.1.1.46", + "EP": "B.1.1.529.2.75.3.1.1.4", + "EQ": "B.1.1.529.5.1.33", + "ER": "B.1.1.529.5.3.1.1.1.1.1.1.22", + "ES": "B.1.1.529.5.3.1.1.1.1.1.1.65", + "ET": "B.1.1.529.5.3.1.1.1.1.1.1.35", + "EU": "XBB.1.5.26", + "EV": "B.1.1.529.5.3.1.1.1.1.1.1.71", + "EW": "B.1.1.529.5.3.1.1.1.1.1.1.38", + "EY": "B.1.1.529.5.3.1.1.1.1.1.1.13.1.1.1", + "EZ": "B.1.1.529.5.3.1.1.1.1.1.1.43", + "FA": "B.1.1.529.5.3.1.1.1.1.1.1.10", + "FB": "B.1.1.529.5.3.1.1.1.1.1.2.1", + "FC": "B.1.1.529.5.3.1.1.1.1.1.1.72", + "FD": "XBB.1.5.15", + "FE": "XBB.1.18.1", + "FF": "B.1.1.529.5.3.1.1.1.1.1.8.2", + "FG": "XBB.1.5.16", + "FH": "XBB.1.5.17", + "FJ": "B.1.1.529.2.75.3.4.1.1.1.1.19", + "FK": "B.1.1.529.2.75.3.4.1.1.1.1.17", + "FL": "XBB.1.9.1", + "FM": "B.1.1.529.5.3.1.1.1.1.1.1.53", + "FN": "B.1.1.529.5.3.1.1.1.1.1.1.74", + "FP": "XBB.1.11.1", + "FQ": "B.1.1.529.5.3.1.1.1.1.1.1.39", + "FR": "B.1.1.529.2.75.5.1.2.3", + "FS": "B.1.1.529.2.75.3.4.1.1.1.1.12", + "FT": "XBB.1.5.39", + "FU": "XBB.1.16.1", + "FV": "B.1.1.529.2.3.20.8.1.1", + "FW": "XBB.1.28.1", + "FY": "XBB.1.22.1", + "FZ": "XBB.1.5.47", + "GA": "XBB.1.17.1", + "GB": "XBB.1.5.46", + "GC": "XBB.1.5.21", + "GD": "XBB.1.9.3", + "GE": "XBB.2.3.10", + "GF": "XBB.1.5.24", + "GG": "XBB.1.5.38", + "GH": "XBB.2.6.1", + "GJ": "XBB.2.3.3", + "GK": "XBB.1.5.70", + "GL": "XAY.1.1.1", + "GM": "XBB.2.3.6", + "GN": "XBB.1.5.73", + "GP": "B.1.1.529.2.75.3.4.1.1.1.1.11", + "GQ": "B.1.1.529.2.75.3.4.1.1.1.1.3", + "GR": "XBB.1.5.42", + "GS": "XBB.2.3.11", + "GT": "XBC.1.6.1", + "GU": "XBB.1.5.41", + "GV": "XBB.1.5.48", + "GW": "XBB.1.19.1", + "GY": "XBB.1.16.2", + "GZ": "XBB.2.3.4", + "HA": "XBB.1.5.86", + "HB": "XBB.1.34.2", + "HC": "XBB.1.5.44", + "XA": [ + "B.1.1.7", + "B.1.177" + ], + "XB": [ + "B.1.634", + "B.1.631" + ], + "XC": [ + "AY.29", + "B.1.1.7" + ], + "XD": [ + "B.1.617.2*", + "BA.1*" + ], + "XE": [ + "BA.1*", + "BA.2*" + ], + "XF": [ + "B.1.617.2*", + "BA.1*" + ], + "XG": [ + "BA.1*", + "BA.2*" + ], + "XH": [ + "BA.1*", + "BA.2*" + ], + "XJ": [ + "BA.1*", + "BA.2*" + ], + "XK": [ + "BA.1*", + "BA.2*" + ], + "XL": [ + "BA.1*", + "BA.2*" + ], + "XM": [ + "BA.1.1*", + "BA.2*" + ], + "XN": [ + "BA.1*", + "BA.2*" + ], + "XP": [ + "BA.1.1*", + "BA.2*" + ], + "XQ": [ + "BA.1.1*", + "BA.2*" + ], + "XR": [ + "BA.1.1*", + "BA.2*" + ], + "XS": [ + "B.1.617.2*", + "BA.1.1*" + ], + "XT": [ + "BA.2*", + "BA.1*" + ], + "XU": [ + "BA.1*", + "BA.2*" + ], + "XV": [ + "BA.1*", + "BA.2*" + ], + "XW": [ + "BA.1*", + "BA.2*" + ], + "XY": [ + "BA.1*", + "BA.2*" + ], + "XZ": [ + "BA.2*", + "BA.1*" + ], + "XAA": [ + "BA.1*", + "BA.2*" + ], + "XAB": [ + "BA.1*", + "BA.2*" + ], + "XAC": [ + "BA.2*", + "BA.1*", + "BA.2*" + ], + "XAD": [ + "BA.2*", + "BA.1*" + ], + "XAE": [ + "BA.2*", + "BA.1*" + ], + "XAF": [ + "BA.1*", + "BA.2*" + ], + "XAG": [ + "BA.1*", + "BA.2*" + ], + "XAH": [ + "BA.2*", + "BA.1*" + ], + "XAJ": [ + "BA.2.12.1*", + "BA.4*" + ], + "XAK": [ + "BA.2*", + "BA.1*", + "BA.2*" + ], + "XAL": [ + "BA.1*", + "BA.2*" + ], + "XAM": [ + "BA.1.1", + "BA.2.9" + ], + "XAN": [ + "BA.2*", + "BA.5.1" + ], + "XAP": [ + "BA.2*", + "BA.1*" + ], + "XAQ": [ + "BA.1*", + "BA.2*" + ], + "XAR": [ + "BA.1*", + "BA.2*" + ], + "XAS": [ + "BA.5*", + "BA.2*" + ], + "XAT": [ + "BA.2.3.13", + "BA.1*" + ], + "XAU": [ + "BA.1.1*", + "BA.2.9*" + ], + "XAV": [ + "BA.2*", + "BA.5*" + ], + "XAW": [ + "BA.2*", + "AY.122" + ], + "XAY": [ + "BA.2*", + "AY.45", + "BA.2*", + "AY.45", + "BA.2*" + ], + "XAZ": [ + "BA.2.5", + "BA.5", + "BA.2.5" + ], + "XBA": [ + "BA.2*", + "AY.45", + "BA.2*", + "AY.45", + "BA.2*" + ], + "XBB": [ + "BJ.1", + "BM.1.1.1" + ], + "XBC": [ + "BA.2*", + "B.1.617.2*", + "BA.2*", + "B.1.617.2*" + ], + "XBD": [ + "BA.2.75.2", + "BF.5" + ], + "XBE": [ + "BA.5.2", + "BE.4.1" + ], + "XBF": [ + "BA.5.2.3", + "CJ.1" + ], + "XBG": [ + "BA.2.76", + "BA.5.2" + ], + "XBH": [ + "BA.2.3.17", + "BA.2.75.2" + ], + "XBJ": [ + "BA.2.3.20", + "BA.5.2" + ], + "XBK": [ + "BA.5.2", + "CJ.1" + ], + "XBL": [ + "XBB.1.5.57", + "BA.2.75*", + "XBB.1.5.57" + ], + "XBM": [ + "BA.2.76", + "BF.3" + ], + "XBN": [ + "BA.2.75", + "XBB.3" + ], + "XBP": [ + "BA.2.75*", + "BQ.1*" + ], + "XBQ": [ + "BA.5.2", + "CJ.1" + ], + "XBR": [ + "BA.2.75", + "BQ.1" + ], + "XBS": [ + "BA.2.75", + "BQ.1" + ], + "XBT": [ + "BA.5.2.34", + "BA.2.75", + "BA.5.2.34" + ], + "XBU": [ + "BA.2.75.3", + "BQ.1", + "BA.2.75.3" + ], + "XBV": [ + "CR.1", + "XBB.1" + ], + "XBW": [ + "XBB.1.5", + "BQ.1.14" + ], + "XBY": [ + "BR.2.1", + "XBF" + ], + "XBZ": [ + "BA.5.2*", + "EF.1.3" + ], + "XCA": [ + "BA.2.75*", + "BQ.1*" + ], + "XCB": [ + "BF.31.1", + "BQ.1.10*" + ], + "XCC": [ + "CH.1.1.1", + "XBB.1.9.1" + ], + "XCD": [ + "XBB.1*", + "BQ.1.1.25*" + ], + "XCE": [ + "BQ.1*", + "FY.1" + ], + "XCF": [ + "XBB*", + "FE.1" + ], + "XCG": [ + "BA.5.2*", + "XBB.1" + ] +} diff --git a/testBaseData/unitTestDummyDataset/preprocessing_config.yaml b/testBaseData/unitTestDummyDataset/preprocessing_config.yaml new file mode 100644 index 000000000..498dd9d1a --- /dev/null +++ b/testBaseData/unitTestDummyDataset/preprocessing_config.yaml @@ -0,0 +1,5 @@ +inputDirectory: "./testBaseData/unitTestDummyDataset/" +outputDirectory: "./output/" +metadataFilename: "metadata.tsv" +pangoLineageDefinitionFilename: "pangolineage_alias.json" +referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/unitTestDummyDataset/reference_genomes.json b/testBaseData/unitTestDummyDataset/reference_genomes.json new file mode 100644 index 000000000..4235418ca --- /dev/null +++ b/testBaseData/unitTestDummyDataset/reference_genomes.json @@ -0,0 +1,22 @@ +{ + "nucleotideSequences": [ + { + "name": "main", + "sequence": "ACGTACGT" + }, + { + "name": "testSecondSequence", + "sequence": "ACGT" + } + ], + "genes": [ + { + "name": "E", + "sequence": "MYSF*" + }, + { + "name": "M", + "sequence": "MADS*" + } + ] +} diff --git a/testBaseData/unitTestDummyDataset/unaligned_0.fasta b/testBaseData/unitTestDummyDataset/unaligned_0.fasta new file mode 100644 index 000000000..182af609f --- /dev/null +++ b/testBaseData/unitTestDummyDataset/unaligned_0.fasta @@ -0,0 +1,10 @@ +>key1 +ACGTACGT +>key2 +AAGNAAGN +>key3 +ACGTACGT +>key4 +ACGTACGT +>key5 +ACGTACGT diff --git a/testBaseData/unitTestDummyDataset/unaligned_1.fasta b/testBaseData/unitTestDummyDataset/unaligned_1.fasta new file mode 100644 index 000000000..733eb86b0 --- /dev/null +++ b/testBaseData/unitTestDummyDataset/unaligned_1.fasta @@ -0,0 +1,10 @@ +>key1 +ACGT +>key2 +AAGN +>key3 +ACGT +>key4 +ACGT +>key5 +ACGT