test: speed up unit tests by using smaller datasets

* database.test.cpp: Use a smaller dataset. Speed up locally: 7s -> 800ms * preprocessor.test.cpp: The RSV dataset took 45s -> shift it to the new e2e tests * preprocessor.test.cpp: Use smaller datasets for some tests: 1.8s -> ~500ms
GenSpectrum · Sep 11, 2024 · 84e0d05 · 84e0d05
1 parent 7924e82
commit 84e0d05
Show file tree

Hide file tree

Showing 23 changed files with 673 additions and 91 deletions.
diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp
@@ -16,10 +16,10 @@
 
 namespace {
 silo::Database buildTestDatabase() {
-   const std::string input_directory{"./testBaseData/exampleDataset/"};
+   const std::string input_directory{"./testBaseData/unitTestDummyDataset/"};
 
    silo::config::PreprocessingConfig config;
-   config.overwrite(silo::config::YamlFile("./testBaseData/test_preprocessing_config.yaml"));
+   config.overwrite(silo::config::YamlFile(input_directory + "preprocessing_config.yaml"));
 
    const auto database_config =
       silo::config::ConfigRepository().getValidatedConfig(input_directory + "database_config.yaml");
@@ -43,8 +43,8 @@ TEST(DatabaseTest, shouldBuildDatabaseWithoutErrors) {
    const auto simple_database_info = database.getDatabaseInfo();
 
    EXPECT_GT(simple_database_info.total_size, 0);
-   EXPECT_EQ(simple_database_info.sequence_count, 100);
-   EXPECT_EQ(simple_database_info.number_of_partitions, 11);
+   EXPECT_EQ(simple_database_info.sequence_count, 5);
+   EXPECT_EQ(simple_database_info.number_of_partitions, 2);
 }
 
 TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) {
@@ -83,10 +83,10 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
    const auto simple_info = database.getDatabaseInfo();
 
    EXPECT_EQ(
-      detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 2775910
+      detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 148
    );
    EXPECT_EQ(
-      detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 2661831
+      detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 128
    );
 
    EXPECT_EQ(
@@ -97,7 +97,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
    EXPECT_EQ(
       detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic
          .number_of_values_stored_in_run_containers,
-      2875
+      0
    );
    EXPECT_EQ(
       detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic
@@ -106,20 +106,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
    );
 
    EXPECT_EQ(
-      detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 42629964
-   );
-   EXPECT_EQ(
-      detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 21433248
+      detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 2108
    );
+   EXPECT_EQ(detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 1066);
    EXPECT_EQ(
       detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic
          .total_bitmap_size_array_containers,
-      133240
+      12
    );
 
-   EXPECT_EQ(simple_info.total_size, 26589508);
-   EXPECT_EQ(simple_info.sequence_count, 100);
-   EXPECT_EQ(simple_info.n_bitmaps_size, 3931);
+   EXPECT_EQ(simple_info.total_size, 1956);
+   EXPECT_EQ(simple_info.sequence_count, 5);
+   EXPECT_EQ(simple_info.n_bitmaps_size, 62);
 }
 
 TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) {
@@ -141,7 +139,7 @@ TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) {
    const auto simple_database_info = database.getDatabaseInfo();
 
    EXPECT_GT(simple_database_info.total_size, 0);
-   EXPECT_EQ(simple_database_info.sequence_count, 100);
+   EXPECT_EQ(simple_database_info.sequence_count, 5);
    EXPECT_GT(simple_database_info.n_bitmaps_size, 0);
-   EXPECT_EQ(simple_database_info.number_of_partitions, 11);
+   EXPECT_EQ(simple_database_info.number_of_partitions, 2);
 }
diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp
@@ -194,7 +194,7 @@ const Scenario EMPTY_INPUT_NDJSON_UNPARTITIONED = {
 
 const Scenario NO_GENES = {
    .input_directory = "testBaseData/noGenes/",
-   .expected_sequence_count = 30,
+   .expected_sequence_count = 9,
    .query = R"(
       {
          "action": {
@@ -206,7 +206,7 @@ const Scenario NO_GENES = {
       }
    )",
    .expected_query_result = nlohmann::json::parse(R"(
-[{"count":30}])")
+[{"count":9}])")
 };
 
 const Scenario NO_NUCLEOTIDE_SEQUENCES = {
@@ -227,7 +227,7 @@ const Scenario NO_NUCLEOTIDE_SEQUENCES = {
 
 const Scenario NO_SEQUENCES = {
    .input_directory = "testBaseData/noSequences/",
-   .expected_sequence_count = 30,
+   .expected_sequence_count = 6,
    .query = R"(
       {
          "action": {
@@ -238,7 +238,7 @@ const Scenario NO_SEQUENCES = {
          }
       }
    )",
-   .expected_query_result = nlohmann::json::parse(R"([{"count":30}])")
+   .expected_query_result = nlohmann::json::parse(R"([{"count":6}])")
 };
 
 const Scenario DIVERSE_SEQUENCE_NAMES = {
@@ -275,22 +275,6 @@ const Scenario DIVERSE_SEQUENCE_NAMES_NDJSON = {
 [{"count":2}])")
 };
 
-const Scenario MEDIUM_SIZED_RSV_DATASET = {
-   .input_directory = "testBaseData/mediumSizedRsvDataset/",
-   .expected_sequence_count = 19662,
-   .query = R"(
-      {
-         "action": {
-           "type": "Aggregated"
-         },
-         "filterExpression": {
-            "type": "True"
-         }
-      }
-   )",
-   .expected_query_result = nlohmann::json::parse(R"([{"count":19662}])")
-};
-
 class PreprocessorTestFixture : public ::testing::TestWithParam<Scenario> {};
 
 INSTANTIATE_TEST_SUITE_P(
@@ -310,8 +294,7 @@ INSTANTIATE_TEST_SUITE_P(
       EMPTY_INPUT_NDJSON_UNPARTITIONED,
       NO_GENES,
       NO_NUCLEOTIDE_SEQUENCES,
-      NO_SEQUENCES,
-      MEDIUM_SIZED_RSV_DATASET
+      NO_SEQUENCES
    ),
    printTestName
 );

diff --git a/testBaseData/noGenes/database_config.yaml b/testBaseData/noGenes/database_config.yaml
@@ -2,24 +2,6 @@ schema:
   instanceName: RSV A
   opennessLevel: OPEN
   metadata:
-    - name: accession
-      type: string
-    - name: version
-      type: int
-    - name: submissionId
-      type: string
     - name: accessionVersion
       type: string
-    - name: isRevocation
-      type: boolean
-    - name: submitter
-      type: string
-      generateIndex: true
-    - name: groupId
-      type: int
-    - name: dataUseTermsUrl
-      type: string
-    - name: sample_collection_date
-      type: date
   primaryKey: accessionVersion
-  dateToSortBy: sample_collection_date
diff --git a/testBaseData/noGenes/input.ndjson b/testBaseData/noGenes/input.ndjson
@@ -0,0 +1,9 @@
+{"metadata": {"accessionVersion": "LOC_000CJJP.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G", "7504:A"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJKM.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CCTTTGGTTAGAGACCGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTG"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJLK.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJMH.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TCNCGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTGAGACGAC"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJNF.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJPD.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTAATTTAGTTAATATACATATAA"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJQB.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJR9.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": {"accessionVersion": "LOC_000CJS7.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGAAAAAATGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
diff --git a/testBaseData/noGenes/input.ndjson.zst b/testBaseData/noGenes/input.ndjson.zst
diff --git a/testBaseData/noGenes/preprocessing_config.yaml b/testBaseData/noGenes/preprocessing_config.yaml
@@ -1,2 +1,2 @@
-ndjsonInputFilename: "input.ndjson.zst"
+ndjsonInputFilename: "input.ndjson"
 referenceGenomeFilename: "reference_genomes.json"
diff --git a/testBaseData/noSequences/database_config.yaml b/testBaseData/noSequences/database_config.yaml
@@ -2,38 +2,6 @@ schema:
   instanceName: RSV A
   opennessLevel: OPEN
   metadata:
-    - name: accession
-      type: string
-    - name: version
-      type: int
-    - name: submissionId
-      type: string
     - name: accessionVersion
       type: string
-    - name: isRevocation
-      type: boolean
-    - name: submitter
-      type: string
-      generateIndex: true
-    - name: groupId
-      type: int
-    - name: groupName
-      type: string
-      generateIndex: true
-    - name: submittedAt
-      type: int
-    - name: releasedAt
-      type: int
-    - name: dataUseTerms
-      type: string
-      generateIndex: true
-    - name: dataUseTermsRestrictedUntil
-      type: date
-    - name: versionStatus
-      type: string
-    - name: dataUseTermsUrl
-      type: string
-    - name: sample_collection_date
-      type: date
   primaryKey: accessionVersion
-  dateToSortBy: sample_collection_date
diff --git a/testBaseData/noSequences/input.ndjson b/testBaseData/noSequences/input.ndjson
@@ -0,0 +1,6 @@
+{"metadata": { "accessionVersion": "LOC_000CHXY.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": { "accessionVersion": "LOC_000CHYW.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": { "accessionVersion": "LOC_000CHZU.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": { "accessionVersion": "LOC_000CJ0S.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": { "accessionVersion": "LOC_000CJ1Q.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
+{"metadata": { "accessionVersion": "LOC_000CJ2N.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
diff --git a/testBaseData/noSequences/input.ndjson.zst b/testBaseData/noSequences/input.ndjson.zst
diff --git a/testBaseData/noSequences/preprocessing_config.yaml b/testBaseData/noSequences/preprocessing_config.yaml
@@ -1,2 +1,2 @@
-ndjsonInputFilename: "input.ndjson.zst"
+ndjsonInputFilename: "input.ndjson"
 referenceGenomeFilename: "reference_genomes.json"
diff --git a/testBaseData/unitTestDummyDataset/aa_insertions.tsv b/testBaseData/unitTestDummyDataset/aa_insertions.tsv
@@ -0,0 +1,6 @@
+primaryKey	E	M
+key1	[214:EPE]	[]
+key2	[]	[]
+key3	[]	[]
+key4	[]	[]
+key5	[214:EPE]	[]
diff --git a/testBaseData/unitTestDummyDataset/database_config.yaml b/testBaseData/unitTestDummyDataset/database_config.yaml
@@ -0,0 +1,30 @@
+schema:
+  instanceName: sars_cov-2_minimal_test_config
+  metadata:
+    - name: primaryKey
+      type: string
+    - name: date
+      type: date
+    - name: unsorted_date
+      type: date
+    - name: region
+      type: string
+      generateIndex: true
+    - name: country
+      type: string
+      generateIndex: true
+    - name: pango_lineage
+      type: pango_lineage
+    - name: division
+      type: string
+      generateIndex: true
+    - name: age
+      type: int
+    - name: qc_value
+      type: float
+    - name: test_boolean_column
+      type: boolean
+  primaryKey: primaryKey
+  dateToSortBy: date
+  partitionBy: pango_lineage
+defaultNucleotideSequence: "main"
diff --git a/testBaseData/unitTestDummyDataset/gene_0.fasta b/testBaseData/unitTestDummyDataset/gene_0.fasta
@@ -0,0 +1,10 @@
+>key1
+MYSF*
+>key2
+MYSF*
+>key3
+MYSF*
+>key4
+MYSF*
+>key5
+MYSF*
diff --git a/testBaseData/unitTestDummyDataset/gene_1.fasta b/testBaseData/unitTestDummyDataset/gene_1.fasta
@@ -0,0 +1,10 @@
+>key1
+MADS*
+>key2
+MADS*
+>key3
+MADS*
+>key4
+MADS*
+>key5
+XXXX*
diff --git a/testBaseData/unitTestDummyDataset/metadata.tsv b/testBaseData/unitTestDummyDataset/metadata.tsv
@@ -0,0 +1,6 @@
+primaryKey	pango_lineage	date	region	country	division	unsorted_date	age	qc_value	test_boolean_column
+key1	B.1.1.7	2021-03-18	Europe	Switzerland	Basel-Land		4	0.98	true
+key2	B.1.1.7	2021-04-13	Europe	Switzerland	Bern	2020-03-08	5	0.97	false
+key3	B.1.1.7	2021-04-25	Europe	Switzerland	Aargau	2021-01-29	6	0.96	
+key4	XBB	2021-04-13	Europe	Switzerland	Bern	2020-12-24	4	0.95	true
+key5	XBB	2021-03-19	Europe	Switzerland	Solothurn	2021-02-10	54	0.94	true
diff --git a/testBaseData/unitTestDummyDataset/nuc_0.fasta b/testBaseData/unitTestDummyDataset/nuc_0.fasta
@@ -0,0 +1,10 @@
+>key1
+ACGTACGT
+>key2
+AAGNAAGN
+>key3
+ACGTACGT
+>key4
+ACGTACGT
+>key5
+ACGTACGT
diff --git a/testBaseData/unitTestDummyDataset/nuc_1.fasta b/testBaseData/unitTestDummyDataset/nuc_1.fasta
@@ -0,0 +1,10 @@
+>key1
+ACGT
+>key2
+AAGN
+>key3
+ACGT
+>key4
+ACGT
+>key5
+ACGT
diff --git a/testBaseData/unitTestDummyDataset/nuc_insertions.tsv b/testBaseData/unitTestDummyDataset/nuc_insertions.tsv
@@ -0,0 +1,6 @@
+primaryKey	main	testSecondSequence
+key1	[]	[]
+key2	[]	[]
+key3	[]	[]
+key4	[]	[]
+key5	[]	[]
-Original file line number
+Diff line change
@@ -0,0 +1,10 @@
+    >key1
+    MYSF*
+    >key2
+    MYSF*
+    >key3
+    MYSF*
+    >key4
+    MYSF*
+    >key5
+    MYSF*