Skip to content

Commit

Permalink
test: speed up unit tests by using smaller datasets
Browse files Browse the repository at this point in the history
* database.test.cpp: Use a smaller dataset. Speed up locally: 7s -> 800ms
* preprocessor.test.cpp: The RSV dataset took 45s -> shift it to the new e2e tests
* preprocessor.test.cpp: Use smaller datasets for some tests: 1.8s -> ~500ms
  • Loading branch information
fengelniederhammer committed Sep 11, 2024
1 parent 7924e82 commit 84e0d05
Show file tree
Hide file tree
Showing 23 changed files with 673 additions and 91 deletions.
32 changes: 15 additions & 17 deletions src/silo/database.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

namespace {
silo::Database buildTestDatabase() {
const std::string input_directory{"./testBaseData/exampleDataset/"};
const std::string input_directory{"./testBaseData/unitTestDummyDataset/"};

silo::config::PreprocessingConfig config;
config.overwrite(silo::config::YamlFile("./testBaseData/test_preprocessing_config.yaml"));
config.overwrite(silo::config::YamlFile(input_directory + "preprocessing_config.yaml"));

const auto database_config =
silo::config::ConfigRepository().getValidatedConfig(input_directory + "database_config.yaml");
Expand All @@ -43,8 +43,8 @@ TEST(DatabaseTest, shouldBuildDatabaseWithoutErrors) {
const auto simple_database_info = database.getDatabaseInfo();

EXPECT_GT(simple_database_info.total_size, 0);
EXPECT_EQ(simple_database_info.sequence_count, 100);
EXPECT_EQ(simple_database_info.number_of_partitions, 11);
EXPECT_EQ(simple_database_info.sequence_count, 5);
EXPECT_EQ(simple_database_info.number_of_partitions, 2);
}

TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) {
Expand Down Expand Up @@ -83,10 +83,10 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
const auto simple_info = database.getDatabaseInfo();

EXPECT_EQ(
detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 2775910
detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::A), 148
);
EXPECT_EQ(
detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 2661831
detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::Nucleotide::Symbol::GAP), 128
);

EXPECT_EQ(
Expand All @@ -97,7 +97,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
EXPECT_EQ(
detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic
.number_of_values_stored_in_run_containers,
2875
0
);
EXPECT_EQ(
detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic
Expand All @@ -106,20 +106,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
);

EXPECT_EQ(
detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 42629964
);
EXPECT_EQ(
detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 21433248
detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 2108
);
EXPECT_EQ(detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 1066);
EXPECT_EQ(
detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic
.total_bitmap_size_array_containers,
133240
12
);

EXPECT_EQ(simple_info.total_size, 26589508);
EXPECT_EQ(simple_info.sequence_count, 100);
EXPECT_EQ(simple_info.n_bitmaps_size, 3931);
EXPECT_EQ(simple_info.total_size, 1956);
EXPECT_EQ(simple_info.sequence_count, 5);
EXPECT_EQ(simple_info.n_bitmaps_size, 62);
}

TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) {
Expand All @@ -141,7 +139,7 @@ TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) {
const auto simple_database_info = database.getDatabaseInfo();

EXPECT_GT(simple_database_info.total_size, 0);
EXPECT_EQ(simple_database_info.sequence_count, 100);
EXPECT_EQ(simple_database_info.sequence_count, 5);
EXPECT_GT(simple_database_info.n_bitmaps_size, 0);
EXPECT_EQ(simple_database_info.number_of_partitions, 11);
EXPECT_EQ(simple_database_info.number_of_partitions, 2);
}
27 changes: 5 additions & 22 deletions src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ const Scenario EMPTY_INPUT_NDJSON_UNPARTITIONED = {

const Scenario NO_GENES = {
.input_directory = "testBaseData/noGenes/",
.expected_sequence_count = 30,
.expected_sequence_count = 9,
.query = R"(
{
"action": {
Expand All @@ -206,7 +206,7 @@ const Scenario NO_GENES = {
}
)",
.expected_query_result = nlohmann::json::parse(R"(
[{"count":30}])")
[{"count":9}])")
};

const Scenario NO_NUCLEOTIDE_SEQUENCES = {
Expand All @@ -227,7 +227,7 @@ const Scenario NO_NUCLEOTIDE_SEQUENCES = {

const Scenario NO_SEQUENCES = {
.input_directory = "testBaseData/noSequences/",
.expected_sequence_count = 30,
.expected_sequence_count = 6,
.query = R"(
{
"action": {
Expand All @@ -238,7 +238,7 @@ const Scenario NO_SEQUENCES = {
}
}
)",
.expected_query_result = nlohmann::json::parse(R"([{"count":30}])")
.expected_query_result = nlohmann::json::parse(R"([{"count":6}])")
};

const Scenario DIVERSE_SEQUENCE_NAMES = {
Expand Down Expand Up @@ -275,22 +275,6 @@ const Scenario DIVERSE_SEQUENCE_NAMES_NDJSON = {
[{"count":2}])")
};

const Scenario MEDIUM_SIZED_RSV_DATASET = {
.input_directory = "testBaseData/mediumSizedRsvDataset/",
.expected_sequence_count = 19662,
.query = R"(
{
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "True"
}
}
)",
.expected_query_result = nlohmann::json::parse(R"([{"count":19662}])")
};

class PreprocessorTestFixture : public ::testing::TestWithParam<Scenario> {};

INSTANTIATE_TEST_SUITE_P(
Expand All @@ -310,8 +294,7 @@ INSTANTIATE_TEST_SUITE_P(
EMPTY_INPUT_NDJSON_UNPARTITIONED,
NO_GENES,
NO_NUCLEOTIDE_SEQUENCES,
NO_SEQUENCES,
MEDIUM_SIZED_RSV_DATASET
NO_SEQUENCES
),
printTestName
);
Expand Down
18 changes: 0 additions & 18 deletions testBaseData/noGenes/database_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,6 @@ schema:
instanceName: RSV A
opennessLevel: OPEN
metadata:
- name: accession
type: string
- name: version
type: int
- name: submissionId
type: string
- name: accessionVersion
type: string
- name: isRevocation
type: boolean
- name: submitter
type: string
generateIndex: true
- name: groupId
type: int
- name: dataUseTermsUrl
type: string
- name: sample_collection_date
type: date
primaryKey: accessionVersion
dateToSortBy: sample_collection_date
9 changes: 9 additions & 0 deletions testBaseData/noGenes/input.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"metadata": {"accessionVersion": "LOC_000CJJP.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G", "7504:A"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJKM.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CCTTTGGTTAGAGACCGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTG"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJLK.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJMH.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TCNCGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTGTGAGACGAC"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJNF.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:TGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJPD.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:CGAAAAAATGCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTTTTTCTCGTAATTTAGTTAATATACATATAA"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJQB.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGTACAACAAACTTGC", "15225:GTTTTTAACACTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJR9.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["2299:G"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": {"accessionVersion": "LOC_000CJS7.1"}, "unalignedNucleotideSequences": {"main": "ACGT"}, "alignedNucleotideSequences": {"main": "ACGT"}, "nucleotideInsertions": {"main": ["0:GCGAAAAAATGCGTACAACAAACTTGC", "4175:A", "15225:GTTTTTGACACTTTTTT"]}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
Binary file removed testBaseData/noGenes/input.ndjson.zst
Binary file not shown.
2 changes: 1 addition & 1 deletion testBaseData/noGenes/preprocessing_config.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ndjsonInputFilename: "input.ndjson.zst"
ndjsonInputFilename: "input.ndjson"
referenceGenomeFilename: "reference_genomes.json"
32 changes: 0 additions & 32 deletions testBaseData/noSequences/database_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,6 @@ schema:
instanceName: RSV A
opennessLevel: OPEN
metadata:
- name: accession
type: string
- name: version
type: int
- name: submissionId
type: string
- name: accessionVersion
type: string
- name: isRevocation
type: boolean
- name: submitter
type: string
generateIndex: true
- name: groupId
type: int
- name: groupName
type: string
generateIndex: true
- name: submittedAt
type: int
- name: releasedAt
type: int
- name: dataUseTerms
type: string
generateIndex: true
- name: dataUseTermsRestrictedUntil
type: date
- name: versionStatus
type: string
- name: dataUseTermsUrl
type: string
- name: sample_collection_date
type: date
primaryKey: accessionVersion
dateToSortBy: sample_collection_date
6 changes: 6 additions & 0 deletions testBaseData/noSequences/input.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"metadata": { "accessionVersion": "LOC_000CHXY.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": { "accessionVersion": "LOC_000CHYW.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": { "accessionVersion": "LOC_000CHZU.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": { "accessionVersion": "LOC_000CJ0S.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": { "accessionVersion": "LOC_000CJ1Q.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
{"metadata": { "accessionVersion": "LOC_000CJ2N.1" }, "unalignedNucleotideSequences": {}, "alignedNucleotideSequences": {}, "nucleotideInsertions": {}, "alignedAminoAcidSequences": {}, "aminoAcidInsertions": {}}
Binary file removed testBaseData/noSequences/input.ndjson.zst
Binary file not shown.
2 changes: 1 addition & 1 deletion testBaseData/noSequences/preprocessing_config.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ndjsonInputFilename: "input.ndjson.zst"
ndjsonInputFilename: "input.ndjson"
referenceGenomeFilename: "reference_genomes.json"
6 changes: 6 additions & 0 deletions testBaseData/unitTestDummyDataset/aa_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
primaryKey E M
key1 [214:EPE] []
key2 [] []
key3 [] []
key4 [] []
key5 [214:EPE] []
30 changes: 30 additions & 0 deletions testBaseData/unitTestDummyDataset/database_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
schema:
instanceName: sars_cov-2_minimal_test_config
metadata:
- name: primaryKey
type: string
- name: date
type: date
- name: unsorted_date
type: date
- name: region
type: string
generateIndex: true
- name: country
type: string
generateIndex: true
- name: pango_lineage
type: pango_lineage
- name: division
type: string
generateIndex: true
- name: age
type: int
- name: qc_value
type: float
- name: test_boolean_column
type: boolean
primaryKey: primaryKey
dateToSortBy: date
partitionBy: pango_lineage
defaultNucleotideSequence: "main"
10 changes: 10 additions & 0 deletions testBaseData/unitTestDummyDataset/gene_0.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>key1
MYSF*
>key2
MYSF*
>key3
MYSF*
>key4
MYSF*
>key5
MYSF*
10 changes: 10 additions & 0 deletions testBaseData/unitTestDummyDataset/gene_1.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>key1
MADS*
>key2
MADS*
>key3
MADS*
>key4
MADS*
>key5
XXXX*
6 changes: 6 additions & 0 deletions testBaseData/unitTestDummyDataset/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
primaryKey pango_lineage date region country division unsorted_date age qc_value test_boolean_column
key1 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 true
key2 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 false
key3 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96
key4 XBB 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 true
key5 XBB 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 true
10 changes: 10 additions & 0 deletions testBaseData/unitTestDummyDataset/nuc_0.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>key1
ACGTACGT
>key2
AAGNAAGN
>key3
ACGTACGT
>key4
ACGTACGT
>key5
ACGTACGT
10 changes: 10 additions & 0 deletions testBaseData/unitTestDummyDataset/nuc_1.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>key1
ACGT
>key2
AAGN
>key3
ACGT
>key4
ACGT
>key5
ACGT
6 changes: 6 additions & 0 deletions testBaseData/unitTestDummyDataset/nuc_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
primaryKey main testSecondSequence
key1 [] []
key2 [] []
key3 [] []
key4 [] []
key5 [] []
Loading

0 comments on commit 84e0d05

Please sign in to comment.