From b433d15968eb8a9f35cac9103de1af49a3feaad6 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 6 Nov 2024 10:12:13 -0500 Subject: [PATCH] refact(ingestion): make indices on the fly --- src/api/services/ingestion.go | 58 ++++++----------------------------- 1 file changed, 10 insertions(+), 48 deletions(-) diff --git a/src/api/services/ingestion.go b/src/api/services/ingestion.go index 90577e7..fbd4c3f 100644 --- a/src/api/services/ingestion.go +++ b/src/api/services/ingestion.go @@ -23,8 +23,6 @@ import ( "os" "os/exec" "path" - "regexp" - "slices" "strconv" "strings" "sync" @@ -372,9 +370,8 @@ func (i *IngestionService) ProcessVcf( scanner := bufio.NewScanner(gr) - var contigs []string // To collect contigs as defined in VCF header + contigs := make(map[string]struct{}) // To collect contigs as defined in VCF header var contigMutex = sync.RWMutex{} - var preMadeContigIndices bool = false var discoveredHeaders bool = false var headers []string @@ -388,30 +385,12 @@ func (i *IngestionService) ProcessVcf( // - manage # of lines being concurrently processed per file at any given time lineProcessingQueue := make(chan bool, lineProcessingConcurrencyLevel) - // pattern for contig headers - // - sectioning off the chr prefix strips it from the contig name prior to ingestion, more or less preserving - // previous Gohan behaviour (which did a find and replace.) - var contig_pattern = regexp.MustCompile(`##contig= create required indices (one per contig) with mappings to ensure ES types are consistent and - // mitigate issues we've encountered with e.g., SIGNATURE, where a date field was detected for - // info.value. - contigMutex.Lock() - fmt.Printf("Got %d contigs: %v\n", len(contigs), contigs) - for _, c := range contigs { - i.MakeVariantIndex(c) - } - contigMutex.Unlock() - if len(contigs) > 0 { - // flag to prevent trying to make contig indices on-the-fly during ingestion - preMadeContigIndices = true - } - discoveredHeaders = true fmt.Printf("Found %d headers: %v\n", len(headers), headers) } @@ -486,17 +450,15 @@ func (i *IngestionService) ProcessVcf( // Strip out chr prefix value = strings.ReplaceAll(value, "chr", "") - if !preMadeContigIndices { - // If we have to make contig indices on the fly, check if we haven't created the contig yet. - // If we haven't, create the index and add it to the contigs slice. - // // A bit janky - O(n) lookup every time for whether contig exists - contigMutex.Lock() - if !slices.Contains(contigs, value) { - i.MakeVariantIndex(value) - contigs = append(contigs, value) - } - contigMutex.Unlock() + // We're making contig indices on the fly - check if we haven't created the contig yet. + // If we haven't, create the index and add it to the contigs "set" (map). + contigMutex.Lock() + _, indexExists := contigs[value] + if !indexExists { + i.MakeVariantIndex(value) + contigs[value] = struct{}{} // add contig to the "set" of created configs } + contigMutex.Unlock() // ems if value is valid chromosome if chromosome.IsValidHumanChromosome(value) {