Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refact!: fixed mappings for variant indices #62

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
63 changes: 63 additions & 0 deletions src/api/models/indexes/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,69 @@ type Genotype struct {
Zygosity c.Zygosity `json:"zygosity"`
}

var MAPPING_FIELDS_KEYWORD_IG256 = map[string]interface{}{
"keyword": map[string]interface{}{
"type": "keyword",
"ignore_above": 256,
},
}
var MAPPING_TEXT = map[string]interface{}{"type": "text", "fields": MAPPING_FIELDS_KEYWORD_IG256}
var MAPPING_LONG = map[string]interface{}{"type": "long"}
var MAPPING_FLOAT64 = map[string]interface{}{"type": "double"}
var MAPPING_BOOL = map[string]interface{}{"type": "boolean"}
var MAPPING_DATE = map[string]interface{}{"type": "date"}

// This mapping is derived from the one exported by Victor from the ICHANGE instance on 2024-11-01,
// using the following commands:
// ./bentoctl.bash shell gohan-api
// --> inside gohan-api container
// curl -u $GOHAN_ES_USERNAME:$GOHAN_ES_PASSWORD bentov2-gohan-elasticsearch:9200/_mapping
var VARIANT_INDEX_MAPPING = map[string]interface{}{
"properties": map[string]interface{}{
"chrom": MAPPING_TEXT,
"pos": MAPPING_LONG,
"id": MAPPING_TEXT,
"ref": MAPPING_TEXT,
"alt": MAPPING_TEXT,
"format": MAPPING_TEXT,
"qual": MAPPING_LONG,
"filter": MAPPING_TEXT,
"info": map[string]interface{}{
"properties": map[string]interface{}{
"id": MAPPING_TEXT,
"value": MAPPING_TEXT,
},
},
"sample": map[string]interface{}{
"properties": map[string]interface{}{
"id": MAPPING_TEXT,
"variation": map[string]interface{}{
"properties": map[string]interface{}{
"genotype": map[string]interface{}{
"properties": map[string]interface{}{
"phased": MAPPING_BOOL,
"zygosity": MAPPING_LONG,
},
},
"alleles": map[string]interface{}{
"properties": map[string]interface{}{
"left": MAPPING_TEXT,
"right": MAPPING_TEXT,
},
},
"phredScaleLikelyhood": MAPPING_LONG,
"genotypeProbability": MAPPING_FLOAT64,
},
},
},
},
"fileId": MAPPING_TEXT,
"dataset": MAPPING_TEXT,
"assemblyId": MAPPING_TEXT,
"createdTime": MAPPING_DATE,
},
}

type Gene struct {
Name string `json:"name"`
Chrom string `json:"chrom"`
Expand Down
56 changes: 51 additions & 5 deletions src/api/services/ingestion.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"os"
"os/exec"
"path"
"regexp"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -142,7 +143,7 @@ func (i *IngestionService) Init() {
esutil.BulkIndexerItem{
// Action field configures the operation to perform (index, create, delete, update)
Action: "index",
Index: fmt.Sprintf("variants-%s", strings.ToLower(queuedVariant.Chrom)),
Index: variantIndexName(queuedVariant.Chrom),

// Body is an `io.Reader` with the payload
Body: bytes.NewReader(variantData),
Expand Down Expand Up @@ -369,6 +370,7 @@ func (i *IngestionService) ProcessVcf(
defer gr.Close()

scanner := bufio.NewScanner(gr)
var contigs []string // To collect contigs as defined in VCF header
var discoveredHeaders bool = false
var headers []string
headerSampleIds := make(map[int]string)
Expand All @@ -381,12 +383,27 @@ func (i *IngestionService) ProcessVcf(
// - manage # of lines being concurrently processed per file at any given time
lineProcessingQueue := make(chan bool, lineProcessingConcurrencyLevel)

for scanner.Scan() {
//fmt.Println(scanner.Text())
// pattern for contig headers
// - sectioning off the chr prefix strips it from the contig name prior to ingestion, more or less preserving
// previous Gohan behaviour (which did a find and replace.)
var contig_pattern = regexp.MustCompile(`##contig=<ID=(chr)?([a-zA-Z0-9_\-.]+)(,.*)?`)

for scanner.Scan() {
// Gather Header row by seeking the CHROM string
// Collect contigs (chromosomes) to create indices
line := scanner.Text()
if !discoveredHeaders {
if line[0:8] == "##contig" {
var matches = contig_pattern.FindStringSubmatch(line)

if len(matches) == 0 || matches[2] == "" {
// Invalid contig name - error
fmt.Printf("Error: got invalid contig header '%s' (matches: %v)\n", line, matches)
} else {
// Valid
contigs = append(contigs, matches[2])
}
}
if line[0:6] == "#CHROM" {
// Split the string by tabs
headers = strings.Split(line, "\t")
Expand All @@ -400,10 +417,35 @@ func (i *IngestionService) ProcessVcf(
}
}

discoveredHeaders = true
// If we got to the VCF final header line, we've found all the contigs possible
// --> create required indices (one per contig) with mappings to ensure ES types are consistent and
// mitigate issues we've encountered with e.g., SIGNATURE, where a date field was detected for
// info.value.
fmt.Printf("Got %d contigs: %v\n", len(contigs), contigs)
for _, c := range contigs {
var client = i.ElasticsearchClient
var contigIndex = variantIndexName(c)

res, err := client.Indices.Exists([]string{contigIndex})
if res.StatusCode == 404 {
mappings, _ := json.Marshal(indexes.VARIANT_INDEX_MAPPING)
res, _ := client.Indices.Create(
contigIndex,
client.Indices.Create.WithBody(strings.NewReader(fmt.Sprintf(`{"mappings": %s}`, mappings))),
)

fmt.Printf("Creating contig index %s - got response: %s\n", c, res.String())
} else if err != nil {
// The actual check didn't work properly (e.g., couldn't contact ES).
fmt.Printf("Contig index %s existence-check got error: %s\n", c, err)
} else {
// The check worked and the index already exists, so we shouldn't try to recreate it.
fmt.Printf("Contig index %s already exists; skipping creation\n", c)
}
}

discoveredHeaders = true
fmt.Println("Found the headers: ", headers)
continue
}
continue
}
Expand Down Expand Up @@ -865,3 +907,7 @@ func (i *IngestionService) FilenameAlreadyRunning(filename string) bool {
}
return false
}

func variantIndexName(contig string) string {
return fmt.Sprintf("variants-%s", strings.ToLower(contig))
}
Loading