From 14d285626fe42bbee21b3366397f1d3936e65cd5 Mon Sep 17 00:00:00 2001 From: brouillette Date: Mon, 27 Sep 2021 14:52:22 -0400 Subject: [PATCH 01/25] poc wikipedia gene scraping begins --- src/gene/main.go | 194 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 src/gene/main.go diff --git a/src/gene/main.go b/src/gene/main.go new file mode 100644 index 00000000..1f6915e8 --- /dev/null +++ b/src/gene/main.go @@ -0,0 +1,194 @@ +package main + +import ( + //"fmt" + "fmt" + "log" + "net/http" + "os" + "strings" + "sync" + + "github.com/PuerkitoBio/goquery" +) + +func main() { + // Setup + localDataDir := "data" + + // - create local data dirs + if _, err := os.Stat(localDataDir); os.IsNotExist(err) { + err := os.Mkdir(localDataDir, 0755) + if err != nil { + log.Fatal(err) + } + } + + baseUrl := "https://en.wikipedia.org" + + // Start here on chromosome 1 + res, err := http.Get(fmt.Sprintf("%s/wiki/Category:Genes_on_human_chromosome_1", baseUrl)) + if err != nil { + log.Fatal(err) + } + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + log.Fatal(err) + } + + // Pluck out the navigation bar with all the chromosomes + doc.Find("#mw-content-text > div.mw-parser-output > div.navbox > table > tbody > tr > td").Each(func(index int, item *goquery.Selection) { + + // Gather links for all chromosomes + //var chromosomeWg sync.WaitGroup + item.Find("div ul li").Each(func(index int, item *goquery.Selection) { + + // chromosomeWg.Add(1) + // go func(_cwg *sync.WaitGroup) { + // defer _cwg.Done() + + // link data + chromTitle := item.Text() + chromLinkTag := item.Find("a") + chromLink, _ := chromLinkTag.Attr("href") + + fmt.Printf("Chromosome #%d: %s - %s\n", index, chromTitle, chromLink) + + chromPageRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, chromLink)) + if err != nil { + log.Fatal(err) + } + defer chromPageRes.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + chromDoc, err := goquery.NewDocumentFromReader(chromPageRes.Body) + if err != nil { + log.Fatal(err) + } + + // Pluck out sections with the links to all the genes on this page alphabetically + var geneWg sync.WaitGroup + + chromDoc.Find(".mw-category-group").Each(func(index int, categorySectionItem *goquery.Selection) { + geneWg.Add(1) + go func(_gwg *sync.WaitGroup) { + defer _gwg.Done() + + // Skip this category if it's a wildcard + isWildcard := false + categorySectionItem.Find("h3").Each(func(index int, h3Item *goquery.Selection) { + if h3Item.Text() == "*" { + isWildcard = true + } + }) + if isWildcard { + return + } + + // Gather links for all chromosomes + categorySectionItem.Find("ul li").Each(func(index int, item *goquery.Selection) { + + // link data + geneTitle := item.Text() + geneLinkTag := item.Find("a") + geneLink, _ := geneLinkTag.Attr("href") + + // discover gene wiki page + geneRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, geneLink)) + if err != nil { + log.Fatal(err) + } + defer geneRes.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", geneRes.StatusCode, res.Status) + } + geneDoc, err := goquery.NewDocumentFromReader(geneRes.Body) + if err != nil { + log.Fatal(err) + } + + // find assembly + // TODO + + // find start and end positions + var humanGeneLocationTableElement *goquery.Selection + var startHeaderElement *goquery.Selection + var startValue string + var endHeaderElement *goquery.Selection + var endValue string + + geneDoc.Find("table").Each(func(index int, table *goquery.Selection) { + if strings.Contains(table.Text(), "Gene location (Human)") { + humanGeneLocationTableElement = table + return + } + }) + + if humanGeneLocationTableElement != nil { + humanGeneLocationTableElement.Find("th").Each(func(index int, rowItemHeader *goquery.Selection) { + if strings.Contains(rowItemHeader.Text(), "Start") { + startHeaderElement = rowItemHeader + return + } else if strings.Contains(rowItemHeader.Text(), "End") { + endHeaderElement = rowItemHeader + return + } + }) + + if startHeaderElement != nil { + valueELement := startHeaderElement.SiblingsFiltered("td").Last() + startValue = valueELement.Text() + } + if endHeaderElement != nil { + endValueELement := endHeaderElement.SiblingsFiltered("td").Last() + endValue = endValueELement.Text() + } + + } + + // store data + // (temp : store to disk) + chromosome := strings.Replace(strings.Replace(chromTitle, ")", "", -1), "(", "", -1) + + fmt.Printf("Chromosome #%s: Gene #%d: %s - %s\n", chromosome, index, geneTitle, geneLink) + fmt.Printf("Start: %s\n", startValue) + fmt.Printf("End: %s\n\n", endValue) + + var file *os.File + thisGenePath := fmt.Sprintf("%s/%s.txt", localDataDir, geneTitle) + if _, err := os.Stat(thisGenePath); os.IsNotExist(err) { + file, err = os.Create(thisGenePath) + if err != nil { + fmt.Println(err) + return + } + } else { + file, err = os.OpenFile(thisGenePath, os.O_RDWR, 0755) + if err != nil { + log.Fatal(err) + } + } + defer file.Close() + + writeText := fmt.Sprintf("Chromosome: %s\nStart: %s\nEnd: %s\nPath: %s", chromosome, startValue, endValue, geneLink) + _, err = file.WriteString(writeText) + if err != nil { + fmt.Println(err) + return + } + }) + }(&geneWg) + }) + geneWg.Wait() + + // }(&chromosomeWg) + }) + // chromosomeWg.Wait() + }) + +} From 1da2b46b5203d4c16d4b7f73171a156de35b3897 Mon Sep 17 00:00:00 2001 From: brouillette Date: Mon, 27 Sep 2021 16:37:51 -0400 Subject: [PATCH 02/25] pagination handling --- src/gene/main.go | 296 ++++++++++++++++++++++++++++------------------- 1 file changed, 175 insertions(+), 121 deletions(-) diff --git a/src/gene/main.go b/src/gene/main.go index 1f6915e8..76702860 100644 --- a/src/gene/main.go +++ b/src/gene/main.go @@ -8,14 +8,18 @@ import ( "os" "strings" "sync" + "time" "github.com/PuerkitoBio/goquery" ) -func main() { +const ( // Setup - localDataDir := "data" + localDataDir = "data" + baseUrl = "https://en.wikipedia.org" +) +func main() { // - create local data dirs if _, err := os.Stat(localDataDir); os.IsNotExist(err) { err := os.Mkdir(localDataDir, 0755) @@ -24,7 +28,7 @@ func main() { } } - baseUrl := "https://en.wikipedia.org" + startTime := time.Now() // Start here on chromosome 1 res, err := http.Get(fmt.Sprintf("%s/wiki/Category:Genes_on_human_chromosome_1", baseUrl)) @@ -58,137 +62,187 @@ func main() { fmt.Printf("Chromosome #%d: %s - %s\n", index, chromTitle, chromLink) - chromPageRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, chromLink)) - if err != nil { - log.Fatal(err) - } - defer chromPageRes.Body.Close() - if res.StatusCode != 200 { - log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) - } - chromDoc, err := goquery.NewDocumentFromReader(chromPageRes.Body) - if err != nil { - log.Fatal(err) - } - - // Pluck out sections with the links to all the genes on this page alphabetically - var geneWg sync.WaitGroup + // process: + // begin on an initial page, and verify the end of each page if + // a "next page" link exists. if so, query that and continue processing + // in the same manner + for { + chromPageRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, chromLink)) + if err != nil { + log.Fatal(err) + } + defer chromPageRes.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + + chromDoc, err := goquery.NewDocumentFromReader(chromPageRes.Body) + if err != nil { + log.Fatal(err) + } + + processChromDoc(chromTitle, chromDoc) + + hasNextPage := false + chromDoc.Find("#mw-pages > a").EachWithBreak(func(index int, linkItem *goquery.Selection) bool { + + if strings.Contains(strings.ToLower(linkItem.Text()), "next page") { + chromLink, _ = linkItem.Attr("href") + hasNextPage = true + + // break + return false + } - chromDoc.Find(".mw-category-group").Each(func(index int, categorySectionItem *goquery.Selection) { - geneWg.Add(1) - go func(_gwg *sync.WaitGroup) { - defer _gwg.Done() + // continue loop + return true + }) - // Skip this category if it's a wildcard - isWildcard := false - categorySectionItem.Find("h3").Each(func(index int, h3Item *goquery.Selection) { - if h3Item.Text() == "*" { - isWildcard = true - } - }) - if isWildcard { - return - } + if !hasNextPage { + break + } + } - // Gather links for all chromosomes - categorySectionItem.Find("ul li").Each(func(index int, item *goquery.Selection) { + // gather "next page" link if available - // link data - geneTitle := item.Text() - geneLinkTag := item.Find("a") - geneLink, _ := geneLinkTag.Attr("href") + // }(&chromosomeWg) + }) + // chromosomeWg.Wait() + }) - // discover gene wiki page - geneRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, geneLink)) - if err != nil { - log.Fatal(err) - } - defer geneRes.Body.Close() - if res.StatusCode != 200 { - log.Fatalf("status code error: %d %s", geneRes.StatusCode, res.Status) - } - geneDoc, err := goquery.NewDocumentFromReader(geneRes.Body) - if err != nil { - log.Fatal(err) - } + // Done - display time lapse + fmt.Printf("Process duration %s\n", time.Since(startTime)) +} - // find assembly - // TODO - - // find start and end positions - var humanGeneLocationTableElement *goquery.Selection - var startHeaderElement *goquery.Selection - var startValue string - var endHeaderElement *goquery.Selection - var endValue string - - geneDoc.Find("table").Each(func(index int, table *goquery.Selection) { - if strings.Contains(table.Text(), "Gene location (Human)") { - humanGeneLocationTableElement = table - return - } - }) - - if humanGeneLocationTableElement != nil { - humanGeneLocationTableElement.Find("th").Each(func(index int, rowItemHeader *goquery.Selection) { - if strings.Contains(rowItemHeader.Text(), "Start") { - startHeaderElement = rowItemHeader - return - } else if strings.Contains(rowItemHeader.Text(), "End") { - endHeaderElement = rowItemHeader - return - } - }) - - if startHeaderElement != nil { - valueELement := startHeaderElement.SiblingsFiltered("td").Last() - startValue = valueELement.Text() - } - if endHeaderElement != nil { - endValueELement := endHeaderElement.SiblingsFiltered("td").Last() - endValue = endValueELement.Text() - } +func processChromDoc(chromTitle string, chromDoc *goquery.Document) { + // Pluck out sections with the links to all the genes on this page alphabetically + var geneWg sync.WaitGroup + chromDoc.Find(".mw-category-group").Each(func(index int, categorySectionItem *goquery.Selection) { + geneWg.Add(1) + go func(_gwg *sync.WaitGroup) { + defer _gwg.Done() + + // Skip this category if it's a wildcard + isWildcard := false + categorySectionItem.Find("h3").Each(func(index int, h3Item *goquery.Selection) { + if h3Item.Text() == "*" { + isWildcard = true + } + }) + if isWildcard { + return + } + // Gather links for all chromosomes + categorySectionItem.Find("ul li").Each(func(index int, item *goquery.Selection) { + + // link data + geneTitle := item.Text() + geneLinkTag := item.Find("a") + geneLink, _ := geneLinkTag.Attr("href") + + // discover gene wiki page + geneRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, geneLink)) + if err != nil { + log.Fatal(err) + } + defer geneRes.Body.Close() + if geneRes.StatusCode != 200 { + log.Fatalf("status code error: %d %s", geneRes.StatusCode, geneRes.Status) + } + geneDoc, err := goquery.NewDocumentFromReader(geneRes.Body) + if err != nil { + log.Fatal(err) + } + + // find assembly + // TODO + + // find start and end positions + var ( + aliasesRowElement *goquery.Selection + aliasesValue string + + humanGeneLocationTableElement *goquery.Selection + startHeaderElement *goquery.Selection + startValue string + endHeaderElement *goquery.Selection + endValue string + ) + + geneDoc.Find("tr").Each(func(index int, rowElement *goquery.Selection) { + if strings.Contains(rowElement.Text(), "Aliases") { + aliasesRowElement = rowElement + + aliasesElement := aliasesRowElement.Find("td span").First() + if aliasesElement != nil { + aliasesValue = aliasesElement.Text() } + } + }) - // store data - // (temp : store to disk) - chromosome := strings.Replace(strings.Replace(chromTitle, ")", "", -1), "(", "", -1) - - fmt.Printf("Chromosome #%s: Gene #%d: %s - %s\n", chromosome, index, geneTitle, geneLink) - fmt.Printf("Start: %s\n", startValue) - fmt.Printf("End: %s\n\n", endValue) - - var file *os.File - thisGenePath := fmt.Sprintf("%s/%s.txt", localDataDir, geneTitle) - if _, err := os.Stat(thisGenePath); os.IsNotExist(err) { - file, err = os.Create(thisGenePath) - if err != nil { - fmt.Println(err) - return - } - } else { - file, err = os.OpenFile(thisGenePath, os.O_RDWR, 0755) - if err != nil { - log.Fatal(err) - } - } - defer file.Close() + geneDoc.Find("table").Each(func(index int, table *goquery.Selection) { + if strings.Contains(table.Text(), "Gene location (Human)") { + humanGeneLocationTableElement = table + return + } + }) - writeText := fmt.Sprintf("Chromosome: %s\nStart: %s\nEnd: %s\nPath: %s", chromosome, startValue, endValue, geneLink) - _, err = file.WriteString(writeText) - if err != nil { - fmt.Println(err) + if humanGeneLocationTableElement != nil { + humanGeneLocationTableElement.Find("th").Each(func(index int, rowItemHeader *goquery.Selection) { + if strings.Contains(rowItemHeader.Text(), "Start") { + startHeaderElement = rowItemHeader + return + } else if strings.Contains(rowItemHeader.Text(), "End") { + endHeaderElement = rowItemHeader return } }) - }(&geneWg) - }) - geneWg.Wait() - // }(&chromosomeWg) - }) - // chromosomeWg.Wait() - }) + if startHeaderElement != nil { + valueELement := startHeaderElement.SiblingsFiltered("td").Last() + startValue = valueELement.Text() + } + if endHeaderElement != nil { + endValueELement := endHeaderElement.SiblingsFiltered("td").Last() + endValue = endValueELement.Text() + } + + } + + // store data + // (temp : store to disk) + chromosome := strings.Replace(strings.Replace(chromTitle, ")", "", -1), "(", "", -1) + + fmt.Printf("Aliases: %s\n", aliasesValue) + fmt.Printf("Chromosome #%s: Gene #%d: %s - %s\n", chromosome, index, geneTitle, geneLink) + fmt.Printf("Start: %s\n", startValue) + fmt.Printf("End: %s\n\n", endValue) + var file *os.File + thisGenePath := fmt.Sprintf("%s/%s.txt", localDataDir, geneTitle) + if _, err := os.Stat(thisGenePath); os.IsNotExist(err) { + file, err = os.Create(thisGenePath) + if err != nil { + fmt.Println(err) + return + } + } else { + file, err = os.OpenFile(thisGenePath, os.O_RDWR, 0755) + if err != nil { + log.Fatal(err) + } + } + defer file.Close() + + writeText := fmt.Sprintf("Aliases: %s\nChromosome: %s\nStart: %s\nEnd: %s\nPath: %s", aliasesValue, chromosome, startValue, endValue, geneLink) + _, err = file.WriteString(writeText) + if err != nil { + fmt.Println(err) + return + } + }) + }(&geneWg) + }) + geneWg.Wait() } From 4fb599585b466c8d04c182263026925c2b175662 Mon Sep 17 00:00:00 2001 From: brouillette Date: Mon, 27 Sep 2021 17:35:39 -0400 Subject: [PATCH 03/25] gene elasticsearch index model --- src/api/models/elasticsearch.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/api/models/elasticsearch.go b/src/api/models/elasticsearch.go index 60bd6487..12ed2a2a 100644 --- a/src/api/models/elasticsearch.go +++ b/src/api/models/elasticsearch.go @@ -45,3 +45,13 @@ type Genotype struct { AlleleRight int `json:"alleleRight"` // -1 = no call (equivalent to a '.') Zygosity c.Zygosity `json:"zygosity"` } + +type Gene struct { + Name string `json:"name"` + Nomenclature []string `json:"nomenclature"` + Chrom int `json:"chrom"` + Start int `json:"start"` + End int `json:"end"` + AssemblyId c.AssemblyId `json:"assemblyId"` + SourceUrl string `json:"sourceUrl"` +} From e38329451c1b34bde9a7d4951530b1a6f8156d48 Mon Sep 17 00:00:00 2001 From: brouillette Date: Mon, 27 Sep 2021 19:37:59 -0400 Subject: [PATCH 04/25] begin ingesting genes into elasticsearch --- src/api/models/ingest/structs/main.go | 16 +++ src/api/services/ingestion.go | 111 +++++++++++++---- src/gene/main.go | 168 +++++++++++++++++++------- 3 files changed, 224 insertions(+), 71 deletions(-) create mode 100644 src/api/models/ingest/structs/main.go diff --git a/src/api/models/ingest/structs/main.go b/src/api/models/ingest/structs/main.go new file mode 100644 index 00000000..d6010fd5 --- /dev/null +++ b/src/api/models/ingest/structs/main.go @@ -0,0 +1,16 @@ +package structs + +import ( + "api/models" + "sync" +) + +type IngestionQueueStructure struct { + Variant *models.Variant + WaitGroup *sync.WaitGroup +} + +type GeneIngestionQueueStructure struct { + Gene *models.Gene + WaitGroup *sync.WaitGroup +} diff --git a/src/api/services/ingestion.go b/src/api/services/ingestion.go index 282d971a..31b4b88c 100644 --- a/src/api/services/ingestion.go +++ b/src/api/services/ingestion.go @@ -5,6 +5,7 @@ import ( "api/models/constants" z "api/models/constants/zygosity" "api/models/ingest" + "api/models/ingest/structs" "api/utils" "bufio" "bytes" @@ -34,18 +35,15 @@ import ( type ( IngestionService struct { - Initialized bool - IngestRequestChan chan *ingest.IngestRequest - IngestRequestMap map[string]*ingest.IngestRequest - IngestionBulkIndexingCapacity int - IngestionBulkIndexingQueue chan *IngestionQueueStructure - ElasticsearchClient *elasticsearch.Client - IngestionBulkIndexer esutil.BulkIndexer - } - - IngestionQueueStructure struct { - Variant *models.Variant - WaitGroup *sync.WaitGroup + Initialized bool + IngestRequestChan chan *ingest.IngestRequest + IngestRequestMap map[string]*ingest.IngestRequest + IngestionBulkIndexingCapacity int + ElasticsearchClient *elasticsearch.Client + IngestionBulkIndexingQueue chan *structs.IngestionQueueStructure + IngestionBulkIndexer esutil.BulkIndexer + GeneIngestionBulkIndexingQueue chan *structs.GeneIngestionQueueStructure + GeneIngestionBulkIndexer esutil.BulkIndexer } ) @@ -54,29 +52,38 @@ const defaultBulkIndexingCap int = 10000 func NewIngestionService(es *elasticsearch.Client) *IngestionService { iz := &IngestionService{ - Initialized: false, - IngestRequestChan: make(chan *ingest.IngestRequest), - IngestRequestMap: map[string]*ingest.IngestRequest{}, - IngestionBulkIndexingCapacity: defaultBulkIndexingCap, - IngestionBulkIndexingQueue: make(chan *IngestionQueueStructure, defaultBulkIndexingCap), - ElasticsearchClient: es, + Initialized: false, + IngestRequestChan: make(chan *ingest.IngestRequest), + IngestRequestMap: map[string]*ingest.IngestRequest{}, + IngestionBulkIndexingCapacity: defaultBulkIndexingCap, + IngestionBulkIndexingQueue: make(chan *structs.IngestionQueueStructure, defaultBulkIndexingCap), + GeneIngestionBulkIndexingQueue: make(chan *structs.GeneIngestionQueueStructure, 10), + ElasticsearchClient: es, } - // see: https://www.elastic.co/blog/why-am-i-seeing-bulk-rejections-in-my-elasticsearch-cluster + //see: https://www.elastic.co/blog/why-am-i-seeing-bulk-rejections-in-my-elasticsearch-cluster var numWorkers = defaultBulkIndexingCap / 100 - // the lower the denominator (the number of documents per bulk upload). the higher - // the chances of 100% successful upload, though the longer it may take (negligible) + //the lower the denominator (the number of documents per bulk upload). the higher + //the chances of 100% successful upload, though the longer it may take (negligible) bi, _ := esutil.NewBulkIndexer(esutil.BulkIndexerConfig{ Index: "variants", Client: iz.ElasticsearchClient, NumWorkers: numWorkers, - // FlushBytes: int(flushBytes), // The flush threshold in bytes (default: 50MB ?) - FlushInterval: 30 * time.Second, // The periodic flush interval + // FlushBytes: int(flushBytes), // The flush threshold in bytes (default: 5MB ?) + FlushInterval: time.Second, // The periodic flush interval }) - iz.IngestionBulkIndexer = bi + gbi, _ := esutil.NewBulkIndexer(esutil.BulkIndexerConfig{ + Index: "genes", + Client: iz.ElasticsearchClient, + NumWorkers: 5, + FlushBytes: int(64), // The flush threshold in bytes (default: 5MB ?) + //FlushInterval: 30 * time.Second, // The periodic flush interval + }) + iz.GeneIngestionBulkIndexer = gbi + iz.Init() return iz @@ -100,7 +107,7 @@ func (i *IngestionService) Init() { } }() - // spin up a listener for bulk indexing + // spin up a listener for each bulk indexing go func() { for { select { @@ -117,6 +124,57 @@ func (i *IngestionService) Init() { // Add an item to the BulkIndexer err = i.IngestionBulkIndexer.Add( + context.Background(), + esutil.BulkIndexerItem{ + // Action field configures the operation to perform (index, create, delete, update) + Action: "index", + + // Body is an `io.Reader` with the payload + Body: bytes.NewReader(data), + + // OnSuccess is called for each successful operation + OnSuccess: func(ctx context.Context, item esutil.BulkIndexerItem, res esutil.BulkIndexerResponseItem) { + defer wg.Done() + //fmt.Printf("Successfully indexed: %s", item) + //atomic.AddUint64(&countSuccessful, 1) + }, + + // OnFailure is called for each failed operation + OnFailure: func(ctx context.Context, item esutil.BulkIndexerItem, res esutil.BulkIndexerResponseItem, err error) { + defer wg.Done() + //atomic.AddUint64(&countFailed, 1) + if err != nil { + fmt.Printf("ERROR: %s", err) + } else { + fmt.Printf("ERROR: %s: %s", res.Error.Type, res.Error.Reason) + } + }, + }, + ) + if err != nil { + defer wg.Done() + fmt.Printf("Unexpected error: %s", err) + } + } + } + }() + + go func() { + for { + select { + case queuedItem := <-i.GeneIngestionBulkIndexingQueue: + + g := queuedItem.Gene + wg := queuedItem.WaitGroup + + // Prepare the data payload: encode article to JSON + data, err := json.Marshal(g) + if err != nil { + log.Fatalf("Cannot encode gene %s: %s\n", g, err) + } + + // Add an item to the BulkIndexer + err = i.GeneIngestionBulkIndexer.Add( context.Background(), esutil.BulkIndexerItem{ // Action field configures the operation to perform (index, create, delete, update) @@ -134,6 +192,7 @@ func (i *IngestionService) Init() { // OnFailure is called for each failed operation OnFailure: func(ctx context.Context, item esutil.BulkIndexerItem, res esutil.BulkIndexerResponseItem, err error) { defer wg.Done() + fmt.Printf("Failure Repsonse: %s", res.Error) //atomic.AddUint64(&countFailed, 1) if err != nil { fmt.Printf("ERROR: %s", err) @@ -489,7 +548,7 @@ func (i *IngestionService) ProcessVcf(vcfFilePath string, drsFileId string, asse mapstructure.Decode(tmpVariant, &resultingVariant) // pass variant (along with a waitgroup) to the channel - i.IngestionBulkIndexingQueue <- &IngestionQueueStructure{ + i.IngestionBulkIndexingQueue <- &structs.IngestionQueueStructure{ Variant: &resultingVariant, WaitGroup: fileWg, } diff --git a/src/gene/main.go b/src/gene/main.go index 76702860..69319664 100644 --- a/src/gene/main.go +++ b/src/gene/main.go @@ -1,16 +1,23 @@ package main import ( - //"fmt" + "api/models" + assemblyId "api/models/constants/assembly-id" + "api/models/ingest/structs" + "api/services" + "api/utils" + "crypto/tls" "fmt" "log" "net/http" "os" + "strconv" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" + "github.com/kelseyhightower/envconfig" ) const ( @@ -20,6 +27,24 @@ const ( ) func main() { + // Gather environment variables + var cfg models.Config + err := envconfig.Process("", &cfg) + if err != nil { + fmt.Println(err) + os.Exit(2) + } + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + + // Service Connections: + // -- Elasticsearch + es := utils.CreateEsConnection(&cfg) + iz := services.NewIngestionService(es) + iz.Init() + // - create local data dirs if _, err := os.Stat(localDataDir); os.IsNotExist(err) { err := os.Mkdir(localDataDir, 0755) @@ -60,7 +85,7 @@ func main() { chromLinkTag := item.Find("a") chromLink, _ := chromLinkTag.Attr("href") - fmt.Printf("Chromosome #%d: %s - %s\n", index, chromTitle, chromLink) + //fmt.Printf("Chromosome #%d: %s - %s\n", index, chromTitle, chromLink) // process: // begin on an initial page, and verify the end of each page if @@ -69,19 +94,22 @@ func main() { for { chromPageRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, chromLink)) if err != nil { - log.Fatal(err) + fmt.Println(err) + continue } defer chromPageRes.Body.Close() if res.StatusCode != 200 { - log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + fmt.Printf("status code error: %d %s\n", res.StatusCode, res.Status) + continue } chromDoc, err := goquery.NewDocumentFromReader(chromPageRes.Body) if err != nil { - log.Fatal(err) + fmt.Println(err) + continue } - processChromDoc(chromTitle, chromDoc) + processChromDoc(iz, chromTitle, chromDoc) hasNextPage := false chromDoc.Find("#mw-pages > a").EachWithBreak(func(index int, linkItem *goquery.Selection) bool { @@ -102,7 +130,6 @@ func main() { break } } - // gather "next page" link if available // }(&chromosomeWg) @@ -114,13 +141,12 @@ func main() { fmt.Printf("Process duration %s\n", time.Since(startTime)) } -func processChromDoc(chromTitle string, chromDoc *goquery.Document) { +func processChromDoc(iz *services.IngestionService, chromTitle string, chromDoc *goquery.Document) { // Pluck out sections with the links to all the genes on this page alphabetically var geneWg sync.WaitGroup chromDoc.Find(".mw-category-group").Each(func(index int, categorySectionItem *goquery.Selection) { - geneWg.Add(1) go func(_gwg *sync.WaitGroup) { - defer _gwg.Done() + //defer _gwg.Done() // Skip this category if it's a wildcard isWildcard := false @@ -135,6 +161,7 @@ func processChromDoc(chromTitle string, chromDoc *goquery.Document) { // Gather links for all chromosomes categorySectionItem.Find("ul li").Each(func(index int, item *goquery.Selection) { + _gwg.Add(1) // link data geneTitle := item.Text() @@ -144,15 +171,15 @@ func processChromDoc(chromTitle string, chromDoc *goquery.Document) { // discover gene wiki page geneRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, geneLink)) if err != nil { - log.Fatal(err) + fmt.Println(err) } defer geneRes.Body.Close() if geneRes.StatusCode != 200 { - log.Fatalf("status code error: %d %s", geneRes.StatusCode, geneRes.Status) + fmt.Printf("status code error: %d %s\n", geneRes.StatusCode, geneRes.Status) } geneDoc, err := goquery.NewDocumentFromReader(geneRes.Body) if err != nil { - log.Fatal(err) + fmt.Println(err) } // find assembly @@ -161,26 +188,39 @@ func processChromDoc(chromTitle string, chromDoc *goquery.Document) { // find start and end positions var ( aliasesRowElement *goquery.Selection - aliasesValue string + aliasesValue []string humanGeneLocationTableElement *goquery.Selection startHeaderElement *goquery.Selection - startValue string + startValue int endHeaderElement *goquery.Selection - endValue string + endValue int + + assemblyIdValue string ) + if geneDoc == nil { + return + } + + // Find nomenclature + // - aliases + // - symbol(s) geneDoc.Find("tr").Each(func(index int, rowElement *goquery.Selection) { if strings.Contains(rowElement.Text(), "Aliases") { aliasesRowElement = rowElement aliasesElement := aliasesRowElement.Find("td span").First() if aliasesElement != nil { - aliasesValue = aliasesElement.Text() + aliasesValue = strings.Split(aliasesElement.Text(), ",") } } }) + // TODO: symbol(s) + // Find gene location + // - from start/end table + // - "map position" geneDoc.Find("table").Each(func(index int, table *goquery.Selection) { if strings.Contains(table.Text(), "Gene location (Human)") { humanGeneLocationTableElement = table @@ -201,46 +241,84 @@ func processChromDoc(chromTitle string, chromDoc *goquery.Document) { if startHeaderElement != nil { valueELement := startHeaderElement.SiblingsFiltered("td").Last() - startValue = valueELement.Text() + startClean := strings.ReplaceAll(strings.ReplaceAll(strings.Split(valueELement.Text(), "bp")[0], ",", ""), " ", "") + startValue, _ = strconv.Atoi(startClean) } if endHeaderElement != nil { endValueELement := endHeaderElement.SiblingsFiltered("td").Last() - endValue = endValueELement.Text() + endClean := strings.ReplaceAll(strings.ReplaceAll(strings.Split(endValueELement.Text(), "bp")[0], ",", ""), " ", "") + endValue, _ = strconv.Atoi(endClean) } } + // TODO: "map position" + + // Find Assembly + // Assume the references provided, if any, containing an assembly id is + // the assembly corresponding to the gene and its position + geneDoc.Find("span.reference-text").EachWithBreak(func(index int, referenceListItem *goquery.Selection) bool { + if strings.Contains(strings.ToLower(referenceListItem.Text()), "grch38") || + strings.Contains(strings.ToLower(referenceListItem.Text()), "grch37") || + strings.Contains(strings.ToLower(referenceListItem.Text()), "ncbi36") { + + // pluck out the link containing the text containing the assembly id + // (usually the first one) + refText := referenceListItem.Find("a").First() + + // split by colon to retrieve the assembly id + assemblyIdValue = strings.Split(refText.Text(), ":")[0] + + // break + return false + } + + // keep looping + return true + }) // store data // (temp : store to disk) - chromosome := strings.Replace(strings.Replace(chromTitle, ")", "", -1), "(", "", -1) - - fmt.Printf("Aliases: %s\n", aliasesValue) - fmt.Printf("Chromosome #%s: Gene #%d: %s - %s\n", chromosome, index, geneTitle, geneLink) - fmt.Printf("Start: %s\n", startValue) - fmt.Printf("End: %s\n\n", endValue) - - var file *os.File - thisGenePath := fmt.Sprintf("%s/%s.txt", localDataDir, geneTitle) - if _, err := os.Stat(thisGenePath); os.IsNotExist(err) { - file, err = os.Create(thisGenePath) - if err != nil { - fmt.Println(err) - return - } - } else { - file, err = os.OpenFile(thisGenePath, os.O_RDWR, 0755) - if err != nil { - log.Fatal(err) - } + chromosomeClean := strings.Replace(strings.Replace(chromTitle, ")", "", -1), "(", "", -1) + chromosome, _ := strconv.Atoi(chromosomeClean) + + discoveredGene := &models.Gene{ + Name: geneTitle, + Nomenclature: aliasesValue, + Chrom: chromosome, + AssemblyId: assemblyId.CastToAssemblyId(assemblyIdValue), + Start: startValue, + End: endValue, + SourceUrl: fmt.Sprintf("%s%s", baseUrl, geneLink), } - defer file.Close() - writeText := fmt.Sprintf("Aliases: %s\nChromosome: %s\nStart: %s\nEnd: %s\nPath: %s", aliasesValue, chromosome, startValue, endValue, geneLink) - _, err = file.WriteString(writeText) - if err != nil { - fmt.Println(err) - return + fmt.Println(discoveredGene) + + iz.GeneIngestionBulkIndexingQueue <- &structs.GeneIngestionQueueStructure{ + Gene: discoveredGene, + WaitGroup: _gwg, } + // var file *os.File + // thisGenePath := fmt.Sprintf("%s/%s.txt", localDataDir, geneTitle) + // if _, err := os.Stat(thisGenePath); os.IsNotExist(err) { + // file, err = os.Create(thisGenePath) + // if err != nil { + // fmt.Println(err) + // return + // } + // } else { + // file, err = os.OpenFile(thisGenePath, os.O_RDWR, 0755) + // if err != nil { + // log.Fatal(err) + // } + // } + // defer file.Close() + + // writeText := fmt.Sprintf("Aliases: %s\nChromosome: %s\nStart: %s\nEnd: %s\nAssemblyId: %s\nPath: %s", aliasesValue, chromosome, startValue, endValue, assemblyIdValue, geneLink) + // _, err = file.WriteString(writeText) + // if err != nil { + // fmt.Println(err) + // return + // } }) }(&geneWg) }) From 8e586a2fb0174edd43abe73f694c1570c5ec5187 Mon Sep 17 00:00:00 2001 From: brouillette Date: Mon, 27 Sep 2021 20:09:15 -0400 Subject: [PATCH 05/25] first api genes search PoC --- src/api/main.go | 3 + src/api/models/dtos.go | 8 +++ src/api/mvc/genes.go | 53 ++++++++++++++++++ src/api/repositories/elasticsearch/main.go | 64 ++++++++++++++++++++++ 4 files changed, 128 insertions(+) create mode 100644 src/api/mvc/genes.go diff --git a/src/api/main.go b/src/api/main.go index 74763465..a3a51218 100644 --- a/src/api/main.go +++ b/src/api/main.go @@ -155,6 +155,9 @@ func main() { gam.MandateAssemblyIdAttribute) e.GET("/variants/ingestion/requests", mvc.GetAllVariantIngestionRequests) + // -- Genes + e.GET("/genes/search", mvc.GenesGetByNomenclatureWildcard) + // Run e.Logger.Fatal(e.Start(":" + cfg.Api.Port)) } diff --git a/src/api/models/dtos.go b/src/api/models/dtos.go index bcd23ddd..35b18bfc 100644 --- a/src/api/models/dtos.go +++ b/src/api/models/dtos.go @@ -12,3 +12,11 @@ type VariantResponseDataModel struct { Count int `json:"count"` Results []Variant `json:"results"` // []Variant } + +type GenesResponseDTO struct { + Status int `json:"status"` + Message string `json:"message"` + Term string `json:"term"` + Count int `json:"count"` + Results []Gene `json:"results"` // []Gene +} diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go new file mode 100644 index 00000000..d81175b1 --- /dev/null +++ b/src/api/mvc/genes.go @@ -0,0 +1,53 @@ +package mvc + +import ( + "api/contexts" + "api/models" + esRepo "api/repositories/elasticsearch" + "fmt" + "net/http" + + "github.com/labstack/echo" + "github.com/mitchellh/mapstructure" +) + +func GenesGetByNomenclatureWildcard(c echo.Context) error { + cfg := c.(*contexts.GohanContext).Config + es := c.(*contexts.GohanContext).Es7Client + + geneResponseDTO := models.GenesResponseDTO{} + + term := c.QueryParam("term") + + fmt.Printf("Executing wildcard genes search for term %s\n", term) + + docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, term) + + docsHits := docs["hits"].(map[string]interface{})["hits"] + allDocHits := []map[string]interface{}{} + mapstructure.Decode(docsHits, &allDocHits) + + // grab _source for each hit + var allSources []models.Gene + + for _, r := range allDocHits { + source := r["_source"].(map[string]interface{}) + + // cast map[string]interface{} to struct + var resultingVariant models.Gene + mapstructure.Decode(source, &resultingVariant) + + // accumulate structs + allSources = append(allSources, resultingVariant) + } + + fmt.Printf("Found %d docs!\n", len(allSources)) + + geneResponseDTO.Count = len(allSources) + geneResponseDTO.Results = allSources + + geneResponseDTO.Status = 200 + geneResponseDTO.Message = "Success" + + return c.JSON(http.StatusOK, geneResponseDTO) +} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 41cd120c..50a0db1c 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -516,3 +516,67 @@ func GetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword s return result } + +func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, term string) map[string]interface{} { + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + + // overall query structure + var buf bytes.Buffer + query := map[string]interface{}{ + "query": map[string]interface{}{ + "wildcard": map[string]interface{}{ + "nomenclature": map[string]interface{}{ + "value": fmt.Sprintf("*%s*", term), + "boost": 1.0, + "rewrite": "constant_score", + }, + }, + }, + "size": 25, // default + } + + // encode the query + if err := json.NewEncoder(&buf).Encode(query); err != nil { + log.Fatalf("Error encoding query: %s\n", err) + } + + if cfg.Debug { + // view the outbound elasticsearch query + myString := string(buf.Bytes()[:]) + fmt.Println(myString) + } + + // Perform the search request. + searchRes, searchErr := es.Search( + es.Search.WithContext(context.Background()), + es.Search.WithIndex("genes"), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithPretty(), + ) + if searchErr != nil { + fmt.Printf("Error getting response: %s\n", searchErr) + } + + defer searchRes.Body.Close() + + resultString := searchRes.String() + if cfg.Debug { + fmt.Println(resultString) + } + + // Prepare an empty interface + result := make(map[string]interface{}) + + // Unmarshal or Decode the JSON to the empty interface. + // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) + umErr := json.Unmarshal([]byte(resultString[9:]), &result) + if umErr != nil { + fmt.Printf("Error unmarshalling gene search response: %s\n", umErr) + } + + return result +} From 18cb3904248a31ef9fccb4c942da1f221e188438 Mon Sep 17 00:00:00 2001 From: brouillette Date: Mon, 27 Sep 2021 20:16:28 -0400 Subject: [PATCH 06/25] patch --- src/api/mvc/genes.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index d81175b1..5643422a 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -15,8 +15,6 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { cfg := c.(*contexts.GohanContext).Config es := c.(*contexts.GohanContext).Es7Client - geneResponseDTO := models.GenesResponseDTO{} - term := c.QueryParam("term") fmt.Printf("Executing wildcard genes search for term %s\n", term) @@ -43,11 +41,13 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { fmt.Printf("Found %d docs!\n", len(allSources)) - geneResponseDTO.Count = len(allSources) - geneResponseDTO.Results = allSources - - geneResponseDTO.Status = 200 - geneResponseDTO.Message = "Success" + geneResponseDTO := models.GenesResponseDTO{ + Term: term, + Count: len(allSources), + Results: allSources, + Status: 200, + Message: "Success", + } return c.JSON(http.StatusOK, geneResponseDTO) } From 837504791713e67af3382c10d90194eb6d9f4c00 Mon Sep 17 00:00:00 2001 From: brouillette Date: Tue, 28 Sep 2021 12:01:56 -0400 Subject: [PATCH 07/25] begin migrating over to using UCSC --- .gitignore | 4 +- src/gene/main.go | 23 +------- src/gota-poc/main.go | 121 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 23 deletions(-) create mode 100644 src/gota-poc/main.go diff --git a/.gitignore b/.gitignore index dd455cc7..ce3ddbed 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,6 @@ bin/* */*/tmp *.vcf -*.vcf.gz \ No newline at end of file +*.vcf.gz + +*/*/*.csv \ No newline at end of file diff --git a/src/gene/main.go b/src/gene/main.go index 69319664..d942c7be 100644 --- a/src/gene/main.go +++ b/src/gene/main.go @@ -297,28 +297,7 @@ func processChromDoc(iz *services.IngestionService, chromTitle string, chromDoc Gene: discoveredGene, WaitGroup: _gwg, } - // var file *os.File - // thisGenePath := fmt.Sprintf("%s/%s.txt", localDataDir, geneTitle) - // if _, err := os.Stat(thisGenePath); os.IsNotExist(err) { - // file, err = os.Create(thisGenePath) - // if err != nil { - // fmt.Println(err) - // return - // } - // } else { - // file, err = os.OpenFile(thisGenePath, os.O_RDWR, 0755) - // if err != nil { - // log.Fatal(err) - // } - // } - // defer file.Close() - - // writeText := fmt.Sprintf("Aliases: %s\nChromosome: %s\nStart: %s\nEnd: %s\nAssemblyId: %s\nPath: %s", aliasesValue, chromosome, startValue, endValue, assemblyIdValue, geneLink) - // _, err = file.WriteString(writeText) - // if err != nil { - // fmt.Println(err) - // return - // } + }) }(&geneWg) }) diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go new file mode 100644 index 00000000..9a107ca9 --- /dev/null +++ b/src/gota-poc/main.go @@ -0,0 +1,121 @@ +package main + +import ( + "fmt" + "io/ioutil" + "log" + "net/http" + "net/url" + "os" + "sync" +) + +func main() { + + client := http.DefaultClient + + ucscUrl := "https://genome.ucsc.edu/cgi-bin/hgTables" + + // make initial call to get hgid + req, err := client.Get(ucscUrl) + if err != nil { + fmt.Printf("err:%v\n", err) + } + fmt.Printf("%+v", req) + // TODO: get Origin-Trial from header + var originTrial string + for key, value := range req.Header { + if key == "Origin-Trial" { + fmt.Println("Got 'Origin-Trial' Header") + originTrial = value[0] + } + } + if originTrial == "" { + log.Fatal("Missing originTrial") + } + + // TODO: get hguid from cookie + var hguid string + for _, cookie := range req.Cookies() { + if cookie.Name == "hguid" { + fmt.Println("Got 'hguid' Cookie") + hguid = cookie.Value + } + } + if hguid == "" { + log.Fatal("Missing hguid") + } + + var dbWg sync.WaitGroup + allDBs := []string{"hg38", "hg19", "hg18", "hg17", "hg16"} + for _, db := range allDBs { + dbWg.Add(1) + + go func(_db string, _wg *sync.WaitGroup) { + defer _wg.Done() + fmt.Printf("Setting up %s..\n", _db) + + // begin mining + v := url.Values{} + // v.Add("hgsid", ) + v.Add("jsh_pageVertPos", "0") + v.Add("clade", "mammal") + v.Add("org", "Human") + v.Add("db", _db) + v.Add("hgta_group", "genes") + v.Add("hgta_track", "knownGene") + v.Add("hgta_table", "knownGene") + v.Add("hgta_regionType", "genome") + // v.Add("position", "chrM:5,904-7,445") + v.Add("hgta_outputType", "primaryTable") + v.Add("boolshad.sendToGalaxy", "0") + v.Add("boolshad.sendToGreat", "0") + v.Add("hgta_outFileName", "") + v.Add("hgta_compressType", "none") + v.Add("hgta_doTopSubmit", "get output") + + miningReq, _ := http.NewRequest("GET", ucscUrl, nil) + miningReq.Header.Set("Cookie", fmt.Sprintf("hguid=%s", hguid)) + miningReq.Header.Set("Host", "genome.ucsc.edu") + miningReq.Header.Set("Origin", "https://genome.ucsc.edu") + miningReq.Header.Set("Referer", ucscUrl) + + fmt.Printf("Downloading %s..\n", _db) + req, err = client.PostForm(ucscUrl, v) + if err != nil { + fmt.Printf("err:%v\n", err) + } + fmt.Printf("%+v", req) + body, err := ioutil.ReadAll(req.Body) + if err != nil { + fmt.Printf("Error reading body: %v", err) + return + } + + var file *os.File + dbPath := fmt.Sprintf("%s.csv", _db) + if _, err := os.Stat(dbPath); os.IsNotExist(err) { + file, err = os.Create(dbPath) + if err != nil { + fmt.Println(err) + return + } + } else { + file, err = os.OpenFile(dbPath, os.O_RDWR, 0755) + if err != nil { + log.Fatal(err) + } + } + defer file.Close() + + _, err = file.WriteString(string(body)) + if err != nil { + fmt.Println(err) + return + } + + fmt.Printf("Save to file %s\n", dbPath) + }(db, &dbWg) + } + dbWg.Wait() +} From 71237f2b1e7385c938069b04dfdfbbf0f9142e26 Mon Sep 17 00:00:00 2001 From: brouillette Date: Tue, 28 Sep 2021 16:32:30 -0400 Subject: [PATCH 08/25] upgrading gene ingestion: - using gota - better searching --- src/api/models/constants/assembly-id/main.go | 6 + src/api/models/elasticsearch.go | 9 +- src/api/repositories/elasticsearch/main.go | 9 +- src/api/services/ingestion.go | 6 +- src/gota-poc/main.go | 277 +++++++++++++++---- 5 files changed, 239 insertions(+), 68 deletions(-) diff --git a/src/api/models/constants/assembly-id/main.go b/src/api/models/constants/assembly-id/main.go index 0c396ff3..1885f6c9 100644 --- a/src/api/models/constants/assembly-id/main.go +++ b/src/api/models/constants/assembly-id/main.go @@ -11,6 +11,8 @@ const ( GRCh38 constants.AssemblyId = "GRCh38" GRCh37 constants.AssemblyId = "GRCh37" NCBI36 constants.AssemblyId = "NCBI36" + NCBI35 constants.AssemblyId = "NCBI35" + NCBI34 constants.AssemblyId = "NCBI34" Other constants.AssemblyId = "Other" ) @@ -22,6 +24,10 @@ func CastToAssemblyId(text string) constants.AssemblyId { return GRCh37 case "ncbi36": return NCBI36 + case "ncbi35": + return NCBI35 + case "ncbi34": + return NCBI34 case "other": return Other default: diff --git a/src/api/models/elasticsearch.go b/src/api/models/elasticsearch.go index 12ed2a2a..4c93d57e 100644 --- a/src/api/models/elasticsearch.go +++ b/src/api/models/elasticsearch.go @@ -47,11 +47,14 @@ type Genotype struct { } type Gene struct { - Name string `json:"name"` - Nomenclature []string `json:"nomenclature"` + Nomenclature Nomenclature `json:"nomenclature"` Chrom int `json:"chrom"` Start int `json:"start"` End int `json:"end"` AssemblyId c.AssemblyId `json:"assemblyId"` - SourceUrl string `json:"sourceUrl"` +} + +type Nomenclature struct { + Names []string `json:"names"` + GeneNames []string `json:"geneNames"` } diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 50a0db1c..377cdd55 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -527,12 +527,9 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client var buf bytes.Buffer query := map[string]interface{}{ "query": map[string]interface{}{ - "wildcard": map[string]interface{}{ - "nomenclature": map[string]interface{}{ - "value": fmt.Sprintf("*%s*", term), - "boost": 1.0, - "rewrite": "constant_score", - }, + "query_string": map[string]interface{}{ + "fields": []string{"nomenclature.names", "nomenclature.genes"}, + "query": fmt.Sprintf("*%s*", term), }, }, "size": 25, // default diff --git a/src/api/services/ingestion.go b/src/api/services/ingestion.go index 31b4b88c..7c1fd014 100644 --- a/src/api/services/ingestion.go +++ b/src/api/services/ingestion.go @@ -78,9 +78,9 @@ func NewIngestionService(es *elasticsearch.Client) *IngestionService { gbi, _ := esutil.NewBulkIndexer(esutil.BulkIndexerConfig{ Index: "genes", Client: iz.ElasticsearchClient, - NumWorkers: 5, - FlushBytes: int(64), // The flush threshold in bytes (default: 5MB ?) - //FlushInterval: 30 * time.Second, // The periodic flush interval + NumWorkers: numWorkers, + //FlushBytes: int(64), // The flush threshold in bytes (default: 5MB ?) + FlushInterval: 3 * time.Second, // The periodic flush interval }) iz.GeneIngestionBulkIndexer = gbi diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go index 9a107ca9..a0ea4e5e 100644 --- a/src/gota-poc/main.go +++ b/src/gota-poc/main.go @@ -1,28 +1,58 @@ package main import ( + "api/models" + "api/models/constants" + assemblyId "api/models/constants/assembly-id" + "api/models/ingest/structs" + "api/services" + "api/utils" + "crypto/tls" "fmt" "io/ioutil" "log" "net/http" "net/url" "os" + "strconv" + "strings" "sync" + + "github.com/go-gota/gota/dataframe" + "github.com/kelseyhightower/envconfig" ) func main() { - client := http.DefaultClient + // Gather environment variables + var cfg models.Config + err := envconfig.Process("", &cfg) + if err != nil { + fmt.Println(err) + os.Exit(2) + } + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + + // Service Connections: + // -- Elasticsearch + es := utils.CreateEsConnection(&cfg) + iz := services.NewIngestionService(es) + iz.Init() ucscUrl := "https://genome.ucsc.edu/cgi-bin/hgTables" // make initial call to get hgid + client := http.DefaultClient req, err := client.Get(ucscUrl) if err != nil { fmt.Printf("err:%v\n", err) } fmt.Printf("%+v", req) - // TODO: get Origin-Trial from header + + // get Origin-Trial from header var originTrial string for key, value := range req.Header { if key == "Origin-Trial" { @@ -34,7 +64,7 @@ func main() { log.Fatal("Missing originTrial") } - // TODO: get hguid from cookie + // get hguid from cookie var hguid string for _, cookie := range req.Cookies() { if cookie.Name == "hguid" { @@ -46,76 +76,211 @@ func main() { log.Fatal("Missing hguid") } + assemblyIdMap := map[constants.AssemblyId]string{ + assemblyId.GRCh38: "hg38", + assemblyId.GRCh37: "hg19", + assemblyId.NCBI36: "hg18", + assemblyId.NCBI35: "hg17", + assemblyId.NCBI34: "hg16", + } + var dbWg sync.WaitGroup - allDBs := []string{"hg38", "hg19", "hg18", "hg17", "hg16"} - for _, db := range allDBs { + for _, db := range assemblyIdMap { dbWg.Add(1) go func(_db string, _wg *sync.WaitGroup) { defer _wg.Done() - fmt.Printf("Setting up %s..\n", _db) - - // begin mining - v := url.Values{} - // v.Add("hgsid", ) - v.Add("jsh_pageVertPos", "0") - v.Add("clade", "mammal") - v.Add("org", "Human") - v.Add("db", _db) - v.Add("hgta_group", "genes") - v.Add("hgta_track", "knownGene") - v.Add("hgta_table", "knownGene") - v.Add("hgta_regionType", "genome") - // v.Add("position", "chrM:5,904-7,445") - v.Add("hgta_outputType", "primaryTable") - v.Add("boolshad.sendToGalaxy", "0") - v.Add("boolshad.sendToGreat", "0") - v.Add("hgta_outFileName", "") - v.Add("hgta_compressType", "none") - v.Add("hgta_doTopSubmit", "get output") - - miningReq, _ := http.NewRequest("GET", ucscUrl, nil) - miningReq.Header.Set("Cookie", fmt.Sprintf("hguid=%s", hguid)) - miningReq.Header.Set("Host", "genome.ucsc.edu") - miningReq.Header.Set("Origin", "https://genome.ucsc.edu") - miningReq.Header.Set("Referer", ucscUrl) - - fmt.Printf("Downloading %s..\n", _db) - req, err = client.PostForm(ucscUrl, v) - if err != nil { - fmt.Printf("err:%v\n", err) - } - fmt.Printf("%+v", req) - body, err := ioutil.ReadAll(req.Body) - if err != nil { - fmt.Printf("Error reading body: %v", err) - return - } - var file *os.File dbPath := fmt.Sprintf("%s.csv", _db) + if _, err := os.Stat(dbPath); os.IsNotExist(err) { - file, err = os.Create(dbPath) + fmt.Printf("Setting up %s..\n", _db) + + // begin mining + v := url.Values{} + // v.Add("hgsid", ) + v.Add("jsh_pageVertPos", "0") + v.Add("clade", "mammal") + v.Add("org", "Human") + v.Add("db", _db) + v.Add("hgta_group", "genes") + v.Add("hgta_track", "knownGene") + v.Add("hgta_table", "knownGene") + v.Add("hgta_regionType", "genome") + // v.Add("position", "chrM:5,904-7,445") + v.Add("hgta_outputType", "primaryTable") + v.Add("boolshad.sendToGalaxy", "0") + v.Add("boolshad.sendToGreat", "0") + v.Add("hgta_outFileName", "") + v.Add("hgta_compressType", "none") + v.Add("hgta_doTopSubmit", "get output") + + miningReq, _ := http.NewRequest("GET", ucscUrl, nil) + miningReq.Header.Set("Cookie", fmt.Sprintf("hguid=%s", hguid)) + miningReq.Header.Set("Host", "genome.ucsc.edu") + miningReq.Header.Set("Origin", "https://genome.ucsc.edu") + miningReq.Header.Set("Referer", ucscUrl) + + fmt.Printf("Downloading %s..\n", _db) + req, err = client.PostForm(ucscUrl, v) if err != nil { - fmt.Println(err) + fmt.Printf("err:%v\n", err) + } + fmt.Printf("%+v", req) + body, err := ioutil.ReadAll(req.Body) + if err != nil { + fmt.Printf("Error reading body: %v", err) return } - } else { - file, err = os.OpenFile(dbPath, os.O_RDWR, 0755) + + var file *os.File + dbPath := fmt.Sprintf("%s.csv", _db) + if _, err := os.Stat(dbPath); os.IsNotExist(err) { + file, err = os.Create(dbPath) + if err != nil { + fmt.Println(err) + return + } + } else { + file, err = os.OpenFile(dbPath, os.O_RDWR, 0755) + if err != nil { + log.Fatal(err) + } + } + defer file.Close() + + _, err = file.WriteString(string(body)) if err != nil { - log.Fatal(err) + fmt.Println(err) + return } - } - defer file.Close() - _, err = file.WriteString(string(body)) - if err != nil { - fmt.Println(err) - return + fmt.Printf("Save to file %s\n", dbPath) + } else { + fmt.Printf("%s already downloaded!\n", dbPath) } - fmt.Printf("Save to file %s\n", dbPath) }(db, &dbWg) } dbWg.Wait() + + var geneWg sync.WaitGroup + + for assId, db := range assemblyIdMap { + // Read one file at a time + dbPath := fmt.Sprintf("%s.csv", db) + + content, err := ioutil.ReadFile(dbPath) // the file is inside the local directory + if err != nil { + fmt.Println("Err") + } + + // Gota + df := dataframe.ReadCSV(strings.NewReader(string(content)), + dataframe.WithDelimiter('\t'), + dataframe.HasHeader(true)) + fmt.Printf("%s :\n", dbPath) + fmt.Println(df) + + var chromHeaderKey int + var startKey int + var endKey int + var nameHeaderKeys []int + var geneNameHeaderKeys []int + + // discover name indexes + + for n, record := range df.Records() { + if n == 0 { + for m, r := range record { + if strings.Contains(r, "name") { + nameHeaderKeys = append(nameHeaderKeys, m) + } else if strings.Contains(r, "geneName") { + geneNameHeaderKeys = append(geneNameHeaderKeys, m) + } else if strings.Contains(r, "chromStart") || strings.Contains(r, "cdsStart") { + startKey = m + } else if strings.Contains(r, "chromEnd") || strings.Contains(r, "cdsEnd") { + endKey = m + } else if r == "chrom" || r == "#chrom" { + chromHeaderKey = m + } + } + continue + } + + geneWg.Add(1) + go func(_record []string, _chromHeaderKey int, + _startKey int, _endKey int, + _nameHeaderKeys []int, _geneNameHeaderKeys []int, + _assId constants.AssemblyId, + _gwg *sync.WaitGroup) { + // fmt.Printf("row : %s\n", row) + + // create instance of a Gene structure + var names, geneNames []string + var chromStart, chromEnd int + + // discover names + for _, nk := range _nameHeaderKeys { + names = append(names, _record[nk]) + } + for _, nk := range geneNameHeaderKeys { + geneNames = append(geneNames, _record[nk]) + } + + //clean chromosome + chromosomeClean := strings.ReplaceAll(strings.ReplaceAll(_record[_chromHeaderKey], "chr", ""), "#", "") + + // skip this record if the chromosome contians "scaffolding", i.e 'chr1_something_something' + if strings.Contains(chromosomeClean, "_") { + geneWg.Done() + return + } + + // TODO: fomarmalize + // if chromosome MT, set to 0 + // if chromosome X, set to -1 + // if chromosome Y, set to -2 + var chromosome int + if strings.Contains(strings.ToUpper(chromosomeClean), "MT") { + chromosome = 0 + } else if strings.ToUpper(chromosomeClean) == "X" { + chromosome = -1 + } else if strings.ToUpper(chromosomeClean) == "Y" { + chromosome = -2 + } else { + chromosome, _ = strconv.Atoi(chromosomeClean) + } + + // clean start/end + chromStartClean := strings.ReplaceAll(strings.ReplaceAll(_record[_startKey], ",", ""), " ", "") + chromStart, _ = strconv.Atoi(chromStartClean) + + chromEndClean := strings.ReplaceAll(strings.ReplaceAll(_record[_endKey], ",", ""), " ", "") + chromEnd, _ = strconv.Atoi(chromEndClean) + + discoveredGene := &models.Gene{ + Nomenclature: models.Nomenclature{ + Names: names, + GeneNames: geneNames, + }, + Chrom: chromosome, + Start: chromStart, + End: chromEnd, + AssemblyId: _assId, + } + + fmt.Printf("Keys :%d, %d, %d, %d, %d -- %s\n", _chromHeaderKey, _startKey, _endKey, _nameHeaderKeys, _geneNameHeaderKeys, discoveredGene) + + iz.GeneIngestionBulkIndexingQueue <- &structs.GeneIngestionQueueStructure{ + Gene: discoveredGene, + WaitGroup: _gwg, + } + }(record, chromHeaderKey, startKey, endKey, nameHeaderKeys, geneNameHeaderKeys, assId, &geneWg) + + fmt.Printf("Stats : %d\n", iz.GeneIngestionBulkIndexer.Stats()) + } + geneWg.Wait() + + } } From 78d660a691a2e667bb65b21be92ec26249ce8444 Mon Sep 17 00:00:00 2001 From: brouillette Date: Wed, 29 Sep 2021 10:57:46 -0400 Subject: [PATCH 09/25] patched gene ingestion: - better reflects assembly structures --- src/gota-poc/main.go | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go index a0ea4e5e..a4407a8e 100644 --- a/src/gota-poc/main.go +++ b/src/gota-poc/main.go @@ -180,31 +180,33 @@ func main() { dataframe.WithDelimiter('\t'), dataframe.HasHeader(true)) fmt.Printf("%s :\n", dbPath) - fmt.Println(df) - var chromHeaderKey int - var startKey int - var endKey int - var nameHeaderKeys []int - var geneNameHeaderKeys []int + var ( + chromHeaderKey = 0 + startKey = 1 + endKey = 2 + nameHeaderKeys = []int{3} + geneNameHeaderKeys []int + ) + + var columnsToPrint []string + if assId == assemblyId.GRCh38 { + // GRCh38 dataset has multiple name fields (name, name2) and + // also includes gene name fields (geneName, geneName2) + columnsToPrint = append(columnsToPrint, "#chrom", "chromStart", "chromEnd", "name", "name2", "geneName", "geneName2") + nameHeaderKeys = append(nameHeaderKeys, 4) + geneNameHeaderKeys = append(geneNameHeaderKeys, 5, 6) + } else { + columnsToPrint = append(columnsToPrint, "chrom", "txStart", "txEnd", "#name") + } + + df = df.Select(columnsToPrint) + fmt.Println(df) // discover name indexes for n, record := range df.Records() { if n == 0 { - for m, r := range record { - if strings.Contains(r, "name") { - nameHeaderKeys = append(nameHeaderKeys, m) - } else if strings.Contains(r, "geneName") { - geneNameHeaderKeys = append(geneNameHeaderKeys, m) - } else if strings.Contains(r, "chromStart") || strings.Contains(r, "cdsStart") { - startKey = m - } else if strings.Contains(r, "chromEnd") || strings.Contains(r, "cdsEnd") { - endKey = m - } else if r == "chrom" || r == "#chrom" { - chromHeaderKey = m - } - } continue } @@ -278,7 +280,7 @@ func main() { } }(record, chromHeaderKey, startKey, endKey, nameHeaderKeys, geneNameHeaderKeys, assId, &geneWg) - fmt.Printf("Stats : %d\n", iz.GeneIngestionBulkIndexer.Stats()) + // fmt.Printf("Stats : %d\n", iz.GeneIngestionBulkIndexer.Stats()) } geneWg.Wait() From fd53d3375a993e2b55941e89b6646cc50dd05ee5 Mon Sep 17 00:00:00 2001 From: brouillette Date: Wed, 29 Sep 2021 14:15:29 -0400 Subject: [PATCH 10/25] better api querying for genes: - clearer chom values (MT, X, Y) - assemblyId query param - sorting --- src/api/mvc/genes.go | 14 ++++++- src/api/repositories/elasticsearch/main.go | 48 +++++++++++++++++++--- src/gota-poc/main.go | 15 +++---- 3 files changed, 63 insertions(+), 14 deletions(-) diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index 5643422a..a6d3fc2a 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -3,6 +3,7 @@ package mvc import ( "api/contexts" "api/models" + assemblyId "api/models/constants/assembly-id" esRepo "api/repositories/elasticsearch" "fmt" "net/http" @@ -17,9 +18,18 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { term := c.QueryParam("term") - fmt.Printf("Executing wildcard genes search for term %s\n", term) + // perform wildcard search if empty/random parameter is passed + // - set to Unknown to trigger it + assId := assemblyId.Unknown + assIdQP := c.QueryParam("assemblyId") + if assemblyId.CastToAssemblyId(assIdQP) != assemblyId.Unknown { + // retrieve passed parameter if is valid + assId = assemblyId.CastToAssemblyId(assIdQP) + } + + fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s\n", term, assId) - docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, term) + docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, term, assId) docsHits := docs["hits"].(map[string]interface{})["hits"] allDocHits := []map[string]interface{}{} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 377cdd55..06774151 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -11,7 +11,9 @@ import ( "time" "api/models" + "api/models/constants" c "api/models/constants" + assemblyId "api/models/constants/assembly-id" gq "api/models/constants/genotype-query" s "api/models/constants/sort" z "api/models/constants/zygosity" @@ -517,22 +519,58 @@ func GetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword s return result } -func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, term string) map[string]interface{} { +func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, term string, assId constants.AssemblyId) map[string]interface{} { // TEMP: SECURITY RISK http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} // - // overall query structure + // Nomenclature Search Term + nomenclatureStringTerm := fmt.Sprintf("*%s*", term) + + // Assembly Id Search Term (wildcard by default) + assemblyIdStringTerm := "*" + if assId != assemblyId.Unknown { + assemblyIdStringTerm = string(assId) + } + var buf bytes.Buffer query := map[string]interface{}{ "query": map[string]interface{}{ - "query_string": map[string]interface{}{ - "fields": []string{"nomenclature.names", "nomenclature.genes"}, - "query": fmt.Sprintf("*%s*", term), + "bool": map[string]interface{}{ + "filter": []map[string]interface{}{{ + "bool": map[string]interface{}{ + "must": []map[string]interface{}{ + { + "query_string": map[string]interface{}{ + "fields": []string{"nomenclature.names", "nomenclature.genes"}, + "query": nomenclatureStringTerm, + }, + }, + { + "query_string": map[string]interface{}{ + "fields": []string{"assemblyId"}, + "query": assemblyIdStringTerm, + }, + }, + }, + }, + }}, }, }, "size": 25, // default + "sort": []map[string]interface{}{ + { + "chrom": map[string]interface{}{ + "order": "asc", + }, + }, + { + "start": map[string]interface{}{ + "order": "asc", + }, + }, + }, } // encode the query diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go index a4407a8e..c6e9a525 100644 --- a/src/gota-poc/main.go +++ b/src/gota-poc/main.go @@ -79,9 +79,10 @@ func main() { assemblyIdMap := map[constants.AssemblyId]string{ assemblyId.GRCh38: "hg38", assemblyId.GRCh37: "hg19", - assemblyId.NCBI36: "hg18", - assemblyId.NCBI35: "hg17", - assemblyId.NCBI34: "hg16", + // SKIP + // assemblyId.NCBI36: "hg18", + // assemblyId.NCBI35: "hg17", + // assemblyId.NCBI34: "hg16", } var dbWg sync.WaitGroup @@ -241,15 +242,15 @@ func main() { // TODO: fomarmalize // if chromosome MT, set to 0 - // if chromosome X, set to -1 - // if chromosome Y, set to -2 + // if chromosome X, set to 101 + // if chromosome Y, set to 102 var chromosome int if strings.Contains(strings.ToUpper(chromosomeClean), "MT") { chromosome = 0 } else if strings.ToUpper(chromosomeClean) == "X" { - chromosome = -1 + chromosome = 101 } else if strings.ToUpper(chromosomeClean) == "Y" { - chromosome = -2 + chromosome = 102 } else { chromosome, _ = strconv.Atoi(chromosomeClean) } From ebbb9a052f212ae47ba65d682aa6c75ca0622279 Mon Sep 17 00:00:00 2001 From: brouillette Date: Wed, 29 Sep 2021 16:02:48 -0400 Subject: [PATCH 11/25] patched size parameter --- src/api/mvc/genes.go | 26 +++++++++++++++++----- src/api/repositories/elasticsearch/main.go | 5 +++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index a6d3fc2a..435f4c83 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -7,6 +7,7 @@ import ( esRepo "api/repositories/elasticsearch" "fmt" "net/http" + "strconv" "github.com/labstack/echo" "github.com/mitchellh/mapstructure" @@ -16,20 +17,35 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { cfg := c.(*contexts.GohanContext).Config es := c.(*contexts.GohanContext).Es7Client + // Name search term term := c.QueryParam("term") + // Assembly ID // perform wildcard search if empty/random parameter is passed // - set to Unknown to trigger it assId := assemblyId.Unknown - assIdQP := c.QueryParam("assemblyId") - if assemblyId.CastToAssemblyId(assIdQP) != assemblyId.Unknown { + if assemblyId.CastToAssemblyId(c.QueryParam("assemblyId")) != assemblyId.Unknown { // retrieve passed parameter if is valid - assId = assemblyId.CastToAssemblyId(assIdQP) + assId = assemblyId.CastToAssemblyId(c.QueryParam("assemblyId")) } - fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s\n", term, assId) + // Size + var ( + size int = 25 + sizeCastErr error + ) + if len(c.QueryParam("size")) > 0 { + sizeQP := c.QueryParam("size") + size, sizeCastErr = strconv.Atoi(sizeQP) + if sizeCastErr != nil { + size = 25 + } + } + + fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s (max size: %d)\n", term, assId, size) - docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, term, assId) + // Execute + docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, term, assId, size) docsHits := docs["hits"].(map[string]interface{})["hits"] allDocHits := []map[string]interface{}{} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 06774151..66d0f063 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -519,7 +519,8 @@ func GetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword s return result } -func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, term string, assId constants.AssemblyId) map[string]interface{} { +func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, + term string, assId constants.AssemblyId, size int) map[string]interface{} { // TEMP: SECURITY RISK http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} @@ -558,7 +559,7 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client }}, }, }, - "size": 25, // default + "size": size, "sort": []map[string]interface{}{ { "chrom": map[string]interface{}{ From 8d199ba153b022da38bb835962c0d4867795874a Mon Sep 17 00:00:00 2001 From: brouillette Date: Wed, 29 Sep 2021 16:05:54 -0400 Subject: [PATCH 12/25] removed wikipedia scraper --- src/gene/main.go | 305 ----------------------------------------------- 1 file changed, 305 deletions(-) delete mode 100644 src/gene/main.go diff --git a/src/gene/main.go b/src/gene/main.go deleted file mode 100644 index d942c7be..00000000 --- a/src/gene/main.go +++ /dev/null @@ -1,305 +0,0 @@ -package main - -import ( - "api/models" - assemblyId "api/models/constants/assembly-id" - "api/models/ingest/structs" - "api/services" - "api/utils" - "crypto/tls" - "fmt" - "log" - "net/http" - "os" - "strconv" - "strings" - "sync" - "time" - - "github.com/PuerkitoBio/goquery" - "github.com/kelseyhightower/envconfig" -) - -const ( - // Setup - localDataDir = "data" - baseUrl = "https://en.wikipedia.org" -) - -func main() { - // Gather environment variables - var cfg models.Config - err := envconfig.Process("", &cfg) - if err != nil { - fmt.Println(err) - os.Exit(2) - } - - // TEMP: SECURITY RISK - http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} - // - - // Service Connections: - // -- Elasticsearch - es := utils.CreateEsConnection(&cfg) - iz := services.NewIngestionService(es) - iz.Init() - - // - create local data dirs - if _, err := os.Stat(localDataDir); os.IsNotExist(err) { - err := os.Mkdir(localDataDir, 0755) - if err != nil { - log.Fatal(err) - } - } - - startTime := time.Now() - - // Start here on chromosome 1 - res, err := http.Get(fmt.Sprintf("%s/wiki/Category:Genes_on_human_chromosome_1", baseUrl)) - if err != nil { - log.Fatal(err) - } - defer res.Body.Close() - if res.StatusCode != 200 { - log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) - } - doc, err := goquery.NewDocumentFromReader(res.Body) - if err != nil { - log.Fatal(err) - } - - // Pluck out the navigation bar with all the chromosomes - doc.Find("#mw-content-text > div.mw-parser-output > div.navbox > table > tbody > tr > td").Each(func(index int, item *goquery.Selection) { - - // Gather links for all chromosomes - //var chromosomeWg sync.WaitGroup - item.Find("div ul li").Each(func(index int, item *goquery.Selection) { - - // chromosomeWg.Add(1) - // go func(_cwg *sync.WaitGroup) { - // defer _cwg.Done() - - // link data - chromTitle := item.Text() - chromLinkTag := item.Find("a") - chromLink, _ := chromLinkTag.Attr("href") - - //fmt.Printf("Chromosome #%d: %s - %s\n", index, chromTitle, chromLink) - - // process: - // begin on an initial page, and verify the end of each page if - // a "next page" link exists. if so, query that and continue processing - // in the same manner - for { - chromPageRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, chromLink)) - if err != nil { - fmt.Println(err) - continue - } - defer chromPageRes.Body.Close() - if res.StatusCode != 200 { - fmt.Printf("status code error: %d %s\n", res.StatusCode, res.Status) - continue - } - - chromDoc, err := goquery.NewDocumentFromReader(chromPageRes.Body) - if err != nil { - fmt.Println(err) - continue - } - - processChromDoc(iz, chromTitle, chromDoc) - - hasNextPage := false - chromDoc.Find("#mw-pages > a").EachWithBreak(func(index int, linkItem *goquery.Selection) bool { - - if strings.Contains(strings.ToLower(linkItem.Text()), "next page") { - chromLink, _ = linkItem.Attr("href") - hasNextPage = true - - // break - return false - } - - // continue loop - return true - }) - - if !hasNextPage { - break - } - } - // gather "next page" link if available - - // }(&chromosomeWg) - }) - // chromosomeWg.Wait() - }) - - // Done - display time lapse - fmt.Printf("Process duration %s\n", time.Since(startTime)) -} - -func processChromDoc(iz *services.IngestionService, chromTitle string, chromDoc *goquery.Document) { - // Pluck out sections with the links to all the genes on this page alphabetically - var geneWg sync.WaitGroup - chromDoc.Find(".mw-category-group").Each(func(index int, categorySectionItem *goquery.Selection) { - go func(_gwg *sync.WaitGroup) { - //defer _gwg.Done() - - // Skip this category if it's a wildcard - isWildcard := false - categorySectionItem.Find("h3").Each(func(index int, h3Item *goquery.Selection) { - if h3Item.Text() == "*" { - isWildcard = true - } - }) - if isWildcard { - return - } - - // Gather links for all chromosomes - categorySectionItem.Find("ul li").Each(func(index int, item *goquery.Selection) { - _gwg.Add(1) - - // link data - geneTitle := item.Text() - geneLinkTag := item.Find("a") - geneLink, _ := geneLinkTag.Attr("href") - - // discover gene wiki page - geneRes, err := http.Get(fmt.Sprintf("%s%s", baseUrl, geneLink)) - if err != nil { - fmt.Println(err) - } - defer geneRes.Body.Close() - if geneRes.StatusCode != 200 { - fmt.Printf("status code error: %d %s\n", geneRes.StatusCode, geneRes.Status) - } - geneDoc, err := goquery.NewDocumentFromReader(geneRes.Body) - if err != nil { - fmt.Println(err) - } - - // find assembly - // TODO - - // find start and end positions - var ( - aliasesRowElement *goquery.Selection - aliasesValue []string - - humanGeneLocationTableElement *goquery.Selection - startHeaderElement *goquery.Selection - startValue int - endHeaderElement *goquery.Selection - endValue int - - assemblyIdValue string - ) - - if geneDoc == nil { - return - } - - // Find nomenclature - // - aliases - // - symbol(s) - geneDoc.Find("tr").Each(func(index int, rowElement *goquery.Selection) { - if strings.Contains(rowElement.Text(), "Aliases") { - aliasesRowElement = rowElement - - aliasesElement := aliasesRowElement.Find("td span").First() - if aliasesElement != nil { - aliasesValue = strings.Split(aliasesElement.Text(), ",") - } - } - }) - // TODO: symbol(s) - - // Find gene location - // - from start/end table - // - "map position" - geneDoc.Find("table").Each(func(index int, table *goquery.Selection) { - if strings.Contains(table.Text(), "Gene location (Human)") { - humanGeneLocationTableElement = table - return - } - }) - - if humanGeneLocationTableElement != nil { - humanGeneLocationTableElement.Find("th").Each(func(index int, rowItemHeader *goquery.Selection) { - if strings.Contains(rowItemHeader.Text(), "Start") { - startHeaderElement = rowItemHeader - return - } else if strings.Contains(rowItemHeader.Text(), "End") { - endHeaderElement = rowItemHeader - return - } - }) - - if startHeaderElement != nil { - valueELement := startHeaderElement.SiblingsFiltered("td").Last() - startClean := strings.ReplaceAll(strings.ReplaceAll(strings.Split(valueELement.Text(), "bp")[0], ",", ""), " ", "") - startValue, _ = strconv.Atoi(startClean) - } - if endHeaderElement != nil { - endValueELement := endHeaderElement.SiblingsFiltered("td").Last() - endClean := strings.ReplaceAll(strings.ReplaceAll(strings.Split(endValueELement.Text(), "bp")[0], ",", ""), " ", "") - endValue, _ = strconv.Atoi(endClean) - } - - } - // TODO: "map position" - - // Find Assembly - // Assume the references provided, if any, containing an assembly id is - // the assembly corresponding to the gene and its position - geneDoc.Find("span.reference-text").EachWithBreak(func(index int, referenceListItem *goquery.Selection) bool { - if strings.Contains(strings.ToLower(referenceListItem.Text()), "grch38") || - strings.Contains(strings.ToLower(referenceListItem.Text()), "grch37") || - strings.Contains(strings.ToLower(referenceListItem.Text()), "ncbi36") { - - // pluck out the link containing the text containing the assembly id - // (usually the first one) - refText := referenceListItem.Find("a").First() - - // split by colon to retrieve the assembly id - assemblyIdValue = strings.Split(refText.Text(), ":")[0] - - // break - return false - } - - // keep looping - return true - }) - - // store data - // (temp : store to disk) - chromosomeClean := strings.Replace(strings.Replace(chromTitle, ")", "", -1), "(", "", -1) - chromosome, _ := strconv.Atoi(chromosomeClean) - - discoveredGene := &models.Gene{ - Name: geneTitle, - Nomenclature: aliasesValue, - Chrom: chromosome, - AssemblyId: assemblyId.CastToAssemblyId(assemblyIdValue), - Start: startValue, - End: endValue, - SourceUrl: fmt.Sprintf("%s%s", baseUrl, geneLink), - } - - fmt.Println(discoveredGene) - - iz.GeneIngestionBulkIndexingQueue <- &structs.GeneIngestionQueueStructure{ - Gene: discoveredGene, - WaitGroup: _gwg, - } - - }) - }(&geneWg) - }) - geneWg.Wait() -} From be54ab0bb950bcd683e86ded3b6100a9f2ccd0dd Mon Sep 17 00:00:00 2001 From: brouillette Date: Wed, 29 Sep 2021 17:45:20 -0400 Subject: [PATCH 13/25] migrating to use gencode from ucsc : - simplified gene data model --- src/api/models/elasticsearch.go | 15 +- src/api/repositories/elasticsearch/main.go | 2 +- src/gota-poc/main.go | 225 ++++++--------------- 3 files changed, 63 insertions(+), 179 deletions(-) diff --git a/src/api/models/elasticsearch.go b/src/api/models/elasticsearch.go index 4c93d57e..774f94cd 100644 --- a/src/api/models/elasticsearch.go +++ b/src/api/models/elasticsearch.go @@ -47,14 +47,9 @@ type Genotype struct { } type Gene struct { - Nomenclature Nomenclature `json:"nomenclature"` - Chrom int `json:"chrom"` - Start int `json:"start"` - End int `json:"end"` - AssemblyId c.AssemblyId `json:"assemblyId"` -} - -type Nomenclature struct { - Names []string `json:"names"` - GeneNames []string `json:"geneNames"` + Name string `json:"name"` + Chrom int `json:"chrom"` + Start int `json:"start"` + End int `json:"end"` + AssemblyId c.AssemblyId `json:"assemblyId"` } diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 66d0f063..37a06c5b 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -544,7 +544,7 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client "must": []map[string]interface{}{ { "query_string": map[string]interface{}{ - "fields": []string{"nomenclature.names", "nomenclature.genes"}, + "fields": []string{"name"}, "query": nomenclatureStringTerm, }, }, diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go index c6e9a525..66bac53b 100644 --- a/src/gota-poc/main.go +++ b/src/gota-poc/main.go @@ -7,18 +7,16 @@ import ( "api/models/ingest/structs" "api/services" "api/utils" + "bufio" "crypto/tls" "fmt" - "io/ioutil" "log" "net/http" - "net/url" "os" "strconv" "strings" "sync" - "github.com/go-gota/gota/dataframe" "github.com/kelseyhightower/envconfig" ) @@ -42,150 +40,36 @@ func main() { iz := services.NewIngestionService(es) iz.Init() - ucscUrl := "https://genome.ucsc.edu/cgi-bin/hgTables" - - // make initial call to get hgid - client := http.DefaultClient - req, err := client.Get(ucscUrl) - if err != nil { - fmt.Printf("err:%v\n", err) - } - fmt.Printf("%+v", req) - - // get Origin-Trial from header - var originTrial string - for key, value := range req.Header { - if key == "Origin-Trial" { - fmt.Println("Got 'Origin-Trial' Header") - originTrial = value[0] - } - } - if originTrial == "" { - log.Fatal("Missing originTrial") - } - - // get hguid from cookie - var hguid string - for _, cookie := range req.Cookies() { - if cookie.Name == "hguid" { - fmt.Println("Got 'hguid' Cookie") - hguid = cookie.Value - } - } - if hguid == "" { - log.Fatal("Missing hguid") - } - assemblyIdMap := map[constants.AssemblyId]string{ - assemblyId.GRCh38: "hg38", - assemblyId.GRCh37: "hg19", + assemblyId.GRCh38: "gencode.v38.annotation.gtf", + assemblyId.GRCh37: "gencode.v19.annotation.gtf_withproteinids", // SKIP // assemblyId.NCBI36: "hg18", // assemblyId.NCBI35: "hg17", // assemblyId.NCBI34: "hg16", } - var dbWg sync.WaitGroup - for _, db := range assemblyIdMap { - dbWg.Add(1) - - go func(_db string, _wg *sync.WaitGroup) { - defer _wg.Done() - - dbPath := fmt.Sprintf("%s.csv", _db) - - if _, err := os.Stat(dbPath); os.IsNotExist(err) { - fmt.Printf("Setting up %s..\n", _db) - - // begin mining - v := url.Values{} - // v.Add("hgsid", ) - v.Add("jsh_pageVertPos", "0") - v.Add("clade", "mammal") - v.Add("org", "Human") - v.Add("db", _db) - v.Add("hgta_group", "genes") - v.Add("hgta_track", "knownGene") - v.Add("hgta_table", "knownGene") - v.Add("hgta_regionType", "genome") - // v.Add("position", "chrM:5,904-7,445") - v.Add("hgta_outputType", "primaryTable") - v.Add("boolshad.sendToGalaxy", "0") - v.Add("boolshad.sendToGreat", "0") - v.Add("hgta_outFileName", "") - v.Add("hgta_compressType", "none") - v.Add("hgta_doTopSubmit", "get output") - - miningReq, _ := http.NewRequest("GET", ucscUrl, nil) - miningReq.Header.Set("Cookie", fmt.Sprintf("hguid=%s", hguid)) - miningReq.Header.Set("Host", "genome.ucsc.edu") - miningReq.Header.Set("Origin", "https://genome.ucsc.edu") - miningReq.Header.Set("Referer", ucscUrl) - - fmt.Printf("Downloading %s..\n", _db) - req, err = client.PostForm(ucscUrl, v) - if err != nil { - fmt.Printf("err:%v\n", err) - } - fmt.Printf("%+v", req) - body, err := ioutil.ReadAll(req.Body) - if err != nil { - fmt.Printf("Error reading body: %v", err) - return - } - - var file *os.File - dbPath := fmt.Sprintf("%s.csv", _db) - if _, err := os.Stat(dbPath); os.IsNotExist(err) { - file, err = os.Create(dbPath) - if err != nil { - fmt.Println(err) - return - } - } else { - file, err = os.OpenFile(dbPath, os.O_RDWR, 0755) - if err != nil { - log.Fatal(err) - } - } - defer file.Close() - - _, err = file.WriteString(string(body)) - if err != nil { - fmt.Println(err) - return - } - - fmt.Printf("Save to file %s\n", dbPath) - } else { - fmt.Printf("%s already downloaded!\n", dbPath) - } - - }(db, &dbWg) - } - dbWg.Wait() - var geneWg sync.WaitGroup - for assId, db := range assemblyIdMap { + for assId, fileName := range assemblyIdMap { // Read one file at a time - dbPath := fmt.Sprintf("%s.csv", db) - content, err := ioutil.ReadFile(dbPath) // the file is inside the local directory + gtfFile, err := os.Open(fileName) if err != nil { - fmt.Println("Err") + log.Fatalf("failed to open file: %s", err) } - // Gota - df := dataframe.ReadCSV(strings.NewReader(string(content)), - dataframe.WithDelimiter('\t'), - dataframe.HasHeader(true)) - fmt.Printf("%s :\n", dbPath) + defer gtfFile.Close() + + fileScanner := bufio.NewScanner(gtfFile) + fileScanner.Split(bufio.ScanLines) + + fmt.Printf("%s :\n", fileName) var ( chromHeaderKey = 0 - startKey = 1 - endKey = 2 + startKey = 3 + endKey = 4 nameHeaderKeys = []int{3} geneNameHeaderKeys []int ) @@ -201,50 +85,43 @@ func main() { columnsToPrint = append(columnsToPrint, "chrom", "txStart", "txEnd", "#name") } - df = df.Select(columnsToPrint) - fmt.Println(df) - - // discover name indexes - - for n, record := range df.Records() { - if n == 0 { + for fileScanner.Scan() { + rowText := fileScanner.Text() + if rowText[:2] == "##" { + // Skip header rows continue } geneWg.Add(1) - go func(_record []string, _chromHeaderKey int, + go func(rowText string, _chromHeaderKey int, _startKey int, _endKey int, _nameHeaderKeys []int, _geneNameHeaderKeys []int, _assId constants.AssemblyId, _gwg *sync.WaitGroup) { // fmt.Printf("row : %s\n", row) - // create instance of a Gene structure - var names, geneNames []string - var chromStart, chromEnd int - - // discover names - for _, nk := range _nameHeaderKeys { - names = append(names, _record[nk]) - } - for _, nk := range geneNameHeaderKeys { - geneNames = append(geneNames, _record[nk]) - } + var ( + chromosome int + start int + end int + geneName string + ) - //clean chromosome - chromosomeClean := strings.ReplaceAll(strings.ReplaceAll(_record[_chromHeaderKey], "chr", ""), "#", "") + rowSplits := strings.Split(rowText, "\t") - // skip this record if the chromosome contians "scaffolding", i.e 'chr1_something_something' - if strings.Contains(chromosomeClean, "_") { - geneWg.Done() + // skip this row if it's not a gene row + // i.e, if it's an exon or transcript + if rowSplits[2] != "gene" { + defer _gwg.Done() return } + //clean chromosome + chromosomeClean := strings.ReplaceAll(rowSplits[_chromHeaderKey], "chr", "") // TODO: fomarmalize // if chromosome MT, set to 0 // if chromosome X, set to 101 // if chromosome Y, set to 102 - var chromosome int if strings.Contains(strings.ToUpper(chromosomeClean), "MT") { chromosome = 0 } else if strings.ToUpper(chromosomeClean) == "X" { @@ -256,30 +133,42 @@ func main() { } // clean start/end - chromStartClean := strings.ReplaceAll(strings.ReplaceAll(_record[_startKey], ",", ""), " ", "") - chromStart, _ = strconv.Atoi(chromStartClean) - - chromEndClean := strings.ReplaceAll(strings.ReplaceAll(_record[_endKey], ",", ""), " ", "") - chromEnd, _ = strconv.Atoi(chromEndClean) + chromStartClean := strings.ReplaceAll(strings.ReplaceAll(rowSplits[_startKey], ",", ""), " ", "") + start, _ = strconv.Atoi(chromStartClean) + + chromEndClean := strings.ReplaceAll(strings.ReplaceAll(rowSplits[_endKey], ",", ""), " ", "") + end, _ = strconv.Atoi(chromEndClean) + + dataClumpSplits := strings.Split(rowSplits[len(rowSplits)-1], ";") + for _, v := range dataClumpSplits { + if strings.Contains(v, "gene_name") { + cleanedItemSplits := strings.Split(strings.TrimSpace(strings.ReplaceAll(v, "\"", "")), " ") + if len(cleanedItemSplits) > 0 { + geneName = cleanedItemSplits[len(cleanedItemSplits)-1] + } + break + } + } + if len(geneName) == 0 { + fmt.Printf("No gene found in row %s\n", rowText) + return + } discoveredGene := &models.Gene{ - Nomenclature: models.Nomenclature{ - Names: names, - GeneNames: geneNames, - }, + Name: geneName, Chrom: chromosome, - Start: chromStart, - End: chromEnd, + Start: start, + End: end, AssemblyId: _assId, } - fmt.Printf("Keys :%d, %d, %d, %d, %d -- %s\n", _chromHeaderKey, _startKey, _endKey, _nameHeaderKeys, _geneNameHeaderKeys, discoveredGene) + //fmt.Printf("Keys :%d, %d, %d, %d, %d -- %s\n", _chromHeaderKey, _startKey, _endKey, _nameHeaderKeys, _geneNameHeaderKeys, discoveredGene) iz.GeneIngestionBulkIndexingQueue <- &structs.GeneIngestionQueueStructure{ Gene: discoveredGene, WaitGroup: _gwg, } - }(record, chromHeaderKey, startKey, endKey, nameHeaderKeys, geneNameHeaderKeys, assId, &geneWg) + }(rowText, chromHeaderKey, startKey, endKey, nameHeaderKeys, geneNameHeaderKeys, assId, &geneWg) // fmt.Printf("Stats : %d\n", iz.GeneIngestionBulkIndexer.Stats()) } From 3dc02880346b4d3ef498832b8db58e248e093258 Mon Sep 17 00:00:00 2001 From: brouillette Date: Wed, 29 Sep 2021 17:49:45 -0400 Subject: [PATCH 14/25] gitignoring data files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ce3ddbed..3a45bee7 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,5 @@ bin/* *.vcf *.vcf.gz -*/*/*.csv \ No newline at end of file +*/*/*.csv +*/*/*.gtf* From 91bbe33dd103adb824635ed9483bc571e72e9e60 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 11:07:38 -0400 Subject: [PATCH 15/25] added genes overview endpoint --- src/api/main.go | 1 + src/api/mvc/genes.go | 56 +++++++++ src/api/mvc/variants.go | 2 +- src/api/repositories/elasticsearch/main.go | 135 +++++++++++---------- 4 files changed, 129 insertions(+), 65 deletions(-) diff --git a/src/api/main.go b/src/api/main.go index a3a51218..f6962d81 100644 --- a/src/api/main.go +++ b/src/api/main.go @@ -156,6 +156,7 @@ func main() { e.GET("/variants/ingestion/requests", mvc.GetAllVariantIngestionRequests) // -- Genes + e.GET("/genes/overview", mvc.GetGenesOverview) e.GET("/genes/search", mvc.GenesGetByNomenclatureWildcard) // Run diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index 435f4c83..f2b6190c 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -8,6 +8,7 @@ import ( "fmt" "net/http" "strconv" + "sync" "github.com/labstack/echo" "github.com/mitchellh/mapstructure" @@ -77,3 +78,58 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { return c.JSON(http.StatusOK, geneResponseDTO) } + +func GetGenesOverview(c echo.Context) error { + + resultsMap := map[string]interface{}{} + resultsMux := sync.RWMutex{} + + var wg sync.WaitGroup + es := c.(*contexts.GohanContext).Es7Client + cfg := c.(*contexts.GohanContext).Config + + callGetBucketsByKeyword := func(key string, keyword string, _wg *sync.WaitGroup) { + defer _wg.Done() + + results := esRepo.GetGeneBucketsByKeyword(cfg, es, keyword) + + // retrieve aggregations.items.buckets + bucketsMapped := []interface{}{} + if aggs, ok := results["aggregations"]; ok { + aggsMapped := aggs.(map[string]interface{}) + + if items, ok := aggsMapped["items"]; ok { + itemsMapped := items.(map[string]interface{}) + + if buckets := itemsMapped["buckets"]; ok { + bucketsMapped = buckets.([]interface{}) + } + } + } + + individualKeyMap := map[string]interface{}{} + // push results bucket to slice + for _, bucket := range bucketsMapped { + doc_key := fmt.Sprint(bucket.(map[string]interface{})["key"]) // ensure strings and numbers are expressed as strings + doc_count := bucket.(map[string]interface{})["doc_count"] + + individualKeyMap[doc_key] = doc_count + } + + resultsMux.Lock() + resultsMap[key] = individualKeyMap + resultsMux.Unlock() + } + + // get distribution of chromosomes + wg.Add(1) + go callGetBucketsByKeyword("chromosomes", "chrom", &wg) + + // get distribution of variant IDs + wg.Add(1) + go callGetBucketsByKeyword("assemblyIDs", "assemblyId.keyword", &wg) + + wg.Wait() + + return c.JSON(http.StatusOK, resultsMap) +} diff --git a/src/api/mvc/variants.go b/src/api/mvc/variants.go index 6ffe7045..85c9c6ab 100644 --- a/src/api/mvc/variants.go +++ b/src/api/mvc/variants.go @@ -249,7 +249,7 @@ func GetVariantsOverview(c echo.Context) error { callGetBucketsByKeyword := func(key string, keyword string, _wg *sync.WaitGroup) { defer _wg.Done() - results := esRepo.GetBucketsByKeyword(cfg, es, keyword) + results := esRepo.GetVariantsBucketsByKeyword(cfg, es, keyword) // retrieve aggregations.items.buckets bucketsMapped := []interface{}{} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 37a06c5b..edb29144 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -455,70 +455,6 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, return result } -func GetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string) map[string]interface{} { - - // begin building the request body. - var buf bytes.Buffer - aggMap := map[string]interface{}{ - "size": "0", - "aggs": map[string]interface{}{ - "items": map[string]interface{}{ - "terms": map[string]interface{}{ - "field": keyword, - "size": "10000", // increases the number of buckets returned (default is 10) - }, - }, - }, - } - - // encode the query - if err := json.NewEncoder(&buf).Encode(aggMap); err != nil { - log.Fatalf("Error encoding aggMap: %s\n", err) - } - - if cfg.Debug { - // view the outbound elasticsearch query - myString := string(buf.Bytes()[:]) - fmt.Println(myString) - } - - // TEMP: SECURITY RISK - http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} - // - // Perform the search request. - res, searchErr := es.Search( - es.Search.WithContext(context.Background()), - es.Search.WithIndex("variants"), - es.Search.WithBody(&buf), - es.Search.WithTrackTotalHits(true), - es.Search.WithPretty(), - ) - if searchErr != nil { - fmt.Printf("Error getting response: %s\n", searchErr) - } - - defer res.Body.Close() - - resultString := res.String() - if cfg.Debug { - fmt.Println(resultString) - } - - // Declared an empty interface - result := make(map[string]interface{}) - - // Unmarshal or Decode the JSON to the interface. - // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) - umErr := json.Unmarshal([]byte(resultString[9:]), &result) - if umErr != nil { - fmt.Printf("Error unmarshalling response: %s\n", umErr) - } - - fmt.Printf("Query End: %s\n", time.Now()) - - return result -} - func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, term string, assId constants.AssemblyId, size int) map[string]interface{} { @@ -616,3 +552,74 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client return result } + +func GetVariantsBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string) map[string]interface{} { + return executeGetBucketsByKeyword(cfg, es, keyword, "variants") +} + +func GetGeneBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string) map[string]interface{} { + return executeGetBucketsByKeyword(cfg, es, keyword, "genes") +} + +func executeGetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string, index string) map[string]interface{} { + // begin building the request body. + var buf bytes.Buffer + aggMap := map[string]interface{}{ + "size": "0", + "aggs": map[string]interface{}{ + "items": map[string]interface{}{ + "terms": map[string]interface{}{ + "field": keyword, + "size": "10000", // increases the number of buckets returned (default is 10) + }, + }, + }, + } + + // encode the query + if err := json.NewEncoder(&buf).Encode(aggMap); err != nil { + log.Fatalf("Error encoding aggMap: %s\n", err) + } + + if cfg.Debug { + // view the outbound elasticsearch query + myString := string(buf.Bytes()[:]) + fmt.Println(myString) + } + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + // Perform the search request. + res, searchErr := es.Search( + es.Search.WithContext(context.Background()), + es.Search.WithIndex(index), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithPretty(), + ) + if searchErr != nil { + fmt.Printf("Error getting response: %s\n", searchErr) + } + + defer res.Body.Close() + + resultString := res.String() + if cfg.Debug { + fmt.Println(resultString) + } + + // Declared an empty interface + result := make(map[string]interface{}) + + // Unmarshal or Decode the JSON to the interface. + // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) + umErr := json.Unmarshal([]byte(resultString[9:]), &result) + if umErr != nil { + fmt.Printf("Error unmarshalling response: %s\n", umErr) + } + + fmt.Printf("Query End: %s\n", time.Now()) + + return result +} From bb7633ee508742f68cff87faa9bd33453445a277 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 11:44:01 -0400 Subject: [PATCH 16/25] updating chrom model: - as string instead of int -- allows for 'M', 'X' and 'Y' --- src/api/models/elasticsearch.go | 4 ++-- src/api/mvc/genes.go | 2 +- src/api/repositories/elasticsearch/main.go | 5 +++- src/gota-poc/main.go | 27 +++++++++++----------- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/api/models/elasticsearch.go b/src/api/models/elasticsearch.go index 774f94cd..2a3548ca 100644 --- a/src/api/models/elasticsearch.go +++ b/src/api/models/elasticsearch.go @@ -7,7 +7,7 @@ import ( var VcfHeaders = []string{"chrom", "pos", "id", "ref", "alt", "qual", "filter", "info", "format"} type Variant struct { - Chrom int `json:"chrom"` + Chrom string `json:"chrom"` Pos int `json:"pos"` Id string `json:"id"` Ref []string `json:"ref"` @@ -48,7 +48,7 @@ type Genotype struct { type Gene struct { Name string `json:"name"` - Chrom int `json:"chrom"` + Chrom string `json:"chrom"` Start int `json:"start"` End int `json:"end"` AssemblyId c.AssemblyId `json:"assemblyId"` diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index f2b6190c..c9c651dd 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -123,7 +123,7 @@ func GetGenesOverview(c echo.Context) error { // get distribution of chromosomes wg.Add(1) - go callGetBucketsByKeyword("chromosomes", "chrom", &wg) + go callGetBucketsByKeyword("chromosomes", "chrom.keyword", &wg) // get distribution of variant IDs wg.Add(1) diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index edb29144..4fdcfddc 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -498,7 +498,7 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client "size": size, "sort": []map[string]interface{}{ { - "chrom": map[string]interface{}{ + "chrom.keyword": map[string]interface{}{ "order": "asc", }, }, @@ -571,6 +571,9 @@ func executeGetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, ke "terms": map[string]interface{}{ "field": keyword, "size": "10000", // increases the number of buckets returned (default is 10) + "order": map[string]string{ + "_key": "asc", + }, }, }, }, diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go index 66bac53b..97e5a0b2 100644 --- a/src/gota-poc/main.go +++ b/src/gota-poc/main.go @@ -101,10 +101,9 @@ func main() { // fmt.Printf("row : %s\n", row) var ( - chromosome int - start int - end int - geneName string + start int + end int + geneName string ) rowSplits := strings.Split(rowText, "\t") @@ -122,15 +121,15 @@ func main() { // if chromosome MT, set to 0 // if chromosome X, set to 101 // if chromosome Y, set to 102 - if strings.Contains(strings.ToUpper(chromosomeClean), "MT") { - chromosome = 0 - } else if strings.ToUpper(chromosomeClean) == "X" { - chromosome = 101 - } else if strings.ToUpper(chromosomeClean) == "Y" { - chromosome = 102 - } else { - chromosome, _ = strconv.Atoi(chromosomeClean) - } + // if strings.Contains(strings.ToUpper(chromosomeClean), "MT") { + // chromosome = 0 + // } else if strings.ToUpper(chromosomeClean) == "X" { + // chromosome = 101 + // } else if strings.ToUpper(chromosomeClean) == "Y" { + // chromosome = 102 + // } else { + // chromosome, _ = strconv.Atoi(chromosomeClean) + // } // clean start/end chromStartClean := strings.ReplaceAll(strings.ReplaceAll(rowSplits[_startKey], ",", ""), " ", "") @@ -156,7 +155,7 @@ func main() { discoveredGene := &models.Gene{ Name: geneName, - Chrom: chromosome, + Chrom: chromosomeClean, Start: start, End: end, AssemblyId: _assId, From 56476c2172e5f6fc22cb39830fdd64edfc929db0 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 12:22:21 -0400 Subject: [PATCH 17/25] revamped chomosome handling during ingestion --- src/api/middleware/chromosomeMiddleware.go | 13 +++----- src/api/models/constants/chromosome/main.go | 37 +++++++++++++++++++++ src/api/models/constants/main.go | 1 + src/api/mvc/variants.go | 2 +- src/api/services/ingestion.go | 22 ++++++++---- 5 files changed, 58 insertions(+), 17 deletions(-) create mode 100644 src/api/models/constants/chromosome/main.go diff --git a/src/api/middleware/chromosomeMiddleware.go b/src/api/middleware/chromosomeMiddleware.go index 67b63766..abd1324f 100644 --- a/src/api/middleware/chromosomeMiddleware.go +++ b/src/api/middleware/chromosomeMiddleware.go @@ -1,8 +1,8 @@ package middleware import ( + "api/models/constants/chromosome" "net/http" - "strconv" "github.com/labstack/echo" ) @@ -20,14 +20,9 @@ func MandateChromosomeAttribute(next echo.HandlerFunc) echo.HandlerFunc { } // verify: - i, conversionErr := strconv.Atoi(chromQP) - if conversionErr != nil { - // if invalid chromosome - return echo.NewHTTPError(http.StatusBadRequest, "Error converting 'chromosome' query parameter! Check your input") - } - - if i <= 0 { - // if chromosome less than 0 + if !chromosome.IsValidHumanChromosome(chromQP) { + // if chromosome less than 1 or greater than 23 + // and not 'x', 'y' or 'm' return echo.NewHTTPError(http.StatusBadRequest, "Please provide a 'chromosome' greater than 0!") } diff --git a/src/api/models/constants/chromosome/main.go b/src/api/models/constants/chromosome/main.go new file mode 100644 index 00000000..81f20ecb --- /dev/null +++ b/src/api/models/constants/chromosome/main.go @@ -0,0 +1,37 @@ +package chromosome + +import ( + "strconv" + "strings" +) + +func IsValidHumanChromosome(text string) bool { + + // Check if number can be represented as an int as is non-zero + chromNumber, _ := strconv.Atoi(text) + if chromNumber > 0 { + // It can.. + // Check if it in range 1-23 + if chromNumber < 24 { + return true + } + } else { + // No it can't.. + // Check if it is an X, Y.. + loweredText := strings.ToLower(text) + switch loweredText { + case "x": + return true + case "y": + return true + } + + // ..or M (MT) + switch strings.Contains(loweredText, "m") { + case true: + return true + } + } + + return false +} diff --git a/src/api/models/constants/main.go b/src/api/models/constants/main.go index 119f0663..a86e3849 100644 --- a/src/api/models/constants/main.go +++ b/src/api/models/constants/main.go @@ -7,6 +7,7 @@ package constants associated services. */ type AssemblyId string +type Chromosome string type GenotypeQuery string type SearchOperation string type SortDirection string diff --git a/src/api/mvc/variants.go b/src/api/mvc/variants.go index 85c9c6ab..8eb312f6 100644 --- a/src/api/mvc/variants.go +++ b/src/api/mvc/variants.go @@ -281,7 +281,7 @@ func GetVariantsOverview(c echo.Context) error { // get distribution of chromosomes wg.Add(1) - go callGetBucketsByKeyword("chromosomes", "chrom", &wg) + go callGetBucketsByKeyword("chromosomes", "chrom.keyword", &wg) // get distribution of variant IDs wg.Add(1) diff --git a/src/api/services/ingestion.go b/src/api/services/ingestion.go index 7c1fd014..0ed969e3 100644 --- a/src/api/services/ingestion.go +++ b/src/api/services/ingestion.go @@ -3,6 +3,7 @@ package services import ( "api/models" "api/models/constants" + "api/models/constants/chromosome" z "api/models/constants/zygosity" "api/models/ingest" "api/models/ingest/structs" @@ -21,7 +22,6 @@ import ( "net/http" "os" "path/filepath" - "regexp" "strconv" "strings" "sync" @@ -300,8 +300,6 @@ func (i *IngestionService) ProcessVcf(vcfFilePath string, drsFileId string, asse var discoveredHeaders bool = false var headers []string - nonNumericRegexp := regexp.MustCompile("[^.0-9]") - var _fileWG sync.WaitGroup for scanner.Scan() { @@ -351,11 +349,21 @@ func (i *IngestionService) ProcessVcf(vcfFilePath string, drsFileId string, asse if utils.StringInSlice(key, models.VcfHeaders) { // filter field type by column name - if key == "chrom" || key == "pos" || key == "qual" { - if key == "chrom" { - // Strip out all non-numeric characters - value = nonNumericRegexp.ReplaceAllString(value, "") + if key == "chrom" { + // Strip out all non-numeric characters + value = strings.ReplaceAll(value, "chr", "") + + // ems if value is valid chromosome + if chromosome.IsValidHumanChromosome(value) { + tmpVariantMapMutex.Lock() + tmpVariant[key] = value + tmpVariantMapMutex.Unlock() + } else { + tmpVariantMapMutex.Lock() + tmpVariant[key] = "err" + tmpVariantMapMutex.Unlock() } + } else if key == "pos" || key == "qual" { // // Convert string's to int's, if possible value, err := strconv.ParseInt(value, 10, 0) From de43206f5ea217bcaa7c0aadab84ff178d00b19f Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 12:28:55 -0400 Subject: [PATCH 18/25] begin genes testing --- src/tests/integration/api/api_test.go | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/tests/integration/api/api_test.go b/src/tests/integration/api/api_test.go index 6e9c2f43..a9097b33 100644 --- a/src/tests/integration/api/api_test.go +++ b/src/tests/integration/api/api_test.go @@ -26,6 +26,8 @@ const ( VariantsOverviewPath string = "%s/variants/overview" VariantsGetBySampleIdsPathWithQueryString string = "%s/variants/get/by/sampleId%s" IngestionRequestsPath string = "%s/variants/ingestion/requests" + + GenesOverviewPath string = "%s/genes/overview" ) func TestWithInvalidAuthenticationToken(t *testing.T) { @@ -58,6 +60,12 @@ func TestVariantsOverview(t *testing.T) { overviewJson := getVariantsOverview(t, cfg) assert.NotNil(t, overviewJson) } +func TestGenesOverview(t *testing.T) { + cfg := common.InitConfig() + + overviewJson := getGenesOverview(t, cfg) + assert.NotNil(t, overviewJson) +} func TestGetIngestionRequests(t *testing.T) { cfg := common.InitConfig() @@ -467,6 +475,43 @@ func getVariantsOverview(_t *testing.T, _cfg *models.Config) map[string]interfac return overviewRespJson } +func getGenesOverview(_t *testing.T, _cfg *models.Config) map[string]interface{} { + request, _ := http.NewRequest("GET", fmt.Sprintf(GenesOverviewPath, _cfg.Api.Url), nil) + + client := &http.Client{} + response, responseErr := client.Do(request) + assert.Nil(_t, responseErr) + + defer response.Body.Close() + + // this test (at the time of writing) will only work if authorization is disabled + shouldBe := 200 + assert.Equal(_t, shouldBe, response.StatusCode, fmt.Sprintf("Error -- Api GET / Status: %s ; Should be %d", response.Status, shouldBe)) + + // -- interpret array of ingestion requests from response + overviewRespBody, overviewRespBodyErr := ioutil.ReadAll(response.Body) + assert.Nil(_t, overviewRespBodyErr) + + // --- transform body bytes to string + overviewRespBodyString := string(overviewRespBody) + + // -- check for json error + var overviewRespJson map[string]interface{} + overviewJsonUnmarshallingError := json.Unmarshal([]byte(overviewRespBodyString), &overviewRespJson) + assert.Nil(_t, overviewJsonUnmarshallingError) + + // -- insure it's an empty array + chromosomesKey, ckOk := overviewRespJson["chromosomes"] + assert.True(_t, ckOk) + assert.NotNil(_t, chromosomesKey) + + variantIDsKey, vidkOk := overviewRespJson["assemblyIDs"] + assert.True(_t, vidkOk) + assert.NotNil(_t, variantIDsKey) + + return overviewRespJson +} + func getOverviewResultCombinations(chromosomeStruct interface{}, sampleIdsStruct interface{}, assemblyIdsStruct interface{}) [][]string { var allCombinations = [][]string{} From 485a60a19ac30c8fdea4ff4b3880631223bf6682 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 14:08:42 -0400 Subject: [PATCH 19/25] reorganizing api tests --- src/tests/integration/api/api_gene_test.go | 19 +++++++++++++++++++ .../api/{api_test.go => api_variant_test.go} | 8 -------- 2 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 src/tests/integration/api/api_gene_test.go rename src/tests/integration/api/{api_test.go => api_variant_test.go} (99%) diff --git a/src/tests/integration/api/api_gene_test.go b/src/tests/integration/api/api_gene_test.go new file mode 100644 index 00000000..e387d41f --- /dev/null +++ b/src/tests/integration/api/api_gene_test.go @@ -0,0 +1,19 @@ +package api + +import ( + "testing" + common "tests/common" + + "github.com/stretchr/testify/assert" +) + +const ( + GenesOverviewPath string = "%s/genes/overview" +) + +func TestGenesOverview(t *testing.T) { + cfg := common.InitConfig() + + overviewJson := getGenesOverview(t, cfg) + assert.NotNil(t, overviewJson) +} diff --git a/src/tests/integration/api/api_test.go b/src/tests/integration/api/api_variant_test.go similarity index 99% rename from src/tests/integration/api/api_test.go rename to src/tests/integration/api/api_variant_test.go index a9097b33..a2858f07 100644 --- a/src/tests/integration/api/api_test.go +++ b/src/tests/integration/api/api_variant_test.go @@ -26,8 +26,6 @@ const ( VariantsOverviewPath string = "%s/variants/overview" VariantsGetBySampleIdsPathWithQueryString string = "%s/variants/get/by/sampleId%s" IngestionRequestsPath string = "%s/variants/ingestion/requests" - - GenesOverviewPath string = "%s/genes/overview" ) func TestWithInvalidAuthenticationToken(t *testing.T) { @@ -60,12 +58,6 @@ func TestVariantsOverview(t *testing.T) { overviewJson := getVariantsOverview(t, cfg) assert.NotNil(t, overviewJson) } -func TestGenesOverview(t *testing.T) { - cfg := common.InitConfig() - - overviewJson := getGenesOverview(t, cfg) - assert.NotNil(t, overviewJson) -} func TestGetIngestionRequests(t *testing.T) { cfg := common.InitConfig() From e0820624e159e6e3eb73daf04de6e9c9a375f1f5 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 14:09:17 -0400 Subject: [PATCH 20/25] added chromosome q param to genes search --- src/api/mvc/genes.go | 5 ++++- src/api/repositories/elasticsearch/main.go | 14 +++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index c9c651dd..120d8047 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -18,6 +18,9 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { cfg := c.(*contexts.GohanContext).Config es := c.(*contexts.GohanContext).Es7Client + // Chromosome search term + chromosome := c.QueryParam("chromosome") + // Name search term term := c.QueryParam("term") @@ -46,7 +49,7 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s (max size: %d)\n", term, assId, size) // Execute - docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, term, assId, size) + docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, chromosome, term, assId, size) docsHits := docs["hits"].(map[string]interface{})["hits"] allDocHits := []map[string]interface{}{} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 4fdcfddc..d139d5a7 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -456,12 +456,18 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, } func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, - term string, assId constants.AssemblyId, size int) map[string]interface{} { + chromosome string, term string, assId constants.AssemblyId, size int) map[string]interface{} { // TEMP: SECURITY RISK http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} // + // Chromosome Search Term + // (wildcard by default) + chromosomeStringTerm := "*" + if chromosome != "" { + chromosomeStringTerm = chromosome + } // Nomenclature Search Term nomenclatureStringTerm := fmt.Sprintf("*%s*", term) @@ -478,6 +484,12 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client "filter": []map[string]interface{}{{ "bool": map[string]interface{}{ "must": []map[string]interface{}{ + { + "query_string": map[string]interface{}{ + "fields": []string{"chrom"}, + "query": chromosomeStringTerm, + }, + }, { "query_string": map[string]interface{}{ "fields": []string{"name"}, From 412577399b4d53337b621e5ff1ca04d8a7823792 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 15:46:01 -0400 Subject: [PATCH 21/25] enriched genes overview endpoint --- src/api/mvc/genes.go | 70 ++++++++++-------- src/api/repositories/elasticsearch/main.go | 85 ++++++++++++++++++++-- 2 files changed, 117 insertions(+), 38 deletions(-) diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index 120d8047..da1567c5 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -87,52 +87,62 @@ func GetGenesOverview(c echo.Context) error { resultsMap := map[string]interface{}{} resultsMux := sync.RWMutex{} - var wg sync.WaitGroup es := c.(*contexts.GohanContext).Es7Client cfg := c.(*contexts.GohanContext).Config - callGetBucketsByKeyword := func(key string, keyword string, _wg *sync.WaitGroup) { - defer _wg.Done() + // retrieve aggregation of genes/chromosomes by assembly id + results := esRepo.GetGeneBucketsByKeyword(cfg, es) - results := esRepo.GetGeneBucketsByKeyword(cfg, es, keyword) + // begin mapping results + geneChromosomeGroupBucketsMapped := []map[string]interface{}{} - // retrieve aggregations.items.buckets - bucketsMapped := []interface{}{} - if aggs, ok := results["aggregations"]; ok { - aggsMapped := aggs.(map[string]interface{}) + // loop over top level aggregation and + // accumulated nested aggregations + if aggs, ok := results["aggregations"]; ok { + aggsMapped := aggs.(map[string]interface{}) - if items, ok := aggsMapped["items"]; ok { - itemsMapped := items.(map[string]interface{}) + if items, ok := aggsMapped["genes_assembly_id_group"]; ok { + itemsMapped := items.(map[string]interface{}) - if buckets := itemsMapped["buckets"]; ok { - bucketsMapped = buckets.([]interface{}) + if buckets := itemsMapped["buckets"]; ok { + arrayMappedBuckets := buckets.([]interface{}) + + for _, mappedBucket := range arrayMappedBuckets { + geneChromosomeGroupBucketsMapped = append(geneChromosomeGroupBucketsMapped, mappedBucket.(map[string]interface{})) } } } + } - individualKeyMap := map[string]interface{}{} - // push results bucket to slice - for _, bucket := range bucketsMapped { - doc_key := fmt.Sprint(bucket.(map[string]interface{})["key"]) // ensure strings and numbers are expressed as strings - doc_count := bucket.(map[string]interface{})["doc_count"] + individualAssemblyIdKeyMap := map[string]interface{}{} - individualKeyMap[doc_key] = doc_count - } + // iterated over each assemblyId bucket + for _, chromGroupBucketMap := range geneChromosomeGroupBucketsMapped { - resultsMux.Lock() - resultsMap[key] = individualKeyMap - resultsMux.Unlock() - } + assemblyIdKey := fmt.Sprint(chromGroupBucketMap["key"]) + + numGenesPerChromMap := map[string]interface{}{} + bucketsMapped := map[string]interface{}{} - // get distribution of chromosomes - wg.Add(1) - go callGetBucketsByKeyword("chromosomes", "chrom.keyword", &wg) + if chromGroupItem, ok := chromGroupBucketMap["genes_chromosome_group"]; ok { + chromGroupItemMapped := chromGroupItem.(map[string]interface{}) - // get distribution of variant IDs - wg.Add(1) - go callGetBucketsByKeyword("assemblyIDs", "assemblyId.keyword", &wg) + for _, chromBucket := range chromGroupItemMapped["buckets"].([]interface{}) { + doc_key := fmt.Sprint(chromBucket.(map[string]interface{})["key"]) // ensure strings and numbers are expressed as strings + doc_count := chromBucket.(map[string]interface{})["doc_count"] + + // add to list of buckets by chromosome + bucketsMapped[doc_key] = doc_count + } + } + + numGenesPerChromMap["numberOfGenesPerChromosome"] = bucketsMapped + individualAssemblyIdKeyMap[assemblyIdKey] = numGenesPerChromMap + } - wg.Wait() + resultsMux.Lock() + resultsMap["assemblyIDs"] = individualAssemblyIdKeyMap + resultsMux.Unlock() return c.JSON(http.StatusOK, resultsMap) } diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index d139d5a7..72593afd 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -566,27 +566,96 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client } func GetVariantsBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string) map[string]interface{} { - return executeGetBucketsByKeyword(cfg, es, keyword, "variants") -} + // begin building the request body. + var buf bytes.Buffer + aggMap := map[string]interface{}{ + "size": "0", + "aggs": map[string]interface{}{ + "items": map[string]interface{}{ + "terms": map[string]interface{}{ + "field": keyword, + "size": "10000", // increases the number of buckets returned (default is 10) + "order": map[string]string{ + "_key": "asc", + }, + }, + }, + }, + } + + // encode the query + if err := json.NewEncoder(&buf).Encode(aggMap); err != nil { + log.Fatalf("Error encoding aggMap: %s\n", err) + } + + if cfg.Debug { + // view the outbound elasticsearch query + myString := string(buf.Bytes()[:]) + fmt.Println(myString) + } + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + // Perform the search request. + res, searchErr := es.Search( + es.Search.WithContext(context.Background()), + es.Search.WithIndex("variants"), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithPretty(), + ) + if searchErr != nil { + fmt.Printf("Error getting response: %s\n", searchErr) + } + + defer res.Body.Close() + + resultString := res.String() + if cfg.Debug { + fmt.Println(resultString) + } + + // Declared an empty interface + result := make(map[string]interface{}) -func GetGeneBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string) map[string]interface{} { - return executeGetBucketsByKeyword(cfg, es, keyword, "genes") + // Unmarshal or Decode the JSON to the interface. + // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) + umErr := json.Unmarshal([]byte(resultString[9:]), &result) + if umErr != nil { + fmt.Printf("Error unmarshalling response: %s\n", umErr) + } + + fmt.Printf("Query End: %s\n", time.Now()) + + return result } -func executeGetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string, index string) map[string]interface{} { +func GetGeneBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client) map[string]interface{} { // begin building the request body. var buf bytes.Buffer aggMap := map[string]interface{}{ "size": "0", "aggs": map[string]interface{}{ - "items": map[string]interface{}{ + "genes_assembly_id_group": map[string]interface{}{ "terms": map[string]interface{}{ - "field": keyword, + "field": "assemblyId.keyword", "size": "10000", // increases the number of buckets returned (default is 10) "order": map[string]string{ "_key": "asc", }, }, + "aggs": map[string]interface{}{ + "genes_chromosome_group": map[string]interface{}{ + "terms": map[string]interface{}{ + "field": "chrom.keyword", + "size": "10000", // increases the number of buckets returned (default is 10) + "order": map[string]string{ + "_key": "asc", + }, + }, + }, + }, }, }, } @@ -608,7 +677,7 @@ func executeGetBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, ke // Perform the search request. res, searchErr := es.Search( es.Search.WithContext(context.Background()), - es.Search.WithIndex(index), + es.Search.WithIndex("genes"), es.Search.WithBody(&buf), es.Search.WithTrackTotalHits(true), es.Search.WithPretty(), From 5f5f962bf4b883a7aa27f377ca112b547d697ce8 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 16:26:06 -0400 Subject: [PATCH 22/25] chromosome query paramter cleanup: - now optional (handles wildcard) --- src/api/main.go | 12 +++++++----- src/api/middleware/chromosomeMiddleware.go | 10 +++------- src/api/mvc/genes.go | 8 ++++++-- src/api/repositories/elasticsearch/main.go | 10 ++-------- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/api/main.go b/src/api/main.go index f6962d81..9ab54a98 100644 --- a/src/api/main.go +++ b/src/api/main.go @@ -124,13 +124,13 @@ func main() { e.GET("/variants/get/by/variantId", mvc.VariantsGetByVariantId, // middleware - gam.MandateChromosomeAttribute, + gam.ValidateOptionalChromosomeAttribute, gam.MandateCalibratedBounds, gam.MandateAssemblyIdAttribute, gam.ValidatePotentialGenotypeQueryParameter) e.GET("/variants/get/by/sampleId", mvc.VariantsGetBySampleId, // middleware - gam.MandateChromosomeAttribute, + gam.ValidateOptionalChromosomeAttribute, gam.MandateCalibratedBounds, gam.MandateAssemblyIdAttribute, gam.MandateSampleIdsPluralAttribute, @@ -138,13 +138,13 @@ func main() { e.GET("/variants/count/by/variantId", mvc.VariantsCountByVariantId, // middleware - gam.MandateChromosomeAttribute, + gam.ValidateOptionalChromosomeAttribute, gam.MandateCalibratedBounds, gam.MandateAssemblyIdAttribute, gam.ValidatePotentialGenotypeQueryParameter) e.GET("/variants/count/by/sampleId", mvc.VariantsCountBySampleId, // middleware - gam.MandateChromosomeAttribute, + gam.ValidateOptionalChromosomeAttribute, gam.MandateCalibratedBounds, gam.MandateAssemblyIdAttribute, gam.MandateSampleIdsSingularAttribute, @@ -157,7 +157,9 @@ func main() { // -- Genes e.GET("/genes/overview", mvc.GetGenesOverview) - e.GET("/genes/search", mvc.GenesGetByNomenclatureWildcard) + e.GET("/genes/search", mvc.GenesGetByNomenclatureWildcard, + // middleware + gam.ValidateOptionalChromosomeAttribute) // Run e.Logger.Fatal(e.Start(":" + cfg.Api.Port)) diff --git a/src/api/middleware/chromosomeMiddleware.go b/src/api/middleware/chromosomeMiddleware.go index abd1324f..b55c4986 100644 --- a/src/api/middleware/chromosomeMiddleware.go +++ b/src/api/middleware/chromosomeMiddleware.go @@ -10,20 +10,16 @@ import ( /* Echo middleware to ensure a valid `chromosome` HTTP query parameter was provided */ -func MandateChromosomeAttribute(next echo.HandlerFunc) echo.HandlerFunc { +func ValidateOptionalChromosomeAttribute(next echo.HandlerFunc) echo.HandlerFunc { return func(c echo.Context) error { // check for chromosome query parameter chromQP := c.QueryParam("chromosome") - if len(chromQP) == 0 { - // if no id was provided return an error - return echo.NewHTTPError(http.StatusBadRequest, "Missing 'chromosome' query parameter for querying!") - } // verify: - if !chromosome.IsValidHumanChromosome(chromQP) { + if len(chromQP) > 0 && !chromosome.IsValidHumanChromosome(chromQP) { // if chromosome less than 1 or greater than 23 // and not 'x', 'y' or 'm' - return echo.NewHTTPError(http.StatusBadRequest, "Please provide a 'chromosome' greater than 0!") + return echo.NewHTTPError(http.StatusBadRequest, "Please provide a valid 'chromosome' (either 1-23, X, Y, or M)") } return next(c) diff --git a/src/api/mvc/genes.go b/src/api/mvc/genes.go index da1567c5..2f8036eb 100644 --- a/src/api/mvc/genes.go +++ b/src/api/mvc/genes.go @@ -19,7 +19,11 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { es := c.(*contexts.GohanContext).Es7Client // Chromosome search term - chromosome := c.QueryParam("chromosome") + chromosomeSearchTerm := c.QueryParam("chromosome") + if len(chromosomeSearchTerm) == 0 { + // if no chromosome is provided, assume "wildcard" search + chromosomeSearchTerm = "*" + } // Name search term term := c.QueryParam("term") @@ -49,7 +53,7 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error { fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s (max size: %d)\n", term, assId, size) // Execute - docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, chromosome, term, assId, size) + docs := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, chromosomeSearchTerm, term, assId, size) docsHits := docs["hits"].(map[string]interface{})["hits"] allDocHits := []map[string]interface{}{} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/main.go index 72593afd..d7873ec6 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/main.go @@ -456,18 +456,12 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, } func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, - chromosome string, term string, assId constants.AssemblyId, size int) map[string]interface{} { + chromosomeSearchTerm string, term string, assId constants.AssemblyId, size int) map[string]interface{} { // TEMP: SECURITY RISK http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} // - // Chromosome Search Term - // (wildcard by default) - chromosomeStringTerm := "*" - if chromosome != "" { - chromosomeStringTerm = chromosome - } // Nomenclature Search Term nomenclatureStringTerm := fmt.Sprintf("*%s*", term) @@ -487,7 +481,7 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client { "query_string": map[string]interface{}{ "fields": []string{"chrom"}, - "query": chromosomeStringTerm, + "query": chromosomeSearchTerm, }, }, { From 2936436a1fd437d2015875c46ae92c0186660543 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 16:33:34 -0400 Subject: [PATCH 23/25] separating elasticsearch repo functions --- src/api/repositories/elasticsearch/genes.go | 201 ++++++++++++++++++ .../elasticsearch/{main.go => variants.go} | 191 +---------------- 2 files changed, 206 insertions(+), 186 deletions(-) create mode 100644 src/api/repositories/elasticsearch/genes.go rename src/api/repositories/elasticsearch/{main.go => variants.go} (71%) diff --git a/src/api/repositories/elasticsearch/genes.go b/src/api/repositories/elasticsearch/genes.go new file mode 100644 index 00000000..d7133ef8 --- /dev/null +++ b/src/api/repositories/elasticsearch/genes.go @@ -0,0 +1,201 @@ +package elasticsearch + +import ( + "bytes" + "context" + "crypto/tls" + "encoding/json" + "fmt" + "log" + "net/http" + "time" + + "api/models" + "api/models/constants" + assemblyId "api/models/constants/assembly-id" + + "github.com/elastic/go-elasticsearch" +) + +const genesIndex = "genes" + +func GetGeneBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client) map[string]interface{} { + // begin building the request body. + var buf bytes.Buffer + aggMap := map[string]interface{}{ + "size": "0", + "aggs": map[string]interface{}{ + "genes_assembly_id_group": map[string]interface{}{ + "terms": map[string]interface{}{ + "field": "assemblyId.keyword", + "size": "10000", // increases the number of buckets returned (default is 10) + "order": map[string]string{ + "_key": "asc", + }, + }, + "aggs": map[string]interface{}{ + "genes_chromosome_group": map[string]interface{}{ + "terms": map[string]interface{}{ + "field": "chrom.keyword", + "size": "10000", // increases the number of buckets returned (default is 10) + "order": map[string]string{ + "_key": "asc", + }, + }, + }, + }, + }, + }, + } + + // encode the query + if err := json.NewEncoder(&buf).Encode(aggMap); err != nil { + log.Fatalf("Error encoding aggMap: %s\n", err) + } + + if cfg.Debug { + // view the outbound elasticsearch query + myString := string(buf.Bytes()[:]) + fmt.Println(myString) + } + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + // Perform the search request. + res, searchErr := es.Search( + es.Search.WithContext(context.Background()), + es.Search.WithIndex(genesIndex), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithPretty(), + ) + if searchErr != nil { + fmt.Printf("Error getting response: %s\n", searchErr) + } + + defer res.Body.Close() + + resultString := res.String() + if cfg.Debug { + fmt.Println(resultString) + } + + // Declared an empty interface + result := make(map[string]interface{}) + + // Unmarshal or Decode the JSON to the interface. + // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) + umErr := json.Unmarshal([]byte(resultString[9:]), &result) + if umErr != nil { + fmt.Printf("Error unmarshalling response: %s\n", umErr) + } + + fmt.Printf("Query End: %s\n", time.Now()) + + return result +} + +func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, + chromosomeSearchTerm string, term string, assId constants.AssemblyId, size int) map[string]interface{} { + + // TEMP: SECURITY RISK + http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + // + + // Nomenclature Search Term + nomenclatureStringTerm := fmt.Sprintf("*%s*", term) + + // Assembly Id Search Term (wildcard by default) + assemblyIdStringTerm := "*" + if assId != assemblyId.Unknown { + assemblyIdStringTerm = string(assId) + } + + var buf bytes.Buffer + query := map[string]interface{}{ + "query": map[string]interface{}{ + "bool": map[string]interface{}{ + "filter": []map[string]interface{}{{ + "bool": map[string]interface{}{ + "must": []map[string]interface{}{ + { + "query_string": map[string]interface{}{ + "fields": []string{"chrom"}, + "query": chromosomeSearchTerm, + }, + }, + { + "query_string": map[string]interface{}{ + "fields": []string{"name"}, + "query": nomenclatureStringTerm, + }, + }, + { + "query_string": map[string]interface{}{ + "fields": []string{"assemblyId"}, + "query": assemblyIdStringTerm, + }, + }, + }, + }, + }}, + }, + }, + "size": size, + "sort": []map[string]interface{}{ + { + "chrom.keyword": map[string]interface{}{ + "order": "asc", + }, + }, + { + "start": map[string]interface{}{ + "order": "asc", + }, + }, + }, + } + + // encode the query + if err := json.NewEncoder(&buf).Encode(query); err != nil { + log.Fatalf("Error encoding query: %s\n", err) + } + + if cfg.Debug { + // view the outbound elasticsearch query + myString := string(buf.Bytes()[:]) + fmt.Println(myString) + } + + // Perform the search request. + searchRes, searchErr := es.Search( + es.Search.WithContext(context.Background()), + es.Search.WithIndex(genesIndex), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithPretty(), + ) + if searchErr != nil { + fmt.Printf("Error getting response: %s\n", searchErr) + } + + defer searchRes.Body.Close() + + resultString := searchRes.String() + if cfg.Debug { + fmt.Println(resultString) + } + + // Prepare an empty interface + result := make(map[string]interface{}) + + // Unmarshal or Decode the JSON to the empty interface. + // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) + umErr := json.Unmarshal([]byte(resultString[9:]), &result) + if umErr != nil { + fmt.Printf("Error unmarshalling gene search response: %s\n", umErr) + } + + return result +} diff --git a/src/api/repositories/elasticsearch/main.go b/src/api/repositories/elasticsearch/variants.go similarity index 71% rename from src/api/repositories/elasticsearch/main.go rename to src/api/repositories/elasticsearch/variants.go index d7873ec6..7da69be4 100644 --- a/src/api/repositories/elasticsearch/main.go +++ b/src/api/repositories/elasticsearch/variants.go @@ -11,9 +11,7 @@ import ( "time" "api/models" - "api/models/constants" c "api/models/constants" - assemblyId "api/models/constants/assembly-id" gq "api/models/constants/genotype-query" s "api/models/constants/sort" z "api/models/constants/zygosity" @@ -21,6 +19,8 @@ import ( "github.com/elastic/go-elasticsearch" ) +const variantsIndex = "variants" + func GetDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, es *elasticsearch.Client, chromosome string, lowerBound int, upperBound int, variantId string, sampleId string, @@ -215,7 +215,7 @@ func GetDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, e // Perform the search request. res, searchErr := es.Search( es.Search.WithContext(context.Background()), - es.Search.WithIndex("variants"), + es.Search.WithIndex(variantsIndex), es.Search.WithBody(&buf), es.Search.WithTrackTotalHits(true), es.Search.WithPretty(), @@ -425,7 +425,7 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, // Perform the search request. res, searchErr := es.Count( es.Count.WithContext(context.Background()), - es.Count.WithIndex("variants"), + es.Count.WithIndex(variantsIndex), es.Count.WithBody(&buf), es.Count.WithPretty(), ) @@ -455,110 +455,6 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, return result } -func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client, - chromosomeSearchTerm string, term string, assId constants.AssemblyId, size int) map[string]interface{} { - - // TEMP: SECURITY RISK - http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} - // - - // Nomenclature Search Term - nomenclatureStringTerm := fmt.Sprintf("*%s*", term) - - // Assembly Id Search Term (wildcard by default) - assemblyIdStringTerm := "*" - if assId != assemblyId.Unknown { - assemblyIdStringTerm = string(assId) - } - - var buf bytes.Buffer - query := map[string]interface{}{ - "query": map[string]interface{}{ - "bool": map[string]interface{}{ - "filter": []map[string]interface{}{{ - "bool": map[string]interface{}{ - "must": []map[string]interface{}{ - { - "query_string": map[string]interface{}{ - "fields": []string{"chrom"}, - "query": chromosomeSearchTerm, - }, - }, - { - "query_string": map[string]interface{}{ - "fields": []string{"name"}, - "query": nomenclatureStringTerm, - }, - }, - { - "query_string": map[string]interface{}{ - "fields": []string{"assemblyId"}, - "query": assemblyIdStringTerm, - }, - }, - }, - }, - }}, - }, - }, - "size": size, - "sort": []map[string]interface{}{ - { - "chrom.keyword": map[string]interface{}{ - "order": "asc", - }, - }, - { - "start": map[string]interface{}{ - "order": "asc", - }, - }, - }, - } - - // encode the query - if err := json.NewEncoder(&buf).Encode(query); err != nil { - log.Fatalf("Error encoding query: %s\n", err) - } - - if cfg.Debug { - // view the outbound elasticsearch query - myString := string(buf.Bytes()[:]) - fmt.Println(myString) - } - - // Perform the search request. - searchRes, searchErr := es.Search( - es.Search.WithContext(context.Background()), - es.Search.WithIndex("genes"), - es.Search.WithBody(&buf), - es.Search.WithTrackTotalHits(true), - es.Search.WithPretty(), - ) - if searchErr != nil { - fmt.Printf("Error getting response: %s\n", searchErr) - } - - defer searchRes.Body.Close() - - resultString := searchRes.String() - if cfg.Debug { - fmt.Println(resultString) - } - - // Prepare an empty interface - result := make(map[string]interface{}) - - // Unmarshal or Decode the JSON to the empty interface. - // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) - umErr := json.Unmarshal([]byte(resultString[9:]), &result) - if umErr != nil { - fmt.Printf("Error unmarshalling gene search response: %s\n", umErr) - } - - return result -} - func GetVariantsBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, keyword string) map[string]interface{} { // begin building the request body. var buf bytes.Buffer @@ -594,84 +490,7 @@ func GetVariantsBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client, k // Perform the search request. res, searchErr := es.Search( es.Search.WithContext(context.Background()), - es.Search.WithIndex("variants"), - es.Search.WithBody(&buf), - es.Search.WithTrackTotalHits(true), - es.Search.WithPretty(), - ) - if searchErr != nil { - fmt.Printf("Error getting response: %s\n", searchErr) - } - - defer res.Body.Close() - - resultString := res.String() - if cfg.Debug { - fmt.Println(resultString) - } - - // Declared an empty interface - result := make(map[string]interface{}) - - // Unmarshal or Decode the JSON to the interface. - // Known bug: response comes back with a preceding '[200 OK] ' which needs trimming (hence the [9:]) - umErr := json.Unmarshal([]byte(resultString[9:]), &result) - if umErr != nil { - fmt.Printf("Error unmarshalling response: %s\n", umErr) - } - - fmt.Printf("Query End: %s\n", time.Now()) - - return result -} - -func GetGeneBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client) map[string]interface{} { - // begin building the request body. - var buf bytes.Buffer - aggMap := map[string]interface{}{ - "size": "0", - "aggs": map[string]interface{}{ - "genes_assembly_id_group": map[string]interface{}{ - "terms": map[string]interface{}{ - "field": "assemblyId.keyword", - "size": "10000", // increases the number of buckets returned (default is 10) - "order": map[string]string{ - "_key": "asc", - }, - }, - "aggs": map[string]interface{}{ - "genes_chromosome_group": map[string]interface{}{ - "terms": map[string]interface{}{ - "field": "chrom.keyword", - "size": "10000", // increases the number of buckets returned (default is 10) - "order": map[string]string{ - "_key": "asc", - }, - }, - }, - }, - }, - }, - } - - // encode the query - if err := json.NewEncoder(&buf).Encode(aggMap); err != nil { - log.Fatalf("Error encoding aggMap: %s\n", err) - } - - if cfg.Debug { - // view the outbound elasticsearch query - myString := string(buf.Bytes()[:]) - fmt.Println(myString) - } - - // TEMP: SECURITY RISK - http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} - // - // Perform the search request. - res, searchErr := es.Search( - es.Search.WithContext(context.Background()), - es.Search.WithIndex("genes"), + es.Search.WithIndex(variantsIndex), es.Search.WithBody(&buf), es.Search.WithTrackTotalHits(true), es.Search.WithPretty(), From 2e376d68aa20a00495eb746974585dcd474adfe8 Mon Sep 17 00:00:00 2001 From: brouillette Date: Thu, 30 Sep 2021 17:33:08 -0400 Subject: [PATCH 24/25] upgrading genes endpoint testing --- src/tests/integration/api/api_gene_test.go | 169 +++++++++++++++++- src/tests/integration/api/api_variant_test.go | 37 ---- 2 files changed, 168 insertions(+), 38 deletions(-) diff --git a/src/tests/integration/api/api_gene_test.go b/src/tests/integration/api/api_gene_test.go index e387d41f..4f2dcf3c 100644 --- a/src/tests/integration/api/api_gene_test.go +++ b/src/tests/integration/api/api_gene_test.go @@ -1,14 +1,26 @@ package api import ( + "api/models" + c "api/models/constants" + a "api/models/constants/assembly-id" + "api/models/constants/chromosome" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "sync" "testing" common "tests/common" + . "github.com/ahmetb/go-linq" + "github.com/stretchr/testify/assert" ) const ( - GenesOverviewPath string = "%s/genes/overview" + GenesOverviewPath string = "%s/genes/overview" + GenesSearchPathWithQueryString string = "%s/genes/search%s" ) func TestGenesOverview(t *testing.T) { @@ -17,3 +29,158 @@ func TestGenesOverview(t *testing.T) { overviewJson := getGenesOverview(t, cfg) assert.NotNil(t, overviewJson) } + +func TestCanGetGenesByAssemblyIdAndChromosome(t *testing.T) { + // retrieve all possible combinations of responses + allDtoResponses := getAllDtosOfVariousCombinationsOfGenesAndAssemblyIDs(t) + + // assert the dto response slice is plentiful + assert.NotNil(t, allDtoResponses) + + From(allDtoResponses).ForEachT(func(dto models.GenesResponseDTO) { + // ensure there are results in the response + assert.NotNil(t, dto.Results) + + // check the resulting data + From(dto.Results).ForEachT(func(gene models.Gene) { + // ensure the gene is legit + assert.NotNil(t, gene.Name) + assert.NotNil(t, gene.AssemblyId) + assert.True(t, chromosome.IsValidHumanChromosome(gene.Chrom)) + assert.Greater(t, gene.End, gene.Start) + }) + }) +} + +func getAllDtosOfVariousCombinationsOfGenesAndAssemblyIDs(_t *testing.T) []models.GenesResponseDTO { + cfg := common.InitConfig() + + // retrieve the overview + overviewJson := getGenesOverview(_t, cfg) + + // ensure the response is valid + // TODO: error check instead of nil check + assert.NotNil(_t, overviewJson) + + // initialize a common slice in which to + // accumulate al responses asynchronously + allDtoResponses := []models.GenesResponseDTO{} + allDtoResponsesMux := sync.RWMutex{} + + var combWg sync.WaitGroup + for _, assemblyIdOverviewBucket := range overviewJson { + + // range over all assembly IDs + for assemblyIdString, genesPerChromosomeBucket := range assemblyIdOverviewBucket.(map[string]interface{}) { + + fmt.Println(assemblyIdString) + fmt.Println(genesPerChromosomeBucket) + + castedBucket := genesPerChromosomeBucket.(map[string]interface{})["numberOfGenesPerChromosome"].(map[string]interface{}) + + for chromosomeString, _ := range castedBucket { // _ = number of genes (unused) + + combWg.Add(1) + go func(_wg *sync.WaitGroup, _assemblyIdString string, _chromosomeString string) { + defer _wg.Done() + + assemblyId := a.CastToAssemblyId(_assemblyIdString) + + // make the call + dto := buildQueryAndMakeGetGenesCall(_chromosomeString, "", assemblyId, _t, cfg) + + // ensure there is data returned + // (we'd be making a bad query, otherwise) + assert.True(_t, len(dto.Results) > 0) + + // accumulate all response objects + // to a common slice in an + // asynchronous-safe manner + allDtoResponsesMux.Lock() + allDtoResponses = append(allDtoResponses, dto) + allDtoResponsesMux.Unlock() + }(&combWg, assemblyIdString, chromosomeString) + } + + } + + } + combWg.Wait() + + return allDtoResponses +} + +func getGenesOverview(_t *testing.T, _cfg *models.Config) map[string]interface{} { + request, _ := http.NewRequest("GET", fmt.Sprintf(GenesOverviewPath, _cfg.Api.Url), nil) + + client := &http.Client{} + response, responseErr := client.Do(request) + assert.Nil(_t, responseErr) + + defer response.Body.Close() + + // this test (at the time of writing) will only work if authorization is disabled + shouldBe := 200 + assert.Equal(_t, shouldBe, response.StatusCode, fmt.Sprintf("Error -- Api GET / Status: %s ; Should be %d", response.Status, shouldBe)) + + // -- interpret array of ingestion requests from response + overviewRespBody, overviewRespBodyErr := ioutil.ReadAll(response.Body) + assert.Nil(_t, overviewRespBodyErr) + + // --- transform body bytes to string + overviewRespBodyString := string(overviewRespBody) + + // -- check for json error + var overviewRespJson map[string]interface{} + overviewJsonUnmarshallingError := json.Unmarshal([]byte(overviewRespBodyString), &overviewRespJson) + assert.Nil(_t, overviewJsonUnmarshallingError) + + // -- insure it's an empty array + // chromosomesKey, ckOk := overviewRespJson["chromosomes"] + // assert.True(_t, ckOk) + // assert.NotNil(_t, chromosomesKey) + + assemblyIDsKey, assidkOk := overviewRespJson["assemblyIDs"] + assert.True(_t, assidkOk) + assert.NotNil(_t, assemblyIDsKey) + + return overviewRespJson +} + +func buildQueryAndMakeGetGenesCall(chromosome string, term string, assemblyId c.AssemblyId, _t *testing.T, _cfg *models.Config) models.GenesResponseDTO { + + queryString := fmt.Sprintf("?chromosome=%s&assemblyId=%s", chromosome, assemblyId) + + url := fmt.Sprintf(GenesSearchPathWithQueryString, _cfg.Api.Url, queryString) + + return getGetGenesCall(url, _t) +} + +func getGetGenesCall(url string, _t *testing.T) models.GenesResponseDTO { + fmt.Printf("Calling %s\n", url) + request, _ := http.NewRequest("GET", url, nil) + + client := &http.Client{} + response, responseErr := client.Do(request) + assert.Nil(_t, responseErr) + + defer response.Body.Close() + + // this test (at the time of writing) will only work if authorization is disabled + shouldBe := 200 + assert.Equal(_t, shouldBe, response.StatusCode, fmt.Sprintf("Error -- Api GET %s Status: %s ; Should be %d", url, response.Status, shouldBe)) + + // -- interpret array of ingestion requests from response + respBody, respBodyErr := ioutil.ReadAll(response.Body) + assert.Nil(_t, respBodyErr) + + // --- transform body bytes to string + respBodyString := string(respBody) + + // -- convert to json and check for error + var respDto models.GenesResponseDTO + jsonUnmarshallingError := json.Unmarshal([]byte(respBodyString), &respDto) + assert.Nil(_t, jsonUnmarshallingError) + + return respDto +} diff --git a/src/tests/integration/api/api_variant_test.go b/src/tests/integration/api/api_variant_test.go index a2858f07..6e9c2f43 100644 --- a/src/tests/integration/api/api_variant_test.go +++ b/src/tests/integration/api/api_variant_test.go @@ -467,43 +467,6 @@ func getVariantsOverview(_t *testing.T, _cfg *models.Config) map[string]interfac return overviewRespJson } -func getGenesOverview(_t *testing.T, _cfg *models.Config) map[string]interface{} { - request, _ := http.NewRequest("GET", fmt.Sprintf(GenesOverviewPath, _cfg.Api.Url), nil) - - client := &http.Client{} - response, responseErr := client.Do(request) - assert.Nil(_t, responseErr) - - defer response.Body.Close() - - // this test (at the time of writing) will only work if authorization is disabled - shouldBe := 200 - assert.Equal(_t, shouldBe, response.StatusCode, fmt.Sprintf("Error -- Api GET / Status: %s ; Should be %d", response.Status, shouldBe)) - - // -- interpret array of ingestion requests from response - overviewRespBody, overviewRespBodyErr := ioutil.ReadAll(response.Body) - assert.Nil(_t, overviewRespBodyErr) - - // --- transform body bytes to string - overviewRespBodyString := string(overviewRespBody) - - // -- check for json error - var overviewRespJson map[string]interface{} - overviewJsonUnmarshallingError := json.Unmarshal([]byte(overviewRespBodyString), &overviewRespJson) - assert.Nil(_t, overviewJsonUnmarshallingError) - - // -- insure it's an empty array - chromosomesKey, ckOk := overviewRespJson["chromosomes"] - assert.True(_t, ckOk) - assert.NotNil(_t, chromosomesKey) - - variantIDsKey, vidkOk := overviewRespJson["assemblyIDs"] - assert.True(_t, vidkOk) - assert.NotNil(_t, variantIDsKey) - - return overviewRespJson -} - func getOverviewResultCombinations(chromosomeStruct interface{}, sampleIdsStruct interface{}, assemblyIdsStruct interface{}) [][]string { var allCombinations = [][]string{} From 8b8f15b62e61e4c17ec78f98ff1f29c57362ab56 Mon Sep 17 00:00:00 2001 From: brouillette Date: Fri, 1 Oct 2021 17:49:19 -0400 Subject: [PATCH 25/25] downloading gene source files upon ingest --- src/api/utils/connections.go | 2 +- src/gota-poc/main.go | 115 ++++++++++++++++++--- src/tests/integration/api/api_gene_test.go | 4 - 3 files changed, 100 insertions(+), 21 deletions(-) diff --git a/src/api/utils/connections.go b/src/api/utils/connections.go index 79d2e789..1e350cec 100644 --- a/src/api/utils/connections.go +++ b/src/api/utils/connections.go @@ -39,7 +39,7 @@ func CreateEsConnection(cfg *models.Config) *es7.Client { es7Client, _ := es7.NewClient(esCfg) - fmt.Printf("Using ES7 Client Version %s", es7.Version) + fmt.Printf("Using ES7 Client Version %s\n", es7.Version) return es7Client } diff --git a/src/gota-poc/main.go b/src/gota-poc/main.go index 97e5a0b2..ae812f4b 100644 --- a/src/gota-poc/main.go +++ b/src/gota-poc/main.go @@ -4,14 +4,18 @@ import ( "api/models" "api/models/constants" assemblyId "api/models/constants/assembly-id" + "api/models/constants/chromosome" "api/models/ingest/structs" "api/services" "api/utils" "bufio" + "compress/gzip" "crypto/tls" "fmt" + "io" "log" "net/http" + "net/url" "os" "strconv" "strings" @@ -42,12 +46,20 @@ func main() { assemblyIdMap := map[constants.AssemblyId]string{ assemblyId.GRCh38: "gencode.v38.annotation.gtf", - assemblyId.GRCh37: "gencode.v19.annotation.gtf_withproteinids", + assemblyId.GRCh37: "gencode.v19.annotation.gtf", // SKIP // assemblyId.NCBI36: "hg18", // assemblyId.NCBI35: "hg17", // assemblyId.NCBI34: "hg16", } + assemblyIdGTFUrlMap := map[constants.AssemblyId]string{ + assemblyId.GRCh38: "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz", + assemblyId.GRCh37: "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz", + // SKIP + // assemblyId.NCBI36: "", + // assemblyId.NCBI35: "", + // assemblyId.NCBI34: "", + } var geneWg sync.WaitGroup @@ -56,7 +68,85 @@ func main() { gtfFile, err := os.Open(fileName) if err != nil { - log.Fatalf("failed to open file: %s", err) + // log.Fatalf("failed to open file: %s", err) + // Download the file + fullURLFile := assemblyIdGTFUrlMap[assId] + + // Build fileName from fullPath + fileURL, err := url.Parse(fullURLFile) + if err != nil { + log.Fatal(err) + } + path := fileURL.Path + segments := strings.Split(path, "/") + fileName = segments[len(segments)-1] + + // Create blank file + file, err := os.Create(fileName) + if err != nil { + log.Fatal(err) + } + client := http.Client{ + CheckRedirect: func(r *http.Request, via []*http.Request) error { + r.URL.Opaque = r.URL.Path + return nil + }, + } + fmt.Printf("Downloading file %s ...\n", fileName) + + // Put content on file + resp, err := client.Get(fullURLFile) + if err != nil { + log.Fatal(err) + } + defer resp.Body.Close() + + size, err := io.Copy(file, resp.Body) + if err != nil { + log.Fatal(err) + } + defer file.Close() + + fmt.Printf("Downloaded a file %s with size %d\n", fileName, size) + + fmt.Printf("Unzipping %s...\n", fileName) + gzipfile, err := os.Open(fileName) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + reader, err := gzip.NewReader(gzipfile) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + defer reader.Close() + + newfilename := strings.TrimSuffix(fileName, ".gz") + + writer, err := os.Create(newfilename) + + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + defer writer.Close() + + if _, err = io.Copy(writer, reader); err != nil { + fmt.Println(err) + os.Exit(1) + } + + fmt.Printf("Opening %s\n", newfilename) + gtfFile, _ = os.Open(newfilename) + + fmt.Printf("Deleting %s\n", fileName) + err = os.Remove(fileName) + if err != nil { + fmt.Println(err) + } } defer gtfFile.Close() @@ -64,7 +154,7 @@ func main() { fileScanner := bufio.NewScanner(gtfFile) fileScanner.Split(bufio.ScanLines) - fmt.Printf("%s :\n", fileName) + fmt.Printf("Ingesting %s\n", string(assId)) var ( chromHeaderKey = 0 @@ -117,19 +207,12 @@ func main() { //clean chromosome chromosomeClean := strings.ReplaceAll(rowSplits[_chromHeaderKey], "chr", "") - // TODO: fomarmalize - // if chromosome MT, set to 0 - // if chromosome X, set to 101 - // if chromosome Y, set to 102 - // if strings.Contains(strings.ToUpper(chromosomeClean), "MT") { - // chromosome = 0 - // } else if strings.ToUpper(chromosomeClean) == "X" { - // chromosome = 101 - // } else if strings.ToUpper(chromosomeClean) == "Y" { - // chromosome = 102 - // } else { - // chromosome, _ = strconv.Atoi(chromosomeClean) - // } + + if !chromosome.IsValidHumanChromosome(chromosomeClean) { + defer _gwg.Done() + return + } + // http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz // clean start/end chromStartClean := strings.ReplaceAll(strings.ReplaceAll(rowSplits[_startKey], ",", ""), " ", "") diff --git a/src/tests/integration/api/api_gene_test.go b/src/tests/integration/api/api_gene_test.go index 4f2dcf3c..103f3bc2 100644 --- a/src/tests/integration/api/api_gene_test.go +++ b/src/tests/integration/api/api_gene_test.go @@ -136,10 +136,6 @@ func getGenesOverview(_t *testing.T, _cfg *models.Config) map[string]interface{} assert.Nil(_t, overviewJsonUnmarshallingError) // -- insure it's an empty array - // chromosomesKey, ckOk := overviewRespJson["chromosomes"] - // assert.True(_t, ckOk) - // assert.NotNil(_t, chromosomesKey) - assemblyIDsKey, assidkOk := overviewRespJson["assemblyIDs"] assert.True(_t, assidkOk) assert.NotNil(_t, assemblyIDsKey)