From 38f71b3452228aef493c7bdf80abcc40db9fc2d7 Mon Sep 17 00:00:00 2001 From: Jeremy Jay Date: Tue, 19 Mar 2019 12:38:45 -0400 Subject: [PATCH] refactor data package --- data/data.go | 80 +++++++++++++ data/fetch_uniprot.go | 149 ------------------------ data/pfam.go | 71 ++++++++++++ data/{fetch_data.go => uniprot.go} | 180 +++++++++++++++++------------ 4 files changed, 260 insertions(+), 220 deletions(-) create mode 100644 data/data.go delete mode 100644 data/fetch_uniprot.go create mode 100644 data/pfam.go rename data/{fetch_data.go => uniprot.go} (53%) diff --git a/data/data.go b/data/data.go new file mode 100644 index 0000000..000f614 --- /dev/null +++ b/data/data.go @@ -0,0 +1,80 @@ +// +// Lollipops diagram generation framework for genetic variations. +// Copyright (C) 2015 Jeremy Jay +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package data + +import ( + "encoding/json" + "os" + "strings" +) + +// MotifNames has human-readable names +// - mostly from http://pfam.xfam.org/help#tabview=tab9 +var MotifNames = map[string]string{ + "disorder": "Disordered region", + "low_complexity": "Low complexity region", + "sig_p": "Signal peptide region", + "coiled_coil": "Coiled-coil motif", + "transmembrane": "Transmembrane region", +} + +// GraphicFeature is a generic representation of various feature responses +type GraphicFeature struct { + Color string `json:"colour"` + Text string `json:"text"` + Type string `json:"type"` + Start json.Number `json:"start"` + End json.Number `json:"end"` + Link string `json:"href"` + Metadata GraphicMetadata `json:"metadata"` +} + +type GraphicMetadata struct { + Description string `json:"description"` + Identifier string `json:"identifier"` +} + +type GraphicResponse struct { + Length json.Number `json:"length"` + Metadata GraphicMetadata `json:"metadata"` + Motifs []GraphicFeature `json:"motifs"` + Regions []GraphicFeature `json:"regions"` +} + +func GetLocalGraphicData(filename string) (*GraphicResponse, error) { + f, err := os.Open(filename) + if err != nil { + return nil, err + } + pf := &GraphicResponse{} + err = json.NewDecoder(f).Decode(pf) + f.Close() + for i, x := range pf.Motifs { + if x.Link != "" && !strings.Contains(x.Link, "://") { + x.Link = "http://pfam.xfam.org" + x.Link + pf.Motifs[i] = x + } + } + for i, x := range pf.Regions { + if x.Link != "" && !strings.Contains(x.Link, "://") { + x.Link = "http://pfam.xfam.org" + x.Link + pf.Regions[i] = x + } + } + return pf, err +} diff --git a/data/fetch_uniprot.go b/data/fetch_uniprot.go deleted file mode 100644 index fda6ed1..0000000 --- a/data/fetch_uniprot.go +++ /dev/null @@ -1,149 +0,0 @@ -package data - -import ( - "bytes" - "encoding/json" - "fmt" - "io/ioutil" - "log" - "net" - "net/http" - "os" - "regexp" - "strings" -) - -const UniprotDataURL = "https://www.uniprot.org/uniprot/%s.txt" - -var defaultUniprotFeatures = map[string][]string{ - "COILED": {"motif", "coiled_coil", "#9cff00"}, - "SIGNAL": {"motif", "sig_p", "#ff9c00"}, - "TRANSMEM": {"motif", "transmembrane", "#ff0000"}, - "COMPBIAS": {"motif", "low_complexity", "#00ffff"}, - - "DNA_BIND": {"region", "dna_bind", "#ff5353"}, - "ZN_FING": {"region", "zn_fing", "#2dcf00"}, - "CA_BIND": {"region", "ca_bind", "#86bcff"}, - - "MOTIF": {"region", "motif", "#1fc01f"}, - "REPEAT": {"region", "repeat", "#1fc01f"}, - "DOMAIN": {"region", "domain", "#9999ff"}, -} - -func getValueForKey(line, key string) string { - parts := strings.Split(line, ";") - for _, s := range parts { - p := strings.SplitN(s, "=", 2) - if p[0] == key { - return strings.TrimSpace(p[1]) - } - } - return "" -} - -func GetUniprotGraphicData(accession string) (*GraphicResponse, error) { - queryURL := fmt.Sprintf(UniprotDataURL, accession) - resp, err := http.Get(queryURL) - if err != nil { - if err, ok := err.(net.Error); ok && err.Timeout() { - fmt.Fprintf(os.Stderr, "Unable to connect to Uniprot. Check your internet connection or try again later.") - os.Exit(1) - } - return nil, err - } - respBytes, err := ioutil.ReadAll(resp.Body) - if err != nil { - return nil, err - } - if resp.StatusCode != 200 { - return nil, fmt.Errorf("pfam error: %s", resp.Status) - } - - nouncertain := regexp.MustCompile("[?<>]") - trimTags := regexp.MustCompile("[{][^}]*[}]") - minisplit := regexp.MustCompile("[;.]") - - gd := &GraphicResponse{} - for _, bline := range bytes.Split(respBytes, []byte("\n")) { - if len(bline) < 5 { - continue - } - key := string(bytes.TrimSpace(bline[:5])) - line := string(bline[5:]) - switch key { - case "GN": - // GN Name=CTNNB1; Synonyms=CTNNB; ORFNames=OK/SW-cl.35, PRO2286; - sym := getValueForKey(line, "Name") - if sym != "" { - gd.Metadata.Identifier = sym - } - case "DE": - // DE RecName: Full=Catenin beta-1; - if !strings.HasPrefix(line, "RecName: ") { - continue - } - desc := getValueForKey(line[9:], "Full") - if desc != "" { - gd.Metadata.Description = desc - } - case "SQ": - // SQ SEQUENCE 781 AA; 85497 MW; CB78F165A3EEF86E CRC64; - parts := strings.Split(line, ";") - for _, p := range parts { - if !strings.HasPrefix(p, "SEQUENCE") { - continue - } - seqLen := strings.TrimSpace(strings.TrimSuffix(p[8:], "AA")) - gd.Length = json.Number(seqLen) - break - } - //////////////////////////// - case "FT": - /// https://web.expasy.org/docs/userman.html#FT_line - if strings.TrimSpace(line[:29]) == "" { - // continuation of previous line's description (ignored) - continue - } - featureType := strings.TrimSpace(line[:8]) // Key name - fromPos := strings.TrimSpace(nouncertain.ReplaceAllString(line[9:15], "")) // 'From' endpoint - toPos := strings.TrimSpace(nouncertain.ReplaceAllString(line[16:22], "")) // 'To' endpoint - desc := strings.TrimSpace(line[29:]) // Description - - if fromPos == "" || toPos == "" || fromPos == toPos { - // skip any unknown positions or point features - continue - } - fdata, ok := defaultUniprotFeatures[featureType] - if !ok { - continue - } - - desc = strings.TrimSpace(trimTags.ReplaceAllString(desc, "")) - shortDesc := desc - if p := minisplit.Split(desc, 2); len(p) == 2 { - shortDesc = strings.TrimSpace(p[0]) - } - - feat := GraphicFeature{ - Color: fdata[2], - Text: strings.Trim(shortDesc, ". "), - Type: fdata[1], - Start: json.Number(fromPos), - End: json.Number(toPos), - Metadata: GraphicMetadata{ - Description: strings.Trim(shortDesc, ". "), - }, - } - switch fdata[0] { - case "region": - gd.Regions = append(gd.Regions, feat) - case "motif": - gd.Motifs = append(gd.Motifs, feat) - default: - log.Println("unknown feature set", fdata[0]) - } - } - } - - return gd, nil -} diff --git a/data/pfam.go b/data/pfam.go new file mode 100644 index 0000000..00562d9 --- /dev/null +++ b/data/pfam.go @@ -0,0 +1,71 @@ +// +// Lollipops diagram generation framework for genetic variations. +// Copyright (C) 2015 Jeremy Jay +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package data + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net" + "net/http" + "os" +) + +const PfamGraphicURL = "http://pfam.xfam.org/protein/%s/graphic" + +func GetPfamGraphicData(accession string) (*GraphicResponse, error) { + queryURL := fmt.Sprintf(PfamGraphicURL, accession) + resp, err := http.Get(queryURL) + if err != nil { + if err, ok := err.(net.Error); ok && err.Timeout() { + fmt.Fprintf(os.Stderr, "Unable to connect to Pfam. Check your internet connection or try again later.") + os.Exit(1) + } + return nil, err + } + respBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, err + } + if resp.StatusCode != 200 { + return nil, fmt.Errorf("pfam error: %s", resp.Status) + } + + data := []GraphicResponse{} + err = json.Unmarshal(respBytes, &data) + //if err != nil { + // return nil, err + //} + if len(data) != 1 { + return nil, fmt.Errorf("pfam returned invalid result") + } + r := data[0] + for i, x := range r.Motifs { + if x.Link != "" { + x.Link = "http://pfam.xfam.org" + x.Link + r.Motifs[i] = x + } + } + for i, x := range r.Regions { + if x.Link != "" { + x.Link = "http://pfam.xfam.org" + x.Link + r.Regions[i] = x + } + } + return &r, nil +} diff --git a/data/fetch_data.go b/data/uniprot.go similarity index 53% rename from data/fetch_data.go rename to data/uniprot.go index fec401f..c471cd6 100644 --- a/data/fetch_data.go +++ b/data/uniprot.go @@ -18,80 +18,53 @@ package data import ( + "bytes" "encoding/json" "fmt" "io/ioutil" + "log" "net" "net/http" "net/url" "os" + "regexp" "strings" ) -const PfamGraphicURL = "http://pfam.xfam.org/protein/%s/graphic" +const UniprotDataURL = "https://www.uniprot.org/uniprot/%s.txt" -// MotifNames has human-readable names -// - mostly from http://pfam.xfam.org/help#tabview=tab9 -var MotifNames = map[string]string{ - "disorder": "Disordered region", - "low_complexity": "Low complexity region", - "sig_p": "Signal peptide region", - "coiled_coil": "Coiled-coil motif", - "transmembrane": "Transmembrane region", -} +var defaultUniprotFeatures = map[string][]string{ + "COILED": {"motif", "coiled_coil", "#9cff00"}, + "SIGNAL": {"motif", "sig_p", "#ff9c00"}, + "TRANSMEM": {"motif", "transmembrane", "#ff0000"}, + "COMPBIAS": {"motif", "low_complexity", "#00ffff"}, -// GraphicFeature is a generic representation of various feature responses -type GraphicFeature struct { - Color string `json:"colour"` - Text string `json:"text"` - Type string `json:"type"` - Start json.Number `json:"start"` - End json.Number `json:"end"` - Link string `json:"href"` - Metadata GraphicMetadata `json:"metadata"` -} + "DNA_BIND": {"region", "dna_bind", "#ff5353"}, + "ZN_FING": {"region", "zn_fing", "#2dcf00"}, + "CA_BIND": {"region", "ca_bind", "#86bcff"}, -type GraphicMetadata struct { - Description string `json:"description"` - Identifier string `json:"identifier"` + "MOTIF": {"region", "motif", "#1fc01f"}, + "REPEAT": {"region", "repeat", "#1fc01f"}, + "DOMAIN": {"region", "domain", "#9999ff"}, } -type GraphicResponse struct { - Length json.Number `json:"length"` - Metadata GraphicMetadata `json:"metadata"` - Motifs []GraphicFeature `json:"motifs"` - Regions []GraphicFeature `json:"regions"` -} - -func GetLocalGraphicData(filename string) (*GraphicResponse, error) { - f, err := os.Open(filename) - if err != nil { - return nil, err - } - pf := &GraphicResponse{} - err = json.NewDecoder(f).Decode(pf) - f.Close() - for i, x := range pf.Motifs { - if x.Link != "" && !strings.Contains(x.Link, "://") { - x.Link = "http://pfam.xfam.org" + x.Link - pf.Motifs[i] = x - } - } - for i, x := range pf.Regions { - if x.Link != "" && !strings.Contains(x.Link, "://") { - x.Link = "http://pfam.xfam.org" + x.Link - pf.Regions[i] = x +func getValueForKey(line, key string) string { + parts := strings.Split(line, ";") + for _, s := range parts { + p := strings.SplitN(s, "=", 2) + if p[0] == key { + return strings.TrimSpace(p[1]) } } - return pf, err + return "" } -func GetPfamGraphicData(accession string) (*GraphicResponse, error) { - queryURL := fmt.Sprintf(PfamGraphicURL, accession) +func GetUniprotGraphicData(accession string) (*GraphicResponse, error) { + queryURL := fmt.Sprintf(UniprotDataURL, accession) resp, err := http.Get(queryURL) if err != nil { if err, ok := err.(net.Error); ok && err.Timeout() { - fmt.Fprintf(os.Stderr, "Unable to connect to Pfam. Check your internet connection or try again later.") + fmt.Fprintf(os.Stderr, "Unable to connect to Uniprot. Check your internet connection or try again later.") os.Exit(1) } return nil, err @@ -104,28 +77,93 @@ func GetPfamGraphicData(accession string) (*GraphicResponse, error) { return nil, fmt.Errorf("pfam error: %s", resp.Status) } - data := []GraphicResponse{} - err = json.Unmarshal(respBytes, &data) - //if err != nil { - // return nil, err - //} - if len(data) != 1 { - return nil, fmt.Errorf("pfam returned invalid result") - } - r := data[0] - for i, x := range r.Motifs { - if x.Link != "" { - x.Link = "http://pfam.xfam.org" + x.Link - r.Motifs[i] = x + nouncertain := regexp.MustCompile("[?<>]") + trimTags := regexp.MustCompile("[{][^}]*[}]") + minisplit := regexp.MustCompile("[;.]") + + gd := &GraphicResponse{} + for _, bline := range bytes.Split(respBytes, []byte("\n")) { + if len(bline) < 5 { + continue } - } - for i, x := range r.Regions { - if x.Link != "" { - x.Link = "http://pfam.xfam.org" + x.Link - r.Regions[i] = x + key := string(bytes.TrimSpace(bline[:5])) + line := string(bline[5:]) + switch key { + case "GN": + // GN Name=CTNNB1; Synonyms=CTNNB; ORFNames=OK/SW-cl.35, PRO2286; + sym := getValueForKey(line, "Name") + if sym != "" { + gd.Metadata.Identifier = sym + } + case "DE": + // DE RecName: Full=Catenin beta-1; + if !strings.HasPrefix(line, "RecName: ") { + continue + } + desc := getValueForKey(line[9:], "Full") + if desc != "" { + gd.Metadata.Description = desc + } + case "SQ": + // SQ SEQUENCE 781 AA; 85497 MW; CB78F165A3EEF86E CRC64; + parts := strings.Split(line, ";") + for _, p := range parts { + if !strings.HasPrefix(p, "SEQUENCE") { + continue + } + seqLen := strings.TrimSpace(strings.TrimSuffix(p[8:], "AA")) + gd.Length = json.Number(seqLen) + break + } + //////////////////////////// + case "FT": + /// https://web.expasy.org/docs/userman.html#FT_line + if strings.TrimSpace(line[:29]) == "" { + // continuation of previous line's description (ignored) + continue + } + featureType := strings.TrimSpace(line[:8]) // Key name + fromPos := strings.TrimSpace(nouncertain.ReplaceAllString(line[9:15], "")) // 'From' endpoint + toPos := strings.TrimSpace(nouncertain.ReplaceAllString(line[16:22], "")) // 'To' endpoint + desc := strings.TrimSpace(line[29:]) // Description + + if fromPos == "" || toPos == "" || fromPos == toPos { + // skip any unknown positions or point features + continue + } + fdata, ok := defaultUniprotFeatures[featureType] + if !ok { + continue + } + + desc = strings.TrimSpace(trimTags.ReplaceAllString(desc, "")) + shortDesc := desc + if p := minisplit.Split(desc, 2); len(p) == 2 { + shortDesc = strings.TrimSpace(p[0]) + } + + feat := GraphicFeature{ + Color: fdata[2], + Text: strings.Trim(shortDesc, ". "), + Type: fdata[1], + Start: json.Number(fromPos), + End: json.Number(toPos), + Metadata: GraphicMetadata{ + Description: strings.Trim(shortDesc, ". "), + }, + } + switch fdata[0] { + case "region": + gd.Regions = append(gd.Regions, feat) + case "motif": + gd.Motifs = append(gd.Motifs, feat) + default: + log.Println("unknown feature set", fdata[0]) + } } } - return &r, nil + + return gd, nil } func GetProtID(symbol string) (string, error) {