updates from chimi

kdm9 · Feb 15, 2024 · 1815671 · 1815671
1 parent b358b99
commit 1815671
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 27 deletions.
diff --git a/blsl/genigvjs.py b/blsl/genigvjs.py
@@ -9,6 +9,9 @@
 from pathlib import Path
 import subprocess
 from base64 import b64encode 
+import shutil
+
+
 def get_data_uri(data):
     if isinstance(data, str) or isinstance(data, Path):
         data = Path(data)
@@ -52,21 +55,27 @@ def get_data_uri(data):
 </body>
 """
 
-def link(linkpath, target):
+def link(linkpath, target, copy=False):
     linkpath = Path(linkpath)
     target = Path(target).absolute()
-    try:
-        if linkpath.samefile(target):
-            return
-    except:
-        pass
-    linkpath.symlink_to(target)
+    if copy:
+        linkpath.parent.mkdir(exist_ok=True)
+        shutil.copyfile(target, linkpath)
+    else:
+        try:
+            if linkpath.samefile(target):
+                return
+        except:
+            pass
+        linkpath.symlink_to(target)
 
 def genigvjs_main(argv=None):
     """Generate a simple IGV.js visualisation of some bioinf files."""
     ap = argparse.ArgumentParser()
     ap.add_argument("--template", "-T", required=False,
             help="Alternative HTML template")
+    ap.add_argument("--copy", "--cp", action="store_true",
+            help="Copy, don't link, data to output dir")
     ap.add_argument("--embed", "-e", action="store_true",
             help="Encode data as base64 within html (almost always a bad idea on non-toy datasets).")
     ap.add_argument("--prefix", "-p", default="./",
@@ -113,8 +122,8 @@ def genigvjs_main(argv=None):
         data["reference"].update({
             "fastaURL": f"{args.prefix}{ref.name}",
         })
-        link(outdir / ref.name, ref)
-        link(outdir / (ref.name + ".fai"), reffai)
+        link(outdir / ref.name, ref, copy=args.copy)
+        link(outdir / (ref.name + ".fai"), reffai, copy=args.copy)
     cbpaired = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c',
                 '#fdbf6f','#ff7f00','#cab2d6','#6a3d9a', '#b15928']
     for i, track in enumerate(args.tracks):
@@ -129,8 +138,8 @@ def genigvjs_main(argv=None):
             index = Path(str(track) + ".bai")
         else:
             index = Path(str(track) + ".tbi")
-            if not index.exists():
-                index = None
+        if not index.exists():
+            index = None
         trackdat = {
             "name": base,
             "format": format,
@@ -145,10 +154,10 @@ def genigvjs_main(argv=None):
                 trackdat["indexURL"] = get_data_uri(index)
         else:
             trackdat["url"] =  f"{args.prefix}{track.name}"
-            link(outdir / track.name, track)
+            link(outdir / track.name, track, copy=args.copy)
             if index is not None:
                 trackdat["indexURL"] =  f"{args.prefix}{index.name}"
-                link(outdir / index.name, index)
+                link(outdir / index.name, index, copy=args.copy)
         data["tracks"].append(trackdat)
     with open(outdir / "index.html", "w") as fh:
         html = template \

diff --git a/blsl/gffparse.py b/blsl/gffparse.py
@@ -17,6 +17,7 @@
 import urllib.request, urllib.parse, urllib.error
 from sys import stderr, stdout
 from copy import deepcopy
+import re
 
 from tqdm.auto import tqdm
 
@@ -33,22 +34,27 @@
     "exon", "pseudogenic_exon", "five_prime_UTR", "three_prime_UTR", "CDS", "pseudogenic_CDS",
 ]
 
-def parseGFFAttributes(attributeString):
+def parseGFFAttributes(attributeString, lax=False):
     """Parse the GFF3 attribute column and return a dict"""#
     if attributeString == ".": return {}
     ret = {}
-    for attribute in attributeString.split(";"):
-        if "=" in attribute:
-            key, value = attribute.split("=")
-        else:
-            key = attribute
-            value = "True"
+    if lax:
+        # The idea here is to split not just on ;, but on whole ;key=XXXXXX
+        # sections. This allows e.g. unescaped ; etc in the attrs column
+        attrs = re.findall(r"(^|(?<=;))([\w_-]+)=(.+?)((?=;[\w_-]+=)|$)", attributeString)
+        attrs = [(x[1], x[2]) for x in attrs]
+    else:
+        attrs = [x.partition("=") for x in attributeString.split(";")]
+        attrs = [(x[0], x[2]) for x in attrs]
+    for key, value in attrs:
         if not key and not value:
             continue
+        if not value:
+            value = "True"
         ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
     return ret
 
-def parseGFF3(filename, return_as=dict):
+def parseGFF3(filename, return_as=dict, lax=False):
     """
     A minimalistic GFF3 format parser.
     Yields objects that contain info about a single GFF3 feature.
@@ -77,15 +83,14 @@ def parseGFF3(filename, return_as=dict):
                 "score": None if parts[5] == "." else float(parts[5]),
                 "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
                 "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
-                "attributes": parseGFFAttributes(parts[8])
+                "attributes": parseGFFAttributes(parts[8], lax=True)
             }
             #Alternatively, you can emit the dictionary here, if you need mutability:
             #    yield normalizedInfo
             yield return_as(**normalizedInfo)
 
 
 def gff_heirarchy(filename, progress=None, make_missing_genes=False):
-    last = {"l1": None, "l2": None, "l3": None}
     level_canonicaliser = {
         "transcript": "mRNA",
     }
@@ -106,9 +111,11 @@ def gff_heirarchy(filename, progress=None, make_missing_genes=False):
     ignore = {
         "source",
         "stop_codon",
+        "start_codon",
     }
     records = {}
     l2l1 = {}
+    last = {"l1": None, "l2": None, "l3": None}
     i = 0
     warned = set()
     recordsrc =  parseGFF3(filename, return_as=dict)
@@ -163,7 +170,7 @@ def gff_heirarchy(filename, progress=None, make_missing_genes=False):
                 print(f"L2 entry {id} parent {parent} not in records? {record}")
         else:
             try:
-                parent = record["attributes"]["Parent"]
+                parent = record["attributes"].get("Parent", records["attributes"].get("transcript_id", None))
                 top = l2l1[parent]
                 if id in records[top]["children"][parent]["children"]:
                     i += 1
@@ -218,7 +225,7 @@ def prefix_name(entry, sid):
 
 
 def write_gff_line(line, file=None):
-    attr = ";".join(f"{k}={v}" for k, v in line["attributes"].items())
+    attr = ";".join(f"{k}={urllib.parse.quote(v)}" for k, v in line["attributes"].items())
     cols = [line.get(f, ".") for f in gffInfoFields[:-1]] + [attr]
     cols = ["." if x is None else x for x in cols]
     print(*cols, file=file, sep="\t")
@@ -291,6 +298,8 @@ def gffparse_main(argv=None):
             help="Convert gff including attributes to simple TSV")
     ap.add_argument("-c", "--prefix-chrom", action="store_true",
             help="Prefix gene names with chromsome name (e.g. useful for concatenating augustus results)")
+    ap.add_argument("-g", "--make-missing-genes", action="store_true",
+            help="Invent Fake L1 'gene' entries for any L2 lacking an L1 parent")
     ap.add_argument("input", help="Input GFF")
     args = ap.parse_args(argv)
 
@@ -306,10 +315,10 @@ def gffparse_main(argv=None):
                 cols = [x if x is not None else "" for x in cols]
                 print(*cols, sep="\t", file=fh)
         else:
-            gff = gff_heirarchy(args.input, progress="Parse   ")
+            gff = gff_heirarchy(args.input, progress="Parse   ", make_missing_genes=args.make_missing_genes)
             for _, gene in tqdm(gff.items(), desc="Process "):
                 if args.prefix_chrom:
-                    newgid = f"{gene['seqid']}_{gene['attributes']['ID']}"
+                    newgid = f"{gene['seqid'].replace('#', '_')}_{gene['attributes']['ID']}"
                     reformat_names(gene, geneid=newgid, changenames=False)
                 write_gene(gene, file=fh)