From 181567134de5760c4f9b95cef13f59aed2f1f448 Mon Sep 17 00:00:00 2001 From: Kevin Murray Date: Thu, 15 Feb 2024 11:53:37 +0100 Subject: [PATCH] updates from chimi --- blsl/genigvjs.py | 35 ++++++++++++++++++++++------------- blsl/gffparse.py | 37 +++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/blsl/genigvjs.py b/blsl/genigvjs.py index 544727d..a9c10cb 100644 --- a/blsl/genigvjs.py +++ b/blsl/genigvjs.py @@ -9,6 +9,9 @@ from pathlib import Path import subprocess from base64 import b64encode +import shutil + + def get_data_uri(data): if isinstance(data, str) or isinstance(data, Path): data = Path(data) @@ -52,21 +55,27 @@ def get_data_uri(data): """ -def link(linkpath, target): +def link(linkpath, target, copy=False): linkpath = Path(linkpath) target = Path(target).absolute() - try: - if linkpath.samefile(target): - return - except: - pass - linkpath.symlink_to(target) + if copy: + linkpath.parent.mkdir(exist_ok=True) + shutil.copyfile(target, linkpath) + else: + try: + if linkpath.samefile(target): + return + except: + pass + linkpath.symlink_to(target) def genigvjs_main(argv=None): """Generate a simple IGV.js visualisation of some bioinf files.""" ap = argparse.ArgumentParser() ap.add_argument("--template", "-T", required=False, help="Alternative HTML template") + ap.add_argument("--copy", "--cp", action="store_true", + help="Copy, don't link, data to output dir") ap.add_argument("--embed", "-e", action="store_true", help="Encode data as base64 within html (almost always a bad idea on non-toy datasets).") ap.add_argument("--prefix", "-p", default="./", @@ -113,8 +122,8 @@ def genigvjs_main(argv=None): data["reference"].update({ "fastaURL": f"{args.prefix}{ref.name}", }) - link(outdir / ref.name, ref) - link(outdir / (ref.name + ".fai"), reffai) + link(outdir / ref.name, ref, copy=args.copy) + link(outdir / (ref.name + ".fai"), reffai, copy=args.copy) cbpaired = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c', '#fdbf6f','#ff7f00','#cab2d6','#6a3d9a', '#b15928'] for i, track in enumerate(args.tracks): @@ -129,8 +138,8 @@ def genigvjs_main(argv=None): index = Path(str(track) + ".bai") else: index = Path(str(track) + ".tbi") - if not index.exists(): - index = None + if not index.exists(): + index = None trackdat = { "name": base, "format": format, @@ -145,10 +154,10 @@ def genigvjs_main(argv=None): trackdat["indexURL"] = get_data_uri(index) else: trackdat["url"] = f"{args.prefix}{track.name}" - link(outdir / track.name, track) + link(outdir / track.name, track, copy=args.copy) if index is not None: trackdat["indexURL"] = f"{args.prefix}{index.name}" - link(outdir / index.name, index) + link(outdir / index.name, index, copy=args.copy) data["tracks"].append(trackdat) with open(outdir / "index.html", "w") as fh: html = template \ diff --git a/blsl/gffparse.py b/blsl/gffparse.py index 7b0605f..418de24 100644 --- a/blsl/gffparse.py +++ b/blsl/gffparse.py @@ -17,6 +17,7 @@ import urllib.request, urllib.parse, urllib.error from sys import stderr, stdout from copy import deepcopy +import re from tqdm.auto import tqdm @@ -33,22 +34,27 @@ "exon", "pseudogenic_exon", "five_prime_UTR", "three_prime_UTR", "CDS", "pseudogenic_CDS", ] -def parseGFFAttributes(attributeString): +def parseGFFAttributes(attributeString, lax=False): """Parse the GFF3 attribute column and return a dict"""# if attributeString == ".": return {} ret = {} - for attribute in attributeString.split(";"): - if "=" in attribute: - key, value = attribute.split("=") - else: - key = attribute - value = "True" + if lax: + # The idea here is to split not just on ;, but on whole ;key=XXXXXX + # sections. This allows e.g. unescaped ; etc in the attrs column + attrs = re.findall(r"(^|(?<=;))([\w_-]+)=(.+?)((?=;[\w_-]+=)|$)", attributeString) + attrs = [(x[1], x[2]) for x in attrs] + else: + attrs = [x.partition("=") for x in attributeString.split(";")] + attrs = [(x[0], x[2]) for x in attrs] + for key, value in attrs: if not key and not value: continue + if not value: + value = "True" ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value) return ret -def parseGFF3(filename, return_as=dict): +def parseGFF3(filename, return_as=dict, lax=False): """ A minimalistic GFF3 format parser. Yields objects that contain info about a single GFF3 feature. @@ -77,7 +83,7 @@ def parseGFF3(filename, return_as=dict): "score": None if parts[5] == "." else float(parts[5]), "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]), "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]), - "attributes": parseGFFAttributes(parts[8]) + "attributes": parseGFFAttributes(parts[8], lax=True) } #Alternatively, you can emit the dictionary here, if you need mutability: # yield normalizedInfo @@ -85,7 +91,6 @@ def parseGFF3(filename, return_as=dict): def gff_heirarchy(filename, progress=None, make_missing_genes=False): - last = {"l1": None, "l2": None, "l3": None} level_canonicaliser = { "transcript": "mRNA", } @@ -106,9 +111,11 @@ def gff_heirarchy(filename, progress=None, make_missing_genes=False): ignore = { "source", "stop_codon", + "start_codon", } records = {} l2l1 = {} + last = {"l1": None, "l2": None, "l3": None} i = 0 warned = set() recordsrc = parseGFF3(filename, return_as=dict) @@ -163,7 +170,7 @@ def gff_heirarchy(filename, progress=None, make_missing_genes=False): print(f"L2 entry {id} parent {parent} not in records? {record}") else: try: - parent = record["attributes"]["Parent"] + parent = record["attributes"].get("Parent", records["attributes"].get("transcript_id", None)) top = l2l1[parent] if id in records[top]["children"][parent]["children"]: i += 1 @@ -218,7 +225,7 @@ def prefix_name(entry, sid): def write_gff_line(line, file=None): - attr = ";".join(f"{k}={v}" for k, v in line["attributes"].items()) + attr = ";".join(f"{k}={urllib.parse.quote(v)}" for k, v in line["attributes"].items()) cols = [line.get(f, ".") for f in gffInfoFields[:-1]] + [attr] cols = ["." if x is None else x for x in cols] print(*cols, file=file, sep="\t") @@ -291,6 +298,8 @@ def gffparse_main(argv=None): help="Convert gff including attributes to simple TSV") ap.add_argument("-c", "--prefix-chrom", action="store_true", help="Prefix gene names with chromsome name (e.g. useful for concatenating augustus results)") + ap.add_argument("-g", "--make-missing-genes", action="store_true", + help="Invent Fake L1 'gene' entries for any L2 lacking an L1 parent") ap.add_argument("input", help="Input GFF") args = ap.parse_args(argv) @@ -306,10 +315,10 @@ def gffparse_main(argv=None): cols = [x if x is not None else "" for x in cols] print(*cols, sep="\t", file=fh) else: - gff = gff_heirarchy(args.input, progress="Parse ") + gff = gff_heirarchy(args.input, progress="Parse ", make_missing_genes=args.make_missing_genes) for _, gene in tqdm(gff.items(), desc="Process "): if args.prefix_chrom: - newgid = f"{gene['seqid']}_{gene['attributes']['ID']}" + newgid = f"{gene['seqid'].replace('#', '_')}_{gene['attributes']['ID']}" reformat_names(gene, geneid=newgid, changenames=False) write_gene(gene, file=fh)