Skip to content

Commit

Permalink
updates from chimi
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Feb 15, 2024
1 parent b358b99 commit 1815671
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 27 deletions.
35 changes: 22 additions & 13 deletions blsl/genigvjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from pathlib import Path
import subprocess
from base64 import b64encode
import shutil


def get_data_uri(data):
if isinstance(data, str) or isinstance(data, Path):
data = Path(data)
Expand Down Expand Up @@ -52,21 +55,27 @@ def get_data_uri(data):
</body>
"""

def link(linkpath, target):
def link(linkpath, target, copy=False):
linkpath = Path(linkpath)
target = Path(target).absolute()
try:
if linkpath.samefile(target):
return
except:
pass
linkpath.symlink_to(target)
if copy:
linkpath.parent.mkdir(exist_ok=True)
shutil.copyfile(target, linkpath)
else:
try:
if linkpath.samefile(target):
return
except:
pass
linkpath.symlink_to(target)

def genigvjs_main(argv=None):
"""Generate a simple IGV.js visualisation of some bioinf files."""
ap = argparse.ArgumentParser()
ap.add_argument("--template", "-T", required=False,
help="Alternative HTML template")
ap.add_argument("--copy", "--cp", action="store_true",
help="Copy, don't link, data to output dir")
ap.add_argument("--embed", "-e", action="store_true",
help="Encode data as base64 within html (almost always a bad idea on non-toy datasets).")
ap.add_argument("--prefix", "-p", default="./",
Expand Down Expand Up @@ -113,8 +122,8 @@ def genigvjs_main(argv=None):
data["reference"].update({
"fastaURL": f"{args.prefix}{ref.name}",
})
link(outdir / ref.name, ref)
link(outdir / (ref.name + ".fai"), reffai)
link(outdir / ref.name, ref, copy=args.copy)
link(outdir / (ref.name + ".fai"), reffai, copy=args.copy)
cbpaired = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c',
'#fdbf6f','#ff7f00','#cab2d6','#6a3d9a', '#b15928']
for i, track in enumerate(args.tracks):
Expand All @@ -129,8 +138,8 @@ def genigvjs_main(argv=None):
index = Path(str(track) + ".bai")
else:
index = Path(str(track) + ".tbi")
if not index.exists():
index = None
if not index.exists():
index = None
trackdat = {
"name": base,
"format": format,
Expand All @@ -145,10 +154,10 @@ def genigvjs_main(argv=None):
trackdat["indexURL"] = get_data_uri(index)
else:
trackdat["url"] = f"{args.prefix}{track.name}"
link(outdir / track.name, track)
link(outdir / track.name, track, copy=args.copy)
if index is not None:
trackdat["indexURL"] = f"{args.prefix}{index.name}"
link(outdir / index.name, index)
link(outdir / index.name, index, copy=args.copy)
data["tracks"].append(trackdat)
with open(outdir / "index.html", "w") as fh:
html = template \
Expand Down
37 changes: 23 additions & 14 deletions blsl/gffparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import urllib.request, urllib.parse, urllib.error
from sys import stderr, stdout
from copy import deepcopy
import re

from tqdm.auto import tqdm

Expand All @@ -33,22 +34,27 @@
"exon", "pseudogenic_exon", "five_prime_UTR", "three_prime_UTR", "CDS", "pseudogenic_CDS",
]

def parseGFFAttributes(attributeString):
def parseGFFAttributes(attributeString, lax=False):
"""Parse the GFF3 attribute column and return a dict"""#
if attributeString == ".": return {}
ret = {}
for attribute in attributeString.split(";"):
if "=" in attribute:
key, value = attribute.split("=")
else:
key = attribute
value = "True"
if lax:
# The idea here is to split not just on ;, but on whole ;key=XXXXXX
# sections. This allows e.g. unescaped ; etc in the attrs column
attrs = re.findall(r"(^|(?<=;))([\w_-]+)=(.+?)((?=;[\w_-]+=)|$)", attributeString)
attrs = [(x[1], x[2]) for x in attrs]
else:
attrs = [x.partition("=") for x in attributeString.split(";")]
attrs = [(x[0], x[2]) for x in attrs]
for key, value in attrs:
if not key and not value:
continue
if not value:
value = "True"
ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
return ret

def parseGFF3(filename, return_as=dict):
def parseGFF3(filename, return_as=dict, lax=False):
"""
A minimalistic GFF3 format parser.
Yields objects that contain info about a single GFF3 feature.
Expand Down Expand Up @@ -77,15 +83,14 @@ def parseGFF3(filename, return_as=dict):
"score": None if parts[5] == "." else float(parts[5]),
"strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
"phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
"attributes": parseGFFAttributes(parts[8])
"attributes": parseGFFAttributes(parts[8], lax=True)
}
#Alternatively, you can emit the dictionary here, if you need mutability:
# yield normalizedInfo
yield return_as(**normalizedInfo)


def gff_heirarchy(filename, progress=None, make_missing_genes=False):
last = {"l1": None, "l2": None, "l3": None}
level_canonicaliser = {
"transcript": "mRNA",
}
Expand All @@ -106,9 +111,11 @@ def gff_heirarchy(filename, progress=None, make_missing_genes=False):
ignore = {
"source",
"stop_codon",
"start_codon",
}
records = {}
l2l1 = {}
last = {"l1": None, "l2": None, "l3": None}
i = 0
warned = set()
recordsrc = parseGFF3(filename, return_as=dict)
Expand Down Expand Up @@ -163,7 +170,7 @@ def gff_heirarchy(filename, progress=None, make_missing_genes=False):
print(f"L2 entry {id} parent {parent} not in records? {record}")
else:
try:
parent = record["attributes"]["Parent"]
parent = record["attributes"].get("Parent", records["attributes"].get("transcript_id", None))
top = l2l1[parent]
if id in records[top]["children"][parent]["children"]:
i += 1
Expand Down Expand Up @@ -218,7 +225,7 @@ def prefix_name(entry, sid):


def write_gff_line(line, file=None):
attr = ";".join(f"{k}={v}" for k, v in line["attributes"].items())
attr = ";".join(f"{k}={urllib.parse.quote(v)}" for k, v in line["attributes"].items())
cols = [line.get(f, ".") for f in gffInfoFields[:-1]] + [attr]
cols = ["." if x is None else x for x in cols]
print(*cols, file=file, sep="\t")
Expand Down Expand Up @@ -291,6 +298,8 @@ def gffparse_main(argv=None):
help="Convert gff including attributes to simple TSV")
ap.add_argument("-c", "--prefix-chrom", action="store_true",
help="Prefix gene names with chromsome name (e.g. useful for concatenating augustus results)")
ap.add_argument("-g", "--make-missing-genes", action="store_true",
help="Invent Fake L1 'gene' entries for any L2 lacking an L1 parent")
ap.add_argument("input", help="Input GFF")
args = ap.parse_args(argv)

Expand All @@ -306,10 +315,10 @@ def gffparse_main(argv=None):
cols = [x if x is not None else "" for x in cols]
print(*cols, sep="\t", file=fh)
else:
gff = gff_heirarchy(args.input, progress="Parse ")
gff = gff_heirarchy(args.input, progress="Parse ", make_missing_genes=args.make_missing_genes)
for _, gene in tqdm(gff.items(), desc="Process "):
if args.prefix_chrom:
newgid = f"{gene['seqid']}_{gene['attributes']['ID']}"
newgid = f"{gene['seqid'].replace('#', '_')}_{gene['attributes']['ID']}"
reformat_names(gene, geneid=newgid, changenames=False)
write_gene(gene, file=fh)

Expand Down

0 comments on commit 1815671

Please sign in to comment.