Skip to content

Commit

Permalink
gffparse: tabular output
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Feb 13, 2024
1 parent 2ca4e95 commit b358b99
Showing 1 changed file with 21 additions and 7 deletions.
28 changes: 21 additions & 7 deletions blsl/gffparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

__author__ = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"
__version__ = "2.1"

#Initialized GeneInfo named tuple. Note: namedtuple is immutable
gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
Expand Down Expand Up @@ -286,18 +286,32 @@ def gffparse_main(argv=None):
"""Format a GFF sanely"""
ap = argparse.ArgumentParser()
ap.add_argument("-o", "--output", default="/dev/stdout",
help="Output GFF compatible with bcftools csq")
help="Output file")
ap.add_argument("-t", "--tabular", action="store_true",
help="Convert gff including attributes to simple TSV")
ap.add_argument("-c", "--prefix-chrom", action="store_true",
help="Prefix gene names with chromsome name (e.g. useful for concatenating augustus results)")
ap.add_argument("input", help="Input GFF")
args = ap.parse_args(argv)

with open(args.output, "w") as fh:
for _, gene in tqdm(gff_heirarchy(args.input, progress="Parse ").items(), desc="Process "):
if args.prefix_chrom:
newgid = f"{gene['seqid']}_{gene['attributes']['ID']}"
reformat_names(gene, geneid=newgid, changenames=False)
write_gene(gene, file=fh)
if args.tabular:
records = list(tqdm(parseGFF3(args.input, return_as=dict), desc="Parse GFF"))
fields = set()
for line in records:
fields.update(line["attributes"].keys())
print(*gffInfoFields[:-1], *fields, sep="\t", file=fh)
for line in tqdm(records, desc="Tabularise"):
cols = [line[x] for x in gffInfoFields[:-1]] + [line["attributes"].get(x) for x in fields]
cols = [x if x is not None else "" for x in cols]
print(*cols, sep="\t", file=fh)
else:
gff = gff_heirarchy(args.input, progress="Parse ")
for _, gene in tqdm(gff.items(), desc="Process "):
if args.prefix_chrom:
newgid = f"{gene['seqid']}_{gene['attributes']['ID']}"
reformat_names(gene, geneid=newgid, changenames=False)
write_gene(gene, file=fh)

if __name__ == "__main__":
gffparse_main()

0 comments on commit b358b99

Please sign in to comment.