Skip to content

Commit

Permalink
Merge pull request #124 from broadinstitute/dp-annot
Browse files Browse the repository at this point in the history
code for NCBI submissions
  • Loading branch information
dpark01 committed Apr 13, 2015
2 parents 2061965 + cd7dce4 commit 2da9d08
Show file tree
Hide file tree
Showing 4 changed files with 379 additions and 8 deletions.
256 changes: 254 additions & 2 deletions ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,264 @@
__author__ = "PLACEHOLDER"
__commands__ = []

import argparse, logging
import util.cmd, util.file, util.vcf, util.misc
import argparse, logging, collections, shutil, os, os.path
import Bio.SeqIO
import util.cmd, util.file, util.version
import tools.tbl2asn
import interhost

log = logging.getLogger(__name__)



def fasta_chrlens(fasta):
out = collections.OrderedDict()
with open(fasta, 'rt') as inf:
for seq in Bio.SeqIO.parse(inf, 'fasta'):
out[seq.id] = len(seq)
return out

def tbl_transfer(ref_fasta, ref_tbl, alt_fasta, out_tbl, oob_clip=False):
''' This function takes an NCBI TBL file describing features on a genome
(genes, etc) and transfers them to a new genome.
'''
cmap = interhost.CoordMapper(ref_fasta, alt_fasta)
alt_chrlens = fasta_chrlens(alt_fasta)

with open(ref_tbl, 'rt') as inf:
with open(out_tbl, 'wt') as outf:
for line in inf:
line = line.rstrip('\r\n')
if not line:
pass
elif line.startswith('>'):
# sequence identifier
if not line.startswith('>Feature '):
raise Exception("not sure how to handle a non-Feature record")
seqid = line[len('>Feature '):].strip()
if not (seqid.startswith('gb|') and seqid.endswith('|') and len(seqid)>4):
raise Exception("reference annotation does not refer to a GenBank accession")
seqid = seqid[3:-1]
altid = cmap.mapAtoB(seqid)
line = '>Feature ' + altid
feature_keep = True
elif line[0] != '\t':
# feature with numeric coordinates (map them)
row = line.split('\t')
if not len(row)>=2:
raise Exception("this line has only one column?")
row[0] = int(row[0])
row[1] = int(row[1])
if row[1] >= row[0]:
row[0] = cmap.mapAtoB(seqid, row[0], -1)[1]
row[1] = cmap.mapAtoB(seqid, row[1], 1)[1]
else:
# negative strand feature
row[0] = cmap.mapAtoB(seqid, row[0], 1)[1]
row[1] = cmap.mapAtoB(seqid, row[1], -1)[1]

if row[0] and row[1]:
feature_keep = True
elif row[0]==None and row[1]==None:
# feature completely out of bounds
feature_keep = False
continue
else:
# feature overhangs end of sequence
if oob_clip:
if row[0]==None:
row[0] = '<1'
if row[1]==None:
row[1] = '>{}'.format(alt_chrlens[altid])
else:
feature_keep = False
continue
line = '\t'.join(map(str,row))
else:
# feature notes
if not feature_keep:
# skip any lines that follow a skipped feature
continue
elif 'protein_id' in line:
# skip any lines that refer to an explicit protein_id
continue
outf.write(line+'\n')

def parser_tbl_transfer(parser=argparse.ArgumentParser()):
parser.add_argument("ref_fasta", help="Input sequence of reference genome")
parser.add_argument("ref_tbl", help="Input reference annotations (NCBI TBL format)")
parser.add_argument("alt_fasta", help="Input sequence of new genome")
parser.add_argument("out_tbl", help="Output file with transferred annotations")
parser.add_argument('--oob_clip', default=False, action='store_true',
help='''Out of bounds feature behavior.
False: drop all features that are completely or partly out of bounds
True: drop all features completely out of bounds
but truncate any features that are partly out of bounds''')
util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None)))
util.cmd.attach_main(parser, tbl_transfer, split_args=True)
return parser
__commands__.append(('tbl_transfer', parser_tbl_transfer))


def fasta2fsa(infname, outdir, biosample=None):
''' copy a fasta file to a new directory and change its filename to end in .fsa
for NCBI's sake. '''
outfname = os.path.basename(infname)
if outfname.endswith('.fasta'):
outfname = outfname[:-6]
elif outfname.endswith('.fa'):
outfname = outfname[:-3]
if not outfname.endswith('.fsa'):
outfname = outfname + '.fsa'
outfname = os.path.join(outdir, outfname)
with open(infname, 'rU') as inf:
with open(outfname, 'wt') as outf:
for line in inf:
if line.startswith('>') and biosample:
line = line.rstrip('\n\r')
line = '{} [biosample={}]\n'.format(line, biosample)
outf.write(line)
return outfname

def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
with open(cmt_fname, 'wt') as outf:
outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(
util.version.get_version()))
if name:
outf.write("Assembly Name\t{}\n".format(name))
if coverage:
coverage = str(coverage)
if not coverage.endswith('x') or coverage.endswith('X'):
coverage = coverage + 'x'
outf.write("Genome Coverage\t{}\n".format(coverage))
if seq_tech:
outf.write("Sequencing Technology\t{}\n".format(seq_tech))
outf.write("StructuredCommentSuffix\t##Genome-Assembly-Data-END##\n")

def prep_genbank_files(templateFile, fasta_files, annotDir,
master_source_table=None, comment=None, sequencing_tech=None,
coverage_table=None, biosample_map=None):
''' Prepare genbank submission files. Requires .fasta and .tbl files as input,
as well as numerous other metadata files for the submission. Creates a
directory full of files (.sqn in particular) that can be sent to GenBank.
'''
# get coverage map
coverage = {}
if coverage_table:
for row in util.file.read_tabfile_dict(coverage_table):
if row.get('sample') and row.get('aln2self_cov_median'):
coverage[row['sample']] = row['aln2self_cov_median']

# get biosample id map
biosample = {}
if biosample_map:
for row in util.file.read_tabfile_dict(biosample_map):
if row.get('sample') and row.get('BioSample'):
biosample[row['sample']] = row['BioSample']

# make output directory
util.file.mkdir_p(annotDir)
for fn in fasta_files:
if not fn.endswith('.fasta'):
raise Exception("fasta files must end in .fasta")
sample = os.path.basename(fn)[:-6]
# make .fsa files
fasta2fsa(fn, annotDir, biosample=biosample.get(sample))
# make .src files
if master_source_table:
shutil.copy(master_source_table, os.path.join(annotDir, sample+'.src'))
# make .cmt files
make_structured_comment_file(os.path.join(annotDir, sample+'.cmt'),
name=sample, coverage=coverage.get(sample), seq_tech=sequencing_tech)

# run tbl2asn
tbl2asn = tools.tbl2asn.Tbl2AsnTool()
tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True)

def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
parser.add_argument('templateFile',
help='Submission template file (.sbt) including author and contact info')
parser.add_argument("fasta_files", nargs='+',
help="Input fasta files")
parser.add_argument("annotDir",
help="Output directory with genbank submission files (.tbl files must already be there)")
parser.add_argument('--comment', default=None,
help='comment field')
parser.add_argument('--sequencing_tech', default=None,
help='sequencing technology (e.g. Illumina HiSeq 2500)')
parser.add_argument('--master_source_table', default=None,
help='source modifier table')
parser.add_argument("--biosample_map",
help="""A file with two columns and a header: sample and BioSample.
This file may refer to samples that are not included in this submission.""")
parser.add_argument('--coverage_table', default=None,
help='''A genome coverage report file with a header row. The table must
have at least two columns named sample and aln2self_cov_median. All other
columns are ignored. Rows referring to samples not in this submission are
ignored.''')
util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None)))
util.cmd.attach_main(parser, prep_genbank_files, split_args=True)
return parser
__commands__.append(('prep_genbank_files', parser_prep_genbank_files))


def prep_sra_table(lib_fname, biosampleFile, md5_fname, outFile):
''' This is a very lazy hack that creates a basic table that can be
pasted into various columns of an SRA submission spreadsheet. It probably
doesn't work in all cases.
'''
metadata = {}

with open(biosampleFile, 'rU') as inf:
header = inf.readline()
for line in inf:
row = line.rstrip('\n\r').split('\t')
metadata.setdefault(row[0], {})
metadata[row[0]]['biosample_accession'] = row[1]

with open(md5_fname, 'rU') as inf:
for line in inf:
row = line.rstrip('\n\r').split()
s = os.path.basename(row[1])
assert s.endswith('.cleaned.bam')
s = s[:-len('.cleaned.bam')]
metadata.setdefault(s, {})
metadata[s]['filename'] = row[1]
metadata[s]['MD5_checksum'] = row[0]

with open(outFile, 'wt') as outf:
header = ['biosample_accession', 'sample_name', 'library_ID', 'filename', 'MD5_checksum']
outf.write('\t'.join(header)+'\n')
with open(lib_fname, 'rU') as inf:
for line in inf:
lib = line.rstrip('\n\r')
parts = lib.split('.')
assert len(parts)>1 and parts[-1].startswith('l')
s = '.'.join(parts[:-1])
metadata.setdefault(s, {})
metadata[s]['library_ID'] = lib
metadata[s]['sample_name'] = s
outf.write('\t'.join(metadata[s].get(h,'') for h in header)+'\n')

def parser_prep_sra_table(parser=argparse.ArgumentParser()):
parser.add_argument('lib_fname',
help='A file that lists all of the library IDs that will be submitted in this batch')
parser.add_argument("biosampleFile",
help="""A file with two columns and a header: sample and BioSample.
This file may refer to samples that are not included in this submission.""")
parser.add_argument("md5_fname",
help="""A file with two columns and no header. Two columns are MD5 checksum and filename.
Should contain an entry for every bam file being submitted in this batch.
This is typical output from "md5sum *.cleaned.bam".""")
parser.add_argument("outFile",
help="Output table that contains most of the variable columns needed for SRA submission.")
util.cmd.common_args(parser, (('loglevel',None), ('version',None)))
util.cmd.attach_main(parser, prep_sra_table, split_args=True)
return parser
__commands__.append(('prep_sra_table', parser_prep_sra_table))


def full_parser():
return util.cmd.make_parser(__commands__, __doc__)
if __name__ == '__main__':
Expand Down
15 changes: 10 additions & 5 deletions pipes/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@
"lastal_refDb": "/idi/sabeti-scratch/dpark/ebov/lastal_db/ebola",
"spikeinsDb": "/idi/sabeti-scratch/kandersen/references/other/ercc_spike-ins.fasta",
"ref_genome": "/idi/sabeti-scratch/genomes/ebov/KJ660346.2/genome.fasta",
"ref_annot": "/idi/sabeti-scratch/genomes/ebov/KJ660346.2/features.tbl",
"snpEff_genome": "KJ660346.2",

"assembly_min_length": [15000],
"assembly_min_unambig": 0.95,

"align_outgroups": {
"ebov_2014": "",
"ebov": ""

"genbank": {
"author_template": "NCBI/authors.sbt",
"source_modifier_table": "NCBI/sample_meta.src",
"biosample_map": "NCBI/biosample-map.txt",
"sequencing_technology": "Illumina HiSeq 2500; Nextera LC",
"comment": "Please be aware that the annotation is done automatically with little or no manual curation."
},

"seq_center": "BI",
Expand All @@ -50,7 +54,8 @@
"align_self": "02_align_to_self",
"align_ref": "03_align_to_ref",
"interhost": "03_interhost",
"intrahost": "04_intrahost"
"intrahost": "04_intrahost",
"annot": "05_genbank"
},
"dataDir": "data",
"tmpDir": "tmp",
Expand Down
43 changes: 42 additions & 1 deletion pipes/rules/interhost.rules
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,48 @@ rule ref_guided_diversity:
merge_vcfs(params.inVcfs, params.refGenome, output[1])



rule annot_transfer:
input: config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta'
output: config["dataDir"]+'/'+config["subdirs"]["annot"] +'/{sample}.tbl'
resources: mem=4
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
logid="{sample}",
refGenome=config["ref_genome"],
refAnnot=config["ref_annot"]
shell: "{config[binDir]}/ncbi.py tbl_transfer {params.refGenome} {params.refAnnot} {input} {output} --oob_clip"

rule prepare_genbank:
input:
config["reportsDir"]+"/summary.assembly.txt",
expand("{dir}/{subdir}/{sample}.tbl",
dir=[config["dataDir"]], subdir=[config["subdirs"]["annot"]],
sample=read_samples_file(config["samples_assembly"]))
output:
config["dataDir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val'
params:
LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
fasta_files=expand("{dir}/{subdir}/{sample}.fasta",
dir=[config["dataDir"]], subdir=[config["subdirs"]["assembly"]],
sample=read_samples_file(config["samples_assembly"])),
genbank_template=config.get('genbank',{}).get('author_template', ''),
genbank_source_table=config.get('genbank',{}).get('source_modifier_table', ''),
biosample_map=config.get('genbank',{}).get('biosample_map', ''),
seq_tech=config.get('genbank',{}).get('sequencing_technology', 'unknown'),
comment=config.get('genbank',{}).get('comment', ''),
logid="all"
shell:
' '.join(["{config[binDir]}/ncbi.py prep_genbank_files",
"{params.genbank_template} {params.fasta_files}",
"{config[dataDir]}/{config[subdirs][annot]}",
"--master_source_table {params.genbank_source_table}",
"--sequencing_tech '{params.seq_tech}'",
"--biosample_map {params.biosample_map}",
"--coverage_table {config[reportsDir]}/summary.assembly.txt",
"--comment '{params.comment}'"])

rule all_annot:
input:
config["dataDir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val'



Expand Down
Loading

0 comments on commit 2da9d08

Please sign in to comment.