diff --git a/ncbi.py b/ncbi.py index ad7af8c8a..d10af11c8 100755 --- a/ncbi.py +++ b/ncbi.py @@ -6,12 +6,264 @@ __author__ = "PLACEHOLDER" __commands__ = [] -import argparse, logging -import util.cmd, util.file, util.vcf, util.misc +import argparse, logging, collections, shutil, os, os.path +import Bio.SeqIO +import util.cmd, util.file, util.version +import tools.tbl2asn +import interhost log = logging.getLogger(__name__) + +def fasta_chrlens(fasta): + out = collections.OrderedDict() + with open(fasta, 'rt') as inf: + for seq in Bio.SeqIO.parse(inf, 'fasta'): + out[seq.id] = len(seq) + return out + +def tbl_transfer(ref_fasta, ref_tbl, alt_fasta, out_tbl, oob_clip=False): + ''' This function takes an NCBI TBL file describing features on a genome + (genes, etc) and transfers them to a new genome. + ''' + cmap = interhost.CoordMapper(ref_fasta, alt_fasta) + alt_chrlens = fasta_chrlens(alt_fasta) + + with open(ref_tbl, 'rt') as inf: + with open(out_tbl, 'wt') as outf: + for line in inf: + line = line.rstrip('\r\n') + if not line: + pass + elif line.startswith('>'): + # sequence identifier + if not line.startswith('>Feature '): + raise Exception("not sure how to handle a non-Feature record") + seqid = line[len('>Feature '):].strip() + if not (seqid.startswith('gb|') and seqid.endswith('|') and len(seqid)>4): + raise Exception("reference annotation does not refer to a GenBank accession") + seqid = seqid[3:-1] + altid = cmap.mapAtoB(seqid) + line = '>Feature ' + altid + feature_keep = True + elif line[0] != '\t': + # feature with numeric coordinates (map them) + row = line.split('\t') + if not len(row)>=2: + raise Exception("this line has only one column?") + row[0] = int(row[0]) + row[1] = int(row[1]) + if row[1] >= row[0]: + row[0] = cmap.mapAtoB(seqid, row[0], -1)[1] + row[1] = cmap.mapAtoB(seqid, row[1], 1)[1] + else: + # negative strand feature + row[0] = cmap.mapAtoB(seqid, row[0], 1)[1] + row[1] = cmap.mapAtoB(seqid, row[1], -1)[1] + + if row[0] and row[1]: + feature_keep = True + elif row[0]==None and row[1]==None: + # feature completely out of bounds + feature_keep = False + continue + else: + # feature overhangs end of sequence + if oob_clip: + if row[0]==None: + row[0] = '<1' + if row[1]==None: + row[1] = '>{}'.format(alt_chrlens[altid]) + else: + feature_keep = False + continue + line = '\t'.join(map(str,row)) + else: + # feature notes + if not feature_keep: + # skip any lines that follow a skipped feature + continue + elif 'protein_id' in line: + # skip any lines that refer to an explicit protein_id + continue + outf.write(line+'\n') + +def parser_tbl_transfer(parser=argparse.ArgumentParser()): + parser.add_argument("ref_fasta", help="Input sequence of reference genome") + parser.add_argument("ref_tbl", help="Input reference annotations (NCBI TBL format)") + parser.add_argument("alt_fasta", help="Input sequence of new genome") + parser.add_argument("out_tbl", help="Output file with transferred annotations") + parser.add_argument('--oob_clip', default=False, action='store_true', + help='''Out of bounds feature behavior. + False: drop all features that are completely or partly out of bounds + True: drop all features completely out of bounds + but truncate any features that are partly out of bounds''') + util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.attach_main(parser, tbl_transfer, split_args=True) + return parser +__commands__.append(('tbl_transfer', parser_tbl_transfer)) + + +def fasta2fsa(infname, outdir, biosample=None): + ''' copy a fasta file to a new directory and change its filename to end in .fsa + for NCBI's sake. ''' + outfname = os.path.basename(infname) + if outfname.endswith('.fasta'): + outfname = outfname[:-6] + elif outfname.endswith('.fa'): + outfname = outfname[:-3] + if not outfname.endswith('.fsa'): + outfname = outfname + '.fsa' + outfname = os.path.join(outdir, outfname) + with open(infname, 'rU') as inf: + with open(outfname, 'wt') as outf: + for line in inf: + if line.startswith('>') and biosample: + line = line.rstrip('\n\r') + line = '{} [biosample={}]\n'.format(line, biosample) + outf.write(line) + return outfname + +def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None): + with open(cmt_fname, 'wt') as outf: + outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n") + outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format( + util.version.get_version())) + if name: + outf.write("Assembly Name\t{}\n".format(name)) + if coverage: + coverage = str(coverage) + if not coverage.endswith('x') or coverage.endswith('X'): + coverage = coverage + 'x' + outf.write("Genome Coverage\t{}\n".format(coverage)) + if seq_tech: + outf.write("Sequencing Technology\t{}\n".format(seq_tech)) + outf.write("StructuredCommentSuffix\t##Genome-Assembly-Data-END##\n") + +def prep_genbank_files(templateFile, fasta_files, annotDir, + master_source_table=None, comment=None, sequencing_tech=None, + coverage_table=None, biosample_map=None): + ''' Prepare genbank submission files. Requires .fasta and .tbl files as input, + as well as numerous other metadata files for the submission. Creates a + directory full of files (.sqn in particular) that can be sent to GenBank. + ''' + # get coverage map + coverage = {} + if coverage_table: + for row in util.file.read_tabfile_dict(coverage_table): + if row.get('sample') and row.get('aln2self_cov_median'): + coverage[row['sample']] = row['aln2self_cov_median'] + + # get biosample id map + biosample = {} + if biosample_map: + for row in util.file.read_tabfile_dict(biosample_map): + if row.get('sample') and row.get('BioSample'): + biosample[row['sample']] = row['BioSample'] + + # make output directory + util.file.mkdir_p(annotDir) + for fn in fasta_files: + if not fn.endswith('.fasta'): + raise Exception("fasta files must end in .fasta") + sample = os.path.basename(fn)[:-6] + # make .fsa files + fasta2fsa(fn, annotDir, biosample=biosample.get(sample)) + # make .src files + if master_source_table: + shutil.copy(master_source_table, os.path.join(annotDir, sample+'.src')) + # make .cmt files + make_structured_comment_file(os.path.join(annotDir, sample+'.cmt'), + name=sample, coverage=coverage.get(sample), seq_tech=sequencing_tech) + + # run tbl2asn + tbl2asn = tools.tbl2asn.Tbl2AsnTool() + tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True) + +def parser_prep_genbank_files(parser=argparse.ArgumentParser()): + parser.add_argument('templateFile', + help='Submission template file (.sbt) including author and contact info') + parser.add_argument("fasta_files", nargs='+', + help="Input fasta files") + parser.add_argument("annotDir", + help="Output directory with genbank submission files (.tbl files must already be there)") + parser.add_argument('--comment', default=None, + help='comment field') + parser.add_argument('--sequencing_tech', default=None, + help='sequencing technology (e.g. Illumina HiSeq 2500)') + parser.add_argument('--master_source_table', default=None, + help='source modifier table') + parser.add_argument("--biosample_map", + help="""A file with two columns and a header: sample and BioSample. + This file may refer to samples that are not included in this submission.""") + parser.add_argument('--coverage_table', default=None, + help='''A genome coverage report file with a header row. The table must + have at least two columns named sample and aln2self_cov_median. All other + columns are ignored. Rows referring to samples not in this submission are + ignored.''') + util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.attach_main(parser, prep_genbank_files, split_args=True) + return parser +__commands__.append(('prep_genbank_files', parser_prep_genbank_files)) + + +def prep_sra_table(lib_fname, biosampleFile, md5_fname, outFile): + ''' This is a very lazy hack that creates a basic table that can be + pasted into various columns of an SRA submission spreadsheet. It probably + doesn't work in all cases. + ''' + metadata = {} + + with open(biosampleFile, 'rU') as inf: + header = inf.readline() + for line in inf: + row = line.rstrip('\n\r').split('\t') + metadata.setdefault(row[0], {}) + metadata[row[0]]['biosample_accession'] = row[1] + + with open(md5_fname, 'rU') as inf: + for line in inf: + row = line.rstrip('\n\r').split() + s = os.path.basename(row[1]) + assert s.endswith('.cleaned.bam') + s = s[:-len('.cleaned.bam')] + metadata.setdefault(s, {}) + metadata[s]['filename'] = row[1] + metadata[s]['MD5_checksum'] = row[0] + + with open(outFile, 'wt') as outf: + header = ['biosample_accession', 'sample_name', 'library_ID', 'filename', 'MD5_checksum'] + outf.write('\t'.join(header)+'\n') + with open(lib_fname, 'rU') as inf: + for line in inf: + lib = line.rstrip('\n\r') + parts = lib.split('.') + assert len(parts)>1 and parts[-1].startswith('l') + s = '.'.join(parts[:-1]) + metadata.setdefault(s, {}) + metadata[s]['library_ID'] = lib + metadata[s]['sample_name'] = s + outf.write('\t'.join(metadata[s].get(h,'') for h in header)+'\n') + +def parser_prep_sra_table(parser=argparse.ArgumentParser()): + parser.add_argument('lib_fname', + help='A file that lists all of the library IDs that will be submitted in this batch') + parser.add_argument("biosampleFile", + help="""A file with two columns and a header: sample and BioSample. + This file may refer to samples that are not included in this submission.""") + parser.add_argument("md5_fname", + help="""A file with two columns and no header. Two columns are MD5 checksum and filename. + Should contain an entry for every bam file being submitted in this batch. + This is typical output from "md5sum *.cleaned.bam".""") + parser.add_argument("outFile", + help="Output table that contains most of the variable columns needed for SRA submission.") + util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.attach_main(parser, prep_sra_table, split_args=True) + return parser +__commands__.append(('prep_sra_table', parser_prep_sra_table)) + + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) if __name__ == '__main__': diff --git a/pipes/config.json b/pipes/config.json index 99a319dd9..2098d32be 100644 --- a/pipes/config.json +++ b/pipes/config.json @@ -18,14 +18,18 @@ "lastal_refDb": "/idi/sabeti-scratch/dpark/ebov/lastal_db/ebola", "spikeinsDb": "/idi/sabeti-scratch/kandersen/references/other/ercc_spike-ins.fasta", "ref_genome": "/idi/sabeti-scratch/genomes/ebov/KJ660346.2/genome.fasta", + "ref_annot": "/idi/sabeti-scratch/genomes/ebov/KJ660346.2/features.tbl", "snpEff_genome": "KJ660346.2", "assembly_min_length": [15000], "assembly_min_unambig": 0.95, - - "align_outgroups": { - "ebov_2014": "", - "ebov": "" + + "genbank": { + "author_template": "NCBI/authors.sbt", + "source_modifier_table": "NCBI/sample_meta.src", + "biosample_map": "NCBI/biosample-map.txt", + "sequencing_technology": "Illumina HiSeq 2500; Nextera LC", + "comment": "Please be aware that the annotation is done automatically with little or no manual curation." }, "seq_center": "BI", @@ -50,7 +54,8 @@ "align_self": "02_align_to_self", "align_ref": "03_align_to_ref", "interhost": "03_interhost", - "intrahost": "04_intrahost" + "intrahost": "04_intrahost", + "annot": "05_genbank" }, "dataDir": "data", "tmpDir": "tmp", diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules index bc8e5a9e8..13eed4401 100644 --- a/pipes/rules/interhost.rules +++ b/pipes/rules/interhost.rules @@ -72,7 +72,48 @@ rule ref_guided_diversity: merge_vcfs(params.inVcfs, params.refGenome, output[1]) - +rule annot_transfer: + input: config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta' + output: config["dataDir"]+'/'+config["subdirs"]["annot"] +'/{sample}.tbl' + resources: mem=4 + params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), + logid="{sample}", + refGenome=config["ref_genome"], + refAnnot=config["ref_annot"] + shell: "{config[binDir]}/ncbi.py tbl_transfer {params.refGenome} {params.refAnnot} {input} {output} --oob_clip" + +rule prepare_genbank: + input: + config["reportsDir"]+"/summary.assembly.txt", + expand("{dir}/{subdir}/{sample}.tbl", + dir=[config["dataDir"]], subdir=[config["subdirs"]["annot"]], + sample=read_samples_file(config["samples_assembly"])) + output: + config["dataDir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val' + params: + LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), + fasta_files=expand("{dir}/{subdir}/{sample}.fasta", + dir=[config["dataDir"]], subdir=[config["subdirs"]["assembly"]], + sample=read_samples_file(config["samples_assembly"])), + genbank_template=config.get('genbank',{}).get('author_template', ''), + genbank_source_table=config.get('genbank',{}).get('source_modifier_table', ''), + biosample_map=config.get('genbank',{}).get('biosample_map', ''), + seq_tech=config.get('genbank',{}).get('sequencing_technology', 'unknown'), + comment=config.get('genbank',{}).get('comment', ''), + logid="all" + shell: + ' '.join(["{config[binDir]}/ncbi.py prep_genbank_files", + "{params.genbank_template} {params.fasta_files}", + "{config[dataDir]}/{config[subdirs][annot]}", + "--master_source_table {params.genbank_source_table}", + "--sequencing_tech '{params.seq_tech}'", + "--biosample_map {params.biosample_map}", + "--coverage_table {config[reportsDir]}/summary.assembly.txt", + "--comment '{params.comment}'"]) + +rule all_annot: + input: + config["dataDir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val' diff --git a/tools/tbl2asn.py b/tools/tbl2asn.py new file mode 100644 index 000000000..e3ea72298 --- /dev/null +++ b/tools/tbl2asn.py @@ -0,0 +1,73 @@ +''' + tbl2asn - from NCBI + http://www.ncbi.nlm.nih.gov/genbank/tbl2asn2/ +''' + +__author__ = "dpark@broadinstitute.org" + +import logging, tools, util.file +import os, os.path, subprocess, gzip + +url = 'ftp://ftp.ncbi.nih.gov/toolbox/ncbi_tools/converters/by_program/tbl2asn/{os}.tbl2asn.gz' + +log = logging.getLogger(__name__) + +class Tbl2AsnTool(tools.Tool): + def __init__(self, install_methods = None): + if install_methods == None: + install_methods = [DownloadGzipBinary(url.format(os=get_bintype()), 'tbl2asn')] + tools.Tool.__init__(self, install_methods = install_methods) + + def version(self): + return None + + def execute(self, templateFile, inputDir, outputDir=None, + source_quals=[], comment=None, verification='vb', + file_type='s', structured_comment_file=None, + per_genome_comment=False): + + toolCmd = [self.install_and_get_path(), '-t', templateFile] + + if inputDir: + toolCmd += ['-p', inputDir] + if outputDir: + toolCmd += ['-r', outputDir] + if source_quals: + toolCmd.append('-j') + toolCmd.append(' '.join("[{}={}]".format(k,v) for k,v in source_quals)) + if comment: + toolCmd += ['-y', comment] + if structured_comment_file: + toolCmd += ['-w', structured_comment_file] + if verification: + toolCmd += ['-V', verification] + if file_type: + toolCmd += ['-a', file_type] + if per_genome_comment: + toolCmd += ['-X', 'C'] + + log.debug(' '.join(toolCmd)) + subprocess.check_call(toolCmd) + + +def get_bintype(): + uname = os.uname() + if uname[0] == 'Darwin': + return 'mac' + elif uname[0] == 'Linux': + if uname[4] == 'x86_64': + return 'linux64' + else: + return 'linux' + else: + raise Exception('unsupported OS') + +class DownloadGzipBinary(tools.DownloadPackage): + def unpack(self, download_dir): + util.file.mkdir_p(self.destination_dir) + if (self.download_file.endswith('.gz') + and not self.download_file.endswith('.tar.gz')): + with gzip.open(os.path.join(download_dir, self.download_file)) as inf: + with open(self.targetpath, 'wb') as outf: + outf.write(inf.read()) + os.chmod(self.targetpath, 0o755)