Merge pull request #124 from broadinstitute/dp-annot

code for NCBI submissions
broadinstitute · Apr 13, 2015 · 2da9d08 · 2da9d08
2 parents 2061965 + cd7dce4
commit 2da9d08
Show file tree

Hide file tree

Showing 4 changed files with 379 additions and 8 deletions.
diff --git a/ncbi.py b/ncbi.py
@@ -6,12 +6,264 @@
 __author__ = "PLACEHOLDER"
 __commands__ = []
 
-import argparse, logging
-import util.cmd, util.file, util.vcf, util.misc
+import argparse, logging, collections, shutil, os, os.path
+import Bio.SeqIO
+import util.cmd, util.file, util.version
+import tools.tbl2asn
+import interhost
 
 log = logging.getLogger(__name__)
 
 
+
+def fasta_chrlens(fasta):
+    out = collections.OrderedDict()
+    with open(fasta, 'rt') as inf:
+        for seq in Bio.SeqIO.parse(inf, 'fasta'):
+            out[seq.id] = len(seq)
+    return out
+
+def tbl_transfer(ref_fasta, ref_tbl, alt_fasta, out_tbl, oob_clip=False):
+    ''' This function takes an NCBI TBL file describing features on a genome
+        (genes, etc) and transfers them to a new genome.
+    '''
+    cmap = interhost.CoordMapper(ref_fasta, alt_fasta)
+    alt_chrlens = fasta_chrlens(alt_fasta)
+
+    with open(ref_tbl, 'rt') as inf:
+        with open(out_tbl, 'wt') as outf:
+            for line in inf:
+                line = line.rstrip('\r\n')
+                if not line:
+                    pass
+                elif line.startswith('>'):
+                    # sequence identifier
+                    if not line.startswith('>Feature '):
+                        raise Exception("not sure how to handle a non-Feature record")
+                    seqid = line[len('>Feature '):].strip()
+                    if not (seqid.startswith('gb|') and seqid.endswith('|') and len(seqid)>4):
+                        raise Exception("reference annotation does not refer to a GenBank accession")
+                    seqid = seqid[3:-1]
+                    altid = cmap.mapAtoB(seqid)
+                    line = '>Feature ' + altid
+                    feature_keep = True
+                elif line[0] != '\t':
+                    # feature with numeric coordinates (map them)
+                    row = line.split('\t')
+                    if not len(row)>=2:
+                        raise Exception("this line has only one column?")
+                    row[0] = int(row[0])
+                    row[1] = int(row[1])
+                    if row[1] >= row[0]:
+                        row[0] = cmap.mapAtoB(seqid, row[0], -1)[1]
+                        row[1] = cmap.mapAtoB(seqid, row[1],  1)[1]
+                    else:
+                        # negative strand feature
+                        row[0] = cmap.mapAtoB(seqid, row[0],  1)[1]
+                        row[1] = cmap.mapAtoB(seqid, row[1], -1)[1]
+
+                    if row[0] and row[1]:
+                        feature_keep = True
+                    elif row[0]==None and row[1]==None:
+                        # feature completely out of bounds
+                        feature_keep = False
+                        continue
+                    else:
+                        # feature overhangs end of sequence
+                        if oob_clip:
+                            if row[0]==None:
+                                row[0] = '<1'
+                            if row[1]==None:
+                                row[1] = '>{}'.format(alt_chrlens[altid])
+                        else:
+                            feature_keep = False
+                            continue
+                    line = '\t'.join(map(str,row))
+                else:
+                    # feature notes
+                    if not feature_keep:
+                        # skip any lines that follow a skipped feature
+                        continue
+                    elif 'protein_id' in line:
+                        # skip any lines that refer to an explicit protein_id
+                        continue
+                outf.write(line+'\n')
+
+def parser_tbl_transfer(parser=argparse.ArgumentParser()):
+    parser.add_argument("ref_fasta", help="Input sequence of reference genome")
+    parser.add_argument("ref_tbl", help="Input reference annotations (NCBI TBL format)")
+    parser.add_argument("alt_fasta", help="Input sequence of new genome")
+    parser.add_argument("out_tbl", help="Output file with transferred annotations")
+    parser.add_argument('--oob_clip', default=False, action='store_true',
+        help='''Out of bounds feature behavior.
+        False: drop all features that are completely or partly out of bounds
+        True:  drop all features completely out of bounds
+               but truncate any features that are partly out of bounds''')
+    util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None)))
+    util.cmd.attach_main(parser, tbl_transfer, split_args=True)
+    return parser
+__commands__.append(('tbl_transfer', parser_tbl_transfer))
+
+
+def fasta2fsa(infname, outdir, biosample=None):
+    ''' copy a fasta file to a new directory and change its filename to end in .fsa
+        for NCBI's sake. '''
+    outfname = os.path.basename(infname)
+    if outfname.endswith('.fasta'):
+        outfname = outfname[:-6]
+    elif outfname.endswith('.fa'):
+        outfname = outfname[:-3]
+    if not outfname.endswith('.fsa'):
+        outfname = outfname + '.fsa'
+    outfname = os.path.join(outdir, outfname)
+    with open(infname, 'rU') as inf:
+        with open(outfname, 'wt') as outf:
+            for line in inf:
+                if line.startswith('>') and biosample:
+                    line = line.rstrip('\n\r')
+                    line = '{} [biosample={}]\n'.format(line, biosample)
+                outf.write(line)
+    return outfname
+
+def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
+    with open(cmt_fname, 'wt') as outf:
+        outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
+        outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(
+            util.version.get_version()))
+        if name:
+            outf.write("Assembly Name\t{}\n".format(name))
+        if coverage:
+            coverage = str(coverage)
+            if not coverage.endswith('x') or coverage.endswith('X'):
+                coverage = coverage + 'x'
+            outf.write("Genome Coverage\t{}\n".format(coverage))
+        if seq_tech:
+            outf.write("Sequencing Technology\t{}\n".format(seq_tech))
+        outf.write("StructuredCommentSuffix\t##Genome-Assembly-Data-END##\n")
+
+def prep_genbank_files(templateFile, fasta_files, annotDir,
+        master_source_table=None, comment=None, sequencing_tech=None,
+        coverage_table=None, biosample_map=None):
+    ''' Prepare genbank submission files.  Requires .fasta and .tbl files as input,
+        as well as numerous other metadata files for the submission.  Creates a
+        directory full of files (.sqn in particular) that can be sent to GenBank.
+    '''
+    # get coverage map
+    coverage = {}
+    if coverage_table:
+        for row in util.file.read_tabfile_dict(coverage_table):
+            if row.get('sample') and row.get('aln2self_cov_median'):
+                coverage[row['sample']] = row['aln2self_cov_median']
+
+    # get biosample id map
+    biosample = {}
+    if biosample_map:
+        for row in util.file.read_tabfile_dict(biosample_map):
+            if row.get('sample') and row.get('BioSample'):
+                biosample[row['sample']] = row['BioSample']
+
+    # make output directory
+    util.file.mkdir_p(annotDir)
+    for fn in fasta_files:
+        if not fn.endswith('.fasta'):
+            raise Exception("fasta files must end in .fasta")
+        sample = os.path.basename(fn)[:-6]
+        # make .fsa files
+        fasta2fsa(fn, annotDir, biosample=biosample.get(sample))
+        # make .src files
+        if master_source_table:
+            shutil.copy(master_source_table, os.path.join(annotDir, sample+'.src'))
+        # make .cmt files
+        make_structured_comment_file(os.path.join(annotDir, sample+'.cmt'),
+            name=sample, coverage=coverage.get(sample), seq_tech=sequencing_tech)
+
+    # run tbl2asn
+    tbl2asn = tools.tbl2asn.Tbl2AsnTool()
+    tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True)
+
+def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
+    parser.add_argument('templateFile',
+        help='Submission template file (.sbt) including author and contact info')
+    parser.add_argument("fasta_files", nargs='+',
+        help="Input fasta files")
+    parser.add_argument("annotDir",
+        help="Output directory with genbank submission files (.tbl files must already be there)")
+    parser.add_argument('--comment', default=None,
+        help='comment field')
+    parser.add_argument('--sequencing_tech', default=None,
+        help='sequencing technology (e.g. Illumina HiSeq 2500)')
+    parser.add_argument('--master_source_table', default=None,
+        help='source modifier table')
+    parser.add_argument("--biosample_map",
+        help="""A file with two columns and a header: sample and BioSample.
+        This file may refer to samples that are not included in this submission.""")
+    parser.add_argument('--coverage_table', default=None,
+        help='''A genome coverage report file with a header row.  The table must
+        have at least two columns named sample and aln2self_cov_median.  All other
+        columns are ignored. Rows referring to samples not in this submission are
+        ignored.''')
+    util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None)))
+    util.cmd.attach_main(parser, prep_genbank_files, split_args=True)
+    return parser
+__commands__.append(('prep_genbank_files', parser_prep_genbank_files))
+
+
+def prep_sra_table(lib_fname, biosampleFile, md5_fname, outFile):
+    ''' This is a very lazy hack that creates a basic table that can be
+        pasted into various columns of an SRA submission spreadsheet.  It probably
+        doesn't work in all cases.
+    '''
+    metadata = {}
+
+    with open(biosampleFile, 'rU') as inf:
+        header = inf.readline()
+        for line in inf:
+            row = line.rstrip('\n\r').split('\t')
+            metadata.setdefault(row[0], {})
+            metadata[row[0]]['biosample_accession'] = row[1]
+
+    with open(md5_fname, 'rU') as inf:
+        for line in inf:
+            row = line.rstrip('\n\r').split()
+            s = os.path.basename(row[1])
+            assert s.endswith('.cleaned.bam')
+            s = s[:-len('.cleaned.bam')]
+            metadata.setdefault(s, {})
+            metadata[s]['filename'] = row[1]
+            metadata[s]['MD5_checksum'] = row[0]
+
+    with open(outFile, 'wt') as outf:
+        header = ['biosample_accession', 'sample_name', 'library_ID', 'filename', 'MD5_checksum']
+        outf.write('\t'.join(header)+'\n')
+        with open(lib_fname, 'rU') as inf:
+            for line in inf:
+                lib = line.rstrip('\n\r')
+                parts = lib.split('.')
+                assert len(parts)>1 and parts[-1].startswith('l')
+                s = '.'.join(parts[:-1])
+                metadata.setdefault(s, {})
+                metadata[s]['library_ID'] = lib
+                metadata[s]['sample_name'] = s
+                outf.write('\t'.join(metadata[s].get(h,'') for h in header)+'\n')
+
+def parser_prep_sra_table(parser=argparse.ArgumentParser()):
+    parser.add_argument('lib_fname',
+        help='A file that lists all of the library IDs that will be submitted in this batch')
+    parser.add_argument("biosampleFile",
+        help="""A file with two columns and a header: sample and BioSample.
+        This file may refer to samples that are not included in this submission.""")
+    parser.add_argument("md5_fname",
+        help="""A file with two columns and no header.  Two columns are MD5 checksum and filename.
+        Should contain an entry for every bam file being submitted in this batch.
+        This is typical output from "md5sum *.cleaned.bam".""")
+    parser.add_argument("outFile",
+        help="Output table that contains most of the variable columns needed for SRA submission.")
+    util.cmd.common_args(parser, (('loglevel',None), ('version',None)))
+    util.cmd.attach_main(parser, prep_sra_table, split_args=True)
+    return parser
+__commands__.append(('prep_sra_table', parser_prep_sra_table))
+
+
 def full_parser():
     return util.cmd.make_parser(__commands__, __doc__)
 if __name__ == '__main__':

diff --git a/pipes/config.json b/pipes/config.json
@@ -18,14 +18,18 @@
   "lastal_refDb":   "/idi/sabeti-scratch/dpark/ebov/lastal_db/ebola",
   "spikeinsDb":     "/idi/sabeti-scratch/kandersen/references/other/ercc_spike-ins.fasta",
   "ref_genome":     "/idi/sabeti-scratch/genomes/ebov/KJ660346.2/genome.fasta",
+  "ref_annot":      "/idi/sabeti-scratch/genomes/ebov/KJ660346.2/features.tbl",
   "snpEff_genome":  "KJ660346.2",
 
   "assembly_min_length":    [15000],
   "assembly_min_unambig":      0.95,
-
-  "align_outgroups": {
-    "ebov_2014":    "",
-    "ebov":         ""
+
+  "genbank": {
+    "author_template":        "NCBI/authors.sbt",
+    "source_modifier_table":  "NCBI/sample_meta.src",
+    "biosample_map":          "NCBI/biosample-map.txt",
+    "sequencing_technology":  "Illumina HiSeq 2500; Nextera LC",
+    "comment":                "Please be aware that the annotation is done automatically with little or no manual curation."
   },
 
   "seq_center": "BI",
@@ -50,7 +54,8 @@
     "align_self":   "02_align_to_self",
     "align_ref":    "03_align_to_ref",
     "interhost":    "03_interhost",
-    "intrahost":    "04_intrahost"
+    "intrahost":    "04_intrahost",
+    "annot":        "05_genbank"
   },
   "dataDir":    "data",
   "tmpDir":     "tmp",

diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules
@@ -72,7 +72,48 @@ rule ref_guided_diversity:
             merge_vcfs(params.inVcfs, params.refGenome, output[1])
 
 
-
+rule annot_transfer:
+    input:      config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta'
+    output:     config["dataDir"]+'/'+config["subdirs"]["annot"]   +'/{sample}.tbl'
+    resources:  mem=4
+    params:     LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
+                logid="{sample}",
+                refGenome=config["ref_genome"],
+                refAnnot=config["ref_annot"]
+    shell:      "{config[binDir]}/ncbi.py tbl_transfer {params.refGenome} {params.refAnnot} {input} {output} --oob_clip"
+
+rule prepare_genbank:
+    input:
+                config["reportsDir"]+"/summary.assembly.txt",
+                expand("{dir}/{subdir}/{sample}.tbl",
+                    dir=[config["dataDir"]], subdir=[config["subdirs"]["annot"]],
+                    sample=read_samples_file(config["samples_assembly"]))
+    output:
+                config["dataDir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val'
+    params:
+                LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
+                fasta_files=expand("{dir}/{subdir}/{sample}.fasta",
+                    dir=[config["dataDir"]], subdir=[config["subdirs"]["assembly"]],
+                    sample=read_samples_file(config["samples_assembly"])),
+                genbank_template=config.get('genbank',{}).get('author_template', ''),
+                genbank_source_table=config.get('genbank',{}).get('source_modifier_table', ''),
+                biosample_map=config.get('genbank',{}).get('biosample_map', ''),
+                seq_tech=config.get('genbank',{}).get('sequencing_technology', 'unknown'),
+                comment=config.get('genbank',{}).get('comment', ''),
+                logid="all"
+    shell:
+                ' '.join(["{config[binDir]}/ncbi.py prep_genbank_files",
+                    "{params.genbank_template} {params.fasta_files}",
+                    "{config[dataDir]}/{config[subdirs][annot]}",
+                    "--master_source_table {params.genbank_source_table}",
+                    "--sequencing_tech '{params.seq_tech}'",
+                    "--biosample_map {params.biosample_map}",
+                    "--coverage_table {config[reportsDir]}/summary.assembly.txt",
+                    "--comment '{params.comment}'"])
+
+rule all_annot:
+    input:
+                config["dataDir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val'