Skip to content

Commit

Permalink
Merge pull request #2 from rbutleriii/batch_fix
Browse files Browse the repository at this point in the history
Batch fix released
  • Loading branch information
Robert Butler authored Mar 28, 2018
2 parents 9f3a274 + b88baef commit 14b942b
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 110,806 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ These and other stats are returned on a per variant basis, in a table, and addit
## Code Example

```bash
usage: clinotator.py [-h] [--log] [-o prefix] [--version] -e EMAIL -t {vid,rsid,vcf} file [file ...]
usage: clinotator.py [-h] [--log] [--long-log] [-o prefix] [--version] -e EMAIL -t {vid,rsid,vcf} file [file ...]

Clinical interpretation of ambiguous ClinVar annotations

Expand All @@ -25,6 +25,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
--log create logfile
--long-log create detailed logfile
-o prefix choose an alternate prefix for outfiles
--version show program's version number and exit
Expand Down Expand Up @@ -72,7 +73,7 @@ WARNING:root:ClinVar significance for 3521 does not include B,B/LB,LB,US,LP,LP/P
WARNING:root:VID: 55794 does not have valid clinical assertions!
```
The warnings, as well as some additional information can be stored in the log file with `--log`
The warnings, as well as some additional information can be stored in the log file with `--log`. `--long-log` will store detailed debugging information, but the file will be larger than the output tsv file. Both log files append information, so batch runs or especially large lists of variants may result in large file sizes.
### Dependencies
Expand Down
30 changes: 20 additions & 10 deletions clinotator/clinotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def getargs():
description='Clinical interpretation of ambiguous'
' ClinVar annotations')
parser.add_argument('--log', action='store_true', help='create logfile')
parser.add_argument('--long-log', action='store_true', help='create detailed logfile')
parser.add_argument('-o', metavar='prefix', dest='outprefix',
default='clinotator',
help='choose an alternate prefix for outfiles')
Expand All @@ -67,6 +68,17 @@ def getargs():
'vcf - vcf file (output vcf generated)')
return parser.parse_args()

# choosing log options
def log_opts(log, long_log, outprefix):
if log:
logging.basicConfig(level=logging.WARN,
filename="{}.log".format(outprefix))
elif long_log:
logging.basicConfig(level=logging.DEBUG,
filename="{}.log".format(outprefix))
else:
logging.basicConfig(level=logging.WARN)

# how to handle file types, returns vcf_tbl or False for output
def input_selection(file_type, file, outprefix, query_results):
try:
Expand All @@ -89,13 +101,14 @@ def input_selection(file_type, file, outprefix, query_results):
return vcf_tbl

except IOError as e:
logging.fatal(error_handling())
logging.critical(error_handling())
print('Unable to open file, {}'.format(error_handling()))

except:
logging.fatal(error_handling())
logging.critical(error_handling())

# exploding list cells in dataframe into separate rows.
# function courtesy of @MaxU on stackoverflow question-12680754
def explode(df, lst_cols, fill_value=''):
# make sure `lst_cols` is a list
if lst_cols and not isinstance(lst_cols, list):
Expand Down Expand Up @@ -147,19 +160,16 @@ def output_files(vcf_tbl, variant_objects, outprefix):
index=False, na_rep='.')
return


def main():
args = getargs()

if args.log:
logging.basicConfig(level=logging.DEBUG, filename="clinotator.log")
else:
logging.basicConfig(level=logging.WARN)

Entrez.email = args.email
log_opts(args.log, args.long_log, args.outprefix)
logging.debug('CLI inputs are {} {} {} {}'
.format(args.type, args.email, args.input, args.outprefix))
Entrez.email = args.email

for file in args.input:
logging.warning('Starting on {}'.format(file))
query_results = []
variant_objects = []
base = os.path.basename(file)
Expand All @@ -175,7 +185,7 @@ def main():

output_files(vcf_tbl, variant_objects, outprefix)
logging.debug('file written to {}.tsv'.format(outprefix))
sys.exit()
sys.exit()

if __name__ == '__main__':
main()
134 changes: 89 additions & 45 deletions clinotator/getncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,8 @@ def timing_tool():
except:
return time.time()

# getting xml variation files for query_results list,
def get_ncbi_xml(file_type, id_list, query_results):
logging.debug('{} list -> {}'.format(file_type, id_list))
start = timing_tool()
staging_list = []
id_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list

if file_type == 'rsid':
webenv2, query_key2 = post_ncbi(file_type, 'epost', db='snp',
id=",".join(id_list))
webenv1, query_key1 = post_ncbi(file_type, 'elink', db='clinvar',
webenv=webenv2, query_key=query_key2,
dbfrom='snp', linkname='snp_clinvar',
cmd='neighbor_history')
elif file_type == 'vid':
webenv1, query_key1 = post_ncbi(file_type, 'epost', db='clinvar',
id=",".join(id_list))
else:
logging.fatal('Error: Incorrect file_type argument in get_rsid_xml ->'
' {}'.format(file_type))

batch_ncbi('efetch', staging_list, id_list, db='clinvar',
rettype='variation', retmax=g.batch_size, webenv=webenv1,
query_key=query_key1)
[query_results.append(batch) for batch in staging_list]
stop = timing_tool()
logging.debug('Download time: {} min, Batches run -> {}'
.format(((stop-start)/60), len(query_results)))
return

# epost or elink list to e-utilities history server for target db.
# epost list to e-utilities history server for target db.
# Can't elink over 1000, no max on epost
def post_ncbi(file_type, query_type, **kwargs):
logging.debug('{} to ncbi using kwargs: {}'.format(query_type, kwargs))
handle = getattr(Entrez, query_type)(**kwargs)
Expand All @@ -77,39 +48,41 @@ def post_ncbi(file_type, query_type, **kwargs):
.format(webenv, query_key))
return webenv, query_key

# Utilized for batch efetch/esummary from ncbi. HTTPError and retry
# Utilized for batch efetch/esummary from ncbi using history server.
def batch_ncbi(query_type, query_results, id_list, **kwargs):
count = len(id_list)

for start in range(0, count, g.batch_size):
end = min(count, start + g.batch_size)
for start in range(0, count, g.efetch_batch):
end = min(count, start + g.efetch_batch)
logging.debug('{} run with {}'
.format(query_type, dict(retstart=start, **kwargs)))
print("Going to download record %i to %i" % (start+1, end))
attempt = 0
while attempt < 3:
attempt += 1
print("Going to download record {} to {}".format(start + 1, end))

for attempt in range(4): # HTTPError and retry three times
try:
fetch_handle = getattr(Entrez, query_type) \
(**dict(retstart=start, **kwargs))

except ValueError as oops:
logging.warning('Likely total = batch size')
logging.warning('Empty set: Likely total number = batch size')
break

except HTTPError as err:
if 500 <= err.code <= 599:
print("Received error from server %s" % err)
print("Attempt %i of 3" % attempt)
logging.debug(err)
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
time.sleep(15)
elif err.code == 400:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
sys.tracebacklimit = None
break
time.sleep(15)
else:
raise
else:
break
else:
logging.critical('Failed to connect to NCBI 4 times exiting...')
sys.exit()

# Ideally Entrez.read would import as python object, and validate with
# ClinVar, but ClinVar no longer declares DOCTYPE for validation in
# their xml returns. They have .xsd for this purpose, but users
Expand All @@ -121,3 +94,74 @@ def batch_ncbi(query_type, query_results, id_list, **kwargs):
# fetch_handle.close()
query_results.append(fetch_handle.read())
return

# E-utilities batch queries w/o history server, useful for elink.
# Unreliable over batch size of 1000
def batch_nohist(query_type, id_list, **kwargs):
count = len(id_list)
result_list = []

for start in range(0, count, g.elink_batch):
end = min(count, start + g.elink_batch)
logging.debug('{} run with {}'
.format(query_type,
dict(id=",".join(id_list[start:end]), **kwargs)))
print("Looking up VIDs for rsIDs {} to {}".format(start + 1, end))

for attempt in range(4): # HTTPError and retry three times
try:
fetch_handle = getattr(Entrez, query_type) \
(**dict(id=",".join(id_list[start:end]), **kwargs))
except ValueError as oops:
logging.warning('Empty set: Likely total number = batch size')
break
except HTTPError as err:
if 500 <= err.code <= 599:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
time.sleep(15)
elif err.code == 400:
print("Attempt {} of 4".format(attempt + 1))
logging.warning(err)
sys.tracebacklimit = None
time.sleep(15)
else:
raise
else:
break
else:
logging.critical('Failed to connect to NCBI 4 times, exiting...')
sys.exit()

record = Entrez.read(fetch_handle)
result_list.extend(
[link['Id'] for link in record[0]['LinkSetDb'][0]['Link']])
return result_list

# getting xml variation files for query_results list,
# main
def get_ncbi_xml(file_type, id_list, query_results):
logging.debug('{} list -> {}'.format(file_type, id_list))
start = timing_tool()
staging_list = []

if file_type == 'rsid':
id_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list
vid_list = batch_nohist('elink', id_list, db='clinvar',
dbfrom='snp', linkname='snp_clinvar')
elif file_type == 'vid':
vid_list = list(filter(None, id_list)) # costs ~ 6 sec per 50k list
else:
logging.critical('Error: Incorrect file_type argument in get_rsid_xml ->'
' {}'.format(file_type))

webenv1, query_key1 = post_ncbi(file_type, 'epost', db='clinvar',
id=",".join(vid_list))
batch_ncbi('efetch', staging_list, vid_list, db='clinvar',
rettype='variation', retmax=g.efetch_batch, webenv=webenv1,
query_key=query_key1)
[query_results.append(batch) for batch in staging_list]
stop = timing_tool()
logging.info('Download time: {} min, Batches run -> {}'
.format(((stop-start)/60), len(query_results)))
return
5 changes: 3 additions & 2 deletions clinotator/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
See main, eventually tests will be added for this module
'''

__version__ = "0.2.4"
__version__ = "0.3.0"


### getncbi.py global variables

# batch size for querying NCBI, not above 5000
batch_size = 4500
elink_batch = 1000
efetch_batch = 4500

# name of software for ncbi query tagging (preferred by NCBI)
etool = 'Clinotator'
Expand Down
14 changes: 7 additions & 7 deletions clinotator/variation.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def average_list_age(vid, age_list):
return np.nanmean(age_list)

except:
logging.warn('VID: {} does not have valid clinical assertions!'
logging.warning('VID: {} does not have valid clinical assertions!'
.format(vid))
return None

Expand All @@ -69,7 +69,7 @@ def key_test(test_dict, test_key):
try:
return test_dict[test_key]
except KeyError:
logging.warn('{} is not an expected key'.format(test_key))
logging.warning('{} is not an expected key'.format(test_key))

# evaluates reclassification recommendation for CTRR
def reclassification_tree(ctps_index, cvcs_index):
Expand Down Expand Up @@ -172,7 +172,7 @@ def observation_parse(self, variationreport):

elif (observation.get('VariationID') == self.VID and
run_already):
logging.warn('{} has multiple observation fields in its recor'
logging.warning('{} has multiple observation fields in its recor'
'd omitting as an annotation error. Check rsid(s'
') {} manually'.format(self.VID, self.RSID))
continue
Expand All @@ -199,7 +199,7 @@ def assertion_table_stats(self, variationreport):
.find('./ClinicalSignificance')
.get('DateLastEvaluated'))
except:
logging.warn('{} has a missing assertion date!'
logging.warning('{} has a missing assertion date!'
.format(self.VID))
continue

Expand Down Expand Up @@ -247,11 +247,11 @@ def analysis_stats(self):
run_already = True
cvcs_index = 3
elif clinsig in [x[0] for x in g.ctps_cutoffs] and run_already:
logging.warn('multiple CVCS statements for {}, only using fir'
'st one!'.format(self.VID))
logging.warning('multiple CVCS statements for {}, only using '
'first one!'.format(self.VID))

if cvcs_index is None:
logging.warn('ClinVar significance for {} does not include B,B/LB'
logging.warning('ClinVar significance for {} does not include B,B/LB'
',LB,US,LP,LP/P,P'.format(self.VID))
self.CTPS = None
self.CTRR = 0
Expand Down
Loading

0 comments on commit 14b942b

Please sign in to comment.