diff --git a/README.md b/README.md index c977212..acfc3c6 100644 --- a/README.md +++ b/README.md @@ -87,10 +87,10 @@ The warnings, as well as some additional information can be stored in the log fi ### Dependencies -The following from my pipenv: +The following dependencies: ``` -biopython==1.70 +biopython==1.73 - numpy [required: Any, installed: 1.14.0] pandas==0.22.0 - numpy [required: >=1.9.0, installed: 1.14.0] @@ -99,7 +99,7 @@ pandas==0.22.0 - pytz [required: >=2011k, installed: 2018.3] ``` -Numpy *should* work >= 1.9.0 and pandas >= 0.20.0, but install more recent versions if possible. For python 3.4 versions, a pip installation of pandas may give error [#20723](https://github.com/pandas-dev/pandas/issues/20723). If so, specify 'pandas<0.21'. +Numpy *should* work >= 1.9.0 and pandas >= 0.20.0, but install more recent versions if possible. For python 3.4 versions, a pip installation of pandas may give error [#20723](https://github.com/pandas-dev/pandas/issues/20723). If so, specify 'pandas<0.21'. As or December 2018, the batch query limits appear to have tightened upgrade to biopython 1.73 see issue [#1867](https://github.com/biopython/biopython/issues/1867). ### Memory/System requirements diff --git a/clinotator/global_vars.py b/clinotator/global_vars.py index 927f1a6..ff523f2 100644 --- a/clinotator/global_vars.py +++ b/clinotator/global_vars.py @@ -10,7 +10,7 @@ See main, eventually tests will be added for this module ''' -__version__ = "1.2.2" +__version__ = "1.2.3" ### getncbi.py global variables diff --git a/clinotator/vcf.py b/clinotator/vcf.py index 630fdb7..8d83562 100644 --- a/clinotator/vcf.py +++ b/clinotator/vcf.py @@ -29,8 +29,7 @@ def parse_header(file_object, outprefix): with open('{}.anno.vcf'.format(outprefix), 'w') as outfile: header = [] info_list = [] - for index, line in enumerate( - next(file_object) for x in range(g.max_vcf_header_size)): + for index, line in zip(range(g.max_vcf_header_size), file_object): m = re.match('##([\w\-\.]+)=', line) if m and m.group(1) == 'INFO': @@ -63,7 +62,8 @@ def parse_header(file_object, outprefix): # process input vcf file, return rsids for query and vcf_tbl for output def vcf_prep(file_object, outprefix): header_count = parse_header(file_object, outprefix) - vcf_tbl = pd.read_table(file_object, skiprows=header_count, dtype=str) + vcf_tbl = pd.read_csv(file_object, sep='\t', skiprows=header_count, + dtype=str) logging.debug('vcf_tbl shape -> {}'.format(vcf_tbl.shape)) vcf_list = vcf_tbl.ID.values[vcf_tbl.ID.values != '.'].tolist() return vcf_list, vcf_tbl @@ -84,7 +84,7 @@ def cat_info_column(info, rsid, alt, out_tbl): 'CVDS': {',': '%2C', ';': '%3B'}}, regex=True, inplace=True) new_info = ['{}={}'.format(x, info_tbl[x] - .to_csv(header=None, index=False, na_rep='.') + .to_csv(header=False, index=False, na_rep='.') .strip('\n')) for x in info_columns] new_info = [string.replace('\n', ',') for string in new_info] logging.debug('{} had a match: {}'.format(rsid, new_info)) @@ -102,7 +102,7 @@ def cat_info_column(info, rsid, alt, out_tbl): with open('../test/test.vcf', 'r') as file_object: vcf_list, vcf_tbl = vcf_prep(file_object, 'test_header') logging.debug('vcf_list -> {}'.format(vcf_list)) - sample_tbl = pd.read_table('../test/test.tbl', dtype=str) + sample_tbl = pd.read_csv('../test/test.tbl', sep='\t', dtype=str) info_list = cat_info_column('NS=3;DP=11;AF=0.017', 'rs34376836', 'A', sample_tbl) info_list = cat_info_column('NS=3;DP=11;AF=0.017', '.', 'A', diff --git a/requirements.txt b/requirements.txt index e0e74fb..09bfcbb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ pandas -biopython +biopython>=1.73 diff --git a/test/test.tbl b/test/test.tbl new file mode 100644 index 0000000..4de742c --- /dev/null +++ b/test/test.tbl @@ -0,0 +1,8 @@ +VID CVVT rsID CVAL vcfmatch CVCS CVSZ CVNA CVDS CVLE CTRS CTAA CTPS CTRR +7 Haplotype 200401432 A ['200401432|A', '118161496|C'] Pathogenic 1 1 Mitochondrial complex I deficiency(P) 2017-09-01 . 1.0 . . +7 Haplotype 118161496 C ['200401432|A', '118161496|C'] Pathogenic 1 1 Mitochondrial complex I deficiency(P) 2017-09-01 . 1.0 . . +13979 Simple 180177040 C . Pathogenic 3 7 Cardio-facio-cutaneous syndrome(P);Cardio-facio-cutaneous syndrome(P);Inborn genetic diseases(P);Cardiofaciocutaneous syndrome 1(P);not provided(P);Cardio-facio-cutaneous syndrome(P);Rasopathy(P) 2017-04-03 37.74 2.0 Pathogenic 0 +50317 Simple 118161496 C . Conflicting interpretations of pathogenicity 1 5 not provided(P);Inborn genetic diseases(P);not provided(US);not provided(US);Mitochondrial complex I deficiency(LP) 2017-08-14 12.36 2.2 Pathogenic/Likely pathogenic 3 +55794 Simple 180177040 G . Likely pathogenic 0 0 . 2016-05-31 . . . . +211895 Simple 766877230 T . Uncertain significance 2 2 not specified(US);not specified(US) 2016-11-02 -0.48 3.0 Uncertain significance 0 +262232 Simple 34376836 A . Benign 2 2 Brown-Vialetto-Van Laere syndrome 1(B);not specified(B) 2017-11-08 -12.0 1.0 Benign/Likely benign 1 diff --git a/test/test_header.anno.vcf b/test/test_header.anno.vcf new file mode 100644 index 0000000..0a03e9e --- /dev/null +++ b/test/test_header.anno.vcf @@ -0,0 +1,31 @@ +##fileformat=VCFv4.3 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##annotation=CLINOTATORv1.2.3_run_2019-05-08 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT=