Merge pull request #5 from rbutleriii/pandas_update

bugfixes for dependencies py3.7
rbutleriii · May 9, 2019 · 3ce3ee1 · 3ce3ee1
2 parents a3ad494 + 6e1828e
commit 3ce3ee1
Show file tree

Hide file tree

Showing 6 changed files with 49 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -87,10 +87,10 @@ The warnings, as well as some additional information can be stored in the log fi
 
 ### Dependencies
 
-The following from my pipenv:
+The following dependencies:
 
 ```
-biopython==1.70
+biopython==1.73
   - numpy [required: Any, installed: 1.14.0]
 pandas==0.22.0
   - numpy [required: >=1.9.0, installed: 1.14.0]
@@ -99,7 +99,7 @@ pandas==0.22.0
   - pytz [required: >=2011k, installed: 2018.3]
 ```
 
-Numpy *should* work >= 1.9.0 and pandas >= 0.20.0, but install more recent versions if possible. For python 3.4 versions, a pip installation of pandas may give error [#20723](https://github.com/pandas-dev/pandas/issues/20723). If so, specify 'pandas<0.21'.
+Numpy *should* work >= 1.9.0 and pandas >= 0.20.0, but install more recent versions if possible. For python 3.4 versions, a pip installation of pandas may give error [#20723](https://github.com/pandas-dev/pandas/issues/20723). If so, specify 'pandas<0.21'. As or December 2018, the batch query limits appear to have tightened upgrade to biopython 1.73 see issue [#1867](https://github.com/biopython/biopython/issues/1867).
 
 ### Memory/System requirements
 

diff --git a/clinotator/global_vars.py b/clinotator/global_vars.py
@@ -10,7 +10,7 @@
 See main, eventually tests will be added for this module
 '''
 
-__version__ = "1.2.2"
+__version__ = "1.2.3"
 
 
 ### getncbi.py global variables 

diff --git a/clinotator/vcf.py b/clinotator/vcf.py
@@ -29,8 +29,7 @@ def parse_header(file_object, outprefix):
         with open('{}.anno.vcf'.format(outprefix), 'w') as outfile:
             header = []
             info_list = []
-            for index, line in enumerate(
-                    next(file_object) for x in range(g.max_vcf_header_size)):
+            for index, line in zip(range(g.max_vcf_header_size), file_object):
                 m = re.match('##([\w\-\.]+)=', line)
 
                 if m and m.group(1) == 'INFO':
@@ -63,7 +62,8 @@ def parse_header(file_object, outprefix):
 # process input vcf file, return rsids for query and vcf_tbl for output
 def vcf_prep(file_object, outprefix):
     header_count = parse_header(file_object, outprefix)
-    vcf_tbl = pd.read_table(file_object, skiprows=header_count, dtype=str)
+    vcf_tbl = pd.read_csv(file_object, sep='\t', skiprows=header_count,
+                          dtype=str)
     logging.debug('vcf_tbl shape -> {}'.format(vcf_tbl.shape))
     vcf_list = vcf_tbl.ID.values[vcf_tbl.ID.values != '.'].tolist()
     return vcf_list, vcf_tbl
@@ -84,7 +84,7 @@ def cat_info_column(info, rsid, alt, out_tbl):
                           'CVDS': {',': '%2C', ';': '%3B'}},
                          regex=True, inplace=True)
         new_info = ['{}={}'.format(x, info_tbl[x]
-                    .to_csv(header=None, index=False, na_rep='.')
+                    .to_csv(header=False, index=False, na_rep='.')
                     .strip('\n')) for x in info_columns]
         new_info = [string.replace('\n', ',') for string in new_info]
         logging.debug('{} had a match: {}'.format(rsid, new_info))
@@ -102,7 +102,7 @@ def cat_info_column(info, rsid, alt, out_tbl):
     with open('../test/test.vcf', 'r') as file_object:
         vcf_list, vcf_tbl = vcf_prep(file_object, 'test_header')
         logging.debug('vcf_list -> {}'.format(vcf_list))
-        sample_tbl = pd.read_table('../test/test.tbl', dtype=str)
+        sample_tbl = pd.read_csv('../test/test.tbl', sep='\t', dtype=str)
         info_list = cat_info_column('NS=3;DP=11;AF=0.017', 'rs34376836', 'A',
                                     sample_tbl)
         info_list = cat_info_column('NS=3;DP=11;AF=0.017', '.', 'A',

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
 pandas
-biopython
+biopython>=1.73
diff --git a/test/test.tbl b/test/test.tbl
@@ -0,0 +1,8 @@
+VID	CVVT	rsID	CVAL	vcfmatch	CVCS	CVSZ	CVNA	CVDS	CVLE	CTRS	CTAA	CTPS	CTRR
+7	Haplotype	200401432	A	['200401432|A', '118161496|C']	Pathogenic	1	1	Mitochondrial complex I deficiency(P)	2017-09-01	.	1.0	.	.
+7	Haplotype	118161496	C	['200401432|A', '118161496|C']	Pathogenic	1	1	Mitochondrial complex I deficiency(P)	2017-09-01	.	1.0	.	.
+13979	Simple	180177040	C	.	Pathogenic	3	7	Cardio-facio-cutaneous syndrome(P);Cardio-facio-cutaneous syndrome(P);Inborn genetic diseases(P);Cardiofaciocutaneous syndrome 1(P);not provided(P);Cardio-facio-cutaneous syndrome(P);Rasopathy(P)	2017-04-03	37.74	2.0	Pathogenic	0
+50317	Simple	118161496	C	.	Conflicting interpretations of pathogenicity	1	5	not provided(P);Inborn genetic diseases(P);not provided(US);not provided(US);Mitochondrial complex I deficiency(LP)	2017-08-14	12.36	2.2	Pathogenic/Likely pathogenic	3
+55794	Simple	180177040	G	.	Likely pathogenic	0	0	.	2016-05-31	.	.	.	.
+211895	Simple	766877230	T	.	Uncertain significance	2	2	not specified(US);not specified(US)	2016-11-02	-0.48	3.0	Uncertain significance	0
+262232	Simple	34376836	A	.	Benign	2	2	Brown-Vialetto-Van Laere syndrome 1(B);not specified(B)	2017-11-08	-12.0	1.0	Benign/Likely benign	1
diff --git a/test/test_header.anno.vcf b/test/test_header.anno.vcf
@@ -0,0 +1,31 @@
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##annotation=CLINOTATORv1.2.3_run_2019-05-08
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=VID,Number=1,Type=Integer,Description="ClinVar variation ID">
+##INFO=<ID=CVVT,Number=A,Type=String,Description="ClinVar variant type">
+##INFO=<ID=CVAL,Number=A,Type=String,Description="ClinVar alternate allele">
+##INFO=<ID=CVCS,Number=A,Type=String,Description="ClinVar clinical significance">
+##INFO=<ID=CVSZ,Number=A,Type=Integer,Description="ClinVar stars">
+##INFO=<ID=CVNA,Number=A,Type=Integer,Description="ClinVar number of clinical assertions">
+##INFO=<ID=CVDS,Number=A,Type=String,Description="ClinVar conditions">
+##INFO=<ID=CVLE,Number=A,Type=String,Description="ClinVar last evaluated">
+##INFO=<ID=CTRS,Number=A,Type=Float,Description="Clinotator raw score">
+##INFO=<ID=CTAA,Number=A,Type=Float,Description="Clinotator average clinical assertion age">
+##INFO=<ID=CTPS,Number=A,Type=String,Description="Clinotator predicted significance">
+##INFO=<ID=CTRR,Number=A,Type=String,Description="Clinotator reclassification recommendation">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">