diff --git a/test/data/tcga_ov.head.xychr.maf b/test/data/tcga_ov.head.xychr.maf new file mode 100644 index 0000000..06b6125 --- /dev/null +++ b/test/data/tcga_ov.head.xychr.maf @@ -0,0 +1,5 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error +CDK11A 0 - 37 X 1650797 1650797 + Missense_Mutation SNP A A G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 1650797 1650797 A G SNP CDK11A ENST00000404249 human ensembl 69_37n -1 known missense c.325 p.C109R 0.971 NULL pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom - no_errors +GNPAT 0 - 37 Y 231401797 231401797 + Missense_Mutation SNP A A C TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 231401797 231401797 A C SNP GNPAT ENST00000366647 human ensembl 69_37n +1 known missense c.810 p.R270S 0.997 pfam_Acyltransferase,smart_Acyltransferase pfam_Acyltransferase,smart_Acyltransferase - no_errors +E2F2 0 - 37 1 23836447 23836447 + Silent SNP C C A TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 23836447 23836447 C A SNP E2F2 ENST00000361729 human ensembl 69_37n -1 known silent c.1239 p.L413 0.999 NULL pfam_E2F_TDP - no_errors +VSIG2 0 - 37 11 124617502 124617502 + Missense_Mutation SNP C C G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 11 124617502 124617502 C G SNP VSIG2 ENST00000326621 human ensembl 69_37n -1 known missense c.913 p.G305R 0.813 NULL pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like - no_errors diff --git a/test/test_maf.py b/test/test_maf.py index 0fa4b4f..780e30e 100644 --- a/test/test_maf.py +++ b/test/test_maf.py @@ -96,3 +96,25 @@ def test_load_maf_dataframe(): data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error) eq_(len(variants_df), 5) + +def test_xy_contigs(): + """ + Test MAFs with X and Y chromosomes rather than just numerical chromosomes. + """ + for raise_on_error in [True, False]: + variants = load_maf( + data_path("tcga_ov.head.xychr.maf"), raise_on_error=True) + eq_(len(variants), 4) + + +def test_load_utf8(): + """ + Test MAFs loaded with utf-8 encoding. + """ + for raise_on_error in [True, False]: + variants = load_maf( + data_path("ov.wustle.subset5.maf"), raise_on_error=True, encoding="utf-8") + eq_(len(variants), 5) + # Make sure we avoid "TypeError: character mapping must return integer, None or unicode" + # from Bio.Seq. + _ = variants.effects() diff --git a/varcode/maf.py b/varcode/maf.py index 02d85d5..1a74749 100644 --- a/varcode/maf.py +++ b/varcode/maf.py @@ -17,7 +17,7 @@ import pandas from typechecks import require_string -from numpy import isnan +from pandas import isnull from .reference import infer_genome from .variant import Variant, variant_ascending_position_sort_key @@ -48,7 +48,7 @@ ] -def load_maf_dataframe(path, nrows=None, raise_on_error=True): +def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None): """ Load the guaranteed columns of a TCGA MAF file into a DataFrame @@ -62,6 +62,9 @@ def load_maf_dataframe(path, nrows=None, raise_on_error=True): raise_on_error : bool Raise an exception upon encountering an error or log an error + + encoding : str, optional + Encoding to use for UTF when reading MAF file. """ require_string(path, "Path to MAF") @@ -75,7 +78,8 @@ def load_maf_dataframe(path, nrows=None, raise_on_error=True): sep="\t", low_memory=False, skip_blank_lines=True, - header=0) + header=0, + encoding=encoding) if len(df.columns) < n_basic_columns: error_message = ( @@ -112,7 +116,8 @@ def load_maf( optional_cols=[], sort_key=variant_ascending_position_sort_key, distinct=True, - raise_on_error=True): + raise_on_error=True, + encoding=None): """ Load reference name and Variant objects from MAF filename. @@ -121,7 +126,7 @@ def load_maf( path : str Path to MAF (*.maf). - + optional_cols : list, optional A list of MAF columns to include as metadata if they are present in the MAF. Does not result in an error if those columns are not present. @@ -135,10 +140,13 @@ def load_maf( raise_on_error : bool Raise an exception upon encountering an error or just log a warning. + + encoding : str, optional + Encoding to use for UTF when reading MAF file. """ # pylint: disable=no-member # pylint gets confused by read_csv inside load_maf_dataframe - maf_df = load_maf_dataframe(path, raise_on_error=raise_on_error) + maf_df = load_maf_dataframe(path, raise_on_error=raise_on_error, encoding=encoding) if len(maf_df) == 0 and raise_on_error: raise ValueError("Empty MAF file %s" % path) @@ -148,7 +156,7 @@ def load_maf( metadata = {} for _, x in maf_df.iterrows(): contig = x.Chromosome - if not contig or isnan(contig): + if isnull(contig): error_message = "Invalid contig name: %s" % (contig,) if raise_on_error: raise ValueError(error_message) @@ -192,8 +200,8 @@ def load_maf( variant = Variant( contig, start_pos, - ref, - alt, + str(ref), + str(alt), ensembl=ensembl) # keep metadata about the variant and its TCGA annotation