Skip to content

Commit

Permalink
Merge pull request #223 from hammerlab/maf_loading_fixes
Browse files Browse the repository at this point in the history
Fixes to load_maf
  • Loading branch information
tavinathanson authored Apr 28, 2017
2 parents f70f3e3 + 9f692f3 commit a9b7fbc
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 9 deletions.
5 changes: 5 additions & 0 deletions test/data/tcga_ov.head.xychr.maf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error
CDK11A 0 - 37 X 1650797 1650797 + Missense_Mutation SNP A A G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 1650797 1650797 A G SNP CDK11A ENST00000404249 human ensembl 69_37n -1 known missense c.325 p.C109R 0.971 NULL pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom - no_errors
GNPAT 0 - 37 Y 231401797 231401797 + Missense_Mutation SNP A A C TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 231401797 231401797 A C SNP GNPAT ENST00000366647 human ensembl 69_37n +1 known missense c.810 p.R270S 0.997 pfam_Acyltransferase,smart_Acyltransferase pfam_Acyltransferase,smart_Acyltransferase - no_errors
E2F2 0 - 37 1 23836447 23836447 + Silent SNP C C A TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 23836447 23836447 C A SNP E2F2 ENST00000361729 human ensembl 69_37n -1 known silent c.1239 p.L413 0.999 NULL pfam_E2F_TDP - no_errors
VSIG2 0 - 37 11 124617502 124617502 + Missense_Mutation SNP C C G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 11 124617502 124617502 C G SNP VSIG2 ENST00000326621 human ensembl 69_37n -1 known missense c.913 p.G305R 0.813 NULL pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like - no_errors
22 changes: 22 additions & 0 deletions test/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,25 @@ def test_load_maf_dataframe():
data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error)
eq_(len(variants_df), 5)


def test_xy_contigs():
"""
Test MAFs with X and Y chromosomes rather than just numerical chromosomes.
"""
for raise_on_error in [True, False]:
variants = load_maf(
data_path("tcga_ov.head.xychr.maf"), raise_on_error=True)
eq_(len(variants), 4)


def test_load_utf8():
"""
Test MAFs loaded with utf-8 encoding.
"""
for raise_on_error in [True, False]:
variants = load_maf(
data_path("ov.wustle.subset5.maf"), raise_on_error=True, encoding="utf-8")
eq_(len(variants), 5)
# Make sure we avoid "TypeError: character mapping must return integer, None or unicode"
# from Bio.Seq.
_ = variants.effects()
26 changes: 17 additions & 9 deletions varcode/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pandas
from typechecks import require_string
from numpy import isnan
from pandas import isnull

from .reference import infer_genome
from .variant import Variant, variant_ascending_position_sort_key
Expand Down Expand Up @@ -48,7 +48,7 @@
]


def load_maf_dataframe(path, nrows=None, raise_on_error=True):
def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None):
"""
Load the guaranteed columns of a TCGA MAF file into a DataFrame
Expand All @@ -62,6 +62,9 @@ def load_maf_dataframe(path, nrows=None, raise_on_error=True):
raise_on_error : bool
Raise an exception upon encountering an error or log an error
encoding : str, optional
Encoding to use for UTF when reading MAF file.
"""
require_string(path, "Path to MAF")

Expand All @@ -75,7 +78,8 @@ def load_maf_dataframe(path, nrows=None, raise_on_error=True):
sep="\t",
low_memory=False,
skip_blank_lines=True,
header=0)
header=0,
encoding=encoding)

if len(df.columns) < n_basic_columns:
error_message = (
Expand Down Expand Up @@ -112,7 +116,8 @@ def load_maf(
optional_cols=[],
sort_key=variant_ascending_position_sort_key,
distinct=True,
raise_on_error=True):
raise_on_error=True,
encoding=None):
"""
Load reference name and Variant objects from MAF filename.
Expand All @@ -121,7 +126,7 @@ def load_maf(
path : str
Path to MAF (*.maf).
optional_cols : list, optional
A list of MAF columns to include as metadata if they are present in the MAF.
Does not result in an error if those columns are not present.
Expand All @@ -135,10 +140,13 @@ def load_maf(
raise_on_error : bool
Raise an exception upon encountering an error or just log a warning.
encoding : str, optional
Encoding to use for UTF when reading MAF file.
"""
# pylint: disable=no-member
# pylint gets confused by read_csv inside load_maf_dataframe
maf_df = load_maf_dataframe(path, raise_on_error=raise_on_error)
maf_df = load_maf_dataframe(path, raise_on_error=raise_on_error, encoding=encoding)

if len(maf_df) == 0 and raise_on_error:
raise ValueError("Empty MAF file %s" % path)
Expand All @@ -148,7 +156,7 @@ def load_maf(
metadata = {}
for _, x in maf_df.iterrows():
contig = x.Chromosome
if not contig or isnan(contig):
if isnull(contig):
error_message = "Invalid contig name: %s" % (contig,)
if raise_on_error:
raise ValueError(error_message)
Expand Down Expand Up @@ -192,8 +200,8 @@ def load_maf(
variant = Variant(
contig,
start_pos,
ref,
alt,
str(ref),
str(alt),
ensembl=ensembl)

# keep metadata about the variant and its TCGA annotation
Expand Down

0 comments on commit a9b7fbc

Please sign in to comment.