Skip to content

Commit

Permalink
Merge pull request #505 from lschaffer2/master
Browse files Browse the repository at this point in the history
Option for only allowing common/known PTMs in identifications
  • Loading branch information
Leah Schaffer authored Apr 23, 2018
2 parents 16c4c18 + 7f8d9d7 commit 6858f2a
Show file tree
Hide file tree
Showing 16 changed files with 226 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ public string Theoretical
{
get
{
return proteoform.linked_proteoform_references != null ? proteoform.linked_proteoform_references.First().accession.Split('_')[0] : "";
return proteoform as TopDownProteoform != null ? proteoform.accession.Split('_')[0] :
proteoform.linked_proteoform_references != null ? proteoform.linked_proteoform_references.First().accession.Split('_')[0] : "";
}
}

Expand Down
5 changes: 5 additions & 0 deletions ProteoformSuiteGUI/DisplayObjects/DisplayTopDownProteoform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ public string Name
get { return t.name; }
}

public string geneID
{
get { return t.geneID; }
}

public string Sequence
{
get { return t.sequence; }
Expand Down
14 changes: 14 additions & 0 deletions ProteoformSuiteGUI/ProteoformFamilies.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ProteoformSuiteGUI/ProteoformFamilies.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ public void InitializeParameterSet()

cb_count_adducts_as_id.Checked = Sweet.lollipop.count_adducts_as_identifications;

cb_only_assign_common_known_mods.Checked = Sweet.lollipop.only_assign_common_or_known_mods;


initialize_every_time();
}

Expand Down Expand Up @@ -380,5 +383,10 @@ private void cmbx_empty_TextChanged(object sender, EventArgs e)
}

#endregion Private Methods

private void cb_only_assign_common_known_mods_CheckedChanged(object sender, EventArgs e)
{
Sweet.lollipop.only_assign_common_or_known_mods = cb_only_assign_common_known_mods.Checked;
}
}
}
2 changes: 1 addition & 1 deletion ProteoformSuiteInternal/Lollipop.cs
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,7 @@ public void change_peak_acceptance(DeltaMassPeak peak, bool accepted, bool add_a
public static bool gene_centric_families = false;
public static string preferred_gene_label = "";
public int deltaM_edge_display_rounding = 2;

public bool only_assign_common_or_known_mods = true;
public static string[] node_positioning = new string[]
{
"Arbitrary Circle",
Expand Down
10 changes: 5 additions & 5 deletions ProteoformSuiteInternal/Mods/custom_mods.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,35 @@ ID Sulfate Adduct
TG Any
PP Anywhere.
MM 97.967377
MT Unlocalized
MT Common
CF H2 O4 S1
//
ID Acetone Artifact (Unconfirmed)
TG Any
PP Anywhere.
MM 98.0732
MT Unlocalized
MT Common
CF C6 H10 O1
//
ID Hydrogen Dodecyl Sulfate
TG Any
PP Anywhere.
MM 266.15516
MT Unlocalized
MT Common
CF H26 C12 S1 O4
//
ID N-terminal Acetyl
TG K
PP N-terminal.
MM 42.010565
MT Unlocalized
MT Common
CF H2 C2 O1
//
ID GPI anchor
TG Any
PP Anywhere.
MM 123.00853
MT Unlocalized
MT Common
CF H6 C2 N1 O3 P1
DR RESID; AA0161.
DR RESID; AA0162.
Expand Down
26 changes: 6 additions & 20 deletions ProteoformSuiteInternal/Mods/intact_mods.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,59 +3,45 @@ ID Water Adduct
TG Any
PP Anywhere.
MM 18.010565
MT Unlocalized
MT Common
CF H2O
//
ID Water loss
TG Any
PP Anywhere.
MM -18.010565
MT Unlocalized
MT Common
CF H-2 O-1
//
ID Oxidation
TG Any
PP Anywhere.
MM 15.994915
MT Unlocalized
MT Common
CF O1
//
ID Acetyl
TG K
PP Anywhere.
MM 42.010565
MT Unlocalized
MT Common
CF H2 C2 O1
//
ID Carbamidomethyl of C
TG C
PP Anywhere.
MM 57.021464
MT Unlocalized
CF H3 C2 N1 O1
//
ID Methyl
TG E or V or H or R or K or Q or N or S or D
PP Anywhere.
MM 14.015650
MT Unlocalized
MT Common
CF H2 C
//
ID Phospho
TG S or T or Y
PP Anywhere.
MM 79.966331
NL 0 or 97.976896
MT Unlocalized
MT Common
CF H1 O3 P1
//
ID Phospho, Phospho, Phospho
TG S or T or Y
PP Anywhere.
MM 239.898993
MT Unlocalized
CF H3 O9 P3
//
ID Missed Monoisotopic (-1)
TG Any
PP Anywhere.
Expand Down
8 changes: 4 additions & 4 deletions ProteoformSuiteInternal/Mods/stored_mods.modnames
Original file line number Diff line number Diff line change
Expand Up @@ -377,9 +377,9 @@ Methyl on Q Methyl 1 False
Methyl on N Methyl 1 False
Methyl on S Methyl 1 False
Methyl on D Methy 1 False
Phospho on S Phospho on S 1 False
Phospho on T Phospho on T 1 False
Phospho on Y Phospho on Y 1 False
Phospho on S Phospho 1 False
Phospho on T Phospho 1 False
Phospho on Y Phospho 1 False
Phospho, Phospho, Phospho on S Phospho, Phospho, Phospho 3 False
Phospho, Phospho, Phospho on T Phospho, Phospho, Phospho 3 False
Phospho, Phospho, Phospho on Y Phospho, Phospho, Phospho 3 False
Expand Down Expand Up @@ -407,4 +407,4 @@ Missing Serine (S) Missing Serine (S) 1 False
Missing Alanine (A) Missing Alanine (A) 1 False
Missing Glycine (G) Missing Glycine (G) 1 False
Oxidation of M Ox 1 False
Unmodified Unmodified 1 False
Unmodified Unmodified 0 False
75 changes: 44 additions & 31 deletions ProteoformSuiteInternal/Proteoform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ public List<ExperimentalProteoform> identify_connected_experimentals(List<PtmSet
null); //Experimental without theoretical reference

List<PtmSet> possible_additions = r.peak.possiblePeakAssignments.Where(p => Math.Abs(p.mass - deltaM) <= 1).ToList(); // EE relations have PtmSets around both positive and negative deltaM, so remove the ones around the opposite of the deltaM of interest
PtmSet best_addition = generate_possible_added_ptmsets(possible_additions, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, 1)
PtmSet best_addition = generate_possible_added_ptmsets(possible_additions, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, 1, true)
.OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
.FirstOrDefault();

Expand Down Expand Up @@ -151,7 +151,7 @@ public List<ExperimentalProteoform> identify_connected_experimentals(List<PtmSet
}

public List<PtmSet> generate_possible_added_ptmsets(List<PtmSet> possible_peak_assignments, double deltaM, double mass_tolerance, List<ModificationWithMass> all_mods_with_mass,
TheoreticalProteoform theoretical_base, int additional_ptm_penalty)
TheoreticalProteoform theoretical_base, int additional_ptm_penalty, bool final_assignment)
{
List<ModificationWithMass> known_mods = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications.ToList()).SelectMany(kv => kv.Value).OfType<ModificationWithMass>().ToList();
List<PtmSet> possible_ptmsets = new List<PtmSet>();
Expand All @@ -161,20 +161,13 @@ public List<PtmSet> generate_possible_added_ptmsets(List<PtmSet> possible_peak_a
List<ModificationWithMass> mods_in_set = set.ptm_combination.Select(ptm => ptm.modification).ToList();

int rank_sum = additional_ptm_penalty * (set.ptm_combination.Sum(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m.modification, out UnlocalizedModification x) ? x.ptm_count : 1) - 1); // penalize additional PTMs

foreach (ModificationWithMass m in mods_in_set)
{
int mod_rank = Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m, out UnlocalizedModification u) ? u.ptm_rank : Sweet.lollipop.modification_ranks.TryGetValue(m.monoisotopicMass, out int x) ? x : Sweet.lollipop.mod_rank_sum_threshold;

if (m.monoisotopicMass == 0)
{
rank_sum += mod_rank;
continue;
}

bool could_be_m_retention = m.modificationType == "AminoAcid" && m.motif.ToString() == "M" && theoretical_base.begin == 2 && this.begin == 2 && !ptm_set.ptm_combination.Any(p => p.modification.Equals(m));
bool motif_matches_n_terminus = begin >= 1 && begin - 1 < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[begin - 1].ToString();
bool motif_matches_c_terminus = end >= 1 && end - 1 < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[end - 1].ToString();
bool motif_matches_n_terminus = this.begin - theoretical_base.begin >= 0 && this.begin - theoretical_base.begin < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[this.begin - theoretical_base.begin].ToString();
bool motif_matches_c_terminus = this.end - this.begin >= 0 && this.end - this.begin < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[this.end - this.begin].ToString();

bool cannot_be_degradation = !motif_matches_n_terminus && !motif_matches_c_terminus;
if (m.modificationType == "Missing" && cannot_be_degradation
Expand All @@ -187,11 +180,28 @@ public List<PtmSet> generate_possible_added_ptmsets(List<PtmSet> possible_peak_a

bool could_be_n_term_degradation = m.modificationType == "Missing" && motif_matches_n_terminus;
bool could_be_c_term_degradation = m.modificationType == "Missing" && motif_matches_c_terminus;
rank_sum -= Convert.ToInt32(Sweet.lollipop.theoretical_database.variableModifications.Contains(m)); // favor variable modifications over regular modifications of the same mass

//if selected, going to only allow mods in Mods folder (type "Common"), Missing, Missed Monoisotopic, known mods for that protein, or Unmodified
if (Sweet.lollipop.only_assign_common_or_known_mods && final_assignment)
{
if (!(m.monoisotopicMass == 0 || m.modificationType == "Common" || could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation || m.modificationType == "Deconvolution Error" || known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m)))
{
rank_sum = Int32.MaxValue;
break;
}
}

// In order of likelihood:
// 1. First, we observe I/L/A cleavage to be the most common, other degradations and methionine cleavage are weighted mid-level
// 2. Missed monoisotopic errors are considered, but weighted towards the bottom. This should allow missed monoisotopics with common modifications like oxidation, but not rare ones. (handled in unlocalized modification)
if (m.monoisotopicMass == 0)
{
rank_sum += mod_rank;
continue;
}

rank_sum -= Convert.ToInt32(Sweet.lollipop.theoretical_database.variableModifications.Contains(m)); // favor variable modifications over regular modifications of the same mass

if (could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation)
{
rank_sum += Sweet.lollipop.mod_rank_first_quartile / 2;
Expand All @@ -204,7 +214,9 @@ public List<PtmSet> generate_possible_added_ptmsets(List<PtmSet> possible_peak_a
}
else
{
rank_sum += known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m) ?
rank_sum += known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m) ||
known_mods.Select(mod => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(mod, out UnlocalizedModification um) ? um.id : mod.id).Contains(Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m, out UnlocalizedModification um2) ? um2.id : m.id)
?
mod_rank :
mod_rank + Sweet.lollipop.mod_rank_first_quartile / 2; // Penalize modifications that aren't known for this protein and push really rare ones out of the running if they're not in the protein entry
}
Expand Down Expand Up @@ -268,8 +280,26 @@ private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, Proteoform
e.ptm_set.ptm_combination.Remove(ptm);
}
e.ptm_set = new PtmSet(e.ptm_set.ptm_combination);


e.uniprot_mods = "";
foreach (string mod in e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).ToList().Distinct().OrderBy(m => m))
{
//positions with mod
List<int> theo_ptms = theoretical_base.ExpandedProteinList.First().OneBasedPossibleLocalizedModifications.Where(p => p.Key >= e.begin && p.Key <= e.end &&
p.Value.Where(m => m as ModificationWithMass != null).Select(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m as ModificationWithMass, out UnlocalizedModification x) ? x.id : m.id).Contains(mod)).Select(m => m.Key).ToList();
if (theo_ptms.Count > 0)
{
e.uniprot_mods += mod + " @ " + String.Join(", ", theo_ptms) + "; ";
}
if (e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).Count(m => m == mod)
> theo_ptms.Count)
{
e.novel_mods = true;
}
}
}
//if already been assigned -- check if gene name != this gene name ==> ambiguous and same length path
//if already been assigned -- check if gene name != this gene name ==> ambiguous
else if (!e.topdown_id && (e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label)))
{
e.ambiguous = true;
Expand All @@ -283,23 +313,6 @@ private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, Proteoform
{
e.gene_name.gene_names.Concat(this.gene_name.gene_names);
}

e.uniprot_mods = "";
foreach (string mod in e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).ToList().Distinct().OrderBy(m => m))
{
//positions with mod
List<int> theo_ptms = theoretical_base.ExpandedProteinList.First().OneBasedPossibleLocalizedModifications.Where(p => p.Key >= e.begin && p.Key <= e.end &&
p.Value.Where(m => m as ModificationWithMass != null).Select(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m as ModificationWithMass, out UnlocalizedModification x) ? x.id : m.id).Contains(mod)).Select(m => m.Key).ToList();
if (theo_ptms.Count > 0)
{
e.uniprot_mods += mod + " @ " + String.Join(", ", theo_ptms) + "; ";
}
if (e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).Count(m => m == mod)
> theo_ptms.Count)
{
e.novel_mods = true;
}
}
}

#endregion Private Methods
Expand Down
2 changes: 1 addition & 1 deletion ProteoformSuiteInternal/ProteoformRelation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public ProteoformRelation(Proteoform pf1, Proteoform pf2, ProteoformComparison r
TheoreticalProteoform t = pf2 as TheoreticalProteoform;
double mass_tolerance = t.modified_mass / 1000000 * Sweet.lollipop.mass_tolerance;
List<PtmSet> narrower_range_of_candidates = candidate_sets.Where(s => Math.Abs(s.mass - delta_mass) < Sweet.lollipop.peak_width_base_et).ToList();
candidate_ptmset = pf1.generate_possible_added_ptmsets(narrower_range_of_candidates, delta_mass, mass_tolerance, Sweet.lollipop.theoretical_database.all_mods_with_mass, t, Sweet.lollipop.mod_rank_first_quartile)
candidate_ptmset = pf1.generate_possible_added_ptmsets(narrower_range_of_candidates, delta_mass, mass_tolerance, Sweet.lollipop.theoretical_database.all_mods_with_mass, t, Sweet.lollipop.mod_rank_first_quartile, false)
.OrderBy(x => x.ptm_rank_sum + Math.Abs(Math.Abs(x.mass) - Math.Abs(delta_mass)) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
.FirstOrDefault();
}
Expand Down
Loading

0 comments on commit 6858f2a

Please sign in to comment.