diff --git a/ProteoformSuiteGUI/DisplayObjects/DisplayQuantitativeValues.cs b/ProteoformSuiteGUI/DisplayObjects/DisplayQuantitativeValues.cs index 05ba3f35..442cc383 100644 --- a/ProteoformSuiteGUI/DisplayObjects/DisplayQuantitativeValues.cs +++ b/ProteoformSuiteGUI/DisplayObjects/DisplayQuantitativeValues.cs @@ -51,7 +51,8 @@ public string Theoretical { get { - return proteoform.linked_proteoform_references != null ? proteoform.linked_proteoform_references.First().accession.Split('_')[0] : ""; + return proteoform as TopDownProteoform != null ? proteoform.accession.Split('_')[0] : + proteoform.linked_proteoform_references != null ? proteoform.linked_proteoform_references.First().accession.Split('_')[0] : ""; } } diff --git a/ProteoformSuiteGUI/DisplayObjects/DisplayTopDownProteoform.cs b/ProteoformSuiteGUI/DisplayObjects/DisplayTopDownProteoform.cs index 15785127..f1886a3f 100755 --- a/ProteoformSuiteGUI/DisplayObjects/DisplayTopDownProteoform.cs +++ b/ProteoformSuiteGUI/DisplayObjects/DisplayTopDownProteoform.cs @@ -38,6 +38,11 @@ public string Name get { return t.name; } } + public string geneID + { + get { return t.geneID; } + } + public string Sequence { get { return t.sequence; } diff --git a/ProteoformSuiteGUI/ProteoformFamilies.Designer.cs b/ProteoformSuiteGUI/ProteoformFamilies.Designer.cs index ce5fae6b..b09411c1 100644 --- a/ProteoformSuiteGUI/ProteoformFamilies.Designer.cs +++ b/ProteoformSuiteGUI/ProteoformFamilies.Designer.cs @@ -67,6 +67,7 @@ private void InitializeComponent() this.tb_familyBuildFolder = new System.Windows.Forms.TextBox(); this.btn_browseTempFolder = new System.Windows.Forms.Button(); this.Families_update = new System.Windows.Forms.Button(); + this.cb_only_assign_common_known_mods = new System.Windows.Forms.CheckBox(); ((System.ComponentModel.ISupportInitialize)(this.splitContainer1)).BeginInit(); this.splitContainer1.Panel1.SuspendLayout(); this.splitContainer1.Panel2.SuspendLayout(); @@ -329,6 +330,7 @@ private void InitializeComponent() // splitContainer3.Panel2 // this.splitContainer3.Panel2.AutoScroll = true; + this.splitContainer3.Panel2.Controls.Add(this.cb_only_assign_common_known_mods); this.splitContainer3.Panel2.Controls.Add(this.rtb_proteoformFamilyResults); this.splitContainer3.Panel2.Controls.Add(this.cb_geneCentric); this.splitContainer3.Panel2.Controls.Add(this.cb_count_adducts_as_id); @@ -534,6 +536,17 @@ private void InitializeComponent() this.Families_update.UseVisualStyleBackColor = true; this.Families_update.Click += new System.EventHandler(this.Families_update_Click); // + // cb_only_assign_common_known_mods + // + this.cb_only_assign_common_known_mods.Checked = true; + this.cb_only_assign_common_known_mods.CheckState = System.Windows.Forms.CheckState.Checked; + this.cb_only_assign_common_known_mods.Location = new System.Drawing.Point(301, 46); + this.cb_only_assign_common_known_mods.Name = "cb_only_assign_common_known_mods"; + this.cb_only_assign_common_known_mods.Size = new System.Drawing.Size(272, 24); + this.cb_only_assign_common_known_mods.TabIndex = 64; + this.cb_only_assign_common_known_mods.Text = "Only Assign Common/Known Mods"; + this.cb_only_assign_common_known_mods.CheckedChanged += new System.EventHandler(this.cb_only_assign_common_known_mods_CheckedChanged); + // // ProteoformFamilies // this.AutoScaleDimensions = new System.Drawing.SizeF(96F, 96F); @@ -607,5 +620,6 @@ private void InitializeComponent() private System.Windows.Forms.ComboBox cmbx_nodeLabel; private System.Windows.Forms.RichTextBox rtb_proteoformFamilyResults; private System.Windows.Forms.CheckBox cb_count_adducts_as_id; + private System.Windows.Forms.CheckBox cb_only_assign_common_known_mods; } } \ No newline at end of file diff --git a/ProteoformSuiteGUI/ProteoformFamilies.cs b/ProteoformSuiteGUI/ProteoformFamilies.cs index 7179abb7..89f1061d 100644 --- a/ProteoformSuiteGUI/ProteoformFamilies.cs +++ b/ProteoformSuiteGUI/ProteoformFamilies.cs @@ -72,6 +72,9 @@ public void InitializeParameterSet() cb_count_adducts_as_id.Checked = Sweet.lollipop.count_adducts_as_identifications; + cb_only_assign_common_known_mods.Checked = Sweet.lollipop.only_assign_common_or_known_mods; + + initialize_every_time(); } @@ -380,5 +383,10 @@ private void cmbx_empty_TextChanged(object sender, EventArgs e) } #endregion Private Methods + + private void cb_only_assign_common_known_mods_CheckedChanged(object sender, EventArgs e) + { + Sweet.lollipop.only_assign_common_or_known_mods = cb_only_assign_common_known_mods.Checked; + } } } \ No newline at end of file diff --git a/ProteoformSuiteInternal/Lollipop.cs b/ProteoformSuiteInternal/Lollipop.cs index e8534314..37b87195 100644 --- a/ProteoformSuiteInternal/Lollipop.cs +++ b/ProteoformSuiteInternal/Lollipop.cs @@ -948,7 +948,7 @@ public void change_peak_acceptance(DeltaMassPeak peak, bool accepted, bool add_a public static bool gene_centric_families = false; public static string preferred_gene_label = ""; public int deltaM_edge_display_rounding = 2; - + public bool only_assign_common_or_known_mods = true; public static string[] node_positioning = new string[] { "Arbitrary Circle", diff --git a/ProteoformSuiteInternal/Mods/custom_mods.txt b/ProteoformSuiteInternal/Mods/custom_mods.txt index 4b97c4d7..e40b0307 100644 --- a/ProteoformSuiteInternal/Mods/custom_mods.txt +++ b/ProteoformSuiteInternal/Mods/custom_mods.txt @@ -2,35 +2,35 @@ ID Sulfate Adduct TG Any PP Anywhere. MM 97.967377 -MT Unlocalized +MT Common CF H2 O4 S1 // ID Acetone Artifact (Unconfirmed) TG Any PP Anywhere. MM 98.0732 -MT Unlocalized +MT Common CF C6 H10 O1 // ID Hydrogen Dodecyl Sulfate TG Any PP Anywhere. MM 266.15516 -MT Unlocalized +MT Common CF H26 C12 S1 O4 // ID N-terminal Acetyl TG K PP N-terminal. MM 42.010565 -MT Unlocalized +MT Common CF H2 C2 O1 // ID GPI anchor TG Any PP Anywhere. MM 123.00853 -MT Unlocalized +MT Common CF H6 C2 N1 O3 P1 DR RESID; AA0161. DR RESID; AA0162. diff --git a/ProteoformSuiteInternal/Mods/intact_mods.txt b/ProteoformSuiteInternal/Mods/intact_mods.txt index 3502e957..8f335654 100644 --- a/ProteoformSuiteInternal/Mods/intact_mods.txt +++ b/ProteoformSuiteInternal/Mods/intact_mods.txt @@ -3,42 +3,35 @@ ID Water Adduct TG Any PP Anywhere. MM 18.010565 -MT Unlocalized +MT Common CF H2O // ID Water loss TG Any PP Anywhere. MM -18.010565 -MT Unlocalized +MT Common CF H-2 O-1 // ID Oxidation TG Any PP Anywhere. MM 15.994915 -MT Unlocalized +MT Common CF O1 // ID Acetyl TG K PP Anywhere. MM 42.010565 -MT Unlocalized +MT Common CF H2 C2 O1 // -ID Carbamidomethyl of C -TG C -PP Anywhere. -MM 57.021464 -MT Unlocalized -CF H3 C2 N1 O1 -// ID Methyl TG E or V or H or R or K or Q or N or S or D PP Anywhere. MM 14.015650 -MT Unlocalized +MT Common CF H2 C // ID Phospho @@ -46,16 +39,9 @@ TG S or T or Y PP Anywhere. MM 79.966331 NL 0 or 97.976896 -MT Unlocalized +MT Common CF H1 O3 P1 // -ID Phospho, Phospho, Phospho -TG S or T or Y -PP Anywhere. -MM 239.898993 -MT Unlocalized -CF H3 O9 P3 -// ID Missed Monoisotopic (-1) TG Any PP Anywhere. diff --git a/ProteoformSuiteInternal/Mods/stored_mods.modnames b/ProteoformSuiteInternal/Mods/stored_mods.modnames index 0b9d5ad0..55f246b9 100644 --- a/ProteoformSuiteInternal/Mods/stored_mods.modnames +++ b/ProteoformSuiteInternal/Mods/stored_mods.modnames @@ -377,9 +377,9 @@ Methyl on Q Methyl 1 False Methyl on N Methyl 1 False Methyl on S Methyl 1 False Methyl on D Methy 1 False -Phospho on S Phospho on S 1 False -Phospho on T Phospho on T 1 False -Phospho on Y Phospho on Y 1 False +Phospho on S Phospho 1 False +Phospho on T Phospho 1 False +Phospho on Y Phospho 1 False Phospho, Phospho, Phospho on S Phospho, Phospho, Phospho 3 False Phospho, Phospho, Phospho on T Phospho, Phospho, Phospho 3 False Phospho, Phospho, Phospho on Y Phospho, Phospho, Phospho 3 False @@ -407,4 +407,4 @@ Missing Serine (S) Missing Serine (S) 1 False Missing Alanine (A) Missing Alanine (A) 1 False Missing Glycine (G) Missing Glycine (G) 1 False Oxidation of M Ox 1 False -Unmodified Unmodified 1 False +Unmodified Unmodified 0 False diff --git a/ProteoformSuiteInternal/Proteoform.cs b/ProteoformSuiteInternal/Proteoform.cs index 915bd2a7..22afdc55 100644 --- a/ProteoformSuiteInternal/Proteoform.cs +++ b/ProteoformSuiteInternal/Proteoform.cs @@ -97,7 +97,7 @@ public List identify_connected_experimentals(List possible_additions = r.peak.possiblePeakAssignments.Where(p => Math.Abs(p.mass - deltaM) <= 1).ToList(); // EE relations have PtmSets around both positive and negative deltaM, so remove the ones around the opposite of the deltaM of interest - PtmSet best_addition = generate_possible_added_ptmsets(possible_additions, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, 1) + PtmSet best_addition = generate_possible_added_ptmsets(possible_additions, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, 1, true) .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); @@ -151,7 +151,7 @@ public List identify_connected_experimentals(List generate_possible_added_ptmsets(List possible_peak_assignments, double deltaM, double mass_tolerance, List all_mods_with_mass, - TheoreticalProteoform theoretical_base, int additional_ptm_penalty) + TheoreticalProteoform theoretical_base, int additional_ptm_penalty, bool final_assignment) { List known_mods = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications.ToList()).SelectMany(kv => kv.Value).OfType().ToList(); List possible_ptmsets = new List(); @@ -161,20 +161,13 @@ public List generate_possible_added_ptmsets(List possible_peak_a List mods_in_set = set.ptm_combination.Select(ptm => ptm.modification).ToList(); int rank_sum = additional_ptm_penalty * (set.ptm_combination.Sum(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m.modification, out UnlocalizedModification x) ? x.ptm_count : 1) - 1); // penalize additional PTMs - foreach (ModificationWithMass m in mods_in_set) { int mod_rank = Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m, out UnlocalizedModification u) ? u.ptm_rank : Sweet.lollipop.modification_ranks.TryGetValue(m.monoisotopicMass, out int x) ? x : Sweet.lollipop.mod_rank_sum_threshold; - if (m.monoisotopicMass == 0) - { - rank_sum += mod_rank; - continue; - } - bool could_be_m_retention = m.modificationType == "AminoAcid" && m.motif.ToString() == "M" && theoretical_base.begin == 2 && this.begin == 2 && !ptm_set.ptm_combination.Any(p => p.modification.Equals(m)); - bool motif_matches_n_terminus = begin >= 1 && begin - 1 < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[begin - 1].ToString(); - bool motif_matches_c_terminus = end >= 1 && end - 1 < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[end - 1].ToString(); + bool motif_matches_n_terminus = this.begin - theoretical_base.begin >= 0 && this.begin - theoretical_base.begin < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[this.begin - theoretical_base.begin].ToString(); + bool motif_matches_c_terminus = this.end - this.begin >= 0 && this.end - this.begin < theoretical_base.sequence.Length && m.motif.ToString() == theoretical_base.sequence[this.end - this.begin].ToString(); bool cannot_be_degradation = !motif_matches_n_terminus && !motif_matches_c_terminus; if (m.modificationType == "Missing" && cannot_be_degradation @@ -187,11 +180,28 @@ public List generate_possible_added_ptmsets(List possible_peak_a bool could_be_n_term_degradation = m.modificationType == "Missing" && motif_matches_n_terminus; bool could_be_c_term_degradation = m.modificationType == "Missing" && motif_matches_c_terminus; - rank_sum -= Convert.ToInt32(Sweet.lollipop.theoretical_database.variableModifications.Contains(m)); // favor variable modifications over regular modifications of the same mass + + //if selected, going to only allow mods in Mods folder (type "Common"), Missing, Missed Monoisotopic, known mods for that protein, or Unmodified + if (Sweet.lollipop.only_assign_common_or_known_mods && final_assignment) + { + if (!(m.monoisotopicMass == 0 || m.modificationType == "Common" || could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation || m.modificationType == "Deconvolution Error" || known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m))) + { + rank_sum = Int32.MaxValue; + break; + } + } // In order of likelihood: // 1. First, we observe I/L/A cleavage to be the most common, other degradations and methionine cleavage are weighted mid-level // 2. Missed monoisotopic errors are considered, but weighted towards the bottom. This should allow missed monoisotopics with common modifications like oxidation, but not rare ones. (handled in unlocalized modification) + if (m.monoisotopicMass == 0) + { + rank_sum += mod_rank; + continue; + } + + rank_sum -= Convert.ToInt32(Sweet.lollipop.theoretical_database.variableModifications.Contains(m)); // favor variable modifications over regular modifications of the same mass + if (could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation) { rank_sum += Sweet.lollipop.mod_rank_first_quartile / 2; @@ -204,7 +214,9 @@ public List generate_possible_added_ptmsets(List possible_peak_a } else { - rank_sum += known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m) ? + rank_sum += known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m) || + known_mods.Select(mod => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(mod, out UnlocalizedModification um) ? um.id : mod.id).Contains(Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m, out UnlocalizedModification um2) ? um2.id : m.id) + ? mod_rank : mod_rank + Sweet.lollipop.mod_rank_first_quartile / 2; // Penalize modifications that aren't known for this protein and push really rare ones out of the running if they're not in the protein entry } @@ -268,8 +280,26 @@ private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, Proteoform e.ptm_set.ptm_combination.Remove(ptm); } e.ptm_set = new PtmSet(e.ptm_set.ptm_combination); + + + e.uniprot_mods = ""; + foreach (string mod in e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).ToList().Distinct().OrderBy(m => m)) + { + //positions with mod + List theo_ptms = theoretical_base.ExpandedProteinList.First().OneBasedPossibleLocalizedModifications.Where(p => p.Key >= e.begin && p.Key <= e.end && + p.Value.Where(m => m as ModificationWithMass != null).Select(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m as ModificationWithMass, out UnlocalizedModification x) ? x.id : m.id).Contains(mod)).Select(m => m.Key).ToList(); + if (theo_ptms.Count > 0) + { + e.uniprot_mods += mod + " @ " + String.Join(", ", theo_ptms) + "; "; + } + if (e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).Count(m => m == mod) + > theo_ptms.Count) + { + e.novel_mods = true; + } + } } - //if already been assigned -- check if gene name != this gene name ==> ambiguous and same length path + //if already been assigned -- check if gene name != this gene name ==> ambiguous else if (!e.topdown_id && (e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label))) { e.ambiguous = true; @@ -283,23 +313,6 @@ private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, Proteoform { e.gene_name.gene_names.Concat(this.gene_name.gene_names); } - - e.uniprot_mods = ""; - foreach (string mod in e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).ToList().Distinct().OrderBy(m => m)) - { - //positions with mod - List theo_ptms = theoretical_base.ExpandedProteinList.First().OneBasedPossibleLocalizedModifications.Where(p => p.Key >= e.begin && p.Key <= e.end && - p.Value.Where(m => m as ModificationWithMass != null).Select(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m as ModificationWithMass, out UnlocalizedModification x) ? x.id : m.id).Contains(mod)).Select(m => m.Key).ToList(); - if (theo_ptms.Count > 0) - { - e.uniprot_mods += mod + " @ " + String.Join(", ", theo_ptms) + "; "; - } - if (e.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).Count(m => m == mod) - > theo_ptms.Count) - { - e.novel_mods = true; - } - } } #endregion Private Methods diff --git a/ProteoformSuiteInternal/ProteoformRelation.cs b/ProteoformSuiteInternal/ProteoformRelation.cs index 05acfc75..dad1f818 100644 --- a/ProteoformSuiteInternal/ProteoformRelation.cs +++ b/ProteoformSuiteInternal/ProteoformRelation.cs @@ -87,7 +87,7 @@ public ProteoformRelation(Proteoform pf1, Proteoform pf2, ProteoformComparison r TheoreticalProteoform t = pf2 as TheoreticalProteoform; double mass_tolerance = t.modified_mass / 1000000 * Sweet.lollipop.mass_tolerance; List narrower_range_of_candidates = candidate_sets.Where(s => Math.Abs(s.mass - delta_mass) < Sweet.lollipop.peak_width_base_et).ToList(); - candidate_ptmset = pf1.generate_possible_added_ptmsets(narrower_range_of_candidates, delta_mass, mass_tolerance, Sweet.lollipop.theoretical_database.all_mods_with_mass, t, Sweet.lollipop.mod_rank_first_quartile) + candidate_ptmset = pf1.generate_possible_added_ptmsets(narrower_range_of_candidates, delta_mass, mass_tolerance, Sweet.lollipop.theoretical_database.all_mods_with_mass, t, Sweet.lollipop.mod_rank_first_quartile, false) .OrderBy(x => x.ptm_rank_sum + Math.Abs(Math.Abs(x.mass) - Math.Abs(delta_mass)) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); } diff --git a/ProteoformSuiteInternal/ResultsSummaryGenerator.cs b/ProteoformSuiteInternal/ResultsSummaryGenerator.cs index ef83cc48..ad31c2f0 100644 --- a/ProteoformSuiteInternal/ResultsSummaryGenerator.cs +++ b/ProteoformSuiteInternal/ResultsSummaryGenerator.cs @@ -485,13 +485,16 @@ public static DataTable topdown_results_dataframe() { DataTable results = new DataTable(); results.Columns.Add("PFR Accession", typeof(string)); - results.Columns.Add("Theoretical Accession", typeof(string)); + results.Columns.Add("Proteoform Suite Accession", typeof(string)); results.Columns.Add("Top-Down Full Accession", typeof(string)); results.Columns.Add("Top-Down Accession", typeof(string)); - results.Columns.Add("Theoretical Description", typeof(string)); - results.Columns.Add("Theoretical Begin and End", typeof(string)); + results.Columns.Add("Proteoform Suite GeneID", typeof(string)); + results.Columns.Add("Top-Down GeneID", typeof(string)); + results.Columns.Add("Proteoform Suite Description", typeof(string)); + results.Columns.Add("Top-Down Description", typeof(string)); + results.Columns.Add("Proteoform Suite Begin and End", typeof(string)); results.Columns.Add("Top-Down Begin and End", typeof(string)); - results.Columns.Add("Theoretical PTM Type", typeof(string)); + results.Columns.Add("Proteoform Suite PTM Type", typeof(string)); results.Columns.Add("Top-Down PTM Type", typeof(string)); results.Columns.Add("Top-Down PTM Type Unlocalized", typeof(string)); results.Columns.Add("Proteoform Suite Mass Error", typeof(string)); @@ -513,7 +516,10 @@ public static DataTable topdown_results_dataframe() td.linked_proteoform_references == null ? "N/A" : (td.linked_proteoform_references.First() as TheoreticalProteoform).accession, td.accession, td.accession.Split('_')[0], + td.linked_proteoform_references == null ? "N/A" : String.Join("; ", (td.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.DatabaseReferences.Where(r => r.Type == "GeneID").Select(r => r.Id)).Distinct()), + td.geneID, td.linked_proteoform_references == null ? "N/A" : (td.linked_proteoform_references.First() as TheoreticalProteoform).description, + td.name, td.linked_proteoform_references == null ? "N/A" : td.begin + " to " + td.end, td.topdown_begin + " to " + td.topdown_end, td.linked_proteoform_references == null ? "N/A" : td.ptm_set.ptm_combination.Count == 0 ? "Unmodified" : String.Join("; ", td.ptm_set.ptm_combination.Select(ptm => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(ptm.modification, out UnlocalizedModification x) ? x.id : ptm.modification.id).OrderBy(m => m)), diff --git a/ProteoformSuiteInternal/StatisticalAnalysis/Log2FoldChangeAnalysis.cs b/ProteoformSuiteInternal/StatisticalAnalysis/Log2FoldChangeAnalysis.cs index be7a46a4..adbd61b8 100644 --- a/ProteoformSuiteInternal/StatisticalAnalysis/Log2FoldChangeAnalysis.cs +++ b/ProteoformSuiteInternal/StatisticalAnalysis/Log2FoldChangeAnalysis.cs @@ -15,7 +15,7 @@ public class Log2FoldChangeAnalysis public Dictionary, double> conditionBiorep_stdevLog2I { get; set; } = new Dictionary, double>(); // used to impute bft-intensities public Dictionary, double> conditionBiorepIntensitySums { get; set; } = new Dictionary, double>(); // used to normalize columns public double benjiHoch_fdr { get; set; } = 0.05; - public double minFoldChange { get; set; } = 2.0; + public double minFoldChange { get; set; } = 1.0; public List inducedOrRepressedProteins { get; set; } = new List(); // This is the list of proteins from proteoforms that underwent significant induction or repression public GoAnalysis GoAnalysis { get; set; } = new GoAnalysis(); public QuantitativeDistributions QuantitativeDistributions { get; set; } diff --git a/ProteoformSuiteInternal/TheoreticalProteoformDatabase.cs b/ProteoformSuiteInternal/TheoreticalProteoformDatabase.cs index 21aad1d0..c1da3d0f 100644 --- a/ProteoformSuiteInternal/TheoreticalProteoformDatabase.cs +++ b/ProteoformSuiteInternal/TheoreticalProteoformDatabase.cs @@ -32,6 +32,10 @@ public class TheoreticalProteoformDatabase //Settings public bool limit_triples_and_greater = true; + //Constants + private double ptmset_max_number_of_a_kind = 3; + + public Dictionary aaIsotopeMassList; #endregion Public Fields @@ -84,7 +88,7 @@ public void get_theoretical_proteoforms(string current_directory) //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); - for (int i = 2; i < Math.Max(2, Sweet.lollipop.max_ptms) + 1; i++) // the method above doesn't make 2 or more of a kind, so we make it here + for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } @@ -104,7 +108,7 @@ public void make_theoretical_proteoforms() if (Sweet.lollipop.combine_identical_sequences) expanded_proteins = group_proteins_by_sequence(expanded_proteins); expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ThenBy(x => x.BaseSequence).ToArray(); // Take on harder problems first to use parallelization more effectively process_entries(expanded_proteins, variableModifications); - process_decoys(Sweet.lollipop.target_proteoform_community.theoretical_proteoforms); + process_decoys(Sweet.lollipop.target_proteoform_community.theoretical_proteoforms.OrderBy(x => x.modified_mass).ThenBy(x => x.ptm_description).ThenBy(x => x.sequence).ThenBy(x => x.name).ToArray()); Parallel.ForEach(new ProteoformCommunity[] { Sweet.lollipop.target_proteoform_community }.Concat(Sweet.lollipop.decoy_proteoform_communities.Values), community => { @@ -354,7 +358,8 @@ public void add_topdown_sequences() List candidate_theoreticals = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0])).ToList(); if (candidate_theoreticals.Count > 0) { - topdown.gene_name = new GeneName(candidate_theoreticals.First().GeneNames); + topdown.gene_name = new GeneName(candidate_theoreticals.SelectMany(t => t.GeneNames)); + topdown.geneID = String.Join("; ", candidate_theoreticals.SelectMany(p => p.DatabaseReferences.Where(r => r.Type == "GeneID").Select(r => r.Id)).Distinct()); if (!candidate_theoreticals.Any(p => p.BaseSequence == topdown.sequence) && !new_proteins.Any(p => p.AccessionList.Select(a => a.Split('_')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0]) && p.BaseSequence == topdown.sequence)) { int old_proteins_with_same_begin_end_diff_sequence = candidate_theoreticals.Count(t => t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence); @@ -518,15 +523,13 @@ private void process_decoys(TheoreticalProteoform[] entries) { List decoy_proteoforms = new List(); StringBuilder sb = new StringBuilder(5000000); // this set-aside is autoincremented to larger values when necessary. - foreach (TheoreticalProteoform proteoform in entries) + foreach (TheoreticalProteoform proteoform in entries) // Take on harder problems first to use parallelization more effectively { sb.Append(proteoform.sequence); } string giantProtein = sb.ToString(); - TheoreticalProteoform[] shuffled_proteoforms = new TheoreticalProteoform[entries.Length]; - Array.Copy(entries, shuffled_proteoforms, entries.Length); Random decoy_rng = Sweet.lollipop.useRandomSeed_decoys ? new Random(decoyNumber + Sweet.lollipop.randomSeed_decoys) : new Random(); // each decoy database needs to have a new random number generator - decoy_rng.Shuffle(shuffled_proteoforms); //randomize order of protein array + var shuffled_proteoforms = entries.OrderBy(item => decoy_rng.Next()).ToList(); int prevLength = 0; foreach (var p in shuffled_proteoforms) { diff --git a/ProteoformSuiteInternal/TopDownProteoform.cs b/ProteoformSuiteInternal/TopDownProteoform.cs index 6a5182e6..1da9a053 100755 --- a/ProteoformSuiteInternal/TopDownProteoform.cs +++ b/ProteoformSuiteInternal/TopDownProteoform.cs @@ -36,7 +36,7 @@ public PtmSet topdown_ptm_set //the ptmset read in with td data public string topdown_ptm_description { get; set; } public ExperimentalProteoform matching_experimental { get; set; } //corresponding experimental public bool correct_id { get; set; } //true if the ID given by ProteoformSuite matches ID from topdown - + public string geneID { get; set; } public TopDownProteoform(string accession, List hits) : base(accession, null, true) diff --git a/ProteoformSuiteInternal/UnlocalizedModification.cs b/ProteoformSuiteInternal/UnlocalizedModification.cs index 80b8eddb..a653fc68 100644 --- a/ProteoformSuiteInternal/UnlocalizedModification.cs +++ b/ProteoformSuiteInternal/UnlocalizedModification.cs @@ -26,7 +26,7 @@ public UnlocalizedModification(ModificationWithMass m) ptm_count = 1; require_proteoform_without_mod = false; - if (m.modificationType == "Unlocalized") + if (m.modificationType == "Common") ptm_rank = Sweet.lollipop.mod_rank_first_quartile / 2; else if (m.modificationType == "Deconvolution Error") ptm_rank = Sweet.lollipop.mod_rank_first_quartile; diff --git a/Test/TestProteoformIdentification.cs b/Test/TestProteoformIdentification.cs index 7a0e03d4..9ae3b6d2 100644 --- a/Test/TestProteoformIdentification.cs +++ b/Test/TestProteoformIdentification.cs @@ -14,10 +14,11 @@ public class TestProteoformIdentification [Test] public void assign_missing_aa_identity() { + Sweet.lollipop = new Lollipop(); Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(false, true, false, false).AA_Masses; TheoreticalProteoform t = ConstructorsForTesting.make_a_theoretical("", 886.45, 0); // sequence with all serines t.sequence = "AAAAAAAAAAAS"; - t.gene_name = new GeneName(new List>() { new Tuple("Gene", "Gene") } ); + t.gene_name = new GeneName(new List>() { new Tuple("Gene", "Gene") }); ExperimentalProteoform e = ConstructorsForTesting.ExperimentalProteoform("", 886.46, 0, true); ExperimentalProteoform e2 = ConstructorsForTesting.ExperimentalProteoform("", 799.43, 0, true); ConstructorsForTesting.make_relation(e, e2, ProteoformComparison.ExperimentalExperimental, 87.03); @@ -48,8 +49,8 @@ public void assign_missing_aa_identity() t = ConstructorsForTesting.make_a_theoretical("", 886.45, 0); // sequence with all serines t.gene_name = new GeneName(new List>() { new Tuple("Gene", "Gene") }); t.sequence = "SAAAAAAAAAAA"; - e = ConstructorsForTesting.ExperimentalProteoform("", 886.47, 0, true); - e2 = ConstructorsForTesting.ExperimentalProteoform("", 799.44, 0, true); + e = ConstructorsForTesting.ExperimentalProteoform("", 886.47, 0, true); + e2 = ConstructorsForTesting.ExperimentalProteoform("", 799.44, 0, true); ConstructorsForTesting.make_relation(e, e2, ProteoformComparison.ExperimentalExperimental, 87.03); ConstructorsForTesting.make_relation(e, t, ProteoformComparison.ExperimentalTheoretical, 0); t.relationships.First().Accepted = true; @@ -73,11 +74,11 @@ public void assign_missing_aa_identity() t.gene_name = new GeneName(new List>() { new Tuple("Gene", "Gene") }); t.sequence = "SAAAAAAAAAA"; t.begin = 2; - e = ConstructorsForTesting.ExperimentalProteoform("", 815.41, 0, true); - e2 = ConstructorsForTesting.ExperimentalProteoform("", 946.45, 0, true); + e = ConstructorsForTesting.ExperimentalProteoform("", 815.41, 0, true); + e2 = ConstructorsForTesting.ExperimentalProteoform("", 946.45, 0, true); ConstructorsForTesting.make_relation(e, e2, ProteoformComparison.ExperimentalExperimental, 113); ModificationMotif.TryGetMotif("M", out motif); - set = new PtmSet(new List { new Ptm(0, new ModificationWithMass("M retention", "AminoAcid", motif, TerminusLocalization.Any, 113)) }); + set = new PtmSet(new List { new Ptm(0, new ModificationWithMass("M retention", "AminoAcid", motif, TerminusLocalization.Any, 113)) }); Sweet.lollipop.theoretical_database.possible_ptmset_dictionary[Math.Round(set.mass, 1)] = new List { set }; Sweet.lollipop.theoretical_database.possible_ptmset_dictionary[Math.Round(set_unmodified.mass, 1)] = new List { set_unmodified }; ConstructorsForTesting.make_relation(e, t, ProteoformComparison.ExperimentalTheoretical, 0); @@ -102,6 +103,8 @@ public void assign_missing_aa_identity() [Test] public void loss_of_ptm_set() { + Sweet.lollipop = new Lollipop(); + Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(false, true, false, false).AA_Masses; TheoreticalProteoform t = ConstructorsForTesting.make_a_theoretical("", 1106.40, 0); // sequence with all serines @@ -184,6 +187,8 @@ public void loss_of_ptm_set() [Test] public void unmodified_identification() { + Sweet.lollipop = new Lollipop(); + Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(false, true, false, false).AA_Masses; TheoreticalProteoform t = ConstructorsForTesting.make_a_theoretical("", 1106.40, 0); // sequence with all serines @@ -197,7 +202,7 @@ public void unmodified_identification() new Ptm(0, new ModificationWithMass("acetyl loss", "", motif, TerminusLocalization.Any, -42.01)), }); - PtmSet set_unmodified = new PtmSet(new List{ new Ptm() }); + PtmSet set_unmodified = new PtmSet(new List { new Ptm() }); // Proteoforms start without any modifications in the PtmSet Sweet.lollipop.theoretical_database.possible_ptmset_dictionary = new Dictionary> @@ -219,9 +224,11 @@ public void unmodified_identification() [Test] public void adduct_experimental() { + Sweet.lollipop = new Lollipop(); + Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, !Sweet.lollipop.neucode_labeled, Sweet.lollipop.neucode_labeled, false).AA_Masses; ModificationMotif.TryGetMotif("S", out ModificationMotif motif); - PtmSet set = new PtmSet(new List { new Ptm(0, new ModificationWithMass("Sulfate Adduct", "", motif, TerminusLocalization.Any, 97.97)) }); + PtmSet set = new PtmSet(new List { new Ptm(0, new ModificationWithMass("Sulfate Adduct", "Common", motif, TerminusLocalization.Any, 97.97)) }); PtmSet set_unmodified = new PtmSet(new List { new Ptm() }); Sweet.lollipop.theoretical_database.possible_ptmset_dictionary[Math.Round(set.mass, 1)] = new List { set }; Sweet.lollipop.theoretical_database.possible_ptmset_dictionary[Math.Round(set_unmodified.mass, 1)] = new List { set_unmodified }; @@ -264,7 +271,7 @@ public void ambiguous_experimentals() ModificationMotif.TryGetMotif("S", out ModificationMotif motif); PtmSet acetyl = new PtmSet(new List { - new Ptm(0, new ModificationWithMass("acetyl", "", motif, TerminusLocalization.Any, 42.011)) + new Ptm(0, new ModificationWithMass("acetyl", "Common", motif, TerminusLocalization.Any, 42.011)) }); PtmSet set_unmodified = new PtmSet(new List { new Ptm() }); @@ -275,14 +282,6 @@ public void ambiguous_experimentals() { Math.Round(acetyl.mass, 1), new List { acetyl } }, {Math.Round(set_unmodified.mass, 1), new List {set_unmodified}} }; - - acetyl = new PtmSet(new List - { - new Ptm(0, new ModificationWithMass("acetyl", "", motif, TerminusLocalization.Any, 42.011)) - }); - - set_unmodified = new PtmSet(new List { new Ptm() }); - // Proteoforms start without any modifications in the PtmSet Sweet.lollipop.theoretical_database.possible_ptmset_dictionary = new Dictionary> { @@ -306,5 +305,102 @@ public void ambiguous_experimentals() Assert.IsFalse(e1.ambiguous); Assert.IsTrue(e3.ambiguous); } + + [Test] + public void only_common_and_known_mods() + { + Sweet.lollipop = new Lollipop(); + Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.natural_lysine_isotope_abundance, Sweet.lollipop.neucode_light_lysine, Sweet.lollipop.neucode_heavy_lysine).AA_Masses; + Sweet.lollipop.only_assign_common_or_known_mods = true; + Lollipop.preferred_gene_label = "primary"; + TheoreticalProteoform t1 = ConstructorsForTesting.make_a_theoretical("T1", 100000, 0); // sequence with all serines + t1.gene_name = new GeneName(new List>() { new Tuple("gene1", "gene1") }); + ExperimentalProteoform e1 = ConstructorsForTesting.ExperimentalProteoform("E1", 10000, 0, true); + ModificationMotif.TryGetMotif("S", out ModificationMotif motif); + PtmSet acetyl = new PtmSet(new List + { + new Ptm(0, new ModificationWithMass("acetyl", "Common", motif, TerminusLocalization.Any, 42.011)) + }); + + PtmSet set_unmodified = new PtmSet(new List { new Ptm() }); + Sweet.lollipop.theoretical_database.possible_ptmset_dictionary = new Dictionary> + { + { Math.Round(acetyl.mass, 1), new List { acetyl } }, {Math.Round(set_unmodified.mass, 1), new List {set_unmodified}} + }; + + Sweet.lollipop.theoretical_database.possible_ptmset_dictionary = new Dictionary> + { + { Math.Round(acetyl.mass, 1), new List { acetyl } }, {Math.Round(set_unmodified.mass, 1), new List {set_unmodified}} + }; + + + //unmodified = OK + ConstructorsForTesting.make_relation(e1, t1, ProteoformComparison.ExperimentalTheoretical, 0); + e1.linked_proteoform_references = null; + e1.relationships.First().Accepted = true; + e1.relationships.First().peak = new DeltaMassPeak(e1.relationships.First(), new HashSet { e1.relationships.First() }); + ProteoformFamily fam = new ProteoformFamily(e1); + fam.construct_family(); + fam.identify_experimentals(); + Assert.IsNotNull(e1.linked_proteoform_references); + + //acetylated OK if common + t1.modified_mass = 1000; + e1.modified_mass = 1042; + e1.linked_proteoform_references = null; + e1.relationships.Clear(); + t1.relationships.Clear(); + ConstructorsForTesting.make_relation(e1, t1, ProteoformComparison.ExperimentalTheoretical, 42.01); + e1.relationships.First().Accepted = true; + e1.relationships.First().peak = new DeltaMassPeak(e1.relationships.First(), new HashSet { e1.relationships.First() }); + Assert.AreEqual(1, e1.relationships.First().peak.possiblePeakAssignments.Count); + fam = new ProteoformFamily(e1); + fam.construct_family(); + fam.identify_experimentals(); + Assert.IsNotNull(e1.linked_proteoform_references); + + //acetylated not ok if Uniprot ptm type... + e1.linked_proteoform_references = null; + acetyl = new PtmSet(new List + { + new Ptm(0, new ModificationWithMass("acetyl", "UniProt", motif, TerminusLocalization.Any, 42.011)) + }); + + Sweet.lollipop.theoretical_database.possible_ptmset_dictionary = new Dictionary> + { + { Math.Round(acetyl.mass, 1), new List { acetyl } }, {Math.Round(set_unmodified.mass, 1), new List {set_unmodified}} + }; + + Sweet.lollipop.theoretical_database.possible_ptmset_dictionary = new Dictionary> + { + { Math.Round(acetyl.mass, 1), new List { acetyl } }, {Math.Round(set_unmodified.mass, 1), new List {set_unmodified}} + }; + e1.linked_proteoform_references = null; + e1.relationships.Clear(); + t1.relationships.Clear(); + ConstructorsForTesting.make_relation(e1, t1, ProteoformComparison.ExperimentalTheoretical, 42.01); + e1.relationships.First().Accepted = true; + e1.relationships.First().peak = new DeltaMassPeak(e1.relationships.First(), new HashSet { e1.relationships.First() }); + Assert.AreEqual(1, e1.relationships.First().peak.possiblePeakAssignments.Count); + fam = new ProteoformFamily(e1); + fam.construct_family(); + fam.identify_experimentals(); + Assert.IsNull(e1.linked_proteoform_references); + + //acetylated ok if theo has in ptmset... + e1.linked_proteoform_references = null; + e1.relationships.Clear(); + t1.relationships.Clear(); + t1.ExpandedProteinList.First().OneBasedPossibleLocalizedModifications.Add(new KeyValuePair>(22, new List() { acetyl.ptm_combination.First().modification })); + ConstructorsForTesting.make_relation(e1, t1, ProteoformComparison.ExperimentalTheoretical, 42.01); + e1.relationships.First().Accepted = true; + e1.relationships.First().peak = new DeltaMassPeak(e1.relationships.First(), new HashSet { e1.relationships.First() }); + Assert.AreEqual(1, e1.relationships.First().peak.possiblePeakAssignments.Count); + fam = new ProteoformFamily(e1); + fam.construct_family(); + fam.identify_experimentals(); + Assert.IsNotNull(e1.linked_proteoform_references); + + } } }