chore: resolve merge conflict with latest develop branch

SysBioChalmers · Apr 23, 2024 · 189984a · 189984a
2 parents ba08540 + c5d4835
commit 189984a
Show file tree

Hide file tree

Showing 18 changed files with 2,226 additions and 1,856 deletions.
diff --git a/code/DepMapGeneEss/ConvertGeneEssInputData.R b/code/DepMapGeneEss/ConvertGeneEssInputData.R
@@ -0,0 +1,69 @@
+library(tidyverse)
+
+setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
+
+d = read_csv(paste0(rstudioapi::getActiveDocumentContext(), "/data/CCLE_expression_full.csv"))
+dim(d)#1377 52055
+
+#transpose
+ds = as_tibble(cbind(gene = names(d)[-1], t(as.matrix(d[,-1]))))
+colnames(ds)[-1] = d[[1]]
+
+#make sure the columns are numeric instead of chr
+ds2 = ds
+for (i in 2:ncol(ds)) {
+  ds2[[i]] = as.numeric(ds[[i]])
+
+}
+
+#convert the gene names
+
+#To get ensembl: 
+#pattern = ".*\\(([A-Z0-9]*)\\)"
+#newGenes = str_match(ds$gene, pattern)
+#ds2$gene = newGenes[,2]
+#fix the ERCC genes
+#ds2$gene[is.na(newGenes[,2])] = ds$gene[is.na(newGenes[,2])];
+
+#To get gene symbols:
+#It is a bit tricky, not all genes follow the pattern. Some are like LINC00328-2P (ENSG00000225016),
+#some just an ensembl id (we then take the assembl id), some ERCC 
+newGenes = ds2$gene
+x  = strsplit(ds2$gene[!ERCCGenesSel], " ")
+for(i in 1:length(x)) {
+  newGenes[i] = x[[i]][1] #handles all cases
+}
+length(newGenes)
+length(unique(newGenes)) #not the same, we need to merge a few rows, done later
+
+ds2$gene = newGenes
+
+ds2
+
+
+#now convert the data. It is currently as log2(TPM + 1)
+dsTPM = ds2
+dsTPM[,-1] = 2^ds2[,-1] - 1
+colSums(dsTPM[,-1])#very minor roundoff differences, ok
+
+#Sum up the rows that have the same gene name
+duplGenes = unique(dsTPM$gene[duplicated(dsTPM$gene)])
+length(duplGenes)#9
+dsTPM[dsTPM$gene %in% duplGenes,1:10]
+#CCDC39 is a good example to test
+#is 1.59 + 0.150 in the first row, those should be summed up
+
+rowsToRem = rep(FALSE, nrow(dsTPM))
+for (i in 1:length(duplGenes)) {
+  inds = which(dsTPM$gene == duplGenes[i])
+  dsTPM[inds[1], -1] = colSums(dsTPM[inds, -1]) #the first row now gets the sum of all rows
+  rowsToRem[inds[-1]] = TRUE #the other rows are marked for deletion
+}
+sum(rowsToRem) #9, looks good
+#test: 
+dsTPM[[2]][dsTPM$gene == "CCDC39"] #1.74 0.15, as expected, ok
+dsTPM$gene[rowsToRem] # CCDC39 is in there, ok
+dsTPM2 = dsTPM[!rowsToRem,]
+
+write_tsv(dsTPM2, paste0(dataFolder, "DepMap_tpm_gene_symbols.txt"))
+
diff --git a/code/DepMapGeneEss/PlotGeneEss.R b/code/DepMapGeneEss/PlotGeneEss.R
@@ -0,0 +1,62 @@
+library(ggplot2)
+library(tidyverse)
+
+setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
+
+
+figPath = "figures/"
+
+
+############################################
+# Fig 1B - Gene essentiality
+############################################
+
+# load data
+# fns = c("data/geneEss_model1.txt",
+#         "data/geneEss_newalg.txt",
+#         "data/geneEss_newalg2.txt"
+#         )
+fns = c("data/geneEss_model1.txt")
+
+
+#names = c("tINIT","ftINIT 1+0", "ftINIT 1+1")
+names = c("Human2")
+
+gea_res = NULL
+for (i in 1:length(fns)) {
+  x = read.delim(file = fns[i], sep='\t', stringsAsFactors=F)
+  x$model = names[i]
+  gea_res = rbind(gea_res, x)
+}
+gea_res$model = factor(gea_res$model, as.character(names)[1:length(fns)])  # to enforce the model order
+
+
+color_palette <- c('#B5D39B','#6B97BC','#E7B56C')  # light green, light blue, light yellow
+
+p1B = ggplot(gea_res, aes(x = model, y = MCC, fill = model)) +
+  geom_violin(trim=F, show.legend=F, scale='count') +
+  scale_fill_manual(values=color_palette) +
+  theme_classic() + 
+  ylab('MCC') +
+  xlab('') +
+  theme(text = element_text(size=14),
+        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5,
+                                   color='black', size=14),
+        axis.text.y = element_text(color='black', size=14),
+        axis.line.x = element_blank()) +
+  ylim(c(0.08,0.40)) # + #manipulate these numbers to include all data
+  #ylim(c(0,0.5)) # +
+p1B
+
+
+ggsave(
+  paste0(figPath, "FigGeneEss.png"),
+  plot = p1B,
+  width = 3.5, height = 3.2, dpi = 300)
+
+ggsave(
+  paste0(figPath, "FigGeneEss.eps"),
+  plot = p1B,
+  width = 3.5, height = 3.2, dpi = 300)
+
+
diff --git a/code/DepMapGeneEss/PrepDepMapData.m b/code/DepMapGeneEss/PrepDepMapData.m
@@ -0,0 +1,43 @@
+%we are using depmap version 2021_Q3
+
+%Prepares the DepMap data, specifically by filtering out RNA-Seq samples for which no CRISPR data exists
+%cd C:/Work/MatlabCode/components/human-GEM/Human-GEMDepMapEval/Human-GEM/code/DepMapGeneEss %to be used if not running the whole file, may need to change this
+cd(fileparts(which(mfilename))); %to be used if running the whole file
+
+%% Load and prepare DepMap RNA-Seq data (cell lines)
+
+% load RNA-Seq data from txt file
+rna_data = readtable('data/DepMap_tpm_gene_symbols.txt');
+
+% load gene essentiality data (Achilles gene effect)
+ach_data = readtable('data/Achilles_gene_effect.csv');
+samples = ach_data.DepMap_ID;  % extract sample IDs
+
+% filter RNA-Seq data to only include samples for which we have
+% essentiality data
+cellLineNames = rna_data.Properties.VariableNames;
+%now replace '_' with '-'
+cellLineNames = strrep(cellLineNames, '_', '-');
+
+%{'Original column heading: 'ACH-001113''}
+
+keep = ismember(cellLineNames, samples);
+
+sum(keep) %891
+sum(keep)/length(keep) % 65%, seems reasonable
+
+% add RNA-Seq data to arrayData
+arrayDataDepMap.genes = rna_data.gene;
+arrayDataDepMap.tissues = cellLineNames(keep)';
+arrayDataDepMap.levels = table2array(rna_data(:, keep));
+arrayDataDepMap.threshold = 1;
+
+
+% save tINIT inputs
+save('data/arrayDataDepMap.mat','arrayDataDepMap');
+
+%Generate ftINIT prepData - only needs to be done once. Can take up to an hour to run
+model = importYaml('../../model/Human-GEM.yml');
+[model.grRules, skipped] = simplifyGrRules(model.grRules, true);%takes a few minutes to run
+prepData = prepHumanModelForftINIT(model, true, '../../data/metabolicTasks/metabolicTasks_Essential.txt', '../../model/reactions.tsv');
+save('data/prepDataGeneSymbols.mat', 'prepData')
diff --git a/code/DepMapGeneEss/data/DepMap data files go here.txt b/code/DepMapGeneEss/data/DepMap data files go here.txt
diff --git a/code/DepMapGeneEss/enrichment_test.m b/code/DepMapGeneEss/enrichment_test.m
@@ -0,0 +1,46 @@
+%This file is for convenience copied from the Human1 paper.
+function [penr,pdep] = enrichment_test(pop,sample,successes)
+%enrichment_test caculates p-value of enrichment of successes in a sample.
+%
+% [PENR,PDEP] = enrichment_test(POP,SAMPLE,SUCCESSES) evaluates the
+% significance of enrichment (and depletion) of SUCCESSES in a SAMPLE drawn
+% from population POP using the hypergeometric test.
+%
+%
+%--------------------------------- INPUTS ---------------------------------
+%
+% pop           Vector of genes comprising the population from
+%               which samples are drawn.
+%
+% sample        Vector of genes sampled from the population.
+%
+% successes     Vector of genes in the population that are defined as
+%               "successes". 
+%               
+%               The function will test if these "success" genes are 
+%               significantly depleted or enriched in the sample, given
+%               that they were drawn from the population.
+%
+% EXAMPLE: If a metabolite set enrichment analysis (MSEA) is being
+%          performed, then SAMPLE is the list of metabolites of interest,
+%          and SUCCESSES is a metabolite set (e.g., TCA cycle metabolites).
+%          POP is the list of all metabolites from which SAMPLE and
+%          SUCCESSES are drawn.
+%
+%--------------------------------- OUTPUTS --------------------------------
+%
+% penr          p-value associated with enrichment of successes in sample.
+%
+% pdep          p-value associated with depletion of successes in sample.
+%
+%
+% ***** WARNING: P-VALUES ARE NOT ADJUSTED FOR FALSE DISCOVERY RATE! *****
+%
+
+x = numel(intersect(successes,sample));  % calc # of successes in sample
+m = numel(pop);  % calc size of population
+k = numel(intersect(successes,pop));
+n = numel(sample);
+
+penr = hygecdf(x-1,m,k,n,'upper');
+pdep = hygecdf(x,m,k,n);