Skip to content

Commit

Permalink
chore: resolve merge conflict with latest develop branch
Browse files Browse the repository at this point in the history
  • Loading branch information
Devlin-Moyer committed Apr 23, 2024
2 parents ba08540 + c5d4835 commit 189984a
Show file tree
Hide file tree
Showing 18 changed files with 2,226 additions and 1,856 deletions.
69 changes: 69 additions & 0 deletions code/DepMapGeneEss/ConvertGeneEssInputData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
library(tidyverse)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

d = read_csv(paste0(rstudioapi::getActiveDocumentContext(), "/data/CCLE_expression_full.csv"))
dim(d)#1377 52055

#transpose
ds = as_tibble(cbind(gene = names(d)[-1], t(as.matrix(d[,-1]))))
colnames(ds)[-1] = d[[1]]

#make sure the columns are numeric instead of chr
ds2 = ds
for (i in 2:ncol(ds)) {
ds2[[i]] = as.numeric(ds[[i]])

}

#convert the gene names

#To get ensembl:
#pattern = ".*\\(([A-Z0-9]*)\\)"
#newGenes = str_match(ds$gene, pattern)
#ds2$gene = newGenes[,2]
#fix the ERCC genes
#ds2$gene[is.na(newGenes[,2])] = ds$gene[is.na(newGenes[,2])];

#To get gene symbols:
#It is a bit tricky, not all genes follow the pattern. Some are like LINC00328-2P (ENSG00000225016),
#some just an ensembl id (we then take the assembl id), some ERCC
newGenes = ds2$gene
x = strsplit(ds2$gene[!ERCCGenesSel], " ")
for(i in 1:length(x)) {
newGenes[i] = x[[i]][1] #handles all cases
}
length(newGenes)
length(unique(newGenes)) #not the same, we need to merge a few rows, done later

ds2$gene = newGenes

ds2


#now convert the data. It is currently as log2(TPM + 1)
dsTPM = ds2
dsTPM[,-1] = 2^ds2[,-1] - 1
colSums(dsTPM[,-1])#very minor roundoff differences, ok

#Sum up the rows that have the same gene name
duplGenes = unique(dsTPM$gene[duplicated(dsTPM$gene)])
length(duplGenes)#9
dsTPM[dsTPM$gene %in% duplGenes,1:10]
#CCDC39 is a good example to test
#is 1.59 + 0.150 in the first row, those should be summed up

rowsToRem = rep(FALSE, nrow(dsTPM))
for (i in 1:length(duplGenes)) {
inds = which(dsTPM$gene == duplGenes[i])
dsTPM[inds[1], -1] = colSums(dsTPM[inds, -1]) #the first row now gets the sum of all rows
rowsToRem[inds[-1]] = TRUE #the other rows are marked for deletion
}
sum(rowsToRem) #9, looks good
#test:
dsTPM[[2]][dsTPM$gene == "CCDC39"] #1.74 0.15, as expected, ok
dsTPM$gene[rowsToRem] # CCDC39 is in there, ok
dsTPM2 = dsTPM[!rowsToRem,]

write_tsv(dsTPM2, paste0(dataFolder, "DepMap_tpm_gene_symbols.txt"))

62 changes: 62 additions & 0 deletions code/DepMapGeneEss/PlotGeneEss.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
library(ggplot2)
library(tidyverse)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


figPath = "figures/"


############################################
# Fig 1B - Gene essentiality
############################################

# load data
# fns = c("data/geneEss_model1.txt",
# "data/geneEss_newalg.txt",
# "data/geneEss_newalg2.txt"
# )
fns = c("data/geneEss_model1.txt")


#names = c("tINIT","ftINIT 1+0", "ftINIT 1+1")
names = c("Human2")

gea_res = NULL
for (i in 1:length(fns)) {
x = read.delim(file = fns[i], sep='\t', stringsAsFactors=F)
x$model = names[i]
gea_res = rbind(gea_res, x)
}
gea_res$model = factor(gea_res$model, as.character(names)[1:length(fns)]) # to enforce the model order


color_palette <- c('#B5D39B','#6B97BC','#E7B56C') # light green, light blue, light yellow

p1B = ggplot(gea_res, aes(x = model, y = MCC, fill = model)) +
geom_violin(trim=F, show.legend=F, scale='count') +
scale_fill_manual(values=color_palette) +
theme_classic() +
ylab('MCC') +
xlab('') +
theme(text = element_text(size=14),
axis.text.x = element_text(angle=90, hjust=1, vjust=0.5,
color='black', size=14),
axis.text.y = element_text(color='black', size=14),
axis.line.x = element_blank()) +
ylim(c(0.08,0.40)) # + #manipulate these numbers to include all data
#ylim(c(0,0.5)) # +
p1B


ggsave(
paste0(figPath, "FigGeneEss.png"),
plot = p1B,
width = 3.5, height = 3.2, dpi = 300)

ggsave(
paste0(figPath, "FigGeneEss.eps"),
plot = p1B,
width = 3.5, height = 3.2, dpi = 300)


43 changes: 43 additions & 0 deletions code/DepMapGeneEss/PrepDepMapData.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
%we are using depmap version 2021_Q3

%Prepares the DepMap data, specifically by filtering out RNA-Seq samples for which no CRISPR data exists
%cd C:/Work/MatlabCode/components/human-GEM/Human-GEMDepMapEval/Human-GEM/code/DepMapGeneEss %to be used if not running the whole file, may need to change this
cd(fileparts(which(mfilename))); %to be used if running the whole file

%% Load and prepare DepMap RNA-Seq data (cell lines)

% load RNA-Seq data from txt file
rna_data = readtable('data/DepMap_tpm_gene_symbols.txt');

% load gene essentiality data (Achilles gene effect)
ach_data = readtable('data/Achilles_gene_effect.csv');
samples = ach_data.DepMap_ID; % extract sample IDs

% filter RNA-Seq data to only include samples for which we have
% essentiality data
cellLineNames = rna_data.Properties.VariableNames;
%now replace '_' with '-'
cellLineNames = strrep(cellLineNames, '_', '-');

%{'Original column heading: 'ACH-001113''}

keep = ismember(cellLineNames, samples);

sum(keep) %891
sum(keep)/length(keep) % 65%, seems reasonable

% add RNA-Seq data to arrayData
arrayDataDepMap.genes = rna_data.gene;
arrayDataDepMap.tissues = cellLineNames(keep)';
arrayDataDepMap.levels = table2array(rna_data(:, keep));
arrayDataDepMap.threshold = 1;


% save tINIT inputs
save('data/arrayDataDepMap.mat','arrayDataDepMap');

%Generate ftINIT prepData - only needs to be done once. Can take up to an hour to run
model = importYaml('../../model/Human-GEM.yml');
[model.grRules, skipped] = simplifyGrRules(model.grRules, true);%takes a few minutes to run
prepData = prepHumanModelForftINIT(model, true, '../../data/metabolicTasks/metabolicTasks_Essential.txt', '../../model/reactions.tsv');
save('data/prepDataGeneSymbols.mat', 'prepData')
Empty file.
46 changes: 46 additions & 0 deletions code/DepMapGeneEss/enrichment_test.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
%This file is for convenience copied from the Human1 paper.
function [penr,pdep] = enrichment_test(pop,sample,successes)
%enrichment_test caculates p-value of enrichment of successes in a sample.
%
% [PENR,PDEP] = enrichment_test(POP,SAMPLE,SUCCESSES) evaluates the
% significance of enrichment (and depletion) of SUCCESSES in a SAMPLE drawn
% from population POP using the hypergeometric test.
%
%
%--------------------------------- INPUTS ---------------------------------
%
% pop Vector of genes comprising the population from
% which samples are drawn.
%
% sample Vector of genes sampled from the population.
%
% successes Vector of genes in the population that are defined as
% "successes".
%
% The function will test if these "success" genes are
% significantly depleted or enriched in the sample, given
% that they were drawn from the population.
%
% EXAMPLE: If a metabolite set enrichment analysis (MSEA) is being
% performed, then SAMPLE is the list of metabolites of interest,
% and SUCCESSES is a metabolite set (e.g., TCA cycle metabolites).
% POP is the list of all metabolites from which SAMPLE and
% SUCCESSES are drawn.
%
%--------------------------------- OUTPUTS --------------------------------
%
% penr p-value associated with enrichment of successes in sample.
%
% pdep p-value associated with depletion of successes in sample.
%
%
% ***** WARNING: P-VALUES ARE NOT ADJUSTED FOR FALSE DISCOVERY RATE! *****
%

x = numel(intersect(successes,sample)); % calc # of successes in sample
m = numel(pop); % calc size of population
k = numel(intersect(successes,pop));
n = numel(sample);

penr = hygecdf(x-1,m,k,n,'upper');
pdep = hygecdf(x,m,k,n);
Loading

0 comments on commit 189984a

Please sign in to comment.