-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: resolve merge conflict with latest develop branch
- Loading branch information
Showing
18 changed files
with
2,226 additions
and
1,856 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
library(tidyverse) | ||
|
||
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) | ||
|
||
d = read_csv(paste0(rstudioapi::getActiveDocumentContext(), "/data/CCLE_expression_full.csv")) | ||
dim(d)#1377 52055 | ||
|
||
#transpose | ||
ds = as_tibble(cbind(gene = names(d)[-1], t(as.matrix(d[,-1])))) | ||
colnames(ds)[-1] = d[[1]] | ||
|
||
#make sure the columns are numeric instead of chr | ||
ds2 = ds | ||
for (i in 2:ncol(ds)) { | ||
ds2[[i]] = as.numeric(ds[[i]]) | ||
|
||
} | ||
|
||
#convert the gene names | ||
|
||
#To get ensembl: | ||
#pattern = ".*\\(([A-Z0-9]*)\\)" | ||
#newGenes = str_match(ds$gene, pattern) | ||
#ds2$gene = newGenes[,2] | ||
#fix the ERCC genes | ||
#ds2$gene[is.na(newGenes[,2])] = ds$gene[is.na(newGenes[,2])]; | ||
|
||
#To get gene symbols: | ||
#It is a bit tricky, not all genes follow the pattern. Some are like LINC00328-2P (ENSG00000225016), | ||
#some just an ensembl id (we then take the assembl id), some ERCC | ||
newGenes = ds2$gene | ||
x = strsplit(ds2$gene[!ERCCGenesSel], " ") | ||
for(i in 1:length(x)) { | ||
newGenes[i] = x[[i]][1] #handles all cases | ||
} | ||
length(newGenes) | ||
length(unique(newGenes)) #not the same, we need to merge a few rows, done later | ||
|
||
ds2$gene = newGenes | ||
|
||
ds2 | ||
|
||
|
||
#now convert the data. It is currently as log2(TPM + 1) | ||
dsTPM = ds2 | ||
dsTPM[,-1] = 2^ds2[,-1] - 1 | ||
colSums(dsTPM[,-1])#very minor roundoff differences, ok | ||
|
||
#Sum up the rows that have the same gene name | ||
duplGenes = unique(dsTPM$gene[duplicated(dsTPM$gene)]) | ||
length(duplGenes)#9 | ||
dsTPM[dsTPM$gene %in% duplGenes,1:10] | ||
#CCDC39 is a good example to test | ||
#is 1.59 + 0.150 in the first row, those should be summed up | ||
|
||
rowsToRem = rep(FALSE, nrow(dsTPM)) | ||
for (i in 1:length(duplGenes)) { | ||
inds = which(dsTPM$gene == duplGenes[i]) | ||
dsTPM[inds[1], -1] = colSums(dsTPM[inds, -1]) #the first row now gets the sum of all rows | ||
rowsToRem[inds[-1]] = TRUE #the other rows are marked for deletion | ||
} | ||
sum(rowsToRem) #9, looks good | ||
#test: | ||
dsTPM[[2]][dsTPM$gene == "CCDC39"] #1.74 0.15, as expected, ok | ||
dsTPM$gene[rowsToRem] # CCDC39 is in there, ok | ||
dsTPM2 = dsTPM[!rowsToRem,] | ||
|
||
write_tsv(dsTPM2, paste0(dataFolder, "DepMap_tpm_gene_symbols.txt")) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
library(ggplot2) | ||
library(tidyverse) | ||
|
||
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) | ||
|
||
|
||
figPath = "figures/" | ||
|
||
|
||
############################################ | ||
# Fig 1B - Gene essentiality | ||
############################################ | ||
|
||
# load data | ||
# fns = c("data/geneEss_model1.txt", | ||
# "data/geneEss_newalg.txt", | ||
# "data/geneEss_newalg2.txt" | ||
# ) | ||
fns = c("data/geneEss_model1.txt") | ||
|
||
|
||
#names = c("tINIT","ftINIT 1+0", "ftINIT 1+1") | ||
names = c("Human2") | ||
|
||
gea_res = NULL | ||
for (i in 1:length(fns)) { | ||
x = read.delim(file = fns[i], sep='\t', stringsAsFactors=F) | ||
x$model = names[i] | ||
gea_res = rbind(gea_res, x) | ||
} | ||
gea_res$model = factor(gea_res$model, as.character(names)[1:length(fns)]) # to enforce the model order | ||
|
||
|
||
color_palette <- c('#B5D39B','#6B97BC','#E7B56C') # light green, light blue, light yellow | ||
|
||
p1B = ggplot(gea_res, aes(x = model, y = MCC, fill = model)) + | ||
geom_violin(trim=F, show.legend=F, scale='count') + | ||
scale_fill_manual(values=color_palette) + | ||
theme_classic() + | ||
ylab('MCC') + | ||
xlab('') + | ||
theme(text = element_text(size=14), | ||
axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, | ||
color='black', size=14), | ||
axis.text.y = element_text(color='black', size=14), | ||
axis.line.x = element_blank()) + | ||
ylim(c(0.08,0.40)) # + #manipulate these numbers to include all data | ||
#ylim(c(0,0.5)) # + | ||
p1B | ||
|
||
|
||
ggsave( | ||
paste0(figPath, "FigGeneEss.png"), | ||
plot = p1B, | ||
width = 3.5, height = 3.2, dpi = 300) | ||
|
||
ggsave( | ||
paste0(figPath, "FigGeneEss.eps"), | ||
plot = p1B, | ||
width = 3.5, height = 3.2, dpi = 300) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
%we are using depmap version 2021_Q3 | ||
|
||
%Prepares the DepMap data, specifically by filtering out RNA-Seq samples for which no CRISPR data exists | ||
%cd C:/Work/MatlabCode/components/human-GEM/Human-GEMDepMapEval/Human-GEM/code/DepMapGeneEss %to be used if not running the whole file, may need to change this | ||
cd(fileparts(which(mfilename))); %to be used if running the whole file | ||
|
||
%% Load and prepare DepMap RNA-Seq data (cell lines) | ||
|
||
% load RNA-Seq data from txt file | ||
rna_data = readtable('data/DepMap_tpm_gene_symbols.txt'); | ||
|
||
% load gene essentiality data (Achilles gene effect) | ||
ach_data = readtable('data/Achilles_gene_effect.csv'); | ||
samples = ach_data.DepMap_ID; % extract sample IDs | ||
|
||
% filter RNA-Seq data to only include samples for which we have | ||
% essentiality data | ||
cellLineNames = rna_data.Properties.VariableNames; | ||
%now replace '_' with '-' | ||
cellLineNames = strrep(cellLineNames, '_', '-'); | ||
|
||
%{'Original column heading: 'ACH-001113''} | ||
|
||
keep = ismember(cellLineNames, samples); | ||
|
||
sum(keep) %891 | ||
sum(keep)/length(keep) % 65%, seems reasonable | ||
|
||
% add RNA-Seq data to arrayData | ||
arrayDataDepMap.genes = rna_data.gene; | ||
arrayDataDepMap.tissues = cellLineNames(keep)'; | ||
arrayDataDepMap.levels = table2array(rna_data(:, keep)); | ||
arrayDataDepMap.threshold = 1; | ||
|
||
|
||
% save tINIT inputs | ||
save('data/arrayDataDepMap.mat','arrayDataDepMap'); | ||
|
||
%Generate ftINIT prepData - only needs to be done once. Can take up to an hour to run | ||
model = importYaml('../../model/Human-GEM.yml'); | ||
[model.grRules, skipped] = simplifyGrRules(model.grRules, true);%takes a few minutes to run | ||
prepData = prepHumanModelForftINIT(model, true, '../../data/metabolicTasks/metabolicTasks_Essential.txt', '../../model/reactions.tsv'); | ||
save('data/prepDataGeneSymbols.mat', 'prepData') |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
%This file is for convenience copied from the Human1 paper. | ||
function [penr,pdep] = enrichment_test(pop,sample,successes) | ||
%enrichment_test caculates p-value of enrichment of successes in a sample. | ||
% | ||
% [PENR,PDEP] = enrichment_test(POP,SAMPLE,SUCCESSES) evaluates the | ||
% significance of enrichment (and depletion) of SUCCESSES in a SAMPLE drawn | ||
% from population POP using the hypergeometric test. | ||
% | ||
% | ||
%--------------------------------- INPUTS --------------------------------- | ||
% | ||
% pop Vector of genes comprising the population from | ||
% which samples are drawn. | ||
% | ||
% sample Vector of genes sampled from the population. | ||
% | ||
% successes Vector of genes in the population that are defined as | ||
% "successes". | ||
% | ||
% The function will test if these "success" genes are | ||
% significantly depleted or enriched in the sample, given | ||
% that they were drawn from the population. | ||
% | ||
% EXAMPLE: If a metabolite set enrichment analysis (MSEA) is being | ||
% performed, then SAMPLE is the list of metabolites of interest, | ||
% and SUCCESSES is a metabolite set (e.g., TCA cycle metabolites). | ||
% POP is the list of all metabolites from which SAMPLE and | ||
% SUCCESSES are drawn. | ||
% | ||
%--------------------------------- OUTPUTS -------------------------------- | ||
% | ||
% penr p-value associated with enrichment of successes in sample. | ||
% | ||
% pdep p-value associated with depletion of successes in sample. | ||
% | ||
% | ||
% ***** WARNING: P-VALUES ARE NOT ADJUSTED FOR FALSE DISCOVERY RATE! ***** | ||
% | ||
|
||
x = numel(intersect(successes,sample)); % calc # of successes in sample | ||
m = numel(pop); % calc size of population | ||
k = numel(intersect(successes,pop)); | ||
n = numel(sample); | ||
|
||
penr = hygecdf(x-1,m,k,n,'upper'); | ||
pdep = hygecdf(x,m,k,n); |
Oops, something went wrong.