-
Notifications
You must be signed in to change notification settings - Fork 18
/
pre-process.R
76 lines (68 loc) · 3.1 KB
/
pre-process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library(Seurat)
library(R.utils)
# download the 199 mouse testis spermatocyte cells from GSE113293 (PMID:31237565)
# file name: GSM3102983_SPCImat.txt
example_data<- read.table(file = 'GSM3102983_SPCImat.txt',header = T,stringsAsFactors = F)
cellname<- paste(rep('C',times = ncol(example_data)),1:ncol(example_data),sep = '_')
colnames(example_data)<- cellname
# load NCBI gene information
geneinfo<- readRDS('R/geneinfo.rds')
geneinfo<- geneinfo[geneinfo$species == 'Mouse',]
# revise gene symbol
genename<- rownames(example_data)
genename1<- genename[genename %in% geneinfo$Symbol]
genename2<- genename[!genename %in% geneinfo$Symbol]
genename3<- genename2[genename2 %in% geneinfo$Synonyms]
genename4<- rep('NA',length(genename3))
for (i in 1:length(genename3)) {
d1<- geneinfo[geneinfo$Synonyms == genename3[i],]$Symbol
if(length(d1) == 1){
genename4[i]<- d1
}
}
genename3<- c(genename1,genename3)
genename4<- c(genename1,genename4)
genedata<- data.frame(raw_name = genename3,new_name = genename4,stringsAsFactors = F)
genedata<- genedata[!genedata$new_name == 'NA',]
genedata1<- as.data.frame(table(genedata$new_name),stringsAsFactors = F)
genedata1<- genedata1[genedata1$Freq == 1,]
genedata<- genedata[genedata$new_name %in% genedata1$Var1,]
example_data <- CreateSeuratObject(counts = example_data)
example_data <- NormalizeData(object = example_data)
# generate example_data.rds
saveRDS(example_data,file = 'R/example_data.rds')
# example_data is Seurat object after log-normalization
mouse_Testis199<- readRDS('R/example_data.rds')
mouse_Testis199<- mouse_Testis199[['RNA']]@data
# revising gene symbols
genename<- rownames(mouse_Testis199)
genename1<- genename[genename %in% geneinfo$Symbol]
genename2<- genename[!genename %in% geneinfo$Symbol]
genename3<- genename2[genename2 %in% geneinfo$Synonyms]
genename4<- rep('NA',length(genename3))
for (i in 1:length(genename3)) {
d1<- geneinfo[geneinfo$Synonyms == genename3[i],]$Symbol
if(length(d1) == 1){
genename4[i]<- d1
}
}
genename3<- c(genename1,genename3)
genename4<- c(genename1,genename4)
genedata<- data.frame(raw_name = genename3,new_name = genename4,stringsAsFactors = F)
genedata<- genedata[!genedata$new_name == 'NA',]
genedata1<- as.data.frame(table(genedata$new_name),stringsAsFactors = F)
genedata1<- genedata1[genedata1$Freq == 1,]
genedata<- genedata[genedata$new_name %in% genedata1$Var1,]
mouse_Testis199<- mouse_Testis199[genedata$raw_name,]
all(rownames(mouse_Testis199) == genedata$raw_name)
rownames(mouse_Testis199)<- genedata$new_name
all(rownames(mouse_Testis199) == genedata$new_name)
all(rownames(mouse_Testis199) %in% geneinfo$Symbol)
# for log-normalization, please execute the following command
# mouse_Testis199<- CreateSeuratObject(counts = mouse_Testis199)
# mouse_Testis199<- NormalizeData(object = mouse_Testis199)
# mouse_Testis199<- mouse_Testis199[['RNA']]@data
mouse_Testis199<- as.matrix(mouse_Testis199)
write.csv(mouse_Testis199,file = 'test/mouse/mouse_Testis199_data.csv')
gzip('test/mouse/mouse_Testis199_data.csv','test/mouse/mouse_Testis199_data.gz')
# mouse_Testis199_data.csv or mouse_Testis199_data.gz can be used for running scDeepSort