-
Notifications
You must be signed in to change notification settings - Fork 17
/
STAARpipeline_Gene_Centric_ncRNA.r
132 lines (111 loc) · 3.58 KB
/
STAARpipeline_Gene_Centric_ncRNA.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#####################################################################
# Gene-centric analysis for noncoding rare variants of ncRNA
# genes using STAARpipeline
# Xihao Li, Zilin Li
# Initiate date: 11/04/2021
# Current date: 02/17/2024
#####################################################################
rm(list=ls())
gc()
## load required packages
library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(STAAR)
library(STAARpipeline)
###########################################################
# User Input
###########################################################
## aGDS directory
agds_dir <- get(load("/path_to_the_file/agds_dir.Rdata"))
## Null model
obj_nullmodel <- get(load("/path_to_the_file/obj_nullmodel.Rdata"))
## QC_label
QC_label <- "annotation/filter"
## variant_type
variant_type <- "SNV"
## geno_missing_imputation
geno_missing_imputation <- "mean"
## Annotation_dir
Annotation_dir <- "annotation/info/FunctionalAnnotation"
## Annotation channel
Annotation_name_catalog <- get(load("/path_to_the_file/Annotation_name_catalog.Rdata"))
# Or equivalently
# Annotation_name_catalog <- read.csv("/path_to_the_file/Annotation_name_catalog.csv")
## Use_annotation_weights
Use_annotation_weights <- TRUE
## Annotation name
Annotation_name <- c("CADD","LINSIGHT","FATHMM.XF","aPC.EpigeneticActive","aPC.EpigeneticRepressed","aPC.EpigeneticTranscription",
"aPC.Conservation","aPC.LocalDiversity","aPC.Mappability","aPC.TF","aPC.Protein")
## output path
output_path <- "/path_to_the_output_file/"
## output file name
output_file_name <- "TOPMed_F5_LDL_ncRNA"
## input array id from batch file
arrayid <- as.numeric(commandArgs(TRUE)[1])
###########################################################
# Main Function
###########################################################
## gene number in job
gene_num_in_array <- 100
group.num.allchr <- ceiling(table(ncRNA_gene[,1])/gene_num_in_array)
sum(group.num.allchr)
chr <- which.max(arrayid <= cumsum(group.num.allchr))
group.num <- group.num.allchr[chr]
if (chr == 1){
groupid <- arrayid
}else{
groupid <- arrayid - cumsum(group.num.allchr)[chr-1]
}
ncRNA_gene_chr <- ncRNA_gene[ncRNA_gene[,1]==chr,]
sub_seq_num <- dim(ncRNA_gene_chr)[1]
if(groupid < group.num)
{
sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):(groupid*gene_num_in_array)
}else
{
sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):sub_seq_num
}
## exclude large ncRNA masks
if(arrayid==117)
{
sub_seq_id <- setdiff(sub_seq_id,53)
}
if(arrayid==218)
{
sub_seq_id <- setdiff(sub_seq_id,19)
}
if(arrayid==220)
{
sub_seq_id <- setdiff(sub_seq_id,c(208,274))
}
if(arrayid==221)
{
sub_seq_id <- setdiff(sub_seq_id,311)
}
if(arrayid==156)
{
sub_seq_id <- setdiff(sub_seq_id,41)
}
if(arrayid==219)
{
sub_seq_id <- setdiff(sub_seq_id,103)
}
## aGDS file
agds.path <- agds_dir[chr]
genofile <- seqOpen(agds.path)
results_ncRNA <- c()
for(kk in sub_seq_id)
{
print(kk)
gene_name <- ncRNA_gene_chr[kk,2]
results <- c()
results <- try(ncRNA(chr=chr,gene_name=gene_name,genofile=genofile,obj_nullmodel=obj_nullmodel,
rare_maf_cutoff=0.01,rv_num_cutoff=2,
QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name))
results_ncRNA <- rbind(results_ncRNA,results)
}
save(results_ncRNA,file=paste0(output_path,output_file_name,"_",arrayid,".Rdata"))
seqClose(genofile)