-
Notifications
You must be signed in to change notification settings - Fork 3
/
configure.cfg
96 lines (75 loc) · 3.2 KB
/
configure.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
[generic]
# Human ref fasta file. This should be version b37.
human_fasta_ref=tests/data/ABCA4_SCN1A.fasta
# gtf is used to extract coding variants. it has to be sorted and tabix-indexed.
gtf=tests/data/ABCA4_SCN1A.GRCh37.75.gtf.gz
# When extracting coding variants, you may also want to include variants that might
# cause alternative splicing event. Phenogenon will include variants that are at most
# `exon_padding` away from a CDS region.
exon_padding=5
# A list of sample names that are deemed unrelated with each other.
# This can be inferred from KING analysis.
unrelated_file=tests/data/test_unrelated.tsv
# gnomAD path.
gnomad_path=tests/data/gnomad_data
# A tabix indexed file produced from CADD.
cadd_file=tests/data/CADD_ABCA4_SCN1A.vcf.gz
# A patient file that lists minimised HPO terms, and also which cohorts they belong to.
patient_mini_file=tests/data/test_patients_hpo_snapshot_mini.tsv
# As above, but listing expanded HPO terms.
patient_info_file=tests/data/test_patients_hpo_snapshot.tsv
# An HPO json file describing the HPO relationships.
hpo_json=tests/data/new-hpo-hpo.json
[debug]
# debug
log_file=phenogenon.log
log_level=INFO
[phenogenon]
# zmax is used to produce heatmap
zmax=13
# As HPO terms with too few patients can produce considerable level of noise (see paper),
# so we only include HPO terms that have at least `N` affected unrelated samples in the database.
N=60
# Removing non-coding?
remove_nc=true
# If no phasing is done, use cis_gap as a guideline to check
# if two variants are in cis
# Personal experience that if two rare variants are very close together,
# there is a high chance that they are in cis.
cis_gap=100
# These two cutoffs are for getting cleaned vcf.
# If a variant is miss called in more than `v_cutoff`, remove it.
# Likewise if more than `p_cutoff` variants are miss called in a patient, remove it.
v_cutoff=0.4
p_cutoff=0.4
# This cutoff is to get poorly covered individuals
# for a given set of variants (from a grid bin). To get patient_map
# e.g. we have a~e five variants. if it is set at 0.5,
# and an individual has a miss call on a,b,c, it is removed
# from the analysis
patient_missingness_cutoff=0.5
# Set up Phenogenon grid.
cadd_step=5
cadd_min=0
gnomad_step=0.00025
gnomad_cutoff=0.01
# Population curse. We often see that correlation
# between a genotype and a HPO can be confounded by ethnicity.
# This can be a result of a disease cohort predominantly sampled from a specific ethinic group.
# pop_flags: if number of variants in bin >= pop_flags[1]
# and propotion of variants with the same highest af pop >= pop_flags[0]
# flag the HPO
pop_check_p=0.05
pop_flags=0.5, 3
# damange_cadd_ind is used to evaluate cadd efficiency
damage_cadd_ind=3
# This coefficient is used to get reportable hpos.
# The higher the coefficient, the fewer hpos you will get per gene/mode
coefficient=1
# How to combine p values to get hgf? [scaled_stouffer, fisher, stouffer]
combine_pvalues_method=scaled_stouffer
# The scaled weights
stouffer_weights=0.1, 0.5, 0.75, 1., 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8
# The HPO terms excluded from the analysis,
# such as inheritance modes that are often missing in patient annotation.
hpo_mask=HP:0000007, HP:0000006, HP:0003745, HP:0000005