-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathluigi-template.cfg
209 lines (163 loc) · 7.3 KB
/
luigi-template.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
[DEFAULT]
# Base output path for all the tasks
output_path=impc/<DR_TAG>/output/
# DCC Input Files
dcc_experiment_xml_path=impc/<DR_TAG>/xml/
dcc_specimen_xml_path=impc/<DR_TAG>/xml/
# GenTar reports
gentar_colonies_tsv_path=impc/<DR_TAG>/tracking/phenotyping_colonies.tsv
gentar_gene_status_path=impc/<DR_TAG>/tracking/gene_interest.tsv
product_report_tsv_path=impc/<DR_TAG>/tracking/gentar-products-latest.tsv
# Imits allele2 path
imits_allele2_tsv_path=impc/<DR_TAG>/misc/allele2Entries.tsv
# IMPReSS crawling base URL
impress_api_url = https://api.mousephenotype.org/impress/
# Proxy URL to access Internet from the cluster
http_proxy=hh-wwwcache.ebi.ac.uk:3128
# MGI Reports
mgi_strain_report_path = impc/<DR_TAG>/mgi/MGI_Strain.rpt
mgi_gene_pheno_report_path = impc/<DR_TAG>/mgi/MGI_GenePheno.rpt
mgi_homology_report_path = impc/<DR_TAG>/mgi/HGNC_AllianceHomology.rpt
mgi_phenotypic_allele_report_path = impc/<DR_TAG>/mgi/MGI_PhenotypicAllele.rpt
mgi_markers_report_path = impc/<DR_TAG>/mgi/MRK_List1.rpt
# OBO ontologies directory
obo_ontology_input_path=impc/<DR_TAG>/ontologies/
# IMPC ontology pipeline output directory (see https://github.com/mpi2/impc-ontology-pipeline)
ontology_metadata_input_path=impc/<DR_TAG>/ontologies/
emap_emapa_csv_path=impc/<DR_TAG>/ontologies/EMAP-EMAPA.txt
emapa_metadata_csv_path=impc/<DR_TAG>/ontologies/emapa_metadata_table.csv
ma_metadata_csv_path=impc/<DR_TAG>/ontologies/ma_metadata_table.csv
mpath_metadata_path=impc/<DR_TAG>/ontologies/mpath_metadata_table.csv
impc_search_index_csv_path=impc/<DR_TAG>/ontologies/impc_search_index.csv
mp_relation_augmented_metadata_table_csv_path=impc/<DR_TAG>/ontologies/mp-relation-augmented_metadata_table.csv
mp_hp_matches_csv_path=impc/<DR_TAG>/ontologies/mp_hp_matches.csv
# Diff report parameters
previous_dr_observations_parquet_path=/user/mi_hadoop/impc/<PREV_DR_TAG>/output/observations_parquet
previous_dr_tag=<PREV_DR_TAG>
# Statistical analysis output directory
stats_analysis_out_path=<STATS_DATA_LOCATION>
raw_data_in_output=include
extract_windowed_data=true
# Threei stats results file
threei_stats_csv_path=impc/<DR_TAG>/misc/flow_results_EBIexport_180119.csv
# PWG stats results file
pwg_stats_csv_path=impc/<DR_TAG>/misc/pwg_stats_results.csv
# GenTar reference Database connection information
ref_db_jdbc_connection_str=jdbc:postgresql://hx-rke-wp-webadmin-20-worker-1.caas.ebi.ac.uk:32370/refdata
ref_database_user=ref_admin
ref_database_password=ref_admin
# Path to the CSV file of observations with Omero IDs
omero_ids_csv_path=impc/<DR_TAG>/misc/impc_images_input_with_omero_ids.csv
# Embryo data JSON path
embryo_data_json_path=impc/<DR_TAG>/misc/embryo_data.json
# MongoDB connection data
mongodb_database=impc
mongodb_connection_uri=mongodb://impc:[email protected]:27017,mongodb-hlvm-046.ebi.ac.uk:27017
mongodb_genes_collection=genes
mongodb_replica_set=impcprors076
# Solr core generation input
solr_path=impc/<DR_TAG>/solr/
local_path=/nfs/production/tudor/komp/data-releases/<DR_TAG>/solr/
remote_host=hadoop-login-02
# IMPC Web API inputs
gene_disease_association_csv_path=impc/<DR_TAG>/misc/gene_disease_associations.csv
gene_publications_json_path=impc/<DR_TAG>/misc/publications_references.json
disease_model_summary_csv_path=impc/<DR_TAG>/misc/disease_model_summary.csv
[core]
default-scheduler-url=http://ves-ebi-d9.ebi.ac.uk:8082/
[spark]
packages=com.databricks:spark-xml_2.12:0.12.0,mysql:mysql-connector-java:8.0.20,org.postgresql:postgresql:42.2.12,org.mongodb.spark:mongo-spark-connector_2.12:3.0.2
driver_memory=32G
driver_cores=5
master=yarn
spark_submit=/hps/software/users/parkinso/spot/mousephenotype/spark-3.1.2-bin-hadoop2.7/bin/spark-submit
jars=lib/phenodcc-derived-parameters-2020.06.04.jar
driver_class_path=lib/phenodcc-derived-parameters-2020.06.04.jar
conf=spark.sql.legacy.parquet.datetimeRebaseModeInWrite=LEGACY | spark.sql.legacy.timeParserPolicy=LEGACY | spark.yarn.maxAppAttempts=1 | yarn.nodemanager.vmem-check-enabled=false | spark.executor.memoryOverhead=32g | spark.yarn.driver.memoryOverhead=5g | spark.blacklist.enabled=true | spark.executorEnv.PYSPARK_PYTHON=/usr/bin/python36 | spark.yarn.appMasterEnv.PYSPARK_PYTHON=/usr/bin/python36 | spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON=/usr/bin/python36 | spark.yarn.appMasterEnv.HTTP_PROXY=hh-wwwcache.ebi.ac.uk:3128 | spark.sql.session.timeZone=UTC | spark.local.dir=/tmp | spark.files.overwrite=true | spark.jars.packages=com.databricks:spark-xml_2.12:0.12.0,mysql:mysql-connector-java:8.0.20,org.postgresql:postgresql:42.2.12,org.mongodb.spark:mongo-spark-connector:10.0.1 | spark.mongodb.input.uri=mongodb://impc:[email protected]:27017,mongodb-hlvm-046.ebi.ac.uk:27017/admin?replicaSet=impcprors076 | spark.mongodb.output.uri=mongodb://impc:[email protected]:27017,mongodb-hlvm-046.ebi.ac.uk:27017/admin?replicaSet=impcprors076
deploy_mode=cluster
num-executors=10
executor_memory=32G
executor-cores=5
py_files=dist/impc_etl.zip
archives=dist/pyspark_venv.tar.gz#environment
[hdfs]
client=webhdfs
namenode_host=hh-hdp-master-02.ebi.ac.uk
[ImpcConfig]
deploy_mode=cluster
# XML extraction jobs
[SpecimenLevelExperimentExtractor]
[LineLevelExperimentExtractor]
[MouseSpecimenExtractor]
[EmbryoSpecimenExtractor]
# GenTar extraction jobs
[ColonyTrackingExtractor]
[ProductReportExtractor]
[GeneProductionStatusExtractor]
[ExtractAlleleRef]
[ExtractGeneRef]
# IMPReSS extraction
[ImpressExtractor]
# MGI Extraction jobs
[MGIGenePhenoReportExtractor]
[MGIHomologyReportExtractor]
[MGIPhenotypicAlleleExtractor]
[MGIMarkerListReportExtractor]
[MGIStrainReportExtractor]
# Ontology data tasks
[OntologyTermHierarchyExtractor]
[OntologyMetadataExtractor]
# Data cleaning tasks
[ColonyCleaner]
[SpecimenLevelExperimentCleaner]
[LineLevelExperimentCleaner]
[MouseSpecimenCleaner]
[EmbryoSpecimenCleaner]
# Cross-reference tasks
[MouseSpecimenCrossRef]
[EmbryoSpecimenCrossRef]
[SpecimenLevelExperimentCrossRef]
[LineLevelExperimentCrossRef]
# Additional data calculation tasks
[SpecimenLevelExperimentParameterDerivator]
[LineLevelExperimentParameterDerivator]
[ExperimentBWAgeCalculator]
# Task to generate the Observation level representation of the data
[ExperimentToObservationMapper]
# Tasks to generate the Statistical Analysisi inputs
[MPChooserGenerator]
[StatsPipelineInputMapper]
# Task to generate the Images pipeline input
[ImagesPipelineInputGenerator]
# Task to generate the diff report
[ImpcDrDiffReportGeneration]
# Task to group the pre-statistical analysis phase of the ETL
[ImpcPreStatisticalAnalysis]
# Task to load the statistical analysis output
[StatisticalAnalysisOutputMapper]
# Solr cores generator tasks
[PipelineCoreLoader]
[GenotypePhenotypeCoreLoader]
[ImpcImagesLoader]
[GenotypePhenotypeLoader]
[GeneLoader]
[MpLoader]
[MGIPhenotypeCoreLoader]
# Task for the mapping of the impress crawler output to parameter centered view
[ImpressToParameterMapper]
# Task for the mapping and gathering of all stats results
[StatsResultsMapper]
# Task for the post-statistical analysis phase of the ETL
[ImpcPostStatisticalAnalysis]
# EOSC Life API datasource tasks
[ImpcGeneBundleMapper]
# IMPC Web API datsrouce tasks
[ImpcGeneSummaryMapper]
[ImpcGeneStatsResultsMapper]
[ImpcGenePhenotypeHitsMapper]
[ImpcLacZExpressionMapper]
[ImpcPublicationsMapper]
[ImpcProductsMapper]
[ImpcGeneImagesMapper]
[ImpcGeneDiseasesMapper]
[ImpcGeneHistopathologyMapper]