Skip to content

Commit

Permalink
Create PyTests for config file setups (#130)
Browse files Browse the repository at this point in the history
* incorporate config for train-only setup
update quick start

* Update docs to reflect config updates

* add pytests for deeprvat_config generation

* add pretrained_models dir link to pytest

* bugfix smoke test base path

* bugfix smoke test base path

* fixup! Format Python code with psf/black pull_request

* update alpha param from config

* add in gene-file for train-only pipeline

* fixup! Format Python code with psf/black pull_request

---------

Co-authored-by: PMBio <[email protected]>
  • Loading branch information
meyerkm and PMBio authored Aug 27, 2024
1 parent d02b759 commit 801fc32
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 126 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/pipeline-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,29 @@ run-name: DeepRVAT Pipeline Tests 🧬🧪💻🧑‍🔬
on: [ push ]

jobs:
# Config Setup
Smoke-GenerateConfig-Training:
uses: ./.github/workflows/run-pipeline.yml
with:
pipeline_file: ./pipelines/run_training.snakefile
environment_file: ./deeprvat_env_no_gpu.yml
prerun_cmd: cp ./example/config/deeprvat_input_training_config.yaml ./example/

Smoke-GenerateConfig-Training-AssociationTesting:
uses: ./.github/workflows/run-pipeline.yml
with:
pipeline_file: ./pipelines/training_association_testing.snakefile
environment_file: ./deeprvat_env_no_gpu.yml
prerun_cmd: cp ./example/config/deeprvat_input_config.yaml ./example/

Smoke-GenerateConfig-PreTrained:
uses: ./.github/workflows/run-pipeline.yml
with:
pipeline_file: ./pipelines/association_testing_pretrained.snakefile
environment_file: ./deeprvat_env_no_gpu.yml
prerun_cmd: cp ./example/config/deeprvat_input_pretrained_models_config.yaml ./example/ && ln -s $GITHUB_WORKSPACE/pretrained_models ./example/


# Training Pipeline
Smoke-RunTraining:
uses: ./.github/workflows/run-pipeline.yml
Expand Down
244 changes: 139 additions & 105 deletions deeprvat/deeprvat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,23 @@ def create_main_config(
"regenie_options",
]

# Check if Training Only
if input_config.get("training_only", False):
train_only = True
to_remove = {
"phenotypes_for_association_testing",
"association_testing_data_thresholds",
"evaluation",
}
expected_input_keys = [
item for item in expected_input_keys if item not in to_remove
]
input_config.pop("training_only", None)
else:
train_only = False

# CV setup parameters
if not input_config["cv_options"]["cv_exp"]:
logger.info("Not CV setup...removing CV pipeline parameters from config")
full_config["cv_exp"] = False
else: # CV experiment setup specified
if input_config.get("cv_options", {}).get("cv_exp", False):
if any(
key not in input_config["cv_options"]
for key in ["cv_exp", "cv_path", "n_folds"]
Expand All @@ -99,14 +111,14 @@ def create_main_config(
full_config["cv_path"] = input_config["cv_options"]["cv_path"]
full_config["n_folds"] = input_config["cv_options"]["n_folds"]
full_config["cv_exp"] = True
else:
logger.info("Not CV setup...removing CV pipeline parameters from config")
full_config["cv_exp"] = False
expected_input_keys.remove("cv_options")
input_config.pop("cv_options", None)

# REGENIE setup parameters
if not input_config["regenie_options"]["regenie_exp"]:
logger.info(
"Not using REGENIE integration...removing REGENIE parameters from config"
)
full_config["regenie_exp"] = False
else: # REGENIE integration
if input_config.get("regenie_options", {}).get("regenie_exp", False):
if any(
key not in input_config["regenie_options"]
for key in ["regenie_exp", "step_1", "step_2"]
Expand All @@ -124,58 +136,68 @@ def create_main_config(
full_config["regenie_options"]["step_2"] = input_config["regenie_options"][
"step_2"
]
else:
logger.info(
"Not using REGENIE integration...removing REGENIE parameters from config"
)
full_config["regenie_exp"] = False
expected_input_keys.remove("regenie_options")
input_config.pop("regenie_options", None)

no_pretrain = True
if "use_pretrained_models" in input_config:
if input_config["use_pretrained_models"]:
no_pretrain = False
logger.info("Pretrained Model setup specified.")
to_remove = {"training", "phenotypes_for_training", "seed_gene_results"}
expected_input_keys = [
item for item in expected_input_keys if item not in to_remove
]
if input_config.get("use_pretrained_models", False):
no_pretrain = False
logger.info("Pretrained Model setup specified.")
to_remove = {"training", "phenotypes_for_training", "seed_gene_results"}
expected_input_keys = [
item for item in expected_input_keys if item not in to_remove
]

pretrained_model_path = Path(input_config["pretrained_model_path"])
pretrained_model_path = Path(input_config["pretrained_model_path"])

expected_input_keys.extend(
["use_pretrained_models", "model", "pretrained_model_path"]
)
expected_input_keys.extend(
["use_pretrained_models", "model", "pretrained_model_path"]
)

with open(f"{pretrained_model_path}/model_config.yaml") as f:
pretrained_config = yaml.safe_load(f)
with open(f"{pretrained_model_path}/model_config.yaml") as f:
pretrained_config = yaml.safe_load(f)

required_keys = [
"model",
"rare_variant_annotations",
"training_data_thresholds",
]
for k in pretrained_config:
if k not in required_keys:
raise KeyError(
(
f"Unexpected key in pretrained_model_path/model_config.yaml file : {k} "
"Please review DEEPRVAT_DIR/pretrained_models/model_config.yaml for expected list of keys."
)
required_keys = [
"model",
"rare_variant_annotations",
"training_data_thresholds",
]
for k in pretrained_config:
if k not in required_keys:
raise KeyError(
(
f"Unexpected key in pretrained_model_path/model_config.yaml file : {k} "
"Please review DEEPRVAT_DIR/pretrained_models/model_config.yaml for expected list of keys."
)
else:
input_config[k] = deepcopy(pretrained_config[k])
)
else:
input_config[k] = deepcopy(pretrained_config[k])

if no_pretrain and "phenotypes_for_training" not in input_config:
logger.info("Unspecified phenotype list for training.")
logger.info(
" Setting training phenotypes to be the same set as specified by phenotypes_for_association_testing."
)
input_config["phenotypes_for_training"] = input_config[
"phenotypes_for_association_testing"
]
if train_only:
raise KeyError(("Must specify phenotypes_for_training in config file!"))
else:
logger.info(
" Setting training phenotypes to be the same set as specified by phenotypes_for_association_testing."
)
input_config["phenotypes_for_training"] = input_config[
"phenotypes_for_association_testing"
]

if "y_transformation" in input_config:
full_config["training_data"]["dataset_config"]["y_transformation"] = (
input_config["y_transformation"]
)
full_config["association_testing_data"]["dataset_config"][
"y_transformation"
] = input_config["y_transformation"]
if not train_only:
full_config["association_testing_data"]["dataset_config"][
"y_transformation"
] = input_config["y_transformation"]
else:
expected_input_keys.remove("y_transformation")

Expand All @@ -186,7 +208,10 @@ def create_main_config(
"Please review DEEPRVAT_DIR/example/config/deeprvat_input_config.yaml for list of keys."
)
)
if "MAF" not in input_config["association_testing_data_thresholds"]:
if (
not train_only
and "MAF" not in input_config["association_testing_data_thresholds"]
):
raise KeyError(
(
"Missing required MAF threshold in config['association_testing_data_thresholds']. "
Expand Down Expand Up @@ -223,51 +248,59 @@ def create_main_config(
"Please review DEEPRVAT_DIR/example/config/deeprvat_input_config.yaml for list of keys."
)

# Phenotypes
full_config["phenotypes"] = input_config["phenotypes_for_association_testing"]
# genotypes.h5
full_config["training_data"]["gt_file"] = input_config["gt_filename"]
full_config["association_testing_data"]["gt_file"] = input_config["gt_filename"]
# variants.parquet
full_config["training_data"]["variant_file"] = input_config["variant_filename"]
full_config["association_testing_data"]["variant_file"] = input_config[
full_config["training_data"]["gt_file"] = input_config[
"gt_filename"
] # genotypes.h5
full_config["training_data"]["variant_file"] = input_config[
"variant_filename"
]
# phenotypes.parquet
] # variants.parquet
full_config["training_data"]["dataset_config"]["phenotype_file"] = input_config[
"phenotype_filename"
]
full_config["association_testing_data"]["dataset_config"]["phenotype_file"] = (
input_config["phenotype_filename"]
)
# annotations.parquet
] # phenotypes.parquet
full_config["training_data"]["dataset_config"]["annotation_file"] = input_config[
"annotation_filename"
]
full_config["association_testing_data"]["dataset_config"]["annotation_file"] = (
input_config["annotation_filename"]
)
# protein_coding_genes.parquet
] # annotations.parquet
full_config["association_testing_data"]["dataset_config"]["gene_file"] = (
input_config["gene_filename"]
)
full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["gene_file"] = input_config["gene_filename"]
# rare_variant_annotations
) # protein_coding_genes.parquet
full_config["training_data"]["dataset_config"]["rare_embedding"]["config"][
"annotations"
] = input_config["rare_variant_annotations"]
full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["annotations"] = input_config["rare_variant_annotations"]
# covariates
] = input_config[
"rare_variant_annotations"
] # rare_variant_annotations
full_config["training_data"]["dataset_config"]["x_phenotypes"] = input_config[
"covariates"
]
full_config["association_testing_data"]["dataset_config"]["x_phenotypes"] = (
input_config["covariates"]
)
] # covariates
if not train_only:
full_config["phenotypes"] = input_config[
"phenotypes_for_association_testing"
] # Phenotypes
full_config["association_testing_data"]["gt_file"] = input_config[
"gt_filename"
] # genotypes.h5
full_config["association_testing_data"]["variant_file"] = input_config[
"variant_filename"
] # variants.parquet
full_config["association_testing_data"]["dataset_config"]["phenotype_file"] = (
input_config["phenotype_filename"]
) # phenotypes.parquet
full_config["association_testing_data"]["dataset_config"]["annotation_file"] = (
input_config["annotation_filename"]
) # annotations.parquet
full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["gene_file"] = input_config[
"gene_filename"
] # protein_coding_genes.parquet
full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["annotations"] = input_config[
"rare_variant_annotations"
] # rare_variant_annotations
full_config["association_testing_data"]["dataset_config"]["x_phenotypes"] = (
input_config["covariates"] # covariates
)

# Thresholds & variant annotations
anno_list = deepcopy(input_config["rare_variant_annotations"])
full_config["training_data"]["dataset_config"]["rare_embedding"]["config"][
Expand All @@ -280,29 +313,29 @@ def create_main_config(
][k] = f"{k} {v}"
training_anno_list.insert(i + 1, k)
full_config["training_data"]["dataset_config"]["annotations"] = training_anno_list

full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["thresholds"] = {}
association_anno_list = deepcopy(anno_list)
for i, (k, v) in enumerate(
input_config["association_testing_data_thresholds"].items()
):
if not train_only:
full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["thresholds"][k] = f"{k} {v}"
association_anno_list.insert(i + 1, k)
full_config["association_testing_data"]["dataset_config"][
"annotations"
] = association_anno_list
]["thresholds"] = {}
association_anno_list = deepcopy(anno_list)
for i, (k, v) in enumerate(
input_config["association_testing_data_thresholds"].items()
):
full_config["association_testing_data"]["dataset_config"]["rare_embedding"][
"config"
]["thresholds"][k] = f"{k} {v}"
association_anno_list.insert(i + 1, k)
full_config["association_testing_data"]["dataset_config"][
"annotations"
] = association_anno_list
# Results evaluation parameters; alpha parameter for significance threshold
if "evaluation" not in full_config:
full_config["evaluation"] = {}
full_config["evaluation"]["correction_method"] = input_config["evaluation"][
"correction_method"
]
full_config["evaluation"]["alpha"] = input_config["evaluation"]["alpha"]

# Results evaluation parameters; alpha parameter for significance threshold
if "evaluation" not in full_config:
full_config["evaluation"] = {}
full_config["evaluation"]["correction_method"] = input_config["evaluation"][
"correction_method"
]
full_config["evaluation"]["alpha"] = input_config["evaluation"]["alpha"]
# DeepRVAT model
full_config["n_repeats"] = input_config["n_repeats"]

Expand Down Expand Up @@ -585,9 +618,10 @@ def update_config(
else:
logger.info("Not performing EAC filtering of baseline results")
logger.info(f" Correcting p-values using {correction_method} method")
alpha = config["baseline_results"].get(
"alpha_seed_genes", config["evaluation"].get("alpha")
)
if config["baseline_results"].get("alpha_seed_genes", False):
alpha = config["baseline_results"]["alpha_seed_genes"]
else:
alpha = config["evaluation"].get("alpha")
baseline_df = pval_correction(
baseline_df, alpha, correction_type=correction_method
)
Expand Down
10 changes: 2 additions & 8 deletions docs/input_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,12 @@ Configuration for all pipelines is specified in the file `deeprvat_input_config.

In the following, we describe the parameters (both optional and required) that can be specified in the `deeprvat_input_config.yaml` by way of an [example file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_config.yaml), which we explain block by block.

```
deeprvat_repo_dir: ../..
```

_Required._ This specifies the path to your copy of the DeepRVAT repository.

```
use_pretrained_models: True
pretrained_model_path : ../../pretrained_models
pretrained_model_path : pretrained_models
```

These parameters are relevant when using pretrained models. `use_pretrained_models` defaults to `False` if not specified.
These parameters are relevant when using pretrained models. `use_pretrained_models` defaults to `False` if not specified. Update the `pretrained_model_path` to the path where the `pretrained_models` directory is, if not in the same folder as your current experiment directory.

```
phenotypes_for_association_testing:
Expand Down
6 changes: 4 additions & 2 deletions docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@ snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pret
### Run the training pipeline on some example data

```shell
DEEPRVAT_REPO_PATH="[path_to_deeprvat]"
mkdir deeprvat_train
cd deeprvat_train
ln -s [path_to_deeprvat]/example/* .
snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/run_training.snakefile
ln -s "$DEEPRVAT_REPO_PATH"/example/* .
ln -s config/deeprvat_input_training_config.yaml . #get the corresponding config.
snakemake -j 1 --snakefile "$DEEPRVAT_REPO_PATH"/pipelines/run_training.snakefile
```


Expand Down
Loading

0 comments on commit 801fc32

Please sign in to comment.