PGScatalog · nebfield · Feb 29, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/assets/report/renv.lock b/assets/report/renv.lock
diff --git a/assets/report/report.qmd b/assets/report/report.qmd
@@ -29,7 +29,6 @@ library(jsonlite)
 library(dplyr)
 library(tidyr)
 library(ggplot2)
-library(data.table)
 ```
 
 ```{r setup_logs, echo=FALSE}
@@ -367,38 +366,14 @@ pop_summary %>%
 
 # Scores
 
-```{r, echo = FALSE, message = FALSE, eval=!params$run_ancestry}
-# problem: aggregated_scores.txt.gz  has a different structure to the ancestry outputs
-# solution: pivot the table in the report, but fix in pgscatalog_utils in a future release
-# problem: big scores can take a lot of memory
-# use data.table as a temporary solution to pivoting datasets
-scores <- data.table::fread(params$score_path)
-n_scores <- sum(grepl("*_SUM$", colnames(scores)))
-n_samples <- nrow(scores)
-
-id_cols <- c("sampleset", "IID")
-melted_scores <- data.table::melt(scores, id.vars=id_cols, measure.vars=patterns(SUM="*_SUM$", DENOM="DENOM", AVG="*_AVG$"), variable.name = "PGS")
-
-# annoying fix for scores getting converted to a factor level integer
-# fixed in data.table 1.14.9 but not released yet
-score_names <- sub("_.*", "", colnames(scores)[grepl("_SUM$", colnames(scores))])
-setattr(melted_scores$PGS, "levels", score_names)
-
-# don't overwrite params$score_path until report has compiled 
-# OOM crashes can cause corrupted data
-temp_out_path <- tempfile(fileext=".txt.gz")
-data.table::fwrite(melted_scores, temp_out_path, sep="\t", compress="gzip")
-```
-
-```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
+```{r, echo = FALSE, message = FALSE}
 scores <- readr::read_tsv(params$score_path) 
 n_scores <- length(unique(scores$PGS))
 n_samples <- length(unique(scores$IID))
 print(n_samples)
 ```
 
-
-```{asis, echo = any(table(scores$sampleset) < 50)}
+```{asis, echo = any(table(scores$sampleset) < 50) && !params$run_ancestry}
 
 ::: {.callout-important title="Warning: small sampleset size (n < 50) detected"}
 * plink2 uses allele frequency data to [mean impute](https://www.cog-genomics.org/plink/2.0/score) the dosages of missing genotypes
@@ -535,7 +510,3 @@ tibble::tibble(json = json_scorefiles) %>%
     ))
 ```
 
-```{r cleanup, eval=!params$run_ancestry, echo=FALSE}
-# prevent the report crashing from corrupting the aggregated scores
-invisible(file.copy(temp_out_path, params$score_path, overwrite=TRUE))
-```
diff --git a/conf/modules.config b/conf/modules.config
@@ -36,10 +36,10 @@ process {
     // container configuration    
     withLabel: pgscatalog_utils {
         ext.conda = "$projectDir/environments/pgscatalog_utils/environment.yml"
-        ext.docker = 'ghcr.io/pgscatalog/pgscatalog_utils'
-        ext.singularity = 'oras://ghcr.io/pgscatalog/pgscatalog_utils'
-        ext.docker_version = ':v0.4.3'
-        ext.singularity_version = ':v0.4.3-singularity'
+        ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/pgscatalog_utils/pgscatalog_utils'
+        ext.singularity = 'oras://dockerhub.ebi.ac.uk/gdp-public/pgscatalog_utils/singularity/pgscatalog_utils'
+        ext.docker_version = ':dev'
+        ext.singularity_version = ':dev'
     }
 
     withLabel: plink2 {

diff --git a/tests/modules/combine/test.yml b/tests/modules/combine/test.yml
@@ -12,9 +12,8 @@
         - "effect_allele"
         - "other_allele"
         - "effect_weight"
-        - "effect_type"
     - path: output/combine/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
 
 
diff --git a/tests/modules/download/test.yml b/tests/modules/download/test.yml
@@ -8,7 +8,7 @@
     - path: output/download/PGS000001_hmPOS_GRCh37.txt.gz
     - path: output/download/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
 
 - name: pgscatalog test --efo_trait --pgp_id and --pgs_id
   command: nextflow run ./tests/modules/download -entry testmultipleaccessions -c ./tests/config/nextflow.config
@@ -24,7 +24,7 @@
     - path: output/download/PGS002054_hmPOS_GRCh37.txt.gz
     - path: output/download/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
 
 - name: pgscatalog test bad accession
   command: nextflow run ./tests/modules/download -entry testbadaccession -c ./tests/config/nextflow.config
@@ -44,4 +44,4 @@
     - path: output/download/PGS000001_hmPOS_GRCh38.txt.gz
     - path: output/download/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
diff --git a/tests/modules/match/test.yml b/tests/modules/match/test.yml
@@ -8,7 +8,7 @@
   files:
     - path: output/test/match/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
 
 - name: test match combine module
   command: nextflow run ./tests/modules/match -entry testmatchcombine -c ./tests/config/nextflow.config
@@ -20,7 +20,7 @@
   files:
     - path: output/combine/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
     - path: output/combine/scorefiles.txt.gz
       contains:
         - "effect_allele"
@@ -32,4 +32,3 @@
         - "effect_allele"
         - "other_allele"
         - "effect_weight"
-        - "effect_type"
diff --git a/tests/subworkflows/test_apply_score.py b/tests/subworkflows/test_apply_score.py
@@ -1,35 +1,54 @@
 import pytest
 import pathlib
-import numpy as np
 import pandas as pd
 import glob
 import os
 import itertools
 import gzip
 import re
 
-@pytest.mark.workflow('test apply score subworkflow')
+
+@pytest.mark.workflow("test apply score subworkflow")
 def test_aggregated_scores(workflow_dir):
-    ''' Make sure aggregated scores are floats with no missing values '''
+    """Make sure aggregated scores are floats with no missing values"""
 
     score_dir = pathlib.Path(workflow_dir, "output/score/")
     agg_scores = glob.glob(os.path.join(score_dir, "*.txt.gz"))[0]
 
-    df = pd.read_csv(agg_scores, sep = '\t')
+    df = pd.read_csv(agg_scores, sep="\t")
+
+    assert not df.isnull().any().any(), "Missing values in aggregated scores"
 
-    assert not df.isnull().any().any(), 'Missing values in aggregated scores'
+    cols = ["sampleset", "IID", "PGS", "SUM", "DENOM", "AVG"]
+    assert cols == list(df.columns), "Missing columns"
+    assert (
+        len(
+            set(df.select_dtypes(include=["int64", "float64"]).columns).difference(
+                set(
+                    [
+                        "SUM",
+                        "AVG",
+                        "DENOM",
+                    ]
+                )
+            )
+        )
+        == 0
+    )
 
-    numeric_cols = df.select_dtypes(include = ['int64', 'float64'])
-    weight_cols = df.drop(['sampleset', 'IID'], axis = 1)
-    assert weight_cols.equals(numeric_cols), "Weight columns aren't numeric"
 
-@pytest.mark.workflow('test apply score subworkflow')
+@pytest.mark.workflow("test apply score subworkflow")
 def test_processed_variants(workflow_dir):
-    ''' Make sure n_lines in scorefile == --score XXX variants processed in log '''
+    """Make sure n_lines in scorefile == --score XXX variants processed in log"""
     # find directories with scoring file variants in them
-    scoring_variants = [pathlib.Path(x) for x in glob.glob("work/**/**/*.sscore.vars", root_dir=workflow_dir)]
+    scoring_variants = [
+        pathlib.Path(x)
+        for x in glob.glob("work/**/**/*.sscore.vars", root_dir=workflow_dir)
+    ]
     not_symlinks = [not x.is_symlink() for x in scoring_variants]
-    real_files: list[pathlib.Path]  = [i for (i, v) in zip(scoring_variants, not_symlinks) if v]
+    real_files: list[pathlib.Path] = [
+        i for (i, v) in zip(scoring_variants, not_symlinks) if v
+    ]
     work_dirs: list[pathlib.Path] = [x.parents[0] for x in real_files]
 
     for work_dir in work_dirs:
@@ -39,12 +58,18 @@ def test_processed_variants(workflow_dir):
             log: list[str] = f.read().split("\n")
 
         # grab line from log: '--score: n variants processed.'
-        processed_line: list[str] = list(itertools.compress(log, ["variants processed." in x for x in log]))[0]
-        processed_variants: int = int(re.findall(r'\d+', processed_line)[0])
+        processed_line: list[str] = list(
+            itertools.compress(log, ["variants processed." in x for x in log])
+        )[0]
+        processed_variants: int = int(re.findall(r"\d+", processed_line)[0])
 
-        scorefile_path = glob.glob("*.scorefile.gz", root_dir=workflow_dir / work_dir)[0]
+        scorefile_path = glob.glob("*.scorefile.gz", root_dir=workflow_dir / work_dir)[
+            0
+        ]
         with gzip.open(workflow_dir / work_dir / scorefile_path) as f:
             num_scorefile_lines = sum(1 for _ in f)
 
         # (-1 for header line)
-        assert num_scorefile_lines - 1 == processed_variants, "plink log variants processed doesn't match scorefile n variants"
+        assert (
+            num_scorefile_lines - 1 == processed_variants
+        ), "plink log variants processed doesn't match scorefile n variants"
diff --git a/tests/subworkflows/test_liftover_run.yml b/tests/subworkflows/test_liftover_run.yml
@@ -8,7 +8,7 @@
     - path: output/combine/scorefiles.txt.gz
     - path: output/combine/versions.yml      
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"
 
 - name: test input check subworkflow with liftover 37to38
   command: nextflow run main.nf --only_input --pgs_id PGS001229 --liftover --target_build GRCh38 -c ./tests/config/nextflow.config --hg19_chain https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz --hg38_chain https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
@@ -20,4 +20,4 @@
     - path: output/combine/scorefiles.txt.gz
     - path: output/combine/versions.yml
       contains:
-        - "pgscatalog_utils: 0.4.2"
+        - "pgscatalog_utils: 0.5.2"