Update to v0.15

CNAG-Biomedical-Informatics · Nov 30, 2023 · 3fac55d · 3fac55d
1 parent e76d34b
commit 3fac55d
Show file tree

Hide file tree

Showing 16 changed files with 208 additions and 175 deletions.
diff --git a/Changes b/Changes
@@ -1,5 +1,10 @@
 Revision history for Perl distribution Convert-Pheno
 
+0.15 2023-xx-xxT00:00:00Z (Manuel Rueda <[email protected]>)
+
+ - Changed 'windows-latest' to 'windows-2019' in GHA build-and-test.yml
+ - Refactored .pm to improve readibility and maintanability
+
 0.14 2023-11-21T00:00:00Z (Manuel Rueda <[email protected]>)
 
  - Fixed error on pxf.{json,yaml} @pxf-tools-cli examples (v1.0.0-RC3)
@@ -60,5 +65,4 @@ Revision history for Perl distribution Convert-Pheno
  - Added share/db/{omim,hpo}.db SQLite databases
  - Modified source to accomodate such dbs
 
-NOTE: All versions prior to 0.10 were used for testing deployment via CPAN.
-
+Note: Versions < 0.10 were primarily for testing deployment on CPAN
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 [![Coverage Status](https://coveralls.io/repos/github/CNAG-Biomedical-Informatics/convert-pheno/badge.svg?branch=main)](https://coveralls.io/github/CNAG-Biomedical-Informatics/convert-pheno?branch=main)
 [![CPAN Publish](https://github.com/cnag-biomedical-informatics/convert-pheno/actions/workflows/cpan-publish.yml/badge.svg)](https://github.com/cnag-biomedical-informatics/convert-pheno/actions/workflows/cpan-publish.yml)
 [![Kwalitee Score](https://cpants.cpanauthors.org/dist/Convert-Pheno.svg)](https://cpants.cpanauthors.org/dist/Convert-Pheno)
-![version](https://img.shields.io/badge/version-0.14_beta-orange)
+![version](https://img.shields.io/badge/version-0.15_beta-orange)
 [![Docker Build](https://github.com/cnag-biomedical-informatics/convert-pheno/actions/workflows/docker-build.yml/badge.svg)](https://github.com/cnag-biomedical-informatics/convert-pheno/actions/workflows/docker-build.yml)
 [![Docker Pulls](https://badgen.net/docker/pulls/manuelrueda/convert-pheno?icon=docker&label=pulls)](https://hub.docker.com/r/manuelrueda/convert-pheno/)
 [![Docker Image Size](https://badgen.net/docker/size/manuelrueda/convert-pheno?icon=docker&label=image%20size)](https://hub.docker.com/r/manuelrueda/convert-pheno/)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.14
+0.15
diff --git a/api/perl/openapi.json b/api/perl/openapi.json
@@ -3,7 +3,7 @@
   "info": {
     "title": "Convert-Pheno API",
     "description": "Convert-Pheno API configuration",
-    "version": "0.12"
+    "version": "0.14"
   },
   "paths": {
     "/api": {

diff --git a/lib/Convert/Pheno.pm b/lib/Convert/Pheno.pm
@@ -3,17 +3,17 @@ package Convert::Pheno;
 use strict;
 use warnings;
 use autodie;
-use feature               qw(say);
+use feature qw(say);
 use File::Spec::Functions qw(catdir catfile);
 use Data::Dumper;
 use Path::Tiny;
 use File::Basename;
 use File::ShareDir::ProjectDistDir;
 use List::Util qw(any uniq);
-use Carp       qw(confess);
+use Carp qw(confess);
 use XML::Fast;
 use Moo;
-use Types::Standard                qw(Str Int Num Enum ArrayRef Undef);
+use Types::Standard qw(Str Int Num Enum ArrayRef Undef);
 use File::ShareDir::ProjectDistDir qw(dist_dir);
 
 #use Devel::Size     qw(size total_size);
@@ -35,8 +35,12 @@ our @EXPORT =
 
 use constant DEVEL_MODE => 0;
 
+# Personalize warn and die functions
+$SIG{__WARN__} = sub { warn "Warn: ", @_ };
+$SIG{__DIE__}  = sub { die "Error: ", @_ };
+
 # Global variables:
-our $VERSION   = '0.14';
+our $VERSION   = '0.15';
 our $share_dir = dist_dir('Convert-Pheno');
 
 ############################################
@@ -74,10 +78,16 @@ has min_text_similarity_score => (
 has username => (
 
     #default => ( $ENV{LOGNAME} || $ENV{USER} || getpwuid($<) ) , # getpwuid not implemented in Windows
-    default => $ENV{'LOGNAME'} || $ENV{'USER'} || $ENV{'USERNAME'} || 'dummy-user',
-    is      => 'ro',
-    coerce  => sub {
-        $_[0] // ( $ENV{'LOGNAME'} || $ENV{'USER'} || $ENV{'USERNAME'} || 'dummy-user' );
+    default => $ENV{'LOGNAME'}
+      || $ENV{'USER'}
+      || $ENV{'USERNAME'}
+      || 'dummy-user',
+    is     => 'ro',
+    coerce => sub {
+        $_[0] // ( $ENV{'LOGNAME'}
+              || $ENV{'USER'}
+              || $ENV{'USERNAME'}
+              || 'dummy-user' );
     },
     isa => Str
 );
@@ -89,15 +99,18 @@ has max_lines_sql => (
     isa     => Int
 );
 
-has omop_tables => (
-
-    # Table <CONCEPT> is always required
+has 'omop_tables' => (
     default => sub { [@omop_essential_tables] },
     coerce  => sub {
-        @{ $_[0] }
-          ? $_[0] =
-          [ map { uc($_) } ( uniq( @{ $_[0] }, 'CONCEPT', 'PERSON' ) ) ]
+        my $tables = shift;
+
+        # If tables are provided, process them; otherwise, use default essential tables
+        $tables =
+          @$tables
+          ? [ uniq( map { uc($_) } ( 'CONCEPT', 'PERSON', @$tables ) ) ]
           : \@omop_essential_tables;
+
+        return $tables;
     },
     is  => 'rw',
     isa => ArrayRef
@@ -227,58 +240,46 @@ sub omop2bff {
     # IMPORTANT #
     #############
 
-    # SMALL TO MEDIUM FILES < 1M rows
+    # File Size Considerations for Data Processing
+    #
+    # For SMALL TO MEDIUM FILES (< 1M rows):
+    # Commonly, database downsizing for data sharing results in PostgreSQL dumps or CSVs being less than 1 million rows.
+    # With adequate memory (4-16GB), we can efficiently load this data into RAM and effectively consolidate individual data points (e.g., MEASURES, DRUGS).
     #
-    # In many cases, because people are downsizing their DBs for data sharing,
-    # PostgreSQL dumps or CSVs will be < 1M rows.
-    # Providing we have enough memory (4-16GB), we'll able to load data in RAM,
-    # and consolidate individual values (MEASURES, DRUGS, etc.)
-
-    # HUMONGOUS FILES > 1M rows
-    # NB: Interesting read on the topic
-    #     https://www.perlmonks.org/?node_id=1033692
-    # Since we're relying heavily on hashes we need to resort to another strategy(es) to load the data
+    # For HUMONGOUS FILES (> 1M rows):
+    # As we heavily use hashes, larger files necessitate alternative data loading strategies:
     #
-    # * Option A *: Parellel processing - No change in our code
-    #    Without changing the code, we ask the user to create mini-instances (or split CSV's in chunks) and use
-    #    some sort of parallel processing (e.g., GNU parallel, snakemake, HPC, etc.)
-    #    CONS: Concurrent jobs may fail due to SQLite been opened by multiple threads
+    # * Option A: Parallel Processing (No code modification required)
+    #   Users can split their data into smaller chunks or mini-instances, employing parallel processing tools (like GNU parallel, snakemake, HPC, etc.).
+    #   Caveat: SQLite’s limitations with concurrent access by multiple threads.
     #
-    # * Option B *: Keeping data consolidated at the individual-object level (as we do with small to medium files)
+    # * Option B: Data Consolidation at Individual Object Level
     #   --no-stream
-    #   To do this, we have two options:
-    #     a) Externalize (save to file) THE WHOLE HASH w/ DBM:Deep (but it's very slow)
-    #     b) First dump CSV (me or users) and then use *nix to sort by person_id (or loadSQLite and sort there).
-    #   Then, since rows for each individual are adjacent, we can load individual data together. Still,
-    #   we'll by reading one table (e.g. MEASUREMENTS) at a time, thus, this is not relly helping much to consolidate...
+    #   Two approaches for this:
+    #     a) Externalize the complete hash using DBM:Deep (although it's significantly slower).
+    #     b) Initially dump data as CSV (either by the user or automatically), then sort it (using *nix or SQLite) by 'person_id'.
+    #        This method doesn't substantially help with data consolidation since we still process one table at a time.
     #
-    # * Option C *: Parsing files line by line (one row of CSV/SQL per JSON object) <=========== IMPLEMENTED ==========
+    # * Option C: Line-by-Line File Parsing (One row of CSV/SQL per JSON object) <===== CURRENT IMPLEMENTATION
     #   --stream
-    #   BFF / PXF JSONs are just intermediate files. It's nice that they contain data grouped by individual
-    #   (for visually inspection and display), but at the end of the day they'll end up in Mongo DB.
-    #   If all entries contain the primary key 'person_id' then it's up to the Beacon v2 API to deal with them.
-    #   It's a similar issue to the one we had with genomicVariations in the B2RI, where a given variant belong to many individuals.
-    #   Here, multiple JSON documents/objects (MEASUREMENTS, DRUGS, etc.) will belong to the same individual.
-    #   Now, since we allow for CSV and SQL as an input, we need to minimize the numer of steps to a minimum.
+    #   Note: BFF / PXF JSON files serve as intermediate stages. They group data by individual for easier inspection but are ultimately stored in Mongo DB.
+    #   Similar to the genomicVariations issue in B2RI, multiple JSON objects (like MEASUREMENTS, DRUGS) can correspond to a single individual.
     #
-    #   - Problems that may arise:
-    #     1 - <CONCEPT> table is mandatory, but it can be so huge that it takes all RAM memory.
-    #         For instance, <CONCEPT.csv> with 5_808_095 lines = 735 MB
-    #                       <CONCEPT_light.csv> with 5_808_094 lines but only 4 columns = 501 MB
-    #                       Anything more than 2M lines kills a 8GB Ram machine.
-    #         Solutions:
-    #           a) Not loading the table at all and resort to --ohdsi-db
-    #           b) Creating a temporary SQLite instance for <CONCEPT>
-    #     2 - How to read line-by-line from an SQL dump
-    #          If the PostgreSQL dump weights, say, 20GB, do we create CSV tables from it (another ~20GB)?
-    #         Solutions:
-    #           a) Yep, we read @stream_ram_memory_tables and  export the needed tables to CSV and go from there.
-    #           b) Nope, we read PostgreSQL file twice, one time to load @stream_ram_memory_tables
-    #              and the second time to load the remaining TABLES. <=========== IMPLEMENTED ==========
-    #     3 - In --stream mode, do we still allow for --sql2csv? NOPE !!!! <=========== IMPLEMENTED ==========
-    #           We would need to go from functional mode (csv) to filehandles and it will take tons of space.
-    #           Then, --stream and -sql2csv are mutually exclusive.
+    #   Potential Issues and Solutions:
+    #     1. Mandatory <CONCEPT> Table:
+    #        It can be extremely large, potentially consuming all available RAM (e.g., a 735 MB <CONCEPT.csv> with over 5.8 million lines).
+    #        Solutions:
+    #          a) Avoid loading the <CONCEPT> table entirely, using --ohdsi-db instead.
+    #          b) Use a temporary SQLite instance for the <CONCEPT> table.
+    #     2. Reading SQL Dumps Line-by-Line:
+    #        For large SQL dumps (e.g., 20GB), should we convert them into CSV (also ~20GB)?
+    #        Solutions:
+    #          a) Yes, first export required tables to CSV and then proceed.
+    #          b) No, read the PostgreSQL dump twice - first to load specified tables, then the rest.
+    #     3. Streaming Mode Restrictions:
+    #        In --stream mode, --sql2csv is not allowed to prevent excessive space usage and complexity.
     #
+    # Further reading on handling large files: https://www.perlmonks.org/?node_id=1033692
 
     # Load variables
     my $data;
@@ -659,47 +660,53 @@ sub omop_stream_dispatcher {
     my $filepaths   = $arg->{filepaths};
     my $omop_tables = $self->{prev_omop_tables};
 
-    # Open connection to SQLite databases ONCE
+    # Open a SQLite database connection if required
     open_connections_SQLite($self) if $self->{method} ne 'bff2pxf';
 
-    # First we do transformations from AoH to HoH to speed up the calculation
-    my $person = { map { $_->{person_id} => $_ } @{ $self->{data}{PERSON} } };
+    # Transform Array of Hashes (AoH) to Hash of Hashes (HoH) for faster computation
+    my $person = transform_aoh_to_hoh($self);
 
-    # Give back memory to RAM
-    delete $self->{data}{PERSON};
-
-    # CSVs
-    if (@$filepaths) {
-        for (@$filepaths) {
-            say "Processing file ... <$_>" if $self->{verbose};
-            read_csv_stream(
-                {
-                    in     => $_,
-                    sep    => $self->{sep},
-                    self   => $self,
-                    person => $person
-                }
-            );
-        }
-    }
+    # Process files based on the input type (CSV or PostgreSQL dump)
+    return @$filepaths
+      ? process_csv_files( $self, $filepaths, $person )
+      : process_sqldump( $self, $filepath, $omop_tables, $person );
+}
 
-    # PosgreSQL dump
-    else {
+sub transform_aoh_to_hoh {
 
-        # Now iterate
-        for my $table ( @{$omop_tables} ) {
+    my $self   = shift;
+    my $person = { map { $_->{person_id} => $_ } @{ $self->{data}{PERSON} } };
+    delete $self->{data}{PERSON};    # Free up memory
+    return $person;
+}
 
-            # We already loaded @stream_ram_memory_tables;
-            next if any { $_ eq $table } @stream_ram_memory_tables;
-            say "Processing table ... <$table>" if $self->{verbose};
-            $self->{omop_tables} = [$table];
-            read_sqldump_stream(
-                { in => $filepath, self => $self, person => $person } );
-        }
+sub process_csv_files {
+
+    my ( $self, $filepaths, $person ) = @_;
+    for my $file (@$filepaths) {
+        say "Processing file ... <$file>" if $self->{verbose};
+        read_csv_stream(
+            {
+                in     => $file,
+                sep    => $self->{sep},
+                self   => $self,
+                person => $person
+            }
+        );
     }
+    return 1;
+}
 
-    # Close connections ONCE
-    close_connections_SQLite($self) unless $self->{method} eq 'bff2pxf';
+sub process_sqldump {
+
+    my ( $self, $filepath, $omop_tables, $person ) = @_;
+    for my $table (@$omop_tables) {
+        next if any { $_ eq $table } @stream_ram_memory_tables;
+        say "Processing table ... <$table>" if $self->{verbose};
+        $self->{omop_tables} = [$table];
+        read_sqldump_stream(
+            { in => $filepath, self => $self, person => $person } );
+    }
     return 1;
 }
 

diff --git a/lib/Convert/Pheno/BFF.pm b/lib/Convert/Pheno/BFF.pm
@@ -29,10 +29,11 @@ sub do_bff2pxf {
     # START MAPPING TO PHENOPACKET V2 TERMS #
     #########################################
 
-    # We need to shuffle a bit some Beacon v2 properties to be Phenopacket compliant
-    # Order of terms (not alphabetical) taken from:
-    # - https://phenopacket-schema.readthedocs.io/en/latest/phenopacket.html
+# We need to shuffle a bit some Beacon v2 properties to be Phenopacket compliant
+# Order of terms (not alphabetical) taken from:
+# - https://phenopacket-schema.readthedocs.io/en/latest/phenopacket.html
 
+    # Initiate PXF structure
     my $pxf;
 
     # ==
@@ -51,8 +52,9 @@ sub do_bff2pxf {
         #alternateIds => [],
         #_age => $bff->{info}{age}
         #timeAtLastEncounter => {},
-        vitalStatus => { status => 'ALIVE' },      #["UNKNOWN_STATUS", "ALIVE", "DECEASED"]
-        sex         => uc( $bff->{sex}{label} ),
+        vitalStatus => { status => 'ALIVE' }
+        ,    #["UNKNOWN_STATUS", "ALIVE", "DECEASED"]
+        sex => uc( $bff->{sex}{label} ),
 
         #taxonomy => {} ;
         #_age => $bff->{info}{age}
@@ -156,7 +158,7 @@ sub do_bff2pxf {
                 routeOfAdministration => $_->{routeOfAdministration},
                 doseIntervals         => $_->{doseIntervals}
 
-                  #performed => { timestamp => exists $_->{dateOfProcedure} ? $_->{dateOfProcedure} : undef}
+#performed => { timestamp => exists $_->{dateOfProcedure} ? $_->{dateOfProcedure} : undef}
             }
         }
     } @{ $bff->{treatments} };
@@ -183,9 +185,9 @@ sub do_bff2pxf {
     # exposures
     # =========
 
-    # Can't be mapped as Sept-2023 from pxf-tools
-    # Message type "org.phenopackets.schema.v2.Phenopacket" has no field named "exposures" at "Phenopacket".
-    #  Available Fields(except extensions): "['id', 'subject', 'phenotypicFeatures', 'measurements', 'biosamples', 'interpretations', 'diseases', 'medicalActions', 'files', 'metaData']" at line 22
+# Can't be mapped as Sept-2023 from pxf-tools
+# Message type "org.phenopackets.schema.v2.Phenopacket" has no field named "exposures" at "Phenopacket".
+#  Available Fields(except extensions): "['id', 'subject', 'phenotypicFeatures', 'measurements', 'biosamples', 'interpretations', 'diseases', 'medicalActions', 'files', 'metaData']" at line 22
 
     #   $pxf->{exposures} =
     #