From d924e506ba888c5d1ba680a92375a0066012c001 Mon Sep 17 00:00:00 2001
From: Stefano Giorgetti <sgiorgetti@ebi.ac.uk>
Date: Fri, 26 Apr 2024 17:38:49 +0000
Subject: [PATCH 01/37] Changed GENCODE Basic tag to 'gencode_basic' as per
 ENSINT-1885

---
 .../Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm    | 2 +-
 modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm b/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm
index 269f2b8a4..1024dc13c 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm
@@ -263,7 +263,7 @@ sub Bio::EnsEMBL::Transcript::summary_as_hash {
   $summary{'transcript_support_level'} = $self->tsl if $self->tsl;
 
   my @tags;
-  push(@tags, 'basic') if $self->gencode_basic();
+  push(@tags, 'gencode_basic') if $self->gencode_basic();
   push(@tags, 'gencode_primary') if $self->gencode_primary();
   push(@tags, 'Ensembl_canonical') if $self->is_canonical();
   
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm
index 537675025..9efa955a6 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm
@@ -279,7 +279,7 @@ sub Bio::EnsEMBL::Transcript::summary_as_hash {
   $summary{'transcript_support_level'} = $self->tsl if $self->tsl;
 
   my @tags;
-  push(@tags, 'basic') if $self->gencode_basic();
+  push(@tags, 'gencode_basic') if $self->gencode_basic();
   push(@tags, 'gencode_primary') if $self->gencode_primary();
   push(@tags, 'Ensembl_canonical') if $self->is_canonical();
 

From a0b629a2a37dec03ae34f9e8f64b973901b21c98 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 29 Apr 2024 09:49:11 +0100
Subject: [PATCH 02/37] Minor fixes

---
 scripts/xrefs/cleanup_and_split_source.pl     | 11 +++++--
 src/python/ensembl/xrefs/Base.py              |  6 ++--
 src/python/ensembl/xrefs/EmailNotification.py | 29 +++++++++----------
 3 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index e538503c2..e5ddf5560 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -101,12 +101,12 @@
 
 # Extract taxonomy IDs
 my %tax_ids;
-my $skipped_species = 0;
+my ($skipped_species, $added_species) = (0, 0);
 if ($tax_ids_file) {
   open my $fh, '<', $tax_ids_file;
   chomp(my @lines = <$fh>);
   close $fh;
-  my %tax_ids = map { $_ => 1 } @lines;
+  %tax_ids = map { $_ => 1 } @lines;
 
   # Check if any taxonomy IDs already have files
   foreach my $tax_id (keys(%tax_ids)) {
@@ -216,6 +216,12 @@
         make_path($write_path);
 
         $write_file = $write_path."/".$output_file_name."-".$species_id;
+
+        # Check if creating new file
+        if (!-e $write_file) {
+          $added_species++;
+        }
+
         open($out_fh, '>>', $write_file) or die "Couldn't open output file '$write_file' $!";
 
         $current_species_id = $species_id;
@@ -231,6 +237,7 @@
 
 add_to_log_file($log_file, "Source $source_name cleaned up");
 add_to_log_file($log_file, "$source_name skipped species = $skipped_species");
+add_to_log_file($log_file, "$source_name species files created = $added_species")
 
 # Save the clean files directory in source db
 my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url);
diff --git a/src/python/ensembl/xrefs/Base.py b/src/python/ensembl/xrefs/Base.py
index 32b801e93..d5022627f 100644
--- a/src/python/ensembl/xrefs/Base.py
+++ b/src/python/ensembl/xrefs/Base.py
@@ -182,8 +182,6 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args:
         if db and db == 'checksum':
           file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}')
 
-        logging.info(f'I am here inside local ftp with {orig_source_name}')
-
         if not (skip_download_if_file_present and os.path.exists(file_path)):
           shutil.copy(local_file, file_path)
 
@@ -838,11 +836,11 @@ def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release:
     pep_path = self.get_path(base_path, species, release, 'ensembl', 'peptides.fa');
 
     # Try to find a species-specific mapper first
-    module_name = f'ensembl.xrefs.mapper.{species}'
+    module_name = f'ensembl.xrefs.mappers.{species}'
     class_name = species
     found = importlib.find_loader(module_name)
     if not found:
-      module_name = 'ensembl.xrefs.mapper.BasicMapper'
+      module_name = 'ensembl.xrefs.mappers.BasicMapper'
       class_name = 'BasicMapper'
 
     # Create a mapper object
diff --git a/src/python/ensembl/xrefs/EmailNotification.py b/src/python/ensembl/xrefs/EmailNotification.py
index dae71e738..75a23012d 100644
--- a/src/python/ensembl/xrefs/EmailNotification.py
+++ b/src/python/ensembl/xrefs/EmailNotification.py
@@ -37,10 +37,7 @@ def run(self):
       if os.path.exists(log_path):
         log_files = os.listdir(log_path)
 
-        parameters = {}
-        sources = {}
-        added_species = {}
-        skipped_species = {}
+        parameters, sources, added_species, skipped_species = {}, {}, {}, {}
 
         main_log_file = os.path.join(base_path, 'logs', log_timestamp, 'logfile_'+log_timestamp)
 
@@ -92,11 +89,11 @@ def run(self):
           sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", data)
           for source in sources_list: sources[source[0]].update({'copied' : os.path.dirname(source[1])})
 
-          # skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data)
-          # skipped_species = {source[0]: source[1] for source in skipped_species_list}
+          skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data)
+          skipped_species = {source[0]: source[1] for source in skipped_species_list}
 
-          # added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) taxonomy IDs added to filter = (\d+)", data)
-          # added_species = {division[0]: division[1] for division in added_species_list}
+          added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) species files created = (\d+)", data)
+          added_species = {source[0]: source[1] for source in added_species_list}
 
           # Include source statistics
           email_message += '<br>--Source Statistics--<br>'
@@ -106,7 +103,7 @@ def run(self):
 
             if source_values.get('downloaded'):
               (download_type, file_path) = source_values['downloaded'].split("|")
-              email_message += f' File downloaded via {download_type} into {file_path}<br>'
+              email_message += f'&nbsp;&nbsp;&nbsp;File downloaded via {download_type} into {file_path}<br>'
             elif source_values.get('copied'): email_message += '&nbsp;&nbsp;&nbsp;File(s) copied from local FTP into %s<br>' % (source_values['copied'])
             elif source_values.get('skipped'): email_message += '&nbsp;&nbsp;&nbsp;File(s) download skipped, already exists in %s<br>' % (source_values['skipped'])
 
@@ -117,13 +114,13 @@ def run(self):
             if source_values.get('preparsed'): email_message += '&nbsp;&nbsp;&nbsp;Pre-parsed &#10004;<br>'
 
           # Include species statistics
-          # email_message += '<br>--Species Statistics--<br>'
-          # for division,count in added_species.items():
-          #   if division == 'Total': continue
-          #   email_message += f'{species_type} taxonomy IDs = {count}<br>'
-          # email_message += 'Skipped Species per source file:<br>'
-          # for source_name,count in skipped_species.items():
-          #   email_message += f'&nbsp;&nbsp;&nbsp;{source_name}: {count}<br>'
+          email_message += '<br>--Species Statistics--<br>'
+          email_message += 'Skipped Species (files already exist):<br>'
+          for source_name, count in skipped_species.items():
+            email_message += f'&nbsp;&nbsp;&nbsp;{source_name}: {count}<br>'
+          email_message += 'Added Species (files created):<br>'
+          for source_name, count in added_species.items():
+            email_message += f'&nbsp;&nbsp;&nbsp;{source_name}: {count}<br>'
 
           email_message += '<br>To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--base_path</b>, <b>--source_db_url</b>, and <b>--central_db_url</b> (if preparse was run) values provided to this pipeline.'
 

From ec58c92e3f0d0278e6ff50540e7c775025e7a43f Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 29 Apr 2024 10:08:06 +0100
Subject: [PATCH 03/37] Missing semicolon

---
 scripts/xrefs/cleanup_and_split_source.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index e5ddf5560..c08617751 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -237,7 +237,7 @@
 
 add_to_log_file($log_file, "Source $source_name cleaned up");
 add_to_log_file($log_file, "$source_name skipped species = $skipped_species");
-add_to_log_file($log_file, "$source_name species files created = $added_species")
+add_to_log_file($log_file, "$source_name species files created = $added_species");
 
 # Save the clean files directory in source db
 my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url);

From 6b97c441a1de4e376ddb317700ae286473cae293 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 29 Apr 2024 11:11:24 +0100
Subject: [PATCH 04/37] Change glob parameters

---
 .../Production/Pipeline/Xrefs/ScheduleSource.pm     |  2 +-
 scripts/xrefs/cleanup_and_split_source.pl           | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
index eeda0dc8c..b80ebd421 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
@@ -134,7 +134,7 @@ sub run {
       # For Uniprot and Refseq, files might have been split by species
       if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) {
         my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein')));
-        @list_files = glob($file_name . "/**/" . $file_prefix . "-" . $species_id);
+        @list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id);
         $_ = basename(dirname($_)) . "/" . basename($_) foreach (@list_files);
       }
 
diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index c08617751..5687a5109 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -110,12 +110,15 @@
 
   # Check if any taxonomy IDs already have files
   foreach my $tax_id (keys(%tax_ids)) {
-    my @tax_files = glob($output_path . "/**/" . $output_file_name . "-" . $tax_id);
+    print Dumper $tax_id;
+    my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id);
+    print Dumper @tax_files;
     if (scalar(@tax_files) > 0) {
       $tax_ids{$tax_id} = 0;
       $skipped_species++;
     }
   }
+  die;
 
   # Do nothing if all taxonomy IDs already have files
   if ($skipped_species == scalar(keys(%tax_ids))) {
@@ -211,8 +214,10 @@
       if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) {
         close($out_fh) if (defined($current_species_id));
 
-        my @digits = split('', $species_id);
-        $write_path = catdir($output_path, $digits[0], (scalar(@digits)>1 ? $digits[1] : ""), (scalar(@digits)>2 ? $digits[2] : ""), (scalar(@digits)>3 ? $digits[3] : ""));
+	my $species_id_str = sprintf("%04d", $species_id);
+	my @digits = split('', $species_id_str);
+
+	$write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]);
         make_path($write_path);
 
         $write_file = $write_path."/".$output_file_name."-".$species_id;
@@ -231,7 +236,7 @@
     }
 
     close($in_fh);
-    close($out_fh);
+    close($out_fh) if $out_fh;
   }
 }
 

From 5d99a70491d18a9525313c3e642620f86263d8a3 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 29 Apr 2024 11:15:02 +0100
Subject: [PATCH 05/37] Remove debugging

---
 scripts/xrefs/cleanup_and_split_source.pl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index 5687a5109..f1ea49f9a 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -110,15 +110,12 @@
 
   # Check if any taxonomy IDs already have files
   foreach my $tax_id (keys(%tax_ids)) {
-    print Dumper $tax_id;
     my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id);
-    print Dumper @tax_files;
     if (scalar(@tax_files) > 0) {
       $tax_ids{$tax_id} = 0;
       $skipped_species++;
     }
   }
-  die;
 
   # Do nothing if all taxonomy IDs already have files
   if ($skipped_species == scalar(keys(%tax_ids))) {

From 577c6bafb40da0de50d5d83b40842745e52cd992 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 29 Apr 2024 11:27:43 +0100
Subject: [PATCH 06/37] Fixes to prevent warnings

---
 scripts/xrefs/cleanup_and_split_source.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index f1ea49f9a..6a09971ea 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -161,12 +161,13 @@
       my $species_id;
       if ($is_uniprot) {
         ($species_id) = $record =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
-        $species_id =~ s/\s//;
+        $species_id =~ s/\s// if $species_id;
       } else {
         ($species_id) = $record =~ /db_xref=.taxon:(\d+)/;
       }
 
       # Only continue with wanted species
+      next if (!$species_id);
       next if ($tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id}));
 
       # Clean up data

From a29d373cb43887427f1c040d5c41070e96db3e9e Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@hl-codon-45-01.ebi.ac.uk>
Date: Tue, 30 Apr 2024 09:55:24 +0100
Subject: [PATCH 07/37] Fix file paths

---
 .../Production/Pipeline/Xrefs/ScheduleSource.pm       | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
index b80ebd421..34da3141e 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
@@ -127,21 +127,24 @@ sub run {
     } else {
       # Create list of files
       opendir(my $dir_handle, $file_name);
-      my @list_files = readdir($dir_handle);
+      my @temp_list_files = readdir($dir_handle);
       closedir($dir_handle);
+
+      my @list_files;
+      foreach my $file (@temp_list_files) {
+        next if ($file =~ /^\./);
+        push(@list_files, $file_name . "/" . $file);
+      }
       if ($preparse) { @list_files = $preparse; }
 
       # For Uniprot and Refseq, files might have been split by species
       if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) {
         my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein')));
         @list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id);
-        $_ = basename(dirname($_)) . "/" . basename($_) foreach (@list_files);
       }
 
       foreach my $file (@list_files) {
-        next if ($file =~ /^\./);
         $file =~ s/\n//;
-        $file = $file_name . "/" . $file;
         if (defined $release_file and $file eq $release_file) { next; }
   
         $dataflow_params = {

From 7fb4c0488dcb8b5d921a18b8db0dd937dd083835 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-01.ebi.ac.uk>
Date: Thu, 2 May 2024 09:11:45 +0100
Subject: [PATCH 08/37] Keep original files if no species file

---
 .../Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
index 34da3141e..17a04a762 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
@@ -140,7 +140,10 @@ sub run {
       # For Uniprot and Refseq, files might have been split by species
       if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) {
         my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein')));
-        @list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id);
+        my @species_list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id);
+        if (scalar(@species_list_files) > 0) {
+          @list_files = @species_list_files;
+        }
       }
 
       foreach my $file (@list_files) {

From 74296b6af9b528c5d7eeb549cb4ce950892b5202 Mon Sep 17 00:00:00 2001
From: jmgonzmart <jmgonzalez@ebi.ac.uk>
Date: Fri, 10 May 2024 16:31:16 +0100
Subject: [PATCH 09/37] Updated HGNC custom download URL

---
 .../Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json  | 2 +-
 .../Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json | 2 +-
 modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json
index aebb77102..b0910be58 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json
@@ -203,7 +203,7 @@
     {
       "name" : "HGNC",
       "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
       "db" : "ccds",
       "priority" : 3
     }
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
index 0f24ec9a8..7fa14c977 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
@@ -241,7 +241,7 @@
     {
       "name" : "HGNC",
       "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
       "db" : "ccds",
       "priority" : 3
     }
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
index 9bcbf7936..161a41186 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
@@ -269,7 +269,7 @@
     {
       "name" : "HGNC",
       "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
       "db" : "ccds",
       "priority" : 3
     }

From 0c7c41fe7656659400e48200bc2cd47c7df0a5fc Mon Sep 17 00:00:00 2001
From: vinay-ebi <vinay@ebi.ac.uk>
Date: Wed, 15 May 2024 20:58:04 +0100
Subject: [PATCH 10/37] moved ensembl/xrefs to ensembl/production/xrefs

---
 nextflow/config/xref.config                   | 65 +++++++++++++++----
 nextflow/workflows/xrefDownload.nf            | 10 +--
 .../ensembl/{ => production}/xrefs/Base.py    |  0
 .../{ => production}/xrefs/Checksum.py        |  0
 .../{ => production}/xrefs/DownloadSource.py  |  0
 .../xrefs/EmailNotification.py                |  0
 .../{ => production}/xrefs/ScheduleCleanup.py |  0
 .../xrefs/ScheduleDownload.py                 |  0
 .../xrefs/config/xref_all_sources.json        |  0
 .../xrefs/config/xref_config.ini              |  0
 10 files changed, 57 insertions(+), 18 deletions(-)
 rename src/python/ensembl/{ => production}/xrefs/Base.py (100%)
 rename src/python/ensembl/{ => production}/xrefs/Checksum.py (100%)
 rename src/python/ensembl/{ => production}/xrefs/DownloadSource.py (100%)
 rename src/python/ensembl/{ => production}/xrefs/EmailNotification.py (100%)
 rename src/python/ensembl/{ => production}/xrefs/ScheduleCleanup.py (100%)
 rename src/python/ensembl/{ => production}/xrefs/ScheduleDownload.py (100%)
 rename src/python/ensembl/{ => production}/xrefs/config/xref_all_sources.json (100%)
 rename src/python/ensembl/{ => production}/xrefs/config/xref_config.ini (100%)

diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config
index bc36b8617..66bebf05d 100644
--- a/nextflow/config/xref.config
+++ b/nextflow/config/xref.config
@@ -22,23 +22,62 @@ params.base_path = ''
 params.clean_files = 1
 params.clean_dir = "${params.base_path}/clean_files"
 
-executor {
-  name = 'slurm'
-  queue = 'production'
-  queueSize = 100
+trace {
+    enabled = true
+    file = "trace"
+    overwrite = true
 }
 
-process {
-  errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
-  maxRetries = 3
-  time = '1d'
+report {
+    overwrite = true
+    file = "report.html"
+    enable = true
+}
+
+profiles {
 
-  withLabel:small_process {
-    memory = 200.MB
-    executor.perTaskReserve = 200.MB
+  lsf {
+   process {
+    errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
+    executor = 'lsf'
+    queue = 'production'
+    queueSize = 100
+    maxRetries = 3
+     withLabel:small_process {
+        memory = 200.MB
+        //very specific to lsf
+        executor.perTaskReserve = 200.MB
+    }
+    withLabel: dm {
+        queue = 'datamover'
+        time = '2h'
+    }
   }
+ }
+
+  slurm {
+   process {
+    errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
+    executor = 'slurm'
+    queue = 'production'
+    queueSize = 100
+    maxRetries = 3
+    time = '1d'
 
-  withLabel:mem4GB {
-    time = '3d'
+    withLabel:small_process {
+        memory = 200.MB
+    }
+
+    withLabel: dm {
+        queue = 'datamover'
+        time = '2h'
+        memory = 2.GB
+    }
+    withLabel:mem4GB {
+        time = '3d'
+    }
   }
+ }
 }
+
+
diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf
index 5c808038e..51cc53552 100644
--- a/nextflow/workflows/xrefDownload.nf
+++ b/nextflow/workflows/xrefDownload.nf
@@ -107,7 +107,7 @@ process ScheduleDownload {
   timestamp = new java.util.Date().format("yyyyMMdd_HHmmss")
 
   """
-  python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp
+  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp
   """
 }
 
@@ -126,7 +126,7 @@ process DownloadSource {
   src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
 
   """
-  python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download}
+  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download}
   """
 }
 
@@ -156,7 +156,7 @@ process ScheduleCleanup {
   path 'dataflow_cleanup_sources.json'
 
   """
-  python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp
+  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp
   """
 }
 
@@ -171,7 +171,7 @@ process Checksum {
   val 'ChecksumDone'
 
   """
-  python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp
+  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp
   """
 }
 
@@ -234,6 +234,6 @@ process NotifyByEmail {
   val timestamp
 
   """
-  python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
+  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
   """
 }
diff --git a/src/python/ensembl/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py
similarity index 100%
rename from src/python/ensembl/xrefs/Base.py
rename to src/python/ensembl/production/xrefs/Base.py
diff --git a/src/python/ensembl/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py
similarity index 100%
rename from src/python/ensembl/xrefs/Checksum.py
rename to src/python/ensembl/production/xrefs/Checksum.py
diff --git a/src/python/ensembl/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py
similarity index 100%
rename from src/python/ensembl/xrefs/DownloadSource.py
rename to src/python/ensembl/production/xrefs/DownloadSource.py
diff --git a/src/python/ensembl/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py
similarity index 100%
rename from src/python/ensembl/xrefs/EmailNotification.py
rename to src/python/ensembl/production/xrefs/EmailNotification.py
diff --git a/src/python/ensembl/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
similarity index 100%
rename from src/python/ensembl/xrefs/ScheduleCleanup.py
rename to src/python/ensembl/production/xrefs/ScheduleCleanup.py
diff --git a/src/python/ensembl/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py
similarity index 100%
rename from src/python/ensembl/xrefs/ScheduleDownload.py
rename to src/python/ensembl/production/xrefs/ScheduleDownload.py
diff --git a/src/python/ensembl/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
similarity index 100%
rename from src/python/ensembl/xrefs/config/xref_all_sources.json
rename to src/python/ensembl/production/xrefs/config/xref_all_sources.json
diff --git a/src/python/ensembl/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini
similarity index 100%
rename from src/python/ensembl/xrefs/config/xref_config.ini
rename to src/python/ensembl/production/xrefs/config/xref_config.ini

From 039335714034a705b4708a6e7f9f42c66c299fa0 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Wed, 15 May 2024 21:38:07 +0100
Subject: [PATCH 11/37] Update xref.config

add memory 4gg to slurm profile
---
 nextflow/config/xref.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config
index 66bebf05d..3737c5aff 100644
--- a/nextflow/config/xref.config
+++ b/nextflow/config/xref.config
@@ -75,6 +75,7 @@ profiles {
     }
     withLabel:mem4GB {
         time = '3d'
+        memory = 4.GB
     }
   }
  }

From 4391762fb75228fed7626bd0c695037124450cfc Mon Sep 17 00:00:00 2001
From: vinay-ebi <vinay@ebi.ac.uk>
Date: Thu, 16 May 2024 10:00:31 +0100
Subject: [PATCH 12/37] base load changed to ensembl.production.xrefs

---
 src/python/ensembl/production/xrefs/Checksum.py          | 2 +-
 src/python/ensembl/production/xrefs/DownloadSource.py    | 2 +-
 src/python/ensembl/production/xrefs/EmailNotification.py | 2 +-
 src/python/ensembl/production/xrefs/ScheduleCleanup.py   | 2 +-
 src/python/ensembl/production/xrefs/ScheduleDownload.py  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py
index dc59d5c3f..7ccb401a7 100644
--- a/src/python/ensembl/production/xrefs/Checksum.py
+++ b/src/python/ensembl/production/xrefs/Checksum.py
@@ -14,7 +14,7 @@
 
 """Checksum module for the Xref Download pipeline."""
 
-from ensembl.xrefs.Base import *
+from ensembl.production.xrefs.Base import *
 
 class Checksum(Base):
   def run(self):
diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py
index b88088960..060fcb116 100644
--- a/src/python/ensembl/production/xrefs/DownloadSource.py
+++ b/src/python/ensembl/production/xrefs/DownloadSource.py
@@ -14,7 +14,7 @@
 
 """Download module to download xref and version files."""
 
-from ensembl.xrefs.Base import *
+from ensembl.production.xrefs.Base import *
 
 class DownloadSource(Base):
   def run(self):
diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py
index 75a23012d..22738d990 100644
--- a/src/python/ensembl/production/xrefs/EmailNotification.py
+++ b/src/python/ensembl/production/xrefs/EmailNotification.py
@@ -14,7 +14,7 @@
 
 """Email module to send user emails notifying of xref pipelines end, with important information and statistics."""
 
-from ensembl.xrefs.Base import *
+from ensembl.production.xrefs.Base import *
 
 from smtplib import SMTP
 from email.message import EmailMessage
diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
index 515150e9d..58396b33a 100644
--- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py
+++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
@@ -14,7 +14,7 @@
 
 """Scheduling module to create cleanup jobs for specific xref sources."""
 
-from ensembl.xrefs.Base import *
+from ensembl.production.xrefs.Base import *
 
 class ScheduleCleanup(Base):
   def run(self):
diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py
index b2e48aa86..8001bccc8 100644
--- a/src/python/ensembl/production/xrefs/ScheduleDownload.py
+++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py
@@ -14,7 +14,7 @@
 
 """Scheduling module to create download jobs for all xref sources in config file."""
 
-from ensembl.xrefs.Base import *
+from ensembl.production.xrefs.Base import *
 
 class ScheduleDownload(Base):
   def run(self):

From 6f63ae1947e8eace093818df1a99cbedcbf4ae4a Mon Sep 17 00:00:00 2001
From: "J. Alvarez-Jarreta" <jalvarez@ebi.ac.uk>
Date: Thu, 16 May 2024 10:53:51 +0100
Subject: [PATCH 13/37] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 71c0445d4..529bccc7b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ ensembl-hive @ git+https://github.com/Ensembl/ensembl-hive.git
     #   ensembl-py
 ensembl-metadata-api @ git+https://github.com/Ensembl/ensembl-metadata-api.git@2.0.1a2
     # via -r requirements.in
-ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@1.2.2
+ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git
     # via ensembl-metadata-api
 exceptiongroup==1.2.0
     # via

From 15b4eade8a539cc93711e2b983d994abd8da77ec Mon Sep 17 00:00:00 2001
From: danielp <danielp@ebi.ac.uk>
Date: Mon, 20 May 2024 10:05:47 +0100
Subject: [PATCH 14/37] Updated default resources from 100mb to 1gb

---
 .../Production/Pipeline/PipeConfig/Base_conf.pm      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
index ac0fca871..a24322ef5 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
@@ -93,13 +93,13 @@ sub resource_classes {
 
     my %output = (
         #Default is a duplicate of 100M
-        'default'   => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
+        'default'   => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
         #Data mover nodes
-        'dm'        => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'dm_D'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'dm_W'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
+        'dm'        => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm_D'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm_W'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
         'dm32_D'    => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 32000 -R "rusage[mem=32000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
         'dmMAX_D'    => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 200000 -R "rusage[mem=200000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
     );

From 202130276fbf7ef45c9a993cd23c21964bff6110 Mon Sep 17 00:00:00 2001
From: danielp <danielp@ebi.ac.uk>
Date: Thu, 23 May 2024 23:26:15 +0100
Subject: [PATCH 15/37] Updated JSON remodeler to stop Experimental push on
 scalar is now forbidden with new perl version

---
 .../Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm
index d8e8328da..a6738edc0 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm
@@ -292,7 +292,10 @@ sub merge_xrefs {
         $obj->{$dbname} = [];
       }
       for my $ann ( @{ $subobj->{$dbname} } ) {
-        push $obj->{$dbname}, $self->copy_hash($ann);
+        if (ref($obj->{$dbname}) ne 'ARRAY') {
+          $obj->{$dbname} = [];
+        }
+        push @{ $obj->{$dbname} }, $self->copy_hash($ann);
       }
     }
   }

From 3290f218d42f2e1e996d241c5b35426d735a2ffc Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Fri, 24 May 2024 12:16:18 +0100
Subject: [PATCH 16/37] Update xref_all_sources.json for RGD

---
 .../ensembl/production/xrefs/config/xref_all_sources.json       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
index 1edc6e1da..e7b0065a4 100644
--- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json
+++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
@@ -91,7 +91,7 @@
     {
       "name" : "RGD",
       "parser" : "RGDParser",
-      "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES.RAT.txt",
+      "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt",
       "priority" : 2
     },
     {

From 975fa4a81c940e90f05493fdedd70e37a9d77d3a Mon Sep 17 00:00:00 2001
From: nwillhoft <70575561+nwillhoft@users.noreply.github.com>
Date: Tue, 28 May 2024 11:04:12 +0100
Subject: [PATCH 17/37] Update tag names and info relating to gencode genesets

---
 modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm
index 79358a04d..d95763721 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm
@@ -383,7 +383,8 @@ feature for the position of this on the genome
 - cds_start_NF: the coding region start could not be confirmed
 - mRNA_end_NF: the mRNA end could not be confirmed
 - mRNA_start_NF: the mRNA start could not be confirmed.
-- basic: the transcript is part of the gencode basic geneset
+- gencode_basic: the transcript is part of the gencode basic geneset
+- gencode_primary: the transcript is part of the gencode primary geneset
 
 Comments
 

From 95c13207e4ae13a8cdfc617095d395ac1d0cbc99 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-01.ebi.ac.uk>
Date: Wed, 29 May 2024 09:28:22 +0100
Subject: [PATCH 18/37] Bugfix for files not being overwritten

---
 nextflow/config/xref.config               |  5 +++--
 nextflow/workflows/xrefDownload.nf        |  6 +++++-
 scripts/xrefs/cleanup_and_split_source.pl | 20 +++++++++++++-------
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config
index 3737c5aff..024f80e68 100644
--- a/nextflow/config/xref.config
+++ b/nextflow/config/xref.config
@@ -17,6 +17,7 @@ params.reuse_db = 0
 params.skip_preparse = 1
 params.split_files_by_species = 1
 params.tax_ids_file = ''
+params.update_mode = 0
 
 params.base_path = ''
 params.clean_files = 1
@@ -70,11 +71,11 @@ profiles {
 
     withLabel: dm {
         queue = 'datamover'
-        time = '2h'
+        time = '3h'
         memory = 2.GB
     }
     withLabel:mem4GB {
-        time = '3d'
+        time = '5d'
         memory = 4.GB
     }
   }
diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf
index 51cc53552..65e255fda 100644
--- a/nextflow/workflows/xrefDownload.nf
+++ b/nextflow/workflows/xrefDownload.nf
@@ -18,6 +18,7 @@ println """\
         sources_config_file       : ${params.sources_config_file}
         clean_dir                 : ${params.clean_dir}
         tax_ids_file              : ${params.tax_ids_file}
+        update_mode               : ${params.update_mode}
         """
         .stripIndent()
 
@@ -57,6 +58,9 @@ def helpMessage() {
 
     --tax_ids_file              (optional)      Path to the file containing the taxonomy IDs of the species to extract data for.
                                                 Used to update the data for the provided species.
+
+    --update_mode               (optional)      If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs.
+                                                Only used if --tax_ids_file is set. Default: 0
   """.stripIndent()
 }
 
@@ -198,7 +202,7 @@ process CleanupSplitSource {
   }
 
   """
-  perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params
+  perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params
   """
 }
 
diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index 6a09971ea..3beabbcd6 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -19,7 +19,7 @@
 use Getopt::Long;
 use Carp;
 use DBI;
-use File::Path qw/make_path/;
+use File::Path qw/make_path rmtree/;
 use File::Spec::Functions;
 use HTTP::Tiny;
 use JSON;
@@ -28,7 +28,7 @@
 
 use Nextflow::Utils;
 
-my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $log_timestamp);
+my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $update_mode, $log_timestamp);
 GetOptions(
   'base_path=s'     => \$base_path,
   'source_db_url=s' => \$source_db_url,
@@ -37,14 +37,17 @@
   'clean_files=i'   => \$clean_files,
   'version_file:s'  => \$version_file,
   'tax_ids_file:s'  => \$tax_ids_file,
+  'update_mode:i'   => \$update_mode,
   'log_timestamp:s' => \$log_timestamp
 );
 
 # Check that all mandatory parameters are passed
 if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($clean_files)) {
-  croak "Usage: cleanup_source.pl --base_path <base_path> --source_db_url <source_db_url> --name <name> --clean_dir <clean_dir> --clean_files <clean_files> [--version_file <version_file>] [--tax_ids_file <tax_ids_file>] [--log_timestamp <log_timestamp>]";
+  croak "Usage: cleanup_source.pl --base_path <base_path> --source_db_url <source_db_url> --name <name> --clean_dir <clean_dir> --clean_files <clean_files> [--version_file <version_file>] [--tax_ids_file <tax_ids_file>] [--update_mode <update_mode>] [--log_timestamp <log_timestamp>]";
 }
 
+if (!defined($update_mode)) {$update_mode = 0;}
+
 my $log_file;
 if (defined($log_timestamp)) {
   my $log_path = catdir($base_path, 'logs', $log_timestamp);
@@ -71,6 +74,9 @@
 my $output_path = $clean_dir."/".$clean_name;
 
 # Create needed directories
+if (!$update_mode) {
+  rmtree($output_path);
+}
 make_path($output_path);
 
 my $sources_to_remove;
@@ -102,7 +108,7 @@
 # Extract taxonomy IDs
 my %tax_ids;
 my ($skipped_species, $added_species) = (0, 0);
-if ($tax_ids_file) {
+if ($tax_ids_file && $update_mode) {
   open my $fh, '<', $tax_ids_file;
   chomp(my @lines = <$fh>);
   close $fh;
@@ -212,10 +218,10 @@
       if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) {
         close($out_fh) if (defined($current_species_id));
 
-	my $species_id_str = sprintf("%04d", $species_id);
-	my @digits = split('', $species_id_str);
+        my $species_id_str = sprintf("%04d", $species_id);
+        my @digits = split('', $species_id_str);
 
-	$write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]);
+        $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]);
         make_path($write_path);
 
         $write_file = $write_path."/".$output_file_name."-".$species_id;

From 3ad6a021581c3a13d4279c26c76c4e038cddcf4e Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-01.ebi.ac.uk>
Date: Tue, 4 Jun 2024 08:59:16 +0100
Subject: [PATCH 19/37] Fix for when no species file is found

---
 modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
index 17a04a762..27d16dcca 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
@@ -148,6 +148,7 @@ sub run {
 
       foreach my $file (@list_files) {
         $file =~ s/\n//;
+        if (!-f $file) { next; }
         if (defined $release_file and $file eq $release_file) { next; }
   
         $dataflow_params = {

From 9586a89aee2ed9ab1db90a5daaa9eab023ceac50 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Tue, 4 Jun 2024 19:55:35 +0100
Subject: [PATCH 20/37] Update ProteinFeatures analysis

new member db Phobius (1.01) and SignalP_GRAM_POSITIVE (4.1)  added
---
 .../Pipeline/PipeConfig/ProteinFeatures_conf.pm  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
index ec402777e..2ffa87fdf 100755
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
@@ -227,6 +227,22 @@ sub default_options {
         ipscan_xml      => 'TMHMM',
         ipscan_lookup   => 0,
       },
+      {
+        db               => 'Phobius',
+        ipscan_lookup    => 1,
+        ipscan_name      => 'Phobius',
+        ipscan_xml       => 'PHOBIUS',
+        logic_name       => 'Phobius',
+        program          => 'InterProScan',
+      },
+      {
+        db              => 'SignalP_GRAM_POSITIVE',
+        ipscan_lookup   => 1,
+        ipscan_name     => 'SignalP_GRAM_POSITIVE',
+        ipscan_xml      => 'SIGNALP_GRAM_POSITIVE',
+        logic_name      => 'SignalP_GRAM_POSITIVE',
+        program         => 'InterProScan',
+      },
       #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html)
       {
         logic_name      => 'seg',

From f065884595904aa3e8d9c34b0444b376bc53f941 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Tue, 4 Jun 2024 20:25:59 +0100
Subject: [PATCH 21/37] Update ProteinFeatures_conf.pm

---
 .../Pipeline/PipeConfig/ProteinFeatures_conf.pm           | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
index 2ffa87fdf..e449f7b51 100755
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
@@ -243,6 +243,14 @@ sub default_options {
         logic_name      => 'SignalP_GRAM_POSITIVE',
         program         => 'InterProScan',
       },
+      {
+        db              => 'SignalP_GRAM_NEGATIVE',
+        ipscan_lookup   => 1,
+        ipscan_name     => 'SignalP_GRAM_NEGATIVE',
+        ipscan_xml      => 'SIGNALP_GRAM_NEGATIVE',
+        logic_name      => 'SignalP_GRAM_NEGATIVE',
+        program         => 'InterProScan',
+      },      
       #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html)
       {
         logic_name      => 'seg',

From d283cf09fc2516876cb77df899d56903c604a2bb Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Wed, 19 Jun 2024 15:07:03 +0100
Subject: [PATCH 22/37] Update xref_sources.json

update xref source Xenbase  mapping file GenePageEnsemblModelMapping_4.1.txt
---
 modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
index 161a41186..d12edb9cf 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
@@ -254,7 +254,7 @@
     {
       "name" : "Xenbase",
       "parser" : "XenopusJamboreeParser",
-      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
+      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
       "priority" : 1
     },
     {

From d74f87eb7e5bbc3b134366aa09b81b83136eab64 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Wed, 19 Jun 2024 15:08:57 +0100
Subject: [PATCH 23/37] Update xref_all_sources.json

update xenbase mapping file
---
 .../Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
index 7fa14c977..5b6361536 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
@@ -226,7 +226,7 @@
     {
       "name" : "Xenbase",
       "parser" : "XenopusJamboreeParser",
-      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
+      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
       "priority" : 1
     },
     {

From b2da23c1840bb10dde5f29cfb9717ef93f913b4a Mon Sep 17 00:00:00 2001
From: Stefano Giorgetti <sgiorgetti@ebi.ac.uk>
Date: Thu, 20 Jun 2024 13:36:10 +0000
Subject: [PATCH 24/37] Fixed  as per ENSPROD-9493

---
 .../Production/Pipeline/AlphaFold/InsertProteinFeatures.pm      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm
index 059ac4a3e..0178fc21e 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm
@@ -164,7 +164,7 @@ sub run {
             -db            => 'alphafold',
             -db_version    => $alpha_version,
             -db_file       => $self->param('db_dir') . '/accession_ids.csv',
-            -display_label => 'AlphaFold DB import',
+            -display_label => 'AFDB-ENSP mapping',
             -displayable   => '1',
             -description   => 'Protein features based on AlphaFold predictions, mapped with GIFTS or UniParc'
     );

From 7e4aca521363e3550e1e3cb1deecc670a1339077 Mon Sep 17 00:00:00 2001
From: John Tate <jgt@ebi.ac.uk>
Date: Mon, 24 Jun 2024 17:26:01 +0100
Subject: [PATCH 25/37] Fix use of keys on a scalar

Change the use of an experimental, now removed, feature that allowed a hashref scalar to be used in place of a hash.
---
 .../Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm
index 28aa1ba06..d5d8e3a8a 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm
@@ -218,7 +218,7 @@ sub all_hashes {
     } ## end foreach my $slice (@slices)
 
     for my $seq_type (keys %$batch) {
-        for my $attrib_table (keys $batch->{$seq_type}) {
+        for my $attrib_table (keys %{$batch->{$seq_type}}) {
             $attribute_adaptor->store_batch_on_Object($attrib_table, $batch->{$seq_type}->{$attrib_table}, 1000);
         }
     }

From a3ff2b44c29af7d4a6e819dfdf38f67178f32452 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-02.ebi.ac.uk>
Date: Wed, 26 Jun 2024 17:07:18 +0100
Subject: [PATCH 26/37] Fixes for 113 issues

---
 .../Pipeline/PipeConfig/XrefProcess_conf.pm        |  2 ++
 .../EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm
index 2e11ce311..0903a1947 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm
@@ -173,6 +173,7 @@ sub pipeline_analyses {
       base_path => $self->o('base_path'),
       release   => $self->o('release')
     },
+    -max_retry_count => 0,
     -flow_into  => {
       '2->A' => 'dump_xref',
       'A->1' => 'schedule_mapping'
@@ -187,6 +188,7 @@ sub pipeline_analyses {
       release     => $self->o('release'),
       config_file => $self->o('config_file')
     },
+    -max_retry_count => 0,
     -flow_into  => { 2 => 'align_factory' },
     -rc_name    => '1GB',
   },
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm
index b3233ea9d..ef4868850 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm
@@ -84,10 +84,18 @@ sub run {
   $exe =~ s/\n//g;
   my $command_string = sprintf ("%s --showalignment FALSE --showvulgar FALSE --ryo '%s' --gappedextension FALSE --model 'affine:local' %s --subopt no --query %s --target %s --querychunktotal %s --querychunkid %s", $exe, $ryo, $method, $source, $target, $max_chunks, $chunk);
   my $output = `$command_string`;
-  my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments
 
-  while (my $hit = shift @hits) {
-    print $fh $hit . "\n";
+  if ($? == 0) {
+    my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments
+
+    while (my $hit = shift @hits) {
+      print $fh $hit . "\n";
+    }
+  } else {
+    my $job = $self->input_job();
+    $job->adaptor()->db()->get_LogMessageAdaptor()->store_job_message($job->dbID(), $output, 'WORKER_ERROR');  
+
+    throw("Exonerate failed with exit_code: $?\n");
   }
 
   $fh->close();

From ee5da99d1786cf413d571478e1385b08bdef9961 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Fri, 5 Jul 2024 16:13:10 +0100
Subject: [PATCH 27/37] Update SourceFactory.pm

Fix for Experimental keys on a scalar is now forbidden
---
 .../Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm
index 4599d8636..37cd8b09e 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm
@@ -59,7 +59,7 @@ sub write_output {
     my $compara_param = $self->param('compara');
     my $cleanup_dir = $self->param('cleanup_dir');
 
-    foreach my $pair (keys $sp_config) {
+    foreach my $pair (keys %{$sp_config}) {
         my $compara = $sp_config->{$pair}->{'compara'};
         if (defined $compara_param && $compara ne $compara_param) {
             print STDERR "Skipping $compara\n";

From d1ce2933a759494c59e81d274029f9e8fd088618 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Fri, 19 Jul 2024 13:27:32 +0100
Subject: [PATCH 28/37] Updated Base class with slurm default resource 1GB

---
 .../Pipeline/PipeConfig/Base_conf.pm          | 44 +++++++------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
index 86d0ca7a9..a340f0602 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
@@ -66,14 +66,14 @@ sub beekeeper_extra_cmdline_options {
 sub resource_classes {
     my $self = shift;
 
+    ## String it together
+    my %time = (
+        H => ' --time=1:00:00',
+        D => ' --time=1-00:00:00',
+        W => ' --time=7-00:00:00'
+    );
 
-    ## Sting it together
-    my %time = (H => ' --time=1:00:00',
-        D         => ' --time=1-00:00:00',
-        W         => ' --time=7-00:00:00',);
-
-    my %memory = ('100M' => '100',
-        '200M'           => '200',
+    my %memory = (
         '500M'           => '500',
         '1GB'            => '1000',
         '2GB'            => '2000',
@@ -89,40 +89,30 @@ sub resource_classes {
     );
 
     my $dq = ' --partition=datamover';
-
     my %output = (
         #Default is a duplicate of 100M
-        'default'   => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
+        'default'   => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1G'} . 'm' },
+        'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1G'} . 'm' },
+        'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1G'} . 'm' },
         #Data mover nodes
-        'dm'        => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
-        'dm_D'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
-        'dm_W'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
-        'dm32_D'    => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 32000 -R "rusage[mem=32000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
-        'dmMAX_D'    => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 200000 -R "rusage[mem=200000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
+        'dm'       => { 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm_D'     => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm_W'     => { 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm32_D'   => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
+        'dmMAX_D'  => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
     );
-    #Create a dictionary of all possible time and memory combinations. Format would be:
-    #2G={
-    #   'SLURM' => ' --time=1:00:00  --mem=2000m',
-    #   'LSF' => '-q $self->o(production_queue) -M 2000 -R "rusage[mem=2000]"'
-    # };
 
     while ((my $time_key, my $time_value) = each(%time)) {
         while ((my $memory_key, my $memory_value) = each(%memory)) {
             if ($time_key eq 'H') {
-                $output{$memory_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"',
-                    'SLURM'                    => $time_value . '  --mem=' . $memory_value . 'm' }
+                $output{$memory_key} = { 'SLURM' => $time_value . '  --mem=' . $memory_value . 'm' };
             }
             else {
-                $output{$memory_key . '_' . $time_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"',
-                    'SLURM'                                      => $time_value . '  --mem=' . $memory_value . 'm' }
+                $output{$memory_key . '_' . $time_key} = { 'SLURM' => $time_value . '  --mem=' . $memory_value . 'm' };
             }
         }
     }
-
     return \%output;
-
 }
 
 1;

From 772caf1c280ce8dc01706f741400674f1abe5367 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Fri, 19 Jul 2024 13:32:16 +0100
Subject: [PATCH 29/37] Update Typo GB

---
 .../Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
index a340f0602..d57a14a70 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
@@ -91,9 +91,9 @@ sub resource_classes {
     my $dq = ' --partition=datamover';
     my %output = (
         #Default is a duplicate of 100M
-        'default'   => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1G'} . 'm' },
-        'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1G'} . 'm' },
-        'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1G'} . 'm' },
+        'default'   => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
         #Data mover nodes
         'dm'       => { 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
         'dm_D'     => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },

From 2b01517b2f6615cec5805a72435c14091e1d99ab Mon Sep 17 00:00:00 2001
From: vinay-ebi <vinay@ebi.ac.uk>
Date: Mon, 22 Jul 2024 16:25:21 +0100
Subject: [PATCH 30/37] decompress upidump.lis.gz file before load to hive db

---
 .../Pipeline/PipeConfig/ProteinFeatures_conf.pm     |  2 +-
 .../Pipeline/ProteinFeatures/LoadUniParc.pm         | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
index e449f7b51..dbc95951a 100755
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
@@ -75,7 +75,7 @@ sub default_options {
 
     interpro_file    => 'names.dat',
     interpro2go_file => 'interpro2go',
-    uniparc_file     => 'upidump.lis',
+    uniparc_file     => 'upidump.lis.gz',
     mapping_file     => 'idmapping_selected.tab.gz',
 
     # Files are retrieved and stored locally with the same name.
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
index 40eddb1b4..4d97c8950 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
@@ -21,16 +21,25 @@ package Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc;
 
 use strict;
 use warnings;
-
+use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 use File::Basename;
-
 use base ('Bio::EnsEMBL::Production::Pipeline::Common::Base');
 
 sub run {
   my ($self) = @_;
   my $uniparc_file = $self->param_required('uniparc_file_local');
 
+
   if (-e $uniparc_file) {
+
+    #check if uniparc file is compressed
+    if ($uniparc_file =~ /\.gz$/){
+        my $uniparc_file_decompress = $uniparc_file;
+        $uniparc_file_decompress =~ s/\.gz$//;
+        gunzip $uniparc_file => $uniparc_file_decompress  or $self->throw("gunzip failed: $GunzipError");
+        $uniparc_file = $uniparc_file_decompress;
+    }
+
     my $dbh = $self->hive_dbh;
     my $sql = "LOAD DATA LOCAL INFILE '$uniparc_file' INTO TABLE uniparc FIELDS TERMINATED BY ' '";
     $dbh->do($sql) or self->throw($dbh->errstr);

From 434e865ab149ed266c76366d2f6820d2df424f64 Mon Sep 17 00:00:00 2001
From: vinay-ebi <vinay@ebi.ac.uk>
Date: Mon, 22 Jul 2024 16:55:38 +0100
Subject: [PATCH 31/37] delete the upidump file after loading into hive db

---
 .../Production/Pipeline/ProteinFeatures/LoadUniParc.pm     | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
index 4d97c8950..56eb488df 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
@@ -37,6 +37,8 @@ sub run {
         my $uniparc_file_decompress = $uniparc_file;
         $uniparc_file_decompress =~ s/\.gz$//;
         gunzip $uniparc_file => $uniparc_file_decompress  or $self->throw("gunzip failed: $GunzipError");
+        #delete compressed file .gz
+        unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file");
         $uniparc_file = $uniparc_file_decompress;
     }
 
@@ -50,9 +52,14 @@ sub run {
     my $index_2 = 'ALTER TABLE uniparc ADD KEY md5sum_idx (md5sum) USING HASH';
     $dbh->do($index_2) or self->throw($dbh->errstr);
 
+    #delete upidump file from pipeline direcotry after loading into hive db
+    unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file");
+
   } else {
     $self->throw("Checksum file '$uniparc_file' does not exist");
   }
+
+
 }
 
 1;

From 693c899042e1f1710f8875215daf9b92edaa55ab Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Mon, 22 Jul 2024 20:39:30 +0100
Subject: [PATCH 32/37] Update
 modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm

Co-authored-by: John Tate <jgt@ebi.ac.uk>
---
 .../EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
index 56eb488df..e54038880 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
@@ -38,7 +38,7 @@ sub run {
         $uniparc_file_decompress =~ s/\.gz$//;
         gunzip $uniparc_file => $uniparc_file_decompress  or $self->throw("gunzip failed: $GunzipError");
         #delete compressed file .gz
-        unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file");
+        unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file: $!");
         $uniparc_file = $uniparc_file_decompress;
     }
 

From 51cc0c35a49aa430ddc67092052b5fd6928053d4 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Mon, 22 Jul 2024 20:39:39 +0100
Subject: [PATCH 33/37] Update
 modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm

Co-authored-by: John Tate <jgt@ebi.ac.uk>
---
 .../EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
index e54038880..444b7a2bb 100644
--- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
@@ -53,7 +53,7 @@ sub run {
     $dbh->do($index_2) or self->throw($dbh->errstr);
 
     #delete upidump file from pipeline direcotry after loading into hive db
-    unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file");
+    unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file: $!");
 
   } else {
     $self->throw("Checksum file '$uniparc_file' does not exist");

From 0eae3f67f37be6f754347bb70d11ac411885df24 Mon Sep 17 00:00:00 2001
From: Paulo Lins <plins@ebi.ac.uk>
Date: Thu, 25 Jul 2024 09:45:23 +0100
Subject: [PATCH 34/37] Include human and mouse symlinks

---
 scripts/py/regulation_ftp_symlinks.py | 53 ++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py
index a8a9c663b..ba66b919a 100644
--- a/scripts/py/regulation_ftp_symlinks.py
+++ b/scripts/py/regulation_ftp_symlinks.py
@@ -42,7 +42,7 @@
 from pathlib import Path
 
 # Human and Mouse follow a different dir structure
-SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"]
+# SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"]
 
 # GENE-SWITCH species
 GENE_SWITCH_SPECIES = [
@@ -54,8 +54,12 @@
 PUBLIC_PUB_PATH = "PUBLIC/pub"
 DATA_FILES_PATH = "data_files/"
 DATA_FILES_PATH_TEMPLATE = "{ftp_path}/data_files/{species}/{assembly}/funcgen"
-RELEASE_FOLDER_PATH_TEMPLATE = "{ftp_path}/release-{release}/regulation/{species}/{assembly}"
-MISC_GENE_SWITCH_PATH_TEMPLATE = "{ftp_path}/misc/gene-switch/regulation/{species}/{assembly}"
+RELEASE_FOLDER_PATH_TEMPLATE = (
+    "{ftp_path}/release-{release}/regulation/{species}/{assembly}"
+)
+MISC_GENE_SWITCH_PATH_TEMPLATE = (
+    "{ftp_path}/misc/gene-switch/regulation/{species}/{assembly}"
+)
 
 ANALYSIS_TYPE_PEAKS = "peaks"
 ANALYSIS_TYPE_SIGNAL = "signal"
@@ -135,7 +139,9 @@ def get_species_with_analysis_type_folder(analysis_type, ftp_path):
     def get_most_recent_release_data_file_path(data_file_path):
         validator.is_dir(Path(data_file_path))
         available_releases = listdir(data_file_path)
-        return Path(data_file_path) / str(max([int(release) for release in available_releases]))
+        return Path(data_file_path) / str(
+            max([int(release) for release in available_releases])
+        )
 
 
 utils = Utils()
@@ -156,10 +162,14 @@ def __init__(self, **path_specifics):
         )
         self.sources = {
             "release_folder": Path(
-                RELEASE_FOLDER_PATH_TEMPLATE.format(**self.aliased_paths(**path_specifics))
+                RELEASE_FOLDER_PATH_TEMPLATE.format(
+                    **self.aliased_paths(**path_specifics)
+                )
             ),
             "misc_folder": Path(
-                MISC_GENE_SWITCH_PATH_TEMPLATE.format(**self.aliased_paths(**path_specifics))
+                MISC_GENE_SWITCH_PATH_TEMPLATE.format(
+                    **self.aliased_paths(**path_specifics)
+                )
             ),
         }
 
@@ -172,7 +182,8 @@ def get(self, key):
 
     def symlink2rf(self, analysis_type, only_remove=False, relative=True):
         target = (
-            Path(path.relpath(self.target, self.sources["release_folder"])) / analysis_type
+            Path(path.relpath(self.target, self.sources["release_folder"]))
+            / analysis_type
             if relative
             else self.target / analysis_type
         )
@@ -188,7 +199,8 @@ def symlink2misc(self, analysis_type, only_remove=False, relative=True):
             makedirs(self.sources["misc_folder"])
 
         target = (
-            Path(path.relpath(self.target, self.sources["misc_folder"])) / analysis_type
+            Path(path.relpath(self.target, self.sources["misc_folder"]))
+            / analysis_type
             if relative
             else self.target / analysis_type
         )
@@ -203,17 +215,26 @@ def _symlink(self, source, target, only_remove):
         if not only_remove:
             source.symlink_to(target, target_is_directory=True)
             if validator.is_symlink(source, check=True):
-                logger.info(f"{source} -> {target} --- was successfully created")
+                logger.info(
+                    f"{source} -> {target} --- was successfully created"
+                )
         else:
             if not validator.is_symlink(source, check=True):
-                logger.info(f"{source} -> {target} -- was successfully removed")
+                logger.info(
+                    f"{source} -> {target} -- was successfully removed"
+                )
 
     def aliased_paths(self, **kwargs):
-        return {key: self.RELEASE_PATH_ALIASES.get(value, value) for key, value in kwargs.items()}
+        return {
+            key: self.RELEASE_PATH_ALIASES.get(value, value)
+            for key, value in kwargs.items()
+        }
 
     @staticmethod
     def search(analysis_type, ftp_path, release):
-        result = utils.get_species_with_analysis_type_folder(analysis_type, ftp_path)
+        result = utils.get_species_with_analysis_type_folder(
+            analysis_type, ftp_path
+        )
         return [
             RegulationSymlinkFTP(
                 analysis_type=analysis_type,
@@ -275,13 +296,17 @@ def parse_arguments():
     ftp_path = args.ftp_path / PUBLIC_PUB_PATH
 
     logger.info("Searching for peaks in data_files ...")
-    peaks = RegulationSymlinkFTP.search(ANALYSIS_TYPE_PEAKS, ftp_path, args.release_version)
+    peaks = RegulationSymlinkFTP.search(
+        ANALYSIS_TYPE_PEAKS, ftp_path, args.release_version
+    )
     for peak in peaks:
         peak.symlink2rf("peaks", only_remove=args.delete_symlinks)
         peak.symlink2misc("peaks", only_remove=args.delete_symlinks)
 
     logger.info("Searching for signals in data_files ...")
-    signals = RegulationSymlinkFTP.search(ANALYSIS_TYPE_SIGNAL, ftp_path, args.release_version)
+    signals = RegulationSymlinkFTP.search(
+        ANALYSIS_TYPE_SIGNAL, ftp_path, args.release_version
+    )
     for signal in signals:
         signal.symlink2rf("signal", only_remove=args.delete_symlinks)
         signal.symlink2misc("signal", only_remove=args.delete_symlinks)

From 3542ca1265fb905cba0675b3d52118ccac2e9391 Mon Sep 17 00:00:00 2001
From: Paulo Lins <plins@codon-slurm-login-02.ebi.ac.uk>
Date: Thu, 25 Jul 2024 10:36:20 +0100
Subject: [PATCH 35/37] include Mouse and Human symlinks

---
 scripts/py/regulation_ftp_symlinks.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py
index ba66b919a..938ae0f09 100644
--- a/scripts/py/regulation_ftp_symlinks.py
+++ b/scripts/py/regulation_ftp_symlinks.py
@@ -43,6 +43,7 @@
 
 # Human and Mouse follow a different dir structure
 # SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"]
+SPECIES_TO_NOT_INCLUDE = []
 
 # GENE-SWITCH species
 GENE_SWITCH_SPECIES = [
@@ -139,8 +140,15 @@ def get_species_with_analysis_type_folder(analysis_type, ftp_path):
     def get_most_recent_release_data_file_path(data_file_path):
         validator.is_dir(Path(data_file_path))
         available_releases = listdir(data_file_path)
+        releases = []
+        for release in available_releases:
+            try:
+                releases.append(int(release))
+            except:
+                continue
+
         return Path(data_file_path) / str(
-            max([int(release) for release in available_releases])
+            max(releases)
         )
 
 
@@ -235,6 +243,7 @@ def search(analysis_type, ftp_path, release):
         result = utils.get_species_with_analysis_type_folder(
             analysis_type, ftp_path
         )
+        __import__("ipdb").set_trace()
         return [
             RegulationSymlinkFTP(
                 analysis_type=analysis_type,
@@ -244,7 +253,7 @@ def search(analysis_type, ftp_path, release):
                 release=release,
             )
             for species, assemblies in result.items()
-            for assembly in assemblies
+            for assembly in assemblies if assembly not in ["GRCh37", "GRCm38", "NCBIM37"]
         ]
 
 

From ac6c74187bd47a1717dca2bf0066fa783cc9875f Mon Sep 17 00:00:00 2001
From: Paulo Lins <plins@codon-slurm-login-02.ebi.ac.uk>
Date: Thu, 25 Jul 2024 10:43:11 +0100
Subject: [PATCH 36/37] include Mouse and Human symlinks

---
 scripts/py/regulation_ftp_symlinks.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py
index 938ae0f09..3ded73f0c 100644
--- a/scripts/py/regulation_ftp_symlinks.py
+++ b/scripts/py/regulation_ftp_symlinks.py
@@ -42,7 +42,6 @@
 from pathlib import Path
 
 # Human and Mouse follow a different dir structure
-# SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"]
 SPECIES_TO_NOT_INCLUDE = []
 
 # GENE-SWITCH species
@@ -243,7 +242,6 @@ def search(analysis_type, ftp_path, release):
         result = utils.get_species_with_analysis_type_folder(
             analysis_type, ftp_path
         )
-        __import__("ipdb").set_trace()
         return [
             RegulationSymlinkFTP(
                 analysis_type=analysis_type,

From bbc136284d77f6acb7ba8d69026ccc6069401182 Mon Sep 17 00:00:00 2001
From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com>
Date: Tue, 13 Aug 2024 12:15:13 +0100
Subject: [PATCH 37/37] Update ProteinFeatures_conf.pm

Update the Protein feature analysis logic name to lowercase
---
 .../Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
index dbc95951a..41a743aa4 100755
--- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
+++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
@@ -232,7 +232,7 @@ sub default_options {
         ipscan_lookup    => 1,
         ipscan_name      => 'Phobius',
         ipscan_xml       => 'PHOBIUS',
-        logic_name       => 'Phobius',
+        logic_name       => 'phobius',
         program          => 'InterProScan',
       },
       {
@@ -240,7 +240,7 @@ sub default_options {
         ipscan_lookup   => 1,
         ipscan_name     => 'SignalP_GRAM_POSITIVE',
         ipscan_xml      => 'SIGNALP_GRAM_POSITIVE',
-        logic_name      => 'SignalP_GRAM_POSITIVE',
+        logic_name      => 'signalp_gram_positive',
         program         => 'InterProScan',
       },
       {
@@ -248,7 +248,7 @@ sub default_options {
         ipscan_lookup   => 1,
         ipscan_name     => 'SignalP_GRAM_NEGATIVE',
         ipscan_xml      => 'SIGNALP_GRAM_NEGATIVE',
-        logic_name      => 'SignalP_GRAM_NEGATIVE',
+        logic_name      => 'signalp_gram_negative',
         program         => 'InterProScan',
       },      
       #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html)