From d924e506ba888c5d1ba680a92375a0066012c001 Mon Sep 17 00:00:00 2001 From: Stefano Giorgetti Date: Fri, 26 Apr 2024 17:38:49 +0000 Subject: [PATCH 01/37] Changed GENCODE Basic tag to 'gencode_basic' as per ENSINT-1885 --- .../Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm | 2 +- modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm b/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm index 269f2b8a4..1024dc13c 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm @@ -263,7 +263,7 @@ sub Bio::EnsEMBL::Transcript::summary_as_hash { $summary{'transcript_support_level'} = $self->tsl if $self->tsl; my @tags; - push(@tags, 'basic') if $self->gencode_basic(); + push(@tags, 'gencode_basic') if $self->gencode_basic(); push(@tags, 'gencode_primary') if $self->gencode_primary(); push(@tags, 'Ensembl_canonical') if $self->is_canonical(); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm index 537675025..9efa955a6 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm @@ -279,7 +279,7 @@ sub Bio::EnsEMBL::Transcript::summary_as_hash { $summary{'transcript_support_level'} = $self->tsl if $self->tsl; my @tags; - push(@tags, 'basic') if $self->gencode_basic(); + push(@tags, 'gencode_basic') if $self->gencode_basic(); push(@tags, 'gencode_primary') if $self->gencode_primary(); push(@tags, 'Ensembl_canonical') if $self->is_canonical(); From a0b629a2a37dec03ae34f9e8f64b973901b21c98 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 29 Apr 2024 09:49:11 +0100 Subject: [PATCH 02/37] Minor fixes --- scripts/xrefs/cleanup_and_split_source.pl | 11 +++++-- src/python/ensembl/xrefs/Base.py | 6 ++-- src/python/ensembl/xrefs/EmailNotification.py | 29 +++++++++---------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index e538503c2..e5ddf5560 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -101,12 +101,12 @@ # Extract taxonomy IDs my %tax_ids; -my $skipped_species = 0; +my ($skipped_species, $added_species) = (0, 0); if ($tax_ids_file) { open my $fh, '<', $tax_ids_file; chomp(my @lines = <$fh>); close $fh; - my %tax_ids = map { $_ => 1 } @lines; + %tax_ids = map { $_ => 1 } @lines; # Check if any taxonomy IDs already have files foreach my $tax_id (keys(%tax_ids)) { @@ -216,6 +216,12 @@ make_path($write_path); $write_file = $write_path."/".$output_file_name."-".$species_id; + + # Check if creating new file + if (!-e $write_file) { + $added_species++; + } + open($out_fh, '>>', $write_file) or die "Couldn't open output file '$write_file' $!"; $current_species_id = $species_id; @@ -231,6 +237,7 @@ add_to_log_file($log_file, "Source $source_name cleaned up"); add_to_log_file($log_file, "$source_name skipped species = $skipped_species"); +add_to_log_file($log_file, "$source_name species files created = $added_species") # Save the clean files directory in source db my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url); diff --git a/src/python/ensembl/xrefs/Base.py b/src/python/ensembl/xrefs/Base.py index 32b801e93..d5022627f 100644 --- a/src/python/ensembl/xrefs/Base.py +++ b/src/python/ensembl/xrefs/Base.py @@ -182,8 +182,6 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args: if db and db == 'checksum': file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}') - logging.info(f'I am here inside local ftp with {orig_source_name}') - if not (skip_download_if_file_present and os.path.exists(file_path)): shutil.copy(local_file, file_path) @@ -838,11 +836,11 @@ def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: pep_path = self.get_path(base_path, species, release, 'ensembl', 'peptides.fa'); # Try to find a species-specific mapper first - module_name = f'ensembl.xrefs.mapper.{species}' + module_name = f'ensembl.xrefs.mappers.{species}' class_name = species found = importlib.find_loader(module_name) if not found: - module_name = 'ensembl.xrefs.mapper.BasicMapper' + module_name = 'ensembl.xrefs.mappers.BasicMapper' class_name = 'BasicMapper' # Create a mapper object diff --git a/src/python/ensembl/xrefs/EmailNotification.py b/src/python/ensembl/xrefs/EmailNotification.py index dae71e738..75a23012d 100644 --- a/src/python/ensembl/xrefs/EmailNotification.py +++ b/src/python/ensembl/xrefs/EmailNotification.py @@ -37,10 +37,7 @@ def run(self): if os.path.exists(log_path): log_files = os.listdir(log_path) - parameters = {} - sources = {} - added_species = {} - skipped_species = {} + parameters, sources, added_species, skipped_species = {}, {}, {}, {} main_log_file = os.path.join(base_path, 'logs', log_timestamp, 'logfile_'+log_timestamp) @@ -92,11 +89,11 @@ def run(self): sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", data) for source in sources_list: sources[source[0]].update({'copied' : os.path.dirname(source[1])}) - # skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data) - # skipped_species = {source[0]: source[1] for source in skipped_species_list} + skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data) + skipped_species = {source[0]: source[1] for source in skipped_species_list} - # added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) taxonomy IDs added to filter = (\d+)", data) - # added_species = {division[0]: division[1] for division in added_species_list} + added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) species files created = (\d+)", data) + added_species = {source[0]: source[1] for source in added_species_list} # Include source statistics email_message += '
--Source Statistics--
' @@ -106,7 +103,7 @@ def run(self): if source_values.get('downloaded'): (download_type, file_path) = source_values['downloaded'].split("|") - email_message += f' File downloaded via {download_type} into {file_path}
' + email_message += f'   File downloaded via {download_type} into {file_path}
' elif source_values.get('copied'): email_message += '   File(s) copied from local FTP into %s
' % (source_values['copied']) elif source_values.get('skipped'): email_message += '   File(s) download skipped, already exists in %s
' % (source_values['skipped']) @@ -117,13 +114,13 @@ def run(self): if source_values.get('preparsed'): email_message += '   Pre-parsed ✔
' # Include species statistics - # email_message += '
--Species Statistics--
' - # for division,count in added_species.items(): - # if division == 'Total': continue - # email_message += f'{species_type} taxonomy IDs = {count}
' - # email_message += 'Skipped Species per source file:
' - # for source_name,count in skipped_species.items(): - # email_message += f'   {source_name}: {count}
' + email_message += '
--Species Statistics--
' + email_message += 'Skipped Species (files already exist):
' + for source_name, count in skipped_species.items(): + email_message += f'   {source_name}: {count}
' + email_message += 'Added Species (files created):
' + for source_name, count in added_species.items(): + email_message += f'   {source_name}: {count}
' email_message += '
To run the Xref Process Pipeline based on the data from this pipeline, use the same --base_path, --source_db_url, and --central_db_url (if preparse was run) values provided to this pipeline.' From ec58c92e3f0d0278e6ff50540e7c775025e7a43f Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 29 Apr 2024 10:08:06 +0100 Subject: [PATCH 03/37] Missing semicolon --- scripts/xrefs/cleanup_and_split_source.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index e5ddf5560..c08617751 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -237,7 +237,7 @@ add_to_log_file($log_file, "Source $source_name cleaned up"); add_to_log_file($log_file, "$source_name skipped species = $skipped_species"); -add_to_log_file($log_file, "$source_name species files created = $added_species") +add_to_log_file($log_file, "$source_name species files created = $added_species"); # Save the clean files directory in source db my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url); From 6b97c441a1de4e376ddb317700ae286473cae293 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 29 Apr 2024 11:11:24 +0100 Subject: [PATCH 04/37] Change glob parameters --- .../Production/Pipeline/Xrefs/ScheduleSource.pm | 2 +- scripts/xrefs/cleanup_and_split_source.pl | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm index eeda0dc8c..b80ebd421 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm @@ -134,7 +134,7 @@ sub run { # For Uniprot and Refseq, files might have been split by species if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) { my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein'))); - @list_files = glob($file_name . "/**/" . $file_prefix . "-" . $species_id); + @list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id); $_ = basename(dirname($_)) . "/" . basename($_) foreach (@list_files); } diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index c08617751..5687a5109 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -110,12 +110,15 @@ # Check if any taxonomy IDs already have files foreach my $tax_id (keys(%tax_ids)) { - my @tax_files = glob($output_path . "/**/" . $output_file_name . "-" . $tax_id); + print Dumper $tax_id; + my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id); + print Dumper @tax_files; if (scalar(@tax_files) > 0) { $tax_ids{$tax_id} = 0; $skipped_species++; } } + die; # Do nothing if all taxonomy IDs already have files if ($skipped_species == scalar(keys(%tax_ids))) { @@ -211,8 +214,10 @@ if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) { close($out_fh) if (defined($current_species_id)); - my @digits = split('', $species_id); - $write_path = catdir($output_path, $digits[0], (scalar(@digits)>1 ? $digits[1] : ""), (scalar(@digits)>2 ? $digits[2] : ""), (scalar(@digits)>3 ? $digits[3] : "")); + my $species_id_str = sprintf("%04d", $species_id); + my @digits = split('', $species_id_str); + + $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]); make_path($write_path); $write_file = $write_path."/".$output_file_name."-".$species_id; @@ -231,7 +236,7 @@ } close($in_fh); - close($out_fh); + close($out_fh) if $out_fh; } } From 5d99a70491d18a9525313c3e642620f86263d8a3 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 29 Apr 2024 11:15:02 +0100 Subject: [PATCH 05/37] Remove debugging --- scripts/xrefs/cleanup_and_split_source.pl | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index 5687a5109..f1ea49f9a 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -110,15 +110,12 @@ # Check if any taxonomy IDs already have files foreach my $tax_id (keys(%tax_ids)) { - print Dumper $tax_id; my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id); - print Dumper @tax_files; if (scalar(@tax_files) > 0) { $tax_ids{$tax_id} = 0; $skipped_species++; } } - die; # Do nothing if all taxonomy IDs already have files if ($skipped_species == scalar(keys(%tax_ids))) { From 577c6bafb40da0de50d5d83b40842745e52cd992 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 29 Apr 2024 11:27:43 +0100 Subject: [PATCH 06/37] Fixes to prevent warnings --- scripts/xrefs/cleanup_and_split_source.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index f1ea49f9a..6a09971ea 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -161,12 +161,13 @@ my $species_id; if ($is_uniprot) { ($species_id) = $record =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/; - $species_id =~ s/\s//; + $species_id =~ s/\s// if $species_id; } else { ($species_id) = $record =~ /db_xref=.taxon:(\d+)/; } # Only continue with wanted species + next if (!$species_id); next if ($tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id})); # Clean up data From a29d373cb43887427f1c040d5c41070e96db3e9e Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Tue, 30 Apr 2024 09:55:24 +0100 Subject: [PATCH 07/37] Fix file paths --- .../Production/Pipeline/Xrefs/ScheduleSource.pm | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm index b80ebd421..34da3141e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm @@ -127,21 +127,24 @@ sub run { } else { # Create list of files opendir(my $dir_handle, $file_name); - my @list_files = readdir($dir_handle); + my @temp_list_files = readdir($dir_handle); closedir($dir_handle); + + my @list_files; + foreach my $file (@temp_list_files) { + next if ($file =~ /^\./); + push(@list_files, $file_name . "/" . $file); + } if ($preparse) { @list_files = $preparse; } # For Uniprot and Refseq, files might have been split by species if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) { my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein'))); @list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id); - $_ = basename(dirname($_)) . "/" . basename($_) foreach (@list_files); } foreach my $file (@list_files) { - next if ($file =~ /^\./); $file =~ s/\n//; - $file = $file_name . "/" . $file; if (defined $release_file and $file eq $release_file) { next; } $dataflow_params = { From 7fb4c0488dcb8b5d921a18b8db0dd937dd083835 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Thu, 2 May 2024 09:11:45 +0100 Subject: [PATCH 08/37] Keep original files if no species file --- .../Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm index 34da3141e..17a04a762 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm @@ -140,7 +140,10 @@ sub run { # For Uniprot and Refseq, files might have been split by species if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) { my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein'))); - @list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id); + my @species_list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id); + if (scalar(@species_list_files) > 0) { + @list_files = @species_list_files; + } } foreach my $file (@list_files) { From 74296b6af9b528c5d7eeb549cb4ce950892b5202 Mon Sep 17 00:00:00 2001 From: jmgonzmart Date: Fri, 10 May 2024 16:31:16 +0100 Subject: [PATCH 09/37] Updated HGNC custom download URL --- .../Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json | 2 +- .../Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json | 2 +- modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json index aebb77102..b0910be58 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json @@ -203,7 +203,7 @@ { "name" : "HGNC", "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", "db" : "ccds", "priority" : 3 } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json index 0f24ec9a8..7fa14c977 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json @@ -241,7 +241,7 @@ { "name" : "HGNC", "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", "db" : "ccds", "priority" : 3 } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json index 9bcbf7936..161a41186 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json @@ -269,7 +269,7 @@ { "name" : "HGNC", "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", "db" : "ccds", "priority" : 3 } From 0c7c41fe7656659400e48200bc2cd47c7df0a5fc Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Wed, 15 May 2024 20:58:04 +0100 Subject: [PATCH 10/37] moved ensembl/xrefs to ensembl/production/xrefs --- nextflow/config/xref.config | 65 +++++++++++++++---- nextflow/workflows/xrefDownload.nf | 10 +-- .../ensembl/{ => production}/xrefs/Base.py | 0 .../{ => production}/xrefs/Checksum.py | 0 .../{ => production}/xrefs/DownloadSource.py | 0 .../xrefs/EmailNotification.py | 0 .../{ => production}/xrefs/ScheduleCleanup.py | 0 .../xrefs/ScheduleDownload.py | 0 .../xrefs/config/xref_all_sources.json | 0 .../xrefs/config/xref_config.ini | 0 10 files changed, 57 insertions(+), 18 deletions(-) rename src/python/ensembl/{ => production}/xrefs/Base.py (100%) rename src/python/ensembl/{ => production}/xrefs/Checksum.py (100%) rename src/python/ensembl/{ => production}/xrefs/DownloadSource.py (100%) rename src/python/ensembl/{ => production}/xrefs/EmailNotification.py (100%) rename src/python/ensembl/{ => production}/xrefs/ScheduleCleanup.py (100%) rename src/python/ensembl/{ => production}/xrefs/ScheduleDownload.py (100%) rename src/python/ensembl/{ => production}/xrefs/config/xref_all_sources.json (100%) rename src/python/ensembl/{ => production}/xrefs/config/xref_config.ini (100%) diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config index bc36b8617..66bebf05d 100644 --- a/nextflow/config/xref.config +++ b/nextflow/config/xref.config @@ -22,23 +22,62 @@ params.base_path = '' params.clean_files = 1 params.clean_dir = "${params.base_path}/clean_files" -executor { - name = 'slurm' - queue = 'production' - queueSize = 100 +trace { + enabled = true + file = "trace" + overwrite = true } -process { - errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } - maxRetries = 3 - time = '1d' +report { + overwrite = true + file = "report.html" + enable = true +} + +profiles { - withLabel:small_process { - memory = 200.MB - executor.perTaskReserve = 200.MB + lsf { + process { + errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } + executor = 'lsf' + queue = 'production' + queueSize = 100 + maxRetries = 3 + withLabel:small_process { + memory = 200.MB + //very specific to lsf + executor.perTaskReserve = 200.MB + } + withLabel: dm { + queue = 'datamover' + time = '2h' + } } + } + + slurm { + process { + errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } + executor = 'slurm' + queue = 'production' + queueSize = 100 + maxRetries = 3 + time = '1d' - withLabel:mem4GB { - time = '3d' + withLabel:small_process { + memory = 200.MB + } + + withLabel: dm { + queue = 'datamover' + time = '2h' + memory = 2.GB + } + withLabel:mem4GB { + time = '3d' + } } + } } + + diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf index 5c808038e..51cc53552 100644 --- a/nextflow/workflows/xrefDownload.nf +++ b/nextflow/workflows/xrefDownload.nf @@ -107,7 +107,7 @@ process ScheduleDownload { timestamp = new java.util.Date().format("yyyyMMdd_HHmmss") """ - python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp """ } @@ -126,7 +126,7 @@ process DownloadSource { src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] """ - python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} """ } @@ -156,7 +156,7 @@ process ScheduleCleanup { path 'dataflow_cleanup_sources.json' """ - python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp """ } @@ -171,7 +171,7 @@ process Checksum { val 'ChecksumDone' """ - python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp """ } @@ -234,6 +234,6 @@ process NotifyByEmail { val timestamp """ - python ${params.scripts_dir}/run_module.py --module ensembl.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp """ } diff --git a/src/python/ensembl/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py similarity index 100% rename from src/python/ensembl/xrefs/Base.py rename to src/python/ensembl/production/xrefs/Base.py diff --git a/src/python/ensembl/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py similarity index 100% rename from src/python/ensembl/xrefs/Checksum.py rename to src/python/ensembl/production/xrefs/Checksum.py diff --git a/src/python/ensembl/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py similarity index 100% rename from src/python/ensembl/xrefs/DownloadSource.py rename to src/python/ensembl/production/xrefs/DownloadSource.py diff --git a/src/python/ensembl/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py similarity index 100% rename from src/python/ensembl/xrefs/EmailNotification.py rename to src/python/ensembl/production/xrefs/EmailNotification.py diff --git a/src/python/ensembl/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py similarity index 100% rename from src/python/ensembl/xrefs/ScheduleCleanup.py rename to src/python/ensembl/production/xrefs/ScheduleCleanup.py diff --git a/src/python/ensembl/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py similarity index 100% rename from src/python/ensembl/xrefs/ScheduleDownload.py rename to src/python/ensembl/production/xrefs/ScheduleDownload.py diff --git a/src/python/ensembl/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json similarity index 100% rename from src/python/ensembl/xrefs/config/xref_all_sources.json rename to src/python/ensembl/production/xrefs/config/xref_all_sources.json diff --git a/src/python/ensembl/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini similarity index 100% rename from src/python/ensembl/xrefs/config/xref_config.ini rename to src/python/ensembl/production/xrefs/config/xref_config.ini From 039335714034a705b4708a6e7f9f42c66c299fa0 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Wed, 15 May 2024 21:38:07 +0100 Subject: [PATCH 11/37] Update xref.config add memory 4gg to slurm profile --- nextflow/config/xref.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config index 66bebf05d..3737c5aff 100644 --- a/nextflow/config/xref.config +++ b/nextflow/config/xref.config @@ -75,6 +75,7 @@ profiles { } withLabel:mem4GB { time = '3d' + memory = 4.GB } } } From 4391762fb75228fed7626bd0c695037124450cfc Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Thu, 16 May 2024 10:00:31 +0100 Subject: [PATCH 12/37] base load changed to ensembl.production.xrefs --- src/python/ensembl/production/xrefs/Checksum.py | 2 +- src/python/ensembl/production/xrefs/DownloadSource.py | 2 +- src/python/ensembl/production/xrefs/EmailNotification.py | 2 +- src/python/ensembl/production/xrefs/ScheduleCleanup.py | 2 +- src/python/ensembl/production/xrefs/ScheduleDownload.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py index dc59d5c3f..7ccb401a7 100644 --- a/src/python/ensembl/production/xrefs/Checksum.py +++ b/src/python/ensembl/production/xrefs/Checksum.py @@ -14,7 +14,7 @@ """Checksum module for the Xref Download pipeline.""" -from ensembl.xrefs.Base import * +from ensembl.production.xrefs.Base import * class Checksum(Base): def run(self): diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py index b88088960..060fcb116 100644 --- a/src/python/ensembl/production/xrefs/DownloadSource.py +++ b/src/python/ensembl/production/xrefs/DownloadSource.py @@ -14,7 +14,7 @@ """Download module to download xref and version files.""" -from ensembl.xrefs.Base import * +from ensembl.production.xrefs.Base import * class DownloadSource(Base): def run(self): diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py index 75a23012d..22738d990 100644 --- a/src/python/ensembl/production/xrefs/EmailNotification.py +++ b/src/python/ensembl/production/xrefs/EmailNotification.py @@ -14,7 +14,7 @@ """Email module to send user emails notifying of xref pipelines end, with important information and statistics.""" -from ensembl.xrefs.Base import * +from ensembl.production.xrefs.Base import * from smtplib import SMTP from email.message import EmailMessage diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py index 515150e9d..58396b33a 100644 --- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py +++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py @@ -14,7 +14,7 @@ """Scheduling module to create cleanup jobs for specific xref sources.""" -from ensembl.xrefs.Base import * +from ensembl.production.xrefs.Base import * class ScheduleCleanup(Base): def run(self): diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py index b2e48aa86..8001bccc8 100644 --- a/src/python/ensembl/production/xrefs/ScheduleDownload.py +++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py @@ -14,7 +14,7 @@ """Scheduling module to create download jobs for all xref sources in config file.""" -from ensembl.xrefs.Base import * +from ensembl.production.xrefs.Base import * class ScheduleDownload(Base): def run(self): From 6f63ae1947e8eace093818df1a99cbedcbf4ae4a Mon Sep 17 00:00:00 2001 From: "J. Alvarez-Jarreta" Date: Thu, 16 May 2024 10:53:51 +0100 Subject: [PATCH 13/37] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 71c0445d4..529bccc7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ ensembl-hive @ git+https://github.com/Ensembl/ensembl-hive.git # ensembl-py ensembl-metadata-api @ git+https://github.com/Ensembl/ensembl-metadata-api.git@2.0.1a2 # via -r requirements.in -ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@1.2.2 +ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git # via ensembl-metadata-api exceptiongroup==1.2.0 # via From 15b4eade8a539cc93711e2b983d994abd8da77ec Mon Sep 17 00:00:00 2001 From: danielp Date: Mon, 20 May 2024 10:05:47 +0100 Subject: [PATCH 14/37] Updated default resources from 100mb to 1gb --- .../Production/Pipeline/PipeConfig/Base_conf.pm | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm index ac0fca871..a24322ef5 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm @@ -93,13 +93,13 @@ sub resource_classes { my %output = ( #Default is a duplicate of 100M - 'default' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' }, - 'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' }, - 'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' }, + 'default' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $pq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' }, #Data mover nodes - 'dm' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' }, - 'dm_D' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' }, - 'dm_W' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' }, + 'dm' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'dm_D' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'dm_W' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' }, 'dm32_D' => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 32000 -R "rusage[mem=32000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' }, 'dmMAX_D' => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 200000 -R "rusage[mem=200000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' }, ); From 202130276fbf7ef45c9a993cd23c21964bff6110 Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 23 May 2024 23:26:15 +0100 Subject: [PATCH 15/37] Updated JSON remodeler to stop Experimental push on scalar is now forbidden with new perl version --- .../Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm index d8e8328da..a6738edc0 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm @@ -292,7 +292,10 @@ sub merge_xrefs { $obj->{$dbname} = []; } for my $ann ( @{ $subobj->{$dbname} } ) { - push $obj->{$dbname}, $self->copy_hash($ann); + if (ref($obj->{$dbname}) ne 'ARRAY') { + $obj->{$dbname} = []; + } + push @{ $obj->{$dbname} }, $self->copy_hash($ann); } } } From 3290f218d42f2e1e996d241c5b35426d735a2ffc Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Fri, 24 May 2024 12:16:18 +0100 Subject: [PATCH 16/37] Update xref_all_sources.json for RGD --- .../ensembl/production/xrefs/config/xref_all_sources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json index 1edc6e1da..e7b0065a4 100644 --- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json +++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json @@ -91,7 +91,7 @@ { "name" : "RGD", "parser" : "RGDParser", - "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES.RAT.txt", + "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt", "priority" : 2 }, { From 975fa4a81c940e90f05493fdedd70e37a9d77d3a Mon Sep 17 00:00:00 2001 From: nwillhoft <70575561+nwillhoft@users.noreply.github.com> Date: Tue, 28 May 2024 11:04:12 +0100 Subject: [PATCH 17/37] Update tag names and info relating to gencode genesets --- modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm index 79358a04d..d95763721 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm @@ -383,7 +383,8 @@ feature for the position of this on the genome - cds_start_NF: the coding region start could not be confirmed - mRNA_end_NF: the mRNA end could not be confirmed - mRNA_start_NF: the mRNA start could not be confirmed. -- basic: the transcript is part of the gencode basic geneset +- gencode_basic: the transcript is part of the gencode basic geneset +- gencode_primary: the transcript is part of the gencode primary geneset Comments From 95c13207e4ae13a8cdfc617095d395ac1d0cbc99 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Wed, 29 May 2024 09:28:22 +0100 Subject: [PATCH 18/37] Bugfix for files not being overwritten --- nextflow/config/xref.config | 5 +++-- nextflow/workflows/xrefDownload.nf | 6 +++++- scripts/xrefs/cleanup_and_split_source.pl | 20 +++++++++++++------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config index 3737c5aff..024f80e68 100644 --- a/nextflow/config/xref.config +++ b/nextflow/config/xref.config @@ -17,6 +17,7 @@ params.reuse_db = 0 params.skip_preparse = 1 params.split_files_by_species = 1 params.tax_ids_file = '' +params.update_mode = 0 params.base_path = '' params.clean_files = 1 @@ -70,11 +71,11 @@ profiles { withLabel: dm { queue = 'datamover' - time = '2h' + time = '3h' memory = 2.GB } withLabel:mem4GB { - time = '3d' + time = '5d' memory = 4.GB } } diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf index 51cc53552..65e255fda 100644 --- a/nextflow/workflows/xrefDownload.nf +++ b/nextflow/workflows/xrefDownload.nf @@ -18,6 +18,7 @@ println """\ sources_config_file : ${params.sources_config_file} clean_dir : ${params.clean_dir} tax_ids_file : ${params.tax_ids_file} + update_mode : ${params.update_mode} """ .stripIndent() @@ -57,6 +58,9 @@ def helpMessage() { --tax_ids_file (optional) Path to the file containing the taxonomy IDs of the species to extract data for. Used to update the data for the provided species. + + --update_mode (optional) If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs. + Only used if --tax_ids_file is set. Default: 0 """.stripIndent() } @@ -198,7 +202,7 @@ process CleanupSplitSource { } """ - perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params + perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params """ } diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index 6a09971ea..3beabbcd6 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -19,7 +19,7 @@ use Getopt::Long; use Carp; use DBI; -use File::Path qw/make_path/; +use File::Path qw/make_path rmtree/; use File::Spec::Functions; use HTTP::Tiny; use JSON; @@ -28,7 +28,7 @@ use Nextflow::Utils; -my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $log_timestamp); +my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $update_mode, $log_timestamp); GetOptions( 'base_path=s' => \$base_path, 'source_db_url=s' => \$source_db_url, @@ -37,14 +37,17 @@ 'clean_files=i' => \$clean_files, 'version_file:s' => \$version_file, 'tax_ids_file:s' => \$tax_ids_file, + 'update_mode:i' => \$update_mode, 'log_timestamp:s' => \$log_timestamp ); # Check that all mandatory parameters are passed if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($clean_files)) { - croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --clean_files [--version_file ] [--tax_ids_file ] [--log_timestamp ]"; + croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --clean_files [--version_file ] [--tax_ids_file ] [--update_mode ] [--log_timestamp ]"; } +if (!defined($update_mode)) {$update_mode = 0;} + my $log_file; if (defined($log_timestamp)) { my $log_path = catdir($base_path, 'logs', $log_timestamp); @@ -71,6 +74,9 @@ my $output_path = $clean_dir."/".$clean_name; # Create needed directories +if (!$update_mode) { + rmtree($output_path); +} make_path($output_path); my $sources_to_remove; @@ -102,7 +108,7 @@ # Extract taxonomy IDs my %tax_ids; my ($skipped_species, $added_species) = (0, 0); -if ($tax_ids_file) { +if ($tax_ids_file && $update_mode) { open my $fh, '<', $tax_ids_file; chomp(my @lines = <$fh>); close $fh; @@ -212,10 +218,10 @@ if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) { close($out_fh) if (defined($current_species_id)); - my $species_id_str = sprintf("%04d", $species_id); - my @digits = split('', $species_id_str); + my $species_id_str = sprintf("%04d", $species_id); + my @digits = split('', $species_id_str); - $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]); + $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]); make_path($write_path); $write_file = $write_path."/".$output_file_name."-".$species_id; From 3ad6a021581c3a13d4279c26c76c4e038cddcf4e Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Tue, 4 Jun 2024 08:59:16 +0100 Subject: [PATCH 19/37] Fix for when no species file is found --- modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm index 17a04a762..27d16dcca 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm @@ -148,6 +148,7 @@ sub run { foreach my $file (@list_files) { $file =~ s/\n//; + if (!-f $file) { next; } if (defined $release_file and $file eq $release_file) { next; } $dataflow_params = { From 9586a89aee2ed9ab1db90a5daaa9eab023ceac50 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Tue, 4 Jun 2024 19:55:35 +0100 Subject: [PATCH 20/37] Update ProteinFeatures analysis new member db Phobius (1.01) and SignalP_GRAM_POSITIVE (4.1) added --- .../Pipeline/PipeConfig/ProteinFeatures_conf.pm | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm index ec402777e..2ffa87fdf 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm @@ -227,6 +227,22 @@ sub default_options { ipscan_xml => 'TMHMM', ipscan_lookup => 0, }, + { + db => 'Phobius', + ipscan_lookup => 1, + ipscan_name => 'Phobius', + ipscan_xml => 'PHOBIUS', + logic_name => 'Phobius', + program => 'InterProScan', + }, + { + db => 'SignalP_GRAM_POSITIVE', + ipscan_lookup => 1, + ipscan_name => 'SignalP_GRAM_POSITIVE', + ipscan_xml => 'SIGNALP_GRAM_POSITIVE', + logic_name => 'SignalP_GRAM_POSITIVE', + program => 'InterProScan', + }, #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html) { logic_name => 'seg', From f065884595904aa3e8d9c34b0444b376bc53f941 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Tue, 4 Jun 2024 20:25:59 +0100 Subject: [PATCH 21/37] Update ProteinFeatures_conf.pm --- .../Pipeline/PipeConfig/ProteinFeatures_conf.pm | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm index 2ffa87fdf..e449f7b51 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm @@ -243,6 +243,14 @@ sub default_options { logic_name => 'SignalP_GRAM_POSITIVE', program => 'InterProScan', }, + { + db => 'SignalP_GRAM_NEGATIVE', + ipscan_lookup => 1, + ipscan_name => 'SignalP_GRAM_NEGATIVE', + ipscan_xml => 'SIGNALP_GRAM_NEGATIVE', + logic_name => 'SignalP_GRAM_NEGATIVE', + program => 'InterProScan', + }, #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html) { logic_name => 'seg', From d283cf09fc2516876cb77df899d56903c604a2bb Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:07:03 +0100 Subject: [PATCH 22/37] Update xref_sources.json update xref source Xenbase mapping file GenePageEnsemblModelMapping_4.1.txt --- modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json index 161a41186..d12edb9cf 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json @@ -254,7 +254,7 @@ { "name" : "Xenbase", "parser" : "XenopusJamboreeParser", - "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt", + "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", "priority" : 1 }, { From d74f87eb7e5bbc3b134366aa09b81b83136eab64 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:08:57 +0100 Subject: [PATCH 23/37] Update xref_all_sources.json update xenbase mapping file --- .../Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json index 7fa14c977..5b6361536 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json @@ -226,7 +226,7 @@ { "name" : "Xenbase", "parser" : "XenopusJamboreeParser", - "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt", + "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", "priority" : 1 }, { From b2da23c1840bb10dde5f29cfb9717ef93f913b4a Mon Sep 17 00:00:00 2001 From: Stefano Giorgetti Date: Thu, 20 Jun 2024 13:36:10 +0000 Subject: [PATCH 24/37] Fixed as per ENSPROD-9493 --- .../Production/Pipeline/AlphaFold/InsertProteinFeatures.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm index 059ac4a3e..0178fc21e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm @@ -164,7 +164,7 @@ sub run { -db => 'alphafold', -db_version => $alpha_version, -db_file => $self->param('db_dir') . '/accession_ids.csv', - -display_label => 'AlphaFold DB import', + -display_label => 'AFDB-ENSP mapping', -displayable => '1', -description => 'Protein features based on AlphaFold predictions, mapped with GIFTS or UniParc' ); From 7e4aca521363e3550e1e3cb1deecc670a1339077 Mon Sep 17 00:00:00 2001 From: John Tate Date: Mon, 24 Jun 2024 17:26:01 +0100 Subject: [PATCH 25/37] Fix use of keys on a scalar Change the use of an experimental, now removed, feature that allowed a hashref scalar to be used in place of a hash. --- .../Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm index 28aa1ba06..d5d8e3a8a 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm @@ -218,7 +218,7 @@ sub all_hashes { } ## end foreach my $slice (@slices) for my $seq_type (keys %$batch) { - for my $attrib_table (keys $batch->{$seq_type}) { + for my $attrib_table (keys %{$batch->{$seq_type}}) { $attribute_adaptor->store_batch_on_Object($attrib_table, $batch->{$seq_type}->{$attrib_table}, 1000); } } From a3ff2b44c29af7d4a6e819dfdf38f67178f32452 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Wed, 26 Jun 2024 17:07:18 +0100 Subject: [PATCH 26/37] Fixes for 113 issues --- .../Pipeline/PipeConfig/XrefProcess_conf.pm | 2 ++ .../EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm | 14 +++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm index 2e11ce311..0903a1947 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm @@ -173,6 +173,7 @@ sub pipeline_analyses { base_path => $self->o('base_path'), release => $self->o('release') }, + -max_retry_count => 0, -flow_into => { '2->A' => 'dump_xref', 'A->1' => 'schedule_mapping' @@ -187,6 +188,7 @@ sub pipeline_analyses { release => $self->o('release'), config_file => $self->o('config_file') }, + -max_retry_count => 0, -flow_into => { 2 => 'align_factory' }, -rc_name => '1GB', }, diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm index b3233ea9d..ef4868850 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm @@ -84,10 +84,18 @@ sub run { $exe =~ s/\n//g; my $command_string = sprintf ("%s --showalignment FALSE --showvulgar FALSE --ryo '%s' --gappedextension FALSE --model 'affine:local' %s --subopt no --query %s --target %s --querychunktotal %s --querychunkid %s", $exe, $ryo, $method, $source, $target, $max_chunks, $chunk); my $output = `$command_string`; - my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments - while (my $hit = shift @hits) { - print $fh $hit . "\n"; + if ($? == 0) { + my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments + + while (my $hit = shift @hits) { + print $fh $hit . "\n"; + } + } else { + my $job = $self->input_job(); + $job->adaptor()->db()->get_LogMessageAdaptor()->store_job_message($job->dbID(), $output, 'WORKER_ERROR'); + + throw("Exonerate failed with exit_code: $?\n"); } $fh->close(); From ee5da99d1786cf413d571478e1385b08bdef9961 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Fri, 5 Jul 2024 16:13:10 +0100 Subject: [PATCH 27/37] Update SourceFactory.pm Fix for Experimental keys on a scalar is now forbidden --- .../Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm index 4599d8636..37cd8b09e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm @@ -59,7 +59,7 @@ sub write_output { my $compara_param = $self->param('compara'); my $cleanup_dir = $self->param('cleanup_dir'); - foreach my $pair (keys $sp_config) { + foreach my $pair (keys %{$sp_config}) { my $compara = $sp_config->{$pair}->{'compara'}; if (defined $compara_param && $compara ne $compara_param) { print STDERR "Skipping $compara\n"; From d1ce2933a759494c59e81d274029f9e8fd088618 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Fri, 19 Jul 2024 13:27:32 +0100 Subject: [PATCH 28/37] Updated Base class with slurm default resource 1GB --- .../Pipeline/PipeConfig/Base_conf.pm | 44 +++++++------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm index 86d0ca7a9..a340f0602 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm @@ -66,14 +66,14 @@ sub beekeeper_extra_cmdline_options { sub resource_classes { my $self = shift; + ## String it together + my %time = ( + H => ' --time=1:00:00', + D => ' --time=1-00:00:00', + W => ' --time=7-00:00:00' + ); - ## Sting it together - my %time = (H => ' --time=1:00:00', - D => ' --time=1-00:00:00', - W => ' --time=7-00:00:00',); - - my %memory = ('100M' => '100', - '200M' => '200', + my %memory = ( '500M' => '500', '1GB' => '1000', '2GB' => '2000', @@ -89,40 +89,30 @@ sub resource_classes { ); my $dq = ' --partition=datamover'; - my %output = ( #Default is a duplicate of 100M - 'default' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' }, - 'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' }, - 'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' }, + 'default' => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1G'} . 'm' }, + 'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1G'} . 'm' }, + 'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1G'} . 'm' }, #Data mover nodes - 'dm' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' }, - 'dm_D' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' }, - 'dm_W' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' }, - 'dm32_D' => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 32000 -R "rusage[mem=32000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' }, - 'dmMAX_D' => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 200000 -R "rusage[mem=200000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' }, + 'dm' => { 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'dm_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'dm_W' => { 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'dm32_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' }, + 'dmMAX_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' }, ); - #Create a dictionary of all possible time and memory combinations. Format would be: - #2G={ - # 'SLURM' => ' --time=1:00:00 --mem=2000m', - # 'LSF' => '-q $self->o(production_queue) -M 2000 -R "rusage[mem=2000]"' - # }; while ((my $time_key, my $time_value) = each(%time)) { while ((my $memory_key, my $memory_value) = each(%memory)) { if ($time_key eq 'H') { - $output{$memory_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"', - 'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' } + $output{$memory_key} = { 'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' }; } else { - $output{$memory_key . '_' . $time_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"', - 'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' } + $output{$memory_key . '_' . $time_key} = { 'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' }; } } } - return \%output; - } 1; From 772caf1c280ce8dc01706f741400674f1abe5367 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Fri, 19 Jul 2024 13:32:16 +0100 Subject: [PATCH 29/37] Update Typo GB --- .../Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm index a340f0602..d57a14a70 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm @@ -91,9 +91,9 @@ sub resource_classes { my $dq = ' --partition=datamover'; my %output = ( #Default is a duplicate of 100M - 'default' => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1G'} . 'm' }, - 'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1G'} . 'm' }, - 'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1G'} . 'm' }, + 'default' => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' }, + 'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' }, #Data mover nodes 'dm' => { 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' }, 'dm_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' }, From 2b01517b2f6615cec5805a72435c14091e1d99ab Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Mon, 22 Jul 2024 16:25:21 +0100 Subject: [PATCH 30/37] decompress upidump.lis.gz file before load to hive db --- .../Pipeline/PipeConfig/ProteinFeatures_conf.pm | 2 +- .../Pipeline/ProteinFeatures/LoadUniParc.pm | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm index e449f7b51..dbc95951a 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm @@ -75,7 +75,7 @@ sub default_options { interpro_file => 'names.dat', interpro2go_file => 'interpro2go', - uniparc_file => 'upidump.lis', + uniparc_file => 'upidump.lis.gz', mapping_file => 'idmapping_selected.tab.gz', # Files are retrieved and stored locally with the same name. diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm index 40eddb1b4..4d97c8950 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm @@ -21,16 +21,25 @@ package Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc; use strict; use warnings; - +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use File::Basename; - use base ('Bio::EnsEMBL::Production::Pipeline::Common::Base'); sub run { my ($self) = @_; my $uniparc_file = $self->param_required('uniparc_file_local'); + if (-e $uniparc_file) { + + #check if uniparc file is compressed + if ($uniparc_file =~ /\.gz$/){ + my $uniparc_file_decompress = $uniparc_file; + $uniparc_file_decompress =~ s/\.gz$//; + gunzip $uniparc_file => $uniparc_file_decompress or $self->throw("gunzip failed: $GunzipError"); + $uniparc_file = $uniparc_file_decompress; + } + my $dbh = $self->hive_dbh; my $sql = "LOAD DATA LOCAL INFILE '$uniparc_file' INTO TABLE uniparc FIELDS TERMINATED BY ' '"; $dbh->do($sql) or self->throw($dbh->errstr); From 434e865ab149ed266c76366d2f6820d2df424f64 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Mon, 22 Jul 2024 16:55:38 +0100 Subject: [PATCH 31/37] delete the upidump file after loading into hive db --- .../Production/Pipeline/ProteinFeatures/LoadUniParc.pm | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm index 4d97c8950..56eb488df 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm @@ -37,6 +37,8 @@ sub run { my $uniparc_file_decompress = $uniparc_file; $uniparc_file_decompress =~ s/\.gz$//; gunzip $uniparc_file => $uniparc_file_decompress or $self->throw("gunzip failed: $GunzipError"); + #delete compressed file .gz + unlink $uniparc_file or $self->throw("unable to delete $uniparc_file"); $uniparc_file = $uniparc_file_decompress; } @@ -50,9 +52,14 @@ sub run { my $index_2 = 'ALTER TABLE uniparc ADD KEY md5sum_idx (md5sum) USING HASH'; $dbh->do($index_2) or self->throw($dbh->errstr); + #delete upidump file from pipeline direcotry after loading into hive db + unlink $uniparc_file or $self->throw("unable to delete $uniparc_file"); + } else { $self->throw("Checksum file '$uniparc_file' does not exist"); } + + } 1; From 693c899042e1f1710f8875215daf9b92edaa55ab Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:39:30 +0100 Subject: [PATCH 32/37] Update modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm Co-authored-by: John Tate --- .../EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm index 56eb488df..e54038880 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm @@ -38,7 +38,7 @@ sub run { $uniparc_file_decompress =~ s/\.gz$//; gunzip $uniparc_file => $uniparc_file_decompress or $self->throw("gunzip failed: $GunzipError"); #delete compressed file .gz - unlink $uniparc_file or $self->throw("unable to delete $uniparc_file"); + unlink $uniparc_file or $self->throw("unable to delete $uniparc_file: $!"); $uniparc_file = $uniparc_file_decompress; } From 51cc0c35a49aa430ddc67092052b5fd6928053d4 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:39:39 +0100 Subject: [PATCH 33/37] Update modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm Co-authored-by: John Tate --- .../EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm index e54038880..444b7a2bb 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm @@ -53,7 +53,7 @@ sub run { $dbh->do($index_2) or self->throw($dbh->errstr); #delete upidump file from pipeline direcotry after loading into hive db - unlink $uniparc_file or $self->throw("unable to delete $uniparc_file"); + unlink $uniparc_file or $self->throw("unable to delete $uniparc_file: $!"); } else { $self->throw("Checksum file '$uniparc_file' does not exist"); From 0eae3f67f37be6f754347bb70d11ac411885df24 Mon Sep 17 00:00:00 2001 From: Paulo Lins Date: Thu, 25 Jul 2024 09:45:23 +0100 Subject: [PATCH 34/37] Include human and mouse symlinks --- scripts/py/regulation_ftp_symlinks.py | 53 ++++++++++++++++++++------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py index a8a9c663b..ba66b919a 100644 --- a/scripts/py/regulation_ftp_symlinks.py +++ b/scripts/py/regulation_ftp_symlinks.py @@ -42,7 +42,7 @@ from pathlib import Path # Human and Mouse follow a different dir structure -SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"] +# SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"] # GENE-SWITCH species GENE_SWITCH_SPECIES = [ @@ -54,8 +54,12 @@ PUBLIC_PUB_PATH = "PUBLIC/pub" DATA_FILES_PATH = "data_files/" DATA_FILES_PATH_TEMPLATE = "{ftp_path}/data_files/{species}/{assembly}/funcgen" -RELEASE_FOLDER_PATH_TEMPLATE = "{ftp_path}/release-{release}/regulation/{species}/{assembly}" -MISC_GENE_SWITCH_PATH_TEMPLATE = "{ftp_path}/misc/gene-switch/regulation/{species}/{assembly}" +RELEASE_FOLDER_PATH_TEMPLATE = ( + "{ftp_path}/release-{release}/regulation/{species}/{assembly}" +) +MISC_GENE_SWITCH_PATH_TEMPLATE = ( + "{ftp_path}/misc/gene-switch/regulation/{species}/{assembly}" +) ANALYSIS_TYPE_PEAKS = "peaks" ANALYSIS_TYPE_SIGNAL = "signal" @@ -135,7 +139,9 @@ def get_species_with_analysis_type_folder(analysis_type, ftp_path): def get_most_recent_release_data_file_path(data_file_path): validator.is_dir(Path(data_file_path)) available_releases = listdir(data_file_path) - return Path(data_file_path) / str(max([int(release) for release in available_releases])) + return Path(data_file_path) / str( + max([int(release) for release in available_releases]) + ) utils = Utils() @@ -156,10 +162,14 @@ def __init__(self, **path_specifics): ) self.sources = { "release_folder": Path( - RELEASE_FOLDER_PATH_TEMPLATE.format(**self.aliased_paths(**path_specifics)) + RELEASE_FOLDER_PATH_TEMPLATE.format( + **self.aliased_paths(**path_specifics) + ) ), "misc_folder": Path( - MISC_GENE_SWITCH_PATH_TEMPLATE.format(**self.aliased_paths(**path_specifics)) + MISC_GENE_SWITCH_PATH_TEMPLATE.format( + **self.aliased_paths(**path_specifics) + ) ), } @@ -172,7 +182,8 @@ def get(self, key): def symlink2rf(self, analysis_type, only_remove=False, relative=True): target = ( - Path(path.relpath(self.target, self.sources["release_folder"])) / analysis_type + Path(path.relpath(self.target, self.sources["release_folder"])) + / analysis_type if relative else self.target / analysis_type ) @@ -188,7 +199,8 @@ def symlink2misc(self, analysis_type, only_remove=False, relative=True): makedirs(self.sources["misc_folder"]) target = ( - Path(path.relpath(self.target, self.sources["misc_folder"])) / analysis_type + Path(path.relpath(self.target, self.sources["misc_folder"])) + / analysis_type if relative else self.target / analysis_type ) @@ -203,17 +215,26 @@ def _symlink(self, source, target, only_remove): if not only_remove: source.symlink_to(target, target_is_directory=True) if validator.is_symlink(source, check=True): - logger.info(f"{source} -> {target} --- was successfully created") + logger.info( + f"{source} -> {target} --- was successfully created" + ) else: if not validator.is_symlink(source, check=True): - logger.info(f"{source} -> {target} -- was successfully removed") + logger.info( + f"{source} -> {target} -- was successfully removed" + ) def aliased_paths(self, **kwargs): - return {key: self.RELEASE_PATH_ALIASES.get(value, value) for key, value in kwargs.items()} + return { + key: self.RELEASE_PATH_ALIASES.get(value, value) + for key, value in kwargs.items() + } @staticmethod def search(analysis_type, ftp_path, release): - result = utils.get_species_with_analysis_type_folder(analysis_type, ftp_path) + result = utils.get_species_with_analysis_type_folder( + analysis_type, ftp_path + ) return [ RegulationSymlinkFTP( analysis_type=analysis_type, @@ -275,13 +296,17 @@ def parse_arguments(): ftp_path = args.ftp_path / PUBLIC_PUB_PATH logger.info("Searching for peaks in data_files ...") - peaks = RegulationSymlinkFTP.search(ANALYSIS_TYPE_PEAKS, ftp_path, args.release_version) + peaks = RegulationSymlinkFTP.search( + ANALYSIS_TYPE_PEAKS, ftp_path, args.release_version + ) for peak in peaks: peak.symlink2rf("peaks", only_remove=args.delete_symlinks) peak.symlink2misc("peaks", only_remove=args.delete_symlinks) logger.info("Searching for signals in data_files ...") - signals = RegulationSymlinkFTP.search(ANALYSIS_TYPE_SIGNAL, ftp_path, args.release_version) + signals = RegulationSymlinkFTP.search( + ANALYSIS_TYPE_SIGNAL, ftp_path, args.release_version + ) for signal in signals: signal.symlink2rf("signal", only_remove=args.delete_symlinks) signal.symlink2misc("signal", only_remove=args.delete_symlinks) From 3542ca1265fb905cba0675b3d52118ccac2e9391 Mon Sep 17 00:00:00 2001 From: Paulo Lins Date: Thu, 25 Jul 2024 10:36:20 +0100 Subject: [PATCH 35/37] include Mouse and Human symlinks --- scripts/py/regulation_ftp_symlinks.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py index ba66b919a..938ae0f09 100644 --- a/scripts/py/regulation_ftp_symlinks.py +++ b/scripts/py/regulation_ftp_symlinks.py @@ -43,6 +43,7 @@ # Human and Mouse follow a different dir structure # SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"] +SPECIES_TO_NOT_INCLUDE = [] # GENE-SWITCH species GENE_SWITCH_SPECIES = [ @@ -139,8 +140,15 @@ def get_species_with_analysis_type_folder(analysis_type, ftp_path): def get_most_recent_release_data_file_path(data_file_path): validator.is_dir(Path(data_file_path)) available_releases = listdir(data_file_path) + releases = [] + for release in available_releases: + try: + releases.append(int(release)) + except: + continue + return Path(data_file_path) / str( - max([int(release) for release in available_releases]) + max(releases) ) @@ -235,6 +243,7 @@ def search(analysis_type, ftp_path, release): result = utils.get_species_with_analysis_type_folder( analysis_type, ftp_path ) + __import__("ipdb").set_trace() return [ RegulationSymlinkFTP( analysis_type=analysis_type, @@ -244,7 +253,7 @@ def search(analysis_type, ftp_path, release): release=release, ) for species, assemblies in result.items() - for assembly in assemblies + for assembly in assemblies if assembly not in ["GRCh37", "GRCm38", "NCBIM37"] ] From ac6c74187bd47a1717dca2bf0066fa783cc9875f Mon Sep 17 00:00:00 2001 From: Paulo Lins Date: Thu, 25 Jul 2024 10:43:11 +0100 Subject: [PATCH 36/37] include Mouse and Human symlinks --- scripts/py/regulation_ftp_symlinks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py index 938ae0f09..3ded73f0c 100644 --- a/scripts/py/regulation_ftp_symlinks.py +++ b/scripts/py/regulation_ftp_symlinks.py @@ -42,7 +42,6 @@ from pathlib import Path # Human and Mouse follow a different dir structure -# SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"] SPECIES_TO_NOT_INCLUDE = [] # GENE-SWITCH species @@ -243,7 +242,6 @@ def search(analysis_type, ftp_path, release): result = utils.get_species_with_analysis_type_folder( analysis_type, ftp_path ) - __import__("ipdb").set_trace() return [ RegulationSymlinkFTP( analysis_type=analysis_type, From bbc136284d77f6acb7ba8d69026ccc6069401182 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:15:13 +0100 Subject: [PATCH 37/37] Update ProteinFeatures_conf.pm Update the Protein feature analysis logic name to lowercase --- .../Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm index dbc95951a..41a743aa4 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm @@ -232,7 +232,7 @@ sub default_options { ipscan_lookup => 1, ipscan_name => 'Phobius', ipscan_xml => 'PHOBIUS', - logic_name => 'Phobius', + logic_name => 'phobius', program => 'InterProScan', }, { @@ -240,7 +240,7 @@ sub default_options { ipscan_lookup => 1, ipscan_name => 'SignalP_GRAM_POSITIVE', ipscan_xml => 'SIGNALP_GRAM_POSITIVE', - logic_name => 'SignalP_GRAM_POSITIVE', + logic_name => 'signalp_gram_positive', program => 'InterProScan', }, { @@ -248,7 +248,7 @@ sub default_options { ipscan_lookup => 1, ipscan_name => 'SignalP_GRAM_NEGATIVE', ipscan_xml => 'SIGNALP_GRAM_NEGATIVE', - logic_name => 'SignalP_GRAM_NEGATIVE', + logic_name => 'signalp_gram_negative', program => 'InterProScan', }, #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html)