From 8aacdf841e8f65f876e7ec32b6cd3a553f07103e Mon Sep 17 00:00:00 2001 From: nitin-ebi <79518737+nitin-ebi@users.noreply.github.com> Date: Tue, 30 Jul 2024 09:41:31 +0100 Subject: [PATCH] EVA-3227 Add file size to metadata Json (#46) * add file size to metadata JSON --- eva_sub_cli/etc/eva_schema.json | 4 ++++ eva_sub_cli/nextflow/validation.nf | 18 +++++++++--------- eva_sub_cli/validators/docker_validator.py | 2 +- eva_sub_cli/validators/validator.py | 16 +++++++++++----- tests/test_docker_validator.py | 2 +- 5 files changed, 26 insertions(+), 16 deletions(-) diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index e4e3028..1a016af 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -346,6 +346,10 @@ "type": "string", "description": "File name" }, + "fileSize": { + "type": "number", + "description": "Size of file in bytes" + }, "md5": { "type": "string", "description": "MD5 hash of the file" diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index a382150..1525ea9 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -78,8 +78,8 @@ workflow { check_vcf_valid(vcf_channel) check_vcf_reference(vcf_channel) - generate_md5_digests(vcf_files) - collect_md5(generate_md5_digests.out.md5_digest.collect()) + generate_file_size_and_md5_digests(vcf_files) + collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect()) // Metadata conversion @@ -151,34 +151,34 @@ process check_vcf_reference { """ } -process generate_md5_digests { +process generate_file_size_and_md5_digests { input: path(vcf_file) output: - path "${vcf_file}.md5", emit: md5_digest + path "${vcf_file}_file_size_and_digest_info.txt", emit: file_size_and_digest_info script: // Capture the realpath of the vcf to be able to resolve the file based on path instead of name """ - md5sum `readlink $vcf_file` > ${vcf_file}.md5 + echo "\$(md5sum $vcf_file | awk '{print \$1}') \$(stat -c%s $vcf_file) \$(readlink -f $vcf_file)" > ${vcf_file}_file_size_and_digest_info.txt """ } -process collect_md5 { +process collect_file_size_and_md5 { publishDir output_dir, overwrite: true, mode: "copy" input: - path(file_digests) + path(file_size_and_digests) output: - path "md5sums.txt", emit: md5_digest_log + path "file_info.txt", emit: file_info_log script: """ - cat $file_digests > md5sums.txt + cat $file_size_and_digests > file_info.txt """ } diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index e0345a1..eb9c977 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev10' +container_tag = 'v0.0.1.dev11' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 9795531..bd201d5 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -363,7 +363,7 @@ def _collect_metadata_results(self): self._parse_semantic_metadata_results() self._convert_biovalidator_validation_to_spreadsheet() self._write_spreadsheet_validation_results() - self._collect_md5sum_to_metadata() + self._collect_file_info_to_metadata() def _load_spreadsheet_conversion_errors(self): errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', @@ -513,23 +513,27 @@ def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf): if attributes_dict[attribute] == json_attribute: return attribute - def _collect_md5sum_to_metadata(self): - md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'md5sums.txt')) + def _collect_file_info_to_metadata(self): + md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'file_info.txt')) file_path_2_md5 = {} file_name_2_md5 = {} + file_path_2_file_size = {} + file_name_2_file_size = {} if md5sum_file: with open(md5sum_file) as open_file: for line in open_file: sp_line = line.split(' ') md5sum = sp_line[0] - vcf_file = line.strip()[len(md5sum):].lstrip() # Remove the md5: the rest is the file path + file_size = int(sp_line[1]) + vcf_file = sp_line[2].strip() file_path_2_md5[vcf_file] = md5sum file_name_2_md5[os.path.basename(vcf_file)] = md5sum + file_path_2_file_size[vcf_file] = file_size + file_name_2_file_size[os.path.basename(vcf_file)] = file_size if self.metadata_json_post_validation: with open(self.metadata_json_post_validation) as open_file: try: json_data = json.load(open_file) - analysis_aliases = [a.get('analysisAlias') for a in json_data.get('analysis', [])] file_rows = [] files_from_metadata = json_data.get('files', []) if files_from_metadata: @@ -538,6 +542,8 @@ def _collect_md5sum_to_metadata(self): file_path = self._validation_file_path_for(file_dict.get('fileName')) file_dict['md5'] = file_path_2_md5.get(file_path) or \ file_name_2_md5.get(file_dict.get('fileName')) or '' + file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \ + file_name_2_file_size.get(file_dict.get('fileName')) or '' file_rows.append(file_dict) else: self.error('No file found in metadata and multiple analysis alias exist: ' diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 72aba00..fbe4343 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -130,7 +130,7 @@ def test_validate(self): json_data = json.load(open_file) assert json_data.get('files') == [ {'analysisAlias': 'AA', 'fileName': 'input_passed.vcf', 'fileType': 'vcf', - 'md5': '96a80c9368cc3c37095c86fbe6044fb2'} + 'md5': '96a80c9368cc3c37095c86fbe6044fb2', 'fileSize': 103} ] # Check metadata errors