Skip to content

Commit

Permalink
EVA-3227 Add file size to metadata Json (#46)
Browse files Browse the repository at this point in the history
* add file size to metadata JSON
  • Loading branch information
nitin-ebi authored Jul 30, 2024
1 parent 5ffe5ee commit 8aacdf8
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 16 deletions.
4 changes: 4 additions & 0 deletions eva_sub_cli/etc/eva_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,10 @@
"type": "string",
"description": "File name"
},
"fileSize": {
"type": "number",
"description": "Size of file in bytes"
},
"md5": {
"type": "string",
"description": "MD5 hash of the file"
Expand Down
18 changes: 9 additions & 9 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ workflow {
check_vcf_valid(vcf_channel)
check_vcf_reference(vcf_channel)

generate_md5_digests(vcf_files)
collect_md5(generate_md5_digests.out.md5_digest.collect())
generate_file_size_and_md5_digests(vcf_files)
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())


// Metadata conversion
Expand Down Expand Up @@ -151,34 +151,34 @@ process check_vcf_reference {
"""
}

process generate_md5_digests {
process generate_file_size_and_md5_digests {
input:
path(vcf_file)

output:
path "${vcf_file}.md5", emit: md5_digest
path "${vcf_file}_file_size_and_digest_info.txt", emit: file_size_and_digest_info

script:
// Capture the realpath of the vcf to be able to resolve the file based on path instead of name
"""
md5sum `readlink $vcf_file` > ${vcf_file}.md5
echo "\$(md5sum $vcf_file | awk '{print \$1}') \$(stat -c%s $vcf_file) \$(readlink -f $vcf_file)" > ${vcf_file}_file_size_and_digest_info.txt
"""
}

process collect_md5 {
process collect_file_size_and_md5 {
publishDir output_dir,
overwrite: true,
mode: "copy"

input:
path(file_digests)
path(file_size_and_digests)

output:
path "md5sums.txt", emit: md5_digest_log
path "file_info.txt", emit: file_info_log

script:
"""
cat $file_digests > md5sums.txt
cat $file_size_and_digests > file_info.txt
"""
}

Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev10'
container_tag = 'v0.0.1.dev11'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
16 changes: 11 additions & 5 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def _collect_metadata_results(self):
self._parse_semantic_metadata_results()
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
self._collect_md5sum_to_metadata()
self._collect_file_info_to_metadata()

def _load_spreadsheet_conversion_errors(self):
errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
Expand Down Expand Up @@ -513,23 +513,27 @@ def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf):
if attributes_dict[attribute] == json_attribute:
return attribute

def _collect_md5sum_to_metadata(self):
md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'md5sums.txt'))
def _collect_file_info_to_metadata(self):
md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'file_info.txt'))
file_path_2_md5 = {}
file_name_2_md5 = {}
file_path_2_file_size = {}
file_name_2_file_size = {}
if md5sum_file:
with open(md5sum_file) as open_file:
for line in open_file:
sp_line = line.split(' ')
md5sum = sp_line[0]
vcf_file = line.strip()[len(md5sum):].lstrip() # Remove the md5: the rest is the file path
file_size = int(sp_line[1])
vcf_file = sp_line[2].strip()
file_path_2_md5[vcf_file] = md5sum
file_name_2_md5[os.path.basename(vcf_file)] = md5sum
file_path_2_file_size[vcf_file] = file_size
file_name_2_file_size[os.path.basename(vcf_file)] = file_size
if self.metadata_json_post_validation:
with open(self.metadata_json_post_validation) as open_file:
try:
json_data = json.load(open_file)
analysis_aliases = [a.get('analysisAlias') for a in json_data.get('analysis', [])]
file_rows = []
files_from_metadata = json_data.get('files', [])
if files_from_metadata:
Expand All @@ -538,6 +542,8 @@ def _collect_md5sum_to_metadata(self):
file_path = self._validation_file_path_for(file_dict.get('fileName'))
file_dict['md5'] = file_path_2_md5.get(file_path) or \
file_name_2_md5.get(file_dict.get('fileName')) or ''
file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
file_name_2_file_size.get(file_dict.get('fileName')) or ''
file_rows.append(file_dict)
else:
self.error('No file found in metadata and multiple analysis alias exist: '
Expand Down
2 changes: 1 addition & 1 deletion tests/test_docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_validate(self):
json_data = json.load(open_file)
assert json_data.get('files') == [
{'analysisAlias': 'AA', 'fileName': 'input_passed.vcf', 'fileType': 'vcf',
'md5': '96a80c9368cc3c37095c86fbe6044fb2'}
'md5': '96a80c9368cc3c37095c86fbe6044fb2', 'fileSize': 103}
]

# Check metadata errors
Expand Down

0 comments on commit 8aacdf8

Please sign in to comment.