diff --git a/quast_libs/ca_utils/align_contigs.py b/quast_libs/ca_utils/align_contigs.py index 05c484f4d9..0e351cfdda 100644 --- a/quast_libs/ca_utils/align_contigs.py +++ b/quast_libs/ca_utils/align_contigs.py @@ -8,7 +8,7 @@ from __future__ import with_statement import os -from os.path import isfile, join, getsize, basename, dirname +from os.path import isfile, basename import datetime import shutil import sys @@ -19,7 +19,7 @@ create_nucmer_output_dir, clean_tmp_files, get_installed_emem, reset_aligner_selection, draw_mummer_plot from quast_libs.log import get_logger -from quast_libs.qutils import is_python2, safe_create +from quast_libs.qutils import is_python2, md5 logger = get_logger(qconfig.LOGGER_DEFAULT_NAME) @@ -33,8 +33,8 @@ class NucmerStatus: def create_nucmer_successful_check(fpath, contigs_fpath, ref_fpath): nucmer_successful_check_file = open(fpath, 'w') - nucmer_successful_check_file.write("Assembly file size in bytes: %d\n" % getsize(contigs_fpath)) - nucmer_successful_check_file.write("Reference file size in bytes: %d\n" % getsize(ref_fpath)) + nucmer_successful_check_file.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) + nucmer_successful_check_file.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) nucmer_successful_check_file.write("Successfully finished on " + datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n') nucmer_successful_check_file.close() @@ -44,9 +44,9 @@ def check_nucmer_successful_check(fpath, contigs_fpath, ref_fpath): successful_check_content = open(fpath).read().split('\n') if len(successful_check_content) < 2: return False - if not successful_check_content[0].strip().endswith(str(getsize(contigs_fpath))): + if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False - if not successful_check_content[1].strip().endswith(str(getsize(ref_fpath))): + if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False return True diff --git a/quast_libs/qutils.py b/quast_libs/qutils.py index c86bc7baab..d1d31b3c3f 100644 --- a/quast_libs/qutils.py +++ b/quast_libs/qutils.py @@ -7,6 +7,7 @@ from __future__ import with_statement import glob +import hashlib import shutil import subprocess import os @@ -908,3 +909,11 @@ def is_ascii_string(line): return False else: return True + + +def md5(fname): + hash_md5 = hashlib.md5() + with open(fname, 'rb') as f: + for chunk in iter(lambda: f.read(65536), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() diff --git a/quast_libs/search_references_meta.py b/quast_libs/search_references_meta.py index 312e3738c9..78755fa159 100644 --- a/quast_libs/search_references_meta.py +++ b/quast_libs/search_references_meta.py @@ -18,7 +18,7 @@ from quast_libs import qconfig, qutils from quast_libs.fastaparser import _get_fasta_file_handler from quast_libs.log import get_logger -from quast_libs.qutils import is_non_empty_file, is_python2, slugify, correct_name, get_dir_for_download +from quast_libs.qutils import is_non_empty_file, is_python2, slugify, correct_name, get_dir_for_download, md5 logger = get_logger(qconfig.LOGGER_META_NAME) try: @@ -314,14 +314,14 @@ def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: - check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath))) + check_file.writelines('Assembly: %s md5 checksum: %s\n' % (contigs_fpath, md5(contigs_fpath))) def get_blast_output_fpath(blast_output_fpath, label): return blast_output_fpath + '_' + slugify(label) -def check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels): +def check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels): downloaded_organisms = [] not_founded_organisms = [] blast_assemblies = [assembly for assembly in assemblies] @@ -336,8 +336,8 @@ def check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpat if '---' in line: assembly_info = False if line and assembly_info: - assembly, size = line.split()[1], line.split()[3] - if assembly in files_sizes.keys() and int(size) == files_sizes[assembly]: + assembly, md5 = line.split()[1], line.split()[-1] + if assembly in files_md5.keys() and md5 == files_md5[assembly]: existing_assembly = assemblies_fpaths[assembly] logger.main_info(' Using existing BLAST alignments for %s... ' % labels[i]) blast_assemblies.remove(existing_assembly) @@ -356,10 +356,10 @@ def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath= err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') - files_sizes = dict((assembly.fpath, os.path.getsize(assembly.fpath)) for assembly in assemblies) + files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ - check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels) + check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels) organisms = [] if ref_txt_fpath: @@ -569,7 +569,7 @@ def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_ text = check_file.read() text = text[:text.find('\n')] else: - text = 'Assembly: %s size: %d\n' % (assembly.fpath, os.path.getsize(assembly.fpath)) + text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines(text) check_file.writelines('\n---\n')