diff --git a/src/aiida_quantumespresso_hp/calculations/hp.py b/src/aiida_quantumespresso_hp/calculations/hp.py index 8949fb2..f4b49aa 100644 --- a/src/aiida_quantumespresso_hp/calculations/hp.py +++ b/src/aiida_quantumespresso_hp/calculations/hp.py @@ -189,6 +189,8 @@ def define(cls, spec): message='The calculation stopped prematurely because it ran out of walltime.') spec.exit_code(410, 'ERROR_CONVERGENCE_NOT_REACHED', message='The electronic minimization cycle did not reach self-consistency.') + spec.exit_code(462, 'ERROR_COMPUTING_CHOLESKY', + message='The code failed during the cholesky factorization.') @classproperty def filename_output_hubbard_chi(cls): # pylint: disable=no-self-argument diff --git a/src/aiida_quantumespresso_hp/parsers/hp.py b/src/aiida_quantumespresso_hp/parsers/hp.py index 7cbfe44..35ed8db 100644 --- a/src/aiida_quantumespresso_hp/parsers/hp.py +++ b/src/aiida_quantumespresso_hp/parsers/hp.py @@ -21,12 +21,9 @@ def parse(self, **kwargs): return self.exit_codes.ERROR_NO_RETRIEVED_FOLDER # The stdout is always parsed by default. - for parse_method in [ - self.parse_stdout, - ]: - exit_code = parse_method() - if exit_code: - return exit_code + exit_code = self.parse_stdout() + if exit_code: + return exit_code # If it only initialized, then we do NOT parse the `{prefix}.Hubbard_parameters.dat`` # and the {prefix}.chi.dat files. @@ -112,16 +109,15 @@ def parse_stdout(self): else: self.out('parameters', orm.Dict(parsed_data)) - exit_statuses = [ + for exit_status in [ 'ERROR_INVALID_NAMELIST', - 'ERROR_OUTPUT_STDOUT_INCOMPLETE', 'ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS', 'ERROR_MISSING_PERTURBATION_FILE', 'ERROR_CONVERGENCE_NOT_REACHED', 'ERROR_OUT_OF_WALLTIME', - ] - - for exit_status in exit_statuses: + 'ERROR_COMPUTING_CHOLESKY', + 'ERROR_OUTPUT_STDOUT_INCOMPLETE', + ]: if exit_status in logs['error']: return self.exit_codes.get(exit_status) diff --git a/src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py b/src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py index 3f7ff09..6b4f101 100644 --- a/src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py +++ b/src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py @@ -28,33 +28,11 @@ def parse_raw_output(stdout): if 'JOB DONE' in line: is_prematurely_terminated = False - if 'reading inputhp namelist' in line: - logs.error.append('ERROR_INVALID_NAMELIST') - - # If the atoms were not ordered correctly in the parent calculation - if 'WARNING! All Hubbard atoms must be listed first in the ATOMIC_POSITIONS card of PWscf' in line: - logs.error.append('ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS') - - # If the calculation run out of walltime we expect to find the following string - match = re.search(r'.*Maximum CPU time exceeded.*', line) - if match: - logs.error.append('ERROR_OUT_OF_WALLTIME') - - # If not all expected perturbation files were found for a chi_collect calculation - if 'Error in routine hub_read_chi (1)' in line: - logs.error.append('ERROR_MISSING_PERTURBATION_FILE') - - # If the run did not convergence we expect to find the following string - match = re.search(r'.*Convergence has not been reached after\s+([0-9]+)\s+iterations!.*', line) - if match: - logs.error.append('ERROR_CONVERGENCE_NOT_REACHED') + detect_important_message(logs, line) # A calculation that will only perturb a single atom will only print one line match = re.search(r'.*The grid of q-points.*\s+([0-9])+\s+q-points.*', line) if match: - ### DEBUG - print(int(match.group(1))) - ### DEBUG parsed_data['number_of_qpoints'] = int(match.group(1)) # Determine the atomic sites that will be perturbed, or that the calculation expects @@ -87,4 +65,50 @@ def parse_raw_output(stdout): if is_prematurely_terminated: logs.error.append('ERROR_OUTPUT_STDOUT_INCOMPLETE') + # Remove duplicate log messages by turning it into a set. Then convert back to list as that is what is expected + logs.error = list(set(logs.error)) + logs.warning = list(set(logs.warning)) + return parsed_data, logs + + +def detect_important_message(logs, line): + """Detect error or warning messages, and append to the log if a match is found.""" + REG_ERROR_CONVERGENCE_NOT_REACHED = re.compile( + r'.*Convergence has not been reached after\s+([0-9]+)\s+iterations!.*' + ) + ERROR_POSITIONS = 'WARNING! All Hubbard atoms must be listed first in the ATOMIC_POSITIONS card of PWscf' + message_map = { + 'error': { + 'Error in routine hub_read_chi (1)': 'ERROR_MISSING_PERTURBATION_FILE', + 'Maximum CPU time exceeded': 'ERROR_OUT_OF_WALLTIME', + 'reading inputhp namelist': 'ERROR_INVALID_NAMELIST', + 'problems computing cholesky': 'ERROR_COMPUTING_CHOLESKY', + REG_ERROR_CONVERGENCE_NOT_REACHED: 'ERROR_CONVERGENCE_NOT_REACHED', + ERROR_POSITIONS: 'ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS' + }, + 'warning': { + 'Warning:': None, + 'DEPRECATED:': None, + } + } + + # Match any known error and warning messages + for marker, message in message_map['error'].items(): + # Replace with isinstance(marker, re.Pattern) once Python 3.6 is dropped + if hasattr(marker, 'search'): + if marker.match(line): + if message is None: + message = line + logs.error.append(message) + else: + if marker in line: + if message is None: + message = line + logs.error.append(message) + + for marker, message in message_map['warning'].items(): + if marker in line: + if message is None: + message = line + logs.warning.append(message) diff --git a/src/aiida_quantumespresso_hp/workflows/hp/base.py b/src/aiida_quantumespresso_hp/workflows/hp/base.py index 50f70b6..17d35f7 100644 --- a/src/aiida_quantumespresso_hp/workflows/hp/base.py +++ b/src/aiida_quantumespresso_hp/workflows/hp/base.py @@ -142,6 +142,8 @@ def validate_parameters(self): if self.inputs.only_initialization.value: self.ctx.inputs.parameters['INPUTHP']['determine_num_pert_only'] = True + self.ctx.inputs.settings = self.ctx.inputs.settings.get_dict() if 'settings' in self.ctx.inputs else {} + def set_max_seconds(self, max_wallclock_seconds): """Set the `max_seconds` to a fraction of `max_wallclock_seconds` option to prevent out-of-walltime problems. @@ -176,7 +178,37 @@ def handle_unrecoverable_failure(self, node): self.report_error_handled(node, 'unrecoverable error, aborting...') return ProcessHandlerReport(True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE) - @process_handler(priority=500, exit_codes=HpCalculation.exit_codes.ERROR_CONVERGENCE_NOT_REACHED) + @process_handler(priority=460, exit_codes=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY) + def handle_computing_cholesky(self, _): + """Handle `ERROR_COMPUTING_CHOLESKY`: set parallel diagonalization to 1 and restart. + + Parallelization of diagonalization may produce in some cases too much numerical noise, + giving rise to Cholesky factorization issues. As other diagonalization algorithms are + not available in `hp.x`, we try to set the diagonalization flag to 1, if not already set. + """ + settings = self.ctx.inputs.settings + cmdline = settings.get('cmdline', []) + + for key in ['-ndiag', '-northo', '-nd']: + if key in cmdline: + + index = cmdline.index(key) + + if int(cmdline[index+1]) == 1: + self.report('diagonalization flag already to 1, stopping') + return ProcessHandlerReport(False) + + cmdline[index+1] = '1' # enforce to be 1 + break + else: + cmdline += ['-nd', '1'] + + settings['cmdline'] = cmdline + self.report('set parallelization flag for diagonalization to 1, restarting') + return ProcessHandlerReport(True) + + + @process_handler(priority=410, exit_codes=HpCalculation.exit_codes.ERROR_CONVERGENCE_NOT_REACHED) def handle_convergence_not_reached(self, _): """Handle `ERROR_CONVERGENCE_NOT_REACHED`: decrease `alpha_mix`, increase `niter_max`, and restart. diff --git a/tests/parsers/fixtures/hp/failed_computing_cholesky/aiida.out b/tests/parsers/fixtures/hp/failed_computing_cholesky/aiida.out new file mode 100644 index 0000000..63236b7 --- /dev/null +++ b/tests/parsers/fixtures/hp/failed_computing_cholesky/aiida.out @@ -0,0 +1,7 @@ + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + Error in routine cdiaghg (386): + problems computing cholesky + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + stopping ... diff --git a/tests/parsers/test_hp.py b/tests/parsers/test_hp.py index 67bb3bf..14d174b 100644 --- a/tests/parsers/test_hp.py +++ b/tests/parsers/test_hp.py @@ -5,6 +5,8 @@ from aiida.common import AttributeDict import pytest +from aiida_quantumespresso_hp.calculations.hp import HpCalculation + @pytest.fixture def generate_inputs_default(generate_hubbard_structure): @@ -217,58 +219,21 @@ def test_hp_failed_invalid_namelist(aiida_localhost, generate_calc_job_node, gen assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_INVALID_NAMELIST.status -def test_failed_stdout_incomplete(generate_calc_job_node, generate_parser, generate_inputs_default): - """Test calculation that exited prematurely and so the stdout is incomplete.""" - name = 'failed_stdout_incomplete' - entry_point_calc_job = 'quantumespresso.hp' - entry_point_parser = 'quantumespresso.hp' - - node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default()) - parser = generate_parser(entry_point_parser) - _, calcfunction = parser.parse_from_node(node, store_provenance=False) - - assert calcfunction.is_finished, calcfunction.exception - assert calcfunction.is_failed, calcfunction.exit_status - assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE.status - - -def test_failed_no_hubbard_parameters( +@pytest.mark.parametrize(('name', 'exit_status'), ( + ('failed_no_hubbard_parameters', HpCalculation.exit_codes.ERROR_OUTPUT_HUBBARD_MISSING.status), + ('failed_no_hubbard_chi', HpCalculation.exit_codes.ERROR_OUTPUT_HUBBARD_CHI_MISSING.status), + ('failed_out_of_walltime', HpCalculation.exit_codes.ERROR_OUT_OF_WALLTIME.status), + ('failed_stdout_incomplete', HpCalculation.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE.status), + ('failed_computing_cholesky', HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY.status), +)) +def test_failed_calculation( generate_calc_job_node, generate_parser, generate_inputs_default, + name, + exit_status, ): - """Test calculation that did not generate the Hubbard parameters output file.""" - name = 'failed_no_hubbard_parameters' - entry_point_calc_job = 'quantumespresso.hp' - entry_point_parser = 'quantumespresso.hp' - - node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default()) - parser = generate_parser(entry_point_parser) - _, calcfunction = parser.parse_from_node(node, store_provenance=False) - - assert calcfunction.is_finished, calcfunction.exception - assert calcfunction.is_failed, calcfunction.exit_status - assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_HUBBARD_MISSING.status - - -def test_failed_no_hubbard_chi(generate_calc_job_node, generate_parser, generate_inputs_default): - """Test calculation that did not generate the Hubbard chi output file.""" - name = 'failed_no_hubbard_chi' - entry_point_calc_job = 'quantumespresso.hp' - entry_point_parser = 'quantumespresso.hp' - - node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default()) - parser = generate_parser(entry_point_parser) - _, calcfunction = parser.parse_from_node(node, store_provenance=False) - - assert calcfunction.is_finished, calcfunction.exception - assert calcfunction.is_failed, calcfunction.exit_status - assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_HUBBARD_CHI_MISSING.status - - -def test_failed_out_of_walltime(generate_calc_job_node, generate_parser, generate_inputs_default): - """Test calculation that run out of walltime.""" - name = 'failed_out_of_walltime' + """Test calculation failing with the correct exit status when detecing error messages.""" entry_point_calc_job = 'quantumespresso.hp' entry_point_parser = 'quantumespresso.hp' @@ -278,4 +243,4 @@ def test_failed_out_of_walltime(generate_calc_job_node, generate_parser, generat assert calcfunction.is_finished, calcfunction.exception assert calcfunction.is_failed, calcfunction.exit_status - assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUT_OF_WALLTIME.status + assert calcfunction.exit_status == exit_status diff --git a/tests/workflows/hp/test_base.py b/tests/workflows/hp/test_base.py index c9b415e..729d2fc 100644 --- a/tests/workflows/hp/test_base.py +++ b/tests/workflows/hp/test_base.py @@ -115,3 +115,49 @@ def test_handle_convergence_not_reached(generate_workchain_hp, generate_inputs_h assert result.do_break assert process.ctx.inputs.parameters['INPUTHP'] == expected + + +# yapf: disable +@pytest.mark.usefixtures('aiida_profile') +@pytest.mark.parametrize( + ('cmdline', 'expected'), + ( + ([], ['-nd', '1']), + (['-nd', '2'], ['-nd', '1']), + (['-nk', '2', '-nd', '2'], ['-nk', '2', '-nd', '1']), + (['-nk', '2'], ['-nk', '2', '-nd', '1']), + ), +) +# yapf: enable +def test_handle_computing_cholesky(generate_workchain_hp, generate_inputs_hp, cmdline, expected): + """Test `HpBaseWorkChain.handle_computing_cholesky`.""" + from aiida.orm import Dict + + inputs_hp = {'hp': generate_inputs_hp()} + inputs_hp['hp']['settings'] = Dict({'cmdline': cmdline}) + + process = generate_workchain_hp(exit_code=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY, inputs=inputs_hp) + process.setup() + process.validate_parameters() + + result = process.handle_computing_cholesky(process.ctx.children[-1]) + assert isinstance(result, ProcessHandlerReport) + assert result.do_break + + assert process.ctx.inputs.settings['cmdline'] == expected + + +def test_handle_computing_cholesky_fail(generate_workchain_hp, generate_inputs_hp): + """Test `HpBaseWorkChain.handle_computing_cholesky` failing.""" + from aiida.orm import Dict + + inputs_hp = {'hp': generate_inputs_hp()} + inputs_hp['hp']['settings'] = Dict({'cmdline': ['-nd', '1']}) + + process = generate_workchain_hp(exit_code=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY, inputs=inputs_hp) + process.setup() + process.validate_parameters() + + result = process.handle_computing_cholesky(process.ctx.children[-1]) + assert isinstance(result, ProcessHandlerReport) + assert not result.do_break