Skip to content

Commit

Permalink
HpCalculation: exit code and handler for cholesky (#38)
Browse files Browse the repository at this point in the history
Fixes #38 

The infamous Cholesky factorization error for `hp.x`
is now detected and handled. The handler will simply 
set the parallelization flag for diagonalization to 1, as
other diagonalization options are not yet available.
  • Loading branch information
bastonero authored May 30, 2023
1 parent 02021a0 commit 2e76141
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 84 deletions.
2 changes: 2 additions & 0 deletions src/aiida_quantumespresso_hp/calculations/hp.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ def define(cls, spec):
message='The calculation stopped prematurely because it ran out of walltime.')
spec.exit_code(410, 'ERROR_CONVERGENCE_NOT_REACHED',
message='The electronic minimization cycle did not reach self-consistency.')
spec.exit_code(462, 'ERROR_COMPUTING_CHOLESKY',
message='The code failed during the cholesky factorization.')

@classproperty
def filename_output_hubbard_chi(cls): # pylint: disable=no-self-argument
Expand Down
18 changes: 7 additions & 11 deletions src/aiida_quantumespresso_hp/parsers/hp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,9 @@ def parse(self, **kwargs):
return self.exit_codes.ERROR_NO_RETRIEVED_FOLDER

# The stdout is always parsed by default.
for parse_method in [
self.parse_stdout,
]:
exit_code = parse_method()
if exit_code:
return exit_code
exit_code = self.parse_stdout()
if exit_code:
return exit_code

# If it only initialized, then we do NOT parse the `{prefix}.Hubbard_parameters.dat``
# and the {prefix}.chi.dat files.
Expand Down Expand Up @@ -112,16 +109,15 @@ def parse_stdout(self):
else:
self.out('parameters', orm.Dict(parsed_data))

exit_statuses = [
for exit_status in [
'ERROR_INVALID_NAMELIST',
'ERROR_OUTPUT_STDOUT_INCOMPLETE',
'ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS',
'ERROR_MISSING_PERTURBATION_FILE',
'ERROR_CONVERGENCE_NOT_REACHED',
'ERROR_OUT_OF_WALLTIME',
]

for exit_status in exit_statuses:
'ERROR_COMPUTING_CHOLESKY',
'ERROR_OUTPUT_STDOUT_INCOMPLETE',
]:
if exit_status in logs['error']:
return self.exit_codes.get(exit_status)

Expand Down
70 changes: 47 additions & 23 deletions src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,11 @@ def parse_raw_output(stdout):
if 'JOB DONE' in line:
is_prematurely_terminated = False

if 'reading inputhp namelist' in line:
logs.error.append('ERROR_INVALID_NAMELIST')

# If the atoms were not ordered correctly in the parent calculation
if 'WARNING! All Hubbard atoms must be listed first in the ATOMIC_POSITIONS card of PWscf' in line:
logs.error.append('ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS')

# If the calculation run out of walltime we expect to find the following string
match = re.search(r'.*Maximum CPU time exceeded.*', line)
if match:
logs.error.append('ERROR_OUT_OF_WALLTIME')

# If not all expected perturbation files were found for a chi_collect calculation
if 'Error in routine hub_read_chi (1)' in line:
logs.error.append('ERROR_MISSING_PERTURBATION_FILE')

# If the run did not convergence we expect to find the following string
match = re.search(r'.*Convergence has not been reached after\s+([0-9]+)\s+iterations!.*', line)
if match:
logs.error.append('ERROR_CONVERGENCE_NOT_REACHED')
detect_important_message(logs, line)

# A calculation that will only perturb a single atom will only print one line
match = re.search(r'.*The grid of q-points.*\s+([0-9])+\s+q-points.*', line)
if match:
### DEBUG
print(int(match.group(1)))
### DEBUG
parsed_data['number_of_qpoints'] = int(match.group(1))

# Determine the atomic sites that will be perturbed, or that the calculation expects
Expand Down Expand Up @@ -87,4 +65,50 @@ def parse_raw_output(stdout):
if is_prematurely_terminated:
logs.error.append('ERROR_OUTPUT_STDOUT_INCOMPLETE')

# Remove duplicate log messages by turning it into a set. Then convert back to list as that is what is expected
logs.error = list(set(logs.error))
logs.warning = list(set(logs.warning))

return parsed_data, logs


def detect_important_message(logs, line):
"""Detect error or warning messages, and append to the log if a match is found."""
REG_ERROR_CONVERGENCE_NOT_REACHED = re.compile(
r'.*Convergence has not been reached after\s+([0-9]+)\s+iterations!.*'
)
ERROR_POSITIONS = 'WARNING! All Hubbard atoms must be listed first in the ATOMIC_POSITIONS card of PWscf'
message_map = {
'error': {
'Error in routine hub_read_chi (1)': 'ERROR_MISSING_PERTURBATION_FILE',
'Maximum CPU time exceeded': 'ERROR_OUT_OF_WALLTIME',
'reading inputhp namelist': 'ERROR_INVALID_NAMELIST',
'problems computing cholesky': 'ERROR_COMPUTING_CHOLESKY',
REG_ERROR_CONVERGENCE_NOT_REACHED: 'ERROR_CONVERGENCE_NOT_REACHED',
ERROR_POSITIONS: 'ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS'
},
'warning': {
'Warning:': None,
'DEPRECATED:': None,
}
}

# Match any known error and warning messages
for marker, message in message_map['error'].items():
# Replace with isinstance(marker, re.Pattern) once Python 3.6 is dropped
if hasattr(marker, 'search'):
if marker.match(line):
if message is None:
message = line
logs.error.append(message)
else:
if marker in line:
if message is None:
message = line
logs.error.append(message)

for marker, message in message_map['warning'].items():
if marker in line:
if message is None:
message = line
logs.warning.append(message)
34 changes: 33 additions & 1 deletion src/aiida_quantumespresso_hp/workflows/hp/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ def validate_parameters(self):
if self.inputs.only_initialization.value:
self.ctx.inputs.parameters['INPUTHP']['determine_num_pert_only'] = True

self.ctx.inputs.settings = self.ctx.inputs.settings.get_dict() if 'settings' in self.ctx.inputs else {}

def set_max_seconds(self, max_wallclock_seconds):
"""Set the `max_seconds` to a fraction of `max_wallclock_seconds` option to prevent out-of-walltime problems.
Expand Down Expand Up @@ -176,7 +178,37 @@ def handle_unrecoverable_failure(self, node):
self.report_error_handled(node, 'unrecoverable error, aborting...')
return ProcessHandlerReport(True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE)

@process_handler(priority=500, exit_codes=HpCalculation.exit_codes.ERROR_CONVERGENCE_NOT_REACHED)
@process_handler(priority=460, exit_codes=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY)
def handle_computing_cholesky(self, _):
"""Handle `ERROR_COMPUTING_CHOLESKY`: set parallel diagonalization to 1 and restart.
Parallelization of diagonalization may produce in some cases too much numerical noise,
giving rise to Cholesky factorization issues. As other diagonalization algorithms are
not available in `hp.x`, we try to set the diagonalization flag to 1, if not already set.
"""
settings = self.ctx.inputs.settings
cmdline = settings.get('cmdline', [])

for key in ['-ndiag', '-northo', '-nd']:
if key in cmdline:

index = cmdline.index(key)

if int(cmdline[index+1]) == 1:
self.report('diagonalization flag already to 1, stopping')
return ProcessHandlerReport(False)

cmdline[index+1] = '1' # enforce to be 1
break
else:
cmdline += ['-nd', '1']

settings['cmdline'] = cmdline
self.report('set parallelization flag for diagonalization to 1, restarting')
return ProcessHandlerReport(True)


@process_handler(priority=410, exit_codes=HpCalculation.exit_codes.ERROR_CONVERGENCE_NOT_REACHED)
def handle_convergence_not_reached(self, _):
"""Handle `ERROR_CONVERGENCE_NOT_REACHED`: decrease `alpha_mix`, increase `niter_max`, and restart.
Expand Down
7 changes: 7 additions & 0 deletions tests/parsers/fixtures/hp/failed_computing_cholesky/aiida.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Error in routine cdiaghg (386):
problems computing cholesky
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

stopping ...
63 changes: 14 additions & 49 deletions tests/parsers/test_hp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from aiida.common import AttributeDict
import pytest

from aiida_quantumespresso_hp.calculations.hp import HpCalculation


@pytest.fixture
def generate_inputs_default(generate_hubbard_structure):
Expand Down Expand Up @@ -217,58 +219,21 @@ def test_hp_failed_invalid_namelist(aiida_localhost, generate_calc_job_node, gen
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_INVALID_NAMELIST.status


def test_failed_stdout_incomplete(generate_calc_job_node, generate_parser, generate_inputs_default):
"""Test calculation that exited prematurely and so the stdout is incomplete."""
name = 'failed_stdout_incomplete'
entry_point_calc_job = 'quantumespresso.hp'
entry_point_parser = 'quantumespresso.hp'

node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default())
parser = generate_parser(entry_point_parser)
_, calcfunction = parser.parse_from_node(node, store_provenance=False)

assert calcfunction.is_finished, calcfunction.exception
assert calcfunction.is_failed, calcfunction.exit_status
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE.status


def test_failed_no_hubbard_parameters(
@pytest.mark.parametrize(('name', 'exit_status'), (
('failed_no_hubbard_parameters', HpCalculation.exit_codes.ERROR_OUTPUT_HUBBARD_MISSING.status),
('failed_no_hubbard_chi', HpCalculation.exit_codes.ERROR_OUTPUT_HUBBARD_CHI_MISSING.status),
('failed_out_of_walltime', HpCalculation.exit_codes.ERROR_OUT_OF_WALLTIME.status),
('failed_stdout_incomplete', HpCalculation.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE.status),
('failed_computing_cholesky', HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY.status),
))
def test_failed_calculation(
generate_calc_job_node,
generate_parser,
generate_inputs_default,
name,
exit_status,
):
"""Test calculation that did not generate the Hubbard parameters output file."""
name = 'failed_no_hubbard_parameters'
entry_point_calc_job = 'quantumespresso.hp'
entry_point_parser = 'quantumespresso.hp'

node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default())
parser = generate_parser(entry_point_parser)
_, calcfunction = parser.parse_from_node(node, store_provenance=False)

assert calcfunction.is_finished, calcfunction.exception
assert calcfunction.is_failed, calcfunction.exit_status
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_HUBBARD_MISSING.status


def test_failed_no_hubbard_chi(generate_calc_job_node, generate_parser, generate_inputs_default):
"""Test calculation that did not generate the Hubbard chi output file."""
name = 'failed_no_hubbard_chi'
entry_point_calc_job = 'quantumespresso.hp'
entry_point_parser = 'quantumespresso.hp'

node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default())
parser = generate_parser(entry_point_parser)
_, calcfunction = parser.parse_from_node(node, store_provenance=False)

assert calcfunction.is_finished, calcfunction.exception
assert calcfunction.is_failed, calcfunction.exit_status
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_HUBBARD_CHI_MISSING.status


def test_failed_out_of_walltime(generate_calc_job_node, generate_parser, generate_inputs_default):
"""Test calculation that run out of walltime."""
name = 'failed_out_of_walltime'
"""Test calculation failing with the correct exit status when detecing error messages."""
entry_point_calc_job = 'quantumespresso.hp'
entry_point_parser = 'quantumespresso.hp'

Expand All @@ -278,4 +243,4 @@ def test_failed_out_of_walltime(generate_calc_job_node, generate_parser, generat

assert calcfunction.is_finished, calcfunction.exception
assert calcfunction.is_failed, calcfunction.exit_status
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUT_OF_WALLTIME.status
assert calcfunction.exit_status == exit_status
46 changes: 46 additions & 0 deletions tests/workflows/hp/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,49 @@ def test_handle_convergence_not_reached(generate_workchain_hp, generate_inputs_h
assert result.do_break

assert process.ctx.inputs.parameters['INPUTHP'] == expected


# yapf: disable
@pytest.mark.usefixtures('aiida_profile')
@pytest.mark.parametrize(
('cmdline', 'expected'),
(
([], ['-nd', '1']),
(['-nd', '2'], ['-nd', '1']),
(['-nk', '2', '-nd', '2'], ['-nk', '2', '-nd', '1']),
(['-nk', '2'], ['-nk', '2', '-nd', '1']),
),
)
# yapf: enable
def test_handle_computing_cholesky(generate_workchain_hp, generate_inputs_hp, cmdline, expected):
"""Test `HpBaseWorkChain.handle_computing_cholesky`."""
from aiida.orm import Dict

inputs_hp = {'hp': generate_inputs_hp()}
inputs_hp['hp']['settings'] = Dict({'cmdline': cmdline})

process = generate_workchain_hp(exit_code=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY, inputs=inputs_hp)
process.setup()
process.validate_parameters()

result = process.handle_computing_cholesky(process.ctx.children[-1])
assert isinstance(result, ProcessHandlerReport)
assert result.do_break

assert process.ctx.inputs.settings['cmdline'] == expected


def test_handle_computing_cholesky_fail(generate_workchain_hp, generate_inputs_hp):
"""Test `HpBaseWorkChain.handle_computing_cholesky` failing."""
from aiida.orm import Dict

inputs_hp = {'hp': generate_inputs_hp()}
inputs_hp['hp']['settings'] = Dict({'cmdline': ['-nd', '1']})

process = generate_workchain_hp(exit_code=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY, inputs=inputs_hp)
process.setup()
process.validate_parameters()

result = process.handle_computing_cholesky(process.ctx.children[-1])
assert isinstance(result, ProcessHandlerReport)
assert not result.do_break

0 comments on commit 2e76141

Please sign in to comment.