HpCalculation: exit code and handler for cholesky (#38)

Fixes #38 The infamous Cholesky factorization error for `hp.x` is now detected and handled. The handler will simply set the parallelization flag for diagonalization to 1, as other diagonalization options are not yet available.
aiidateam · May 30, 2023 · 2e76141 · 2e76141
1 parent 02021a0
commit 2e76141
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 84 deletions.
diff --git a/src/aiida_quantumespresso_hp/calculations/hp.py b/src/aiida_quantumespresso_hp/calculations/hp.py
@@ -189,6 +189,8 @@ def define(cls, spec):
  message='The calculation stopped prematurely because it ran out of walltime.')
  spec.exit_code(410, 'ERROR_CONVERGENCE_NOT_REACHED',
  message='The electronic minimization cycle did not reach self-consistency.')
+ spec.exit_code(462, 'ERROR_COMPUTING_CHOLESKY',
+ message='The code failed during the cholesky factorization.')
 
  @classproperty
  def filename_output_hubbard_chi(cls): # pylint: disable=no-self-argument

diff --git a/src/aiida_quantumespresso_hp/parsers/hp.py b/src/aiida_quantumespresso_hp/parsers/hp.py
@@ -21,12 +21,9 @@ def parse(self, **kwargs):
  return self.exit_codes.ERROR_NO_RETRIEVED_FOLDER
 
  # The stdout is always parsed by default.
- for parse_method in [
- self.parse_stdout,
- ]:
- exit_code = parse_method()
- if exit_code:
- return exit_code
+ exit_code = self.parse_stdout()
+ if exit_code:
+ return exit_code
 
  # If it only initialized, then we do NOT parse the `{prefix}.Hubbard_parameters.dat``
  # and the {prefix}.chi.dat files.
@@ -112,16 +109,15 @@ def parse_stdout(self):
  else:
  self.out('parameters', orm.Dict(parsed_data))
 
- exit_statuses = [
+ for exit_status in [
  'ERROR_INVALID_NAMELIST',
- 'ERROR_OUTPUT_STDOUT_INCOMPLETE',
  'ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS',
  'ERROR_MISSING_PERTURBATION_FILE',
  'ERROR_CONVERGENCE_NOT_REACHED',
  'ERROR_OUT_OF_WALLTIME',
- ]
-
- for exit_status in exit_statuses:
+  'ERROR_COMPUTING_CHOLESKY',
+ 'ERROR_OUTPUT_STDOUT_INCOMPLETE',
+ ]:
  if exit_status in logs['error']:
  return self.exit_codes.get(exit_status)
 

diff --git a/src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py b/src/aiida_quantumespresso_hp/parsers/parse_raw/hp.py
@@ -28,33 +28,11 @@ def parse_raw_output(stdout):
  if 'JOB DONE' in line:
  is_prematurely_terminated = False
 
- if 'reading inputhp namelist' in line:
- logs.error.append('ERROR_INVALID_NAMELIST')
-
- # If the atoms were not ordered correctly in the parent calculation
- if 'WARNING! All Hubbard atoms must be listed first in the ATOMIC_POSITIONS card of PWscf' in line:
- logs.error.append('ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS')
-
- # If the calculation run out of walltime we expect to find the following string
- match = re.search(r'.*Maximum CPU time exceeded.*', line)
- if match:
- logs.error.append('ERROR_OUT_OF_WALLTIME')
-
- # If not all expected perturbation files were found for a chi_collect calculation
- if 'Error in routine hub_read_chi (1)' in line:
- logs.error.append('ERROR_MISSING_PERTURBATION_FILE')
-
- # If the run did not convergence we expect to find the following string
- match = re.search(r'.*Convergence has not been reached after\s+([0-9]+)\s+iterations!.*', line)
- if match:
- logs.error.append('ERROR_CONVERGENCE_NOT_REACHED')
+ detect_important_message(logs, line)
 
  # A calculation that will only perturb a single atom will only print one line
  match = re.search(r'.*The grid of q-points.*\s+([0-9])+\s+q-points.*', line)
  if match:
- ### DEBUG
- print(int(match.group(1)))
- ### DEBUG
  parsed_data['number_of_qpoints'] = int(match.group(1))
 
  # Determine the atomic sites that will be perturbed, or that the calculation expects
@@ -87,4 +65,50 @@ def parse_raw_output(stdout):
  if is_prematurely_terminated:
  logs.error.append('ERROR_OUTPUT_STDOUT_INCOMPLETE')
 
+ # Remove duplicate log messages by turning it into a set. Then convert back to list as that is what is expected
+ logs.error = list(set(logs.error))
+ logs.warning = list(set(logs.warning))
+
  return parsed_data, logs
+
+
+def detect_important_message(logs, line):
+ """Detect error or warning messages, and append to the log if a match is found."""
+ REG_ERROR_CONVERGENCE_NOT_REACHED = re.compile(
+ r'.*Convergence has not been reached after\s+([0-9]+)\s+iterations!.*'
+ )
+ ERROR_POSITIONS = 'WARNING! All Hubbard atoms must be listed first in the ATOMIC_POSITIONS card of PWscf'
+ message_map = {
+ 'error': {
+ 'Error in routine hub_read_chi (1)': 'ERROR_MISSING_PERTURBATION_FILE',
+ 'Maximum CPU time exceeded': 'ERROR_OUT_OF_WALLTIME',
+ 'reading inputhp namelist': 'ERROR_INVALID_NAMELIST',
+ 'problems computing cholesky': 'ERROR_COMPUTING_CHOLESKY',
+ REG_ERROR_CONVERGENCE_NOT_REACHED: 'ERROR_CONVERGENCE_NOT_REACHED',
+ ERROR_POSITIONS: 'ERROR_INCORRECT_ORDER_ATOMIC_POSITIONS'
+ },
+ 'warning': {
+ 'Warning:': None,
+ 'DEPRECATED:': None,
+ }
+ }
+
+ # Match any known error and warning messages
+ for marker, message in message_map['error'].items():
+ # Replace with isinstance(marker, re.Pattern) once Python 3.6 is dropped
+ if hasattr(marker, 'search'):
+ if marker.match(line):
+ if message is None:
+ message = line
+ logs.error.append(message)
+ else:
+ if marker in line:
+ if message is None:
+ message = line
+ logs.error.append(message)
+
+ for marker, message in message_map['warning'].items():
+ if marker in line:
+ if message is None:
+ message = line
+ logs.warning.append(message)
diff --git a/src/aiida_quantumespresso_hp/workflows/hp/base.py b/src/aiida_quantumespresso_hp/workflows/hp/base.py
@@ -142,6 +142,8 @@ def validate_parameters(self):
  if self.inputs.only_initialization.value:
  self.ctx.inputs.parameters['INPUTHP']['determine_num_pert_only'] = True
 
+ self.ctx.inputs.settings = self.ctx.inputs.settings.get_dict() if 'settings' in self.ctx.inputs else {}
+
  def set_max_seconds(self, max_wallclock_seconds):
  """Set the `max_seconds` to a fraction of `max_wallclock_seconds` option to prevent out-of-walltime problems.
 
@@ -176,7 +178,37 @@ def handle_unrecoverable_failure(self, node):
  self.report_error_handled(node, 'unrecoverable error, aborting...')
  return ProcessHandlerReport(True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE)
 
- @process_handler(priority=500, exit_codes=HpCalculation.exit_codes.ERROR_CONVERGENCE_NOT_REACHED)
+ @process_handler(priority=460, exit_codes=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY)
+ def handle_computing_cholesky(self, _):
+ """Handle `ERROR_COMPUTING_CHOLESKY`: set parallel diagonalization to 1 and restart.
+
+ Parallelization of diagonalization may produce in some cases too much numerical noise,
+ giving rise to Cholesky factorization issues. As other diagonalization algorithms are
+ not available in `hp.x`, we try to set the diagonalization flag to 1, if not already set.
+ """
+ settings = self.ctx.inputs.settings
+ cmdline = settings.get('cmdline', [])
+
+ for key in ['-ndiag', '-northo', '-nd']:
+ if key in cmdline:
+
+ index = cmdline.index(key)
+
+ if int(cmdline[index+1]) == 1:
+ self.report('diagonalization flag already to 1, stopping')
+ return ProcessHandlerReport(False)
+
+ cmdline[index+1] = '1' # enforce to be 1
+ break
+ else:
+ cmdline += ['-nd', '1']
+
+ settings['cmdline'] = cmdline
+ self.report('set parallelization flag for diagonalization to 1, restarting')
+ return ProcessHandlerReport(True)
+
+
+ @process_handler(priority=410, exit_codes=HpCalculation.exit_codes.ERROR_CONVERGENCE_NOT_REACHED)
  def handle_convergence_not_reached(self, _):
  """Handle `ERROR_CONVERGENCE_NOT_REACHED`: decrease `alpha_mix`, increase `niter_max`, and restart.
 

diff --git a/tests/parsers/fixtures/hp/failed_computing_cholesky/aiida.out b/tests/parsers/fixtures/hp/failed_computing_cholesky/aiida.out
@@ -0,0 +1,7 @@
+
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Error in routine cdiaghg (386):
+ problems computing cholesky
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ stopping ...
diff --git a/tests/parsers/test_hp.py b/tests/parsers/test_hp.py
@@ -5,6 +5,8 @@
 from aiida.common import AttributeDict
 import pytest
 
+from aiida_quantumespresso_hp.calculations.hp import HpCalculation
+
 
 @pytest.fixture
 def generate_inputs_default(generate_hubbard_structure):
@@ -217,58 +219,21 @@ def test_hp_failed_invalid_namelist(aiida_localhost, generate_calc_job_node, gen
  assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_INVALID_NAMELIST.status
 
 
-def test_failed_stdout_incomplete(generate_calc_job_node, generate_parser, generate_inputs_default):
- """Test calculation that exited prematurely and so the stdout is incomplete."""
- name = 'failed_stdout_incomplete'
- entry_point_calc_job = 'quantumespresso.hp'
- entry_point_parser = 'quantumespresso.hp'
-
- node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default())
- parser = generate_parser(entry_point_parser)
- _, calcfunction = parser.parse_from_node(node, store_provenance=False)
-
- assert calcfunction.is_finished, calcfunction.exception
- assert calcfunction.is_failed, calcfunction.exit_status
- assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE.status
-
-
-def test_failed_no_hubbard_parameters(
+@pytest.mark.parametrize(('name', 'exit_status'), (
+ ('failed_no_hubbard_parameters', HpCalculation.exit_codes.ERROR_OUTPUT_HUBBARD_MISSING.status),
+ ('failed_no_hubbard_chi', HpCalculation.exit_codes.ERROR_OUTPUT_HUBBARD_CHI_MISSING.status),
+ ('failed_out_of_walltime', HpCalculation.exit_codes.ERROR_OUT_OF_WALLTIME.status),
+ ('failed_stdout_incomplete', HpCalculation.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE.status),
+ ('failed_computing_cholesky', HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY.status),
+))
+def test_failed_calculation(
  generate_calc_job_node,
  generate_parser,
  generate_inputs_default,
+ name,
+ exit_status,
 ):
- """Test calculation that did not generate the Hubbard parameters output file."""
- name = 'failed_no_hubbard_parameters'
- entry_point_calc_job = 'quantumespresso.hp'
- entry_point_parser = 'quantumespresso.hp'
-
- node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default())
- parser = generate_parser(entry_point_parser)
- _, calcfunction = parser.parse_from_node(node, store_provenance=False)
-
- assert calcfunction.is_finished, calcfunction.exception
- assert calcfunction.is_failed, calcfunction.exit_status
- assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_HUBBARD_MISSING.status
-
-
-def test_failed_no_hubbard_chi(generate_calc_job_node, generate_parser, generate_inputs_default):
- """Test calculation that did not generate the Hubbard chi output file."""
- name = 'failed_no_hubbard_chi'
- entry_point_calc_job = 'quantumespresso.hp'
- entry_point_parser = 'quantumespresso.hp'
-
- node = generate_calc_job_node(entry_point_calc_job, test_name=name, inputs=generate_inputs_default())
- parser = generate_parser(entry_point_parser)
- _, calcfunction = parser.parse_from_node(node, store_provenance=False)
-
- assert calcfunction.is_finished, calcfunction.exception
- assert calcfunction.is_failed, calcfunction.exit_status
- assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUTPUT_HUBBARD_CHI_MISSING.status
-
-
-def test_failed_out_of_walltime(generate_calc_job_node, generate_parser, generate_inputs_default):
- """Test calculation that run out of walltime."""
- name = 'failed_out_of_walltime'
+ """Test calculation failing with the correct exit status when detecing error messages."""
  entry_point_calc_job = 'quantumespresso.hp'
  entry_point_parser = 'quantumespresso.hp'
 
@@ -278,4 +243,4 @@ def test_failed_out_of_walltime(generate_calc_job_node, generate_parser, generat
 
  assert calcfunction.is_finished, calcfunction.exception
  assert calcfunction.is_failed, calcfunction.exit_status
- assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUT_OF_WALLTIME.status
+ assert calcfunction.exit_status == exit_status
diff --git a/tests/workflows/hp/test_base.py b/tests/workflows/hp/test_base.py
@@ -115,3 +115,49 @@ def test_handle_convergence_not_reached(generate_workchain_hp, generate_inputs_h
  assert result.do_break
 
  assert process.ctx.inputs.parameters['INPUTHP'] == expected
+
+
+# yapf: disable
+@pytest.mark.usefixtures('aiida_profile')
+@pytest.mark.parametrize(
+ ('cmdline', 'expected'),
+ (
+ ([], ['-nd', '1']),
+ (['-nd', '2'], ['-nd', '1']),
+ (['-nk', '2', '-nd', '2'], ['-nk', '2', '-nd', '1']),
+ (['-nk', '2'], ['-nk', '2', '-nd', '1']),
+ ),
+)
+# yapf: enable
+def test_handle_computing_cholesky(generate_workchain_hp, generate_inputs_hp, cmdline, expected):
+ """Test `HpBaseWorkChain.handle_computing_cholesky`."""
+ from aiida.orm import Dict
+
+ inputs_hp = {'hp': generate_inputs_hp()}
+ inputs_hp['hp']['settings'] = Dict({'cmdline': cmdline})
+
+ process = generate_workchain_hp(exit_code=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY, inputs=inputs_hp)
+ process.setup()
+ process.validate_parameters()
+
+ result = process.handle_computing_cholesky(process.ctx.children[-1])
+ assert isinstance(result, ProcessHandlerReport)
+ assert result.do_break
+
+ assert process.ctx.inputs.settings['cmdline'] == expected
+
+
+def test_handle_computing_cholesky_fail(generate_workchain_hp, generate_inputs_hp):
+ """Test `HpBaseWorkChain.handle_computing_cholesky` failing."""
+ from aiida.orm import Dict
+
+ inputs_hp = {'hp': generate_inputs_hp()}
+ inputs_hp['hp']['settings'] = Dict({'cmdline': ['-nd', '1']})
+
+ process = generate_workchain_hp(exit_code=HpCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY, inputs=inputs_hp)
+ process.setup()
+ process.validate_parameters()
+
+ result = process.handle_computing_cholesky(process.ctx.children[-1])
+ assert isinstance(result, ProcessHandlerReport)
+ assert not result.do_break