optimize order parameter evaluation, allow hamiltonians to be evaluat…

…ed on single states
molmod · Jun 6, 2024 · 9681561 · 9681561
1 parent c0df3a5
commit 9681561
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 34 deletions.
diff --git a/psiflow/data.py b/psiflow/data.py
@@ -812,7 +812,7 @@ def _batch_frames(
 @join_app
 @typeguard.typechecked
 def batch_apply(
- func: Callable,
+ funcs: list[Callable],
  batch_size: int,
  length: int,
  inputs: list = [],
@@ -821,6 +821,9 @@ def batch_apply(
  nbatches = math.ceil(length / batch_size)
  batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)]
  future = batch_frames(batch_size, inputs=[inputs[0]], outputs=batches)
- evaluated = [func(Dataset(None, extxyz=e)) for e in future.outputs]
- f = join_frames(inputs=[e.extxyz for e in evaluated], outputs=[outputs[0]])
+ datasets = [Dataset(None, extxyz=e) for e in future.outputs]
+ for func in funcs:
+ datasets = [func(d) for d in datasets]
+ # evaluated = [func(Dataset(None, extxyz=e)) for e in future.outputs]
+ f = join_frames(inputs=[d.extxyz for d in datasets], outputs=[outputs[0]])
  return f
diff --git a/psiflow/hamiltonians/_plumed.py b/psiflow/hamiltonians/_plumed.py
@@ -42,13 +42,14 @@ def try_manual_plumed_linking() -> str:
 def remove_comments_printflush(plumed_input: str) -> str:
  new_input = []
  for line in list(plumed_input.split("\n")):
- if line.strip().startswith("#"):
+ pre_comment = line.strip().split("#")[0].strip()
+ if len(pre_comment) == 0:
  continue
- if line.strip().startswith("PRINT"):
+ if pre_comment.startswith("PRINT"):
  continue
- if line.strip().startswith("FLUSH"):
+ if pre_comment.startswith("FLUSH"):
  continue
- new_input.append(line)
+ new_input.append(pre_comment)
  return "\n".join(new_input)
 
 

diff --git a/psiflow/hamiltonians/hamiltonian.py b/psiflow/hamiltonians/hamiltonian.py
@@ -1,11 +1,12 @@
 from __future__ import annotations # necessary for type-guarding class methods
 
 import logging
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import typeguard
 from parsl.app.app import python_app
 from parsl.app.futures import DataFuture
+from parsl.dataflow.futures import AppFuture
 from parsl.data_provider.files import File
 
 import psiflow
@@ -22,16 +23,20 @@ def evaluate_function(
  outputs: list = [],
  parsl_resource_specification: dict = {},
  **parameters, # dict values can be futures, so app must wait for those
-) -> None:
+) -> Optional[Geometry]:
  import numpy as np
  from ase import Atoms
 
  from psiflow.data import _read_frames, _write_frames
  from psiflow.geometry import NullState
 
  assert len(inputs) >= 1
- assert len(outputs) == 1
- states = _read_frames(inputs=[inputs[0]])
+ if isinstance(inputs[0], Geometry):
+ assert len(outputs) == 0
+ states = [inputs[0]]
+ else:
+ assert len(outputs) == 1
+ states = _read_frames(inputs=[inputs[0]])
  calculators, index_mapping = load_calculators(states, inputs[1], **parameters)
  for i, state in enumerate(states):
  if state == NullState:
@@ -54,25 +59,41 @@ def evaluate_function(
  print(e)
  stress = np.zeros((3, 3))
  state.stress = stress
- _write_frames(*states, outputs=[outputs[0]])
+ if isinstance(inputs[0], Geometry):
+ return states[0]
+ else:
+ _write_frames(*states, outputs=[outputs[0]])
 
 
 @typeguard.typechecked
 @psiflow.serializable # otherwise MixtureHamiltonian.hamiltonians is not serialized
 class Hamiltonian:
  external: Optional[psiflow._DataFuture]
 
- def evaluate(self, dataset: Dataset, batch_size: Optional[int] = 100) -> Dataset:
- future = batch_apply(
- self.single_evaluate,
- batch_size,
- dataset.length(),
- inputs=[dataset.extxyz],
- outputs=[
- psiflow.context().new_file("data_", ".xyz")
- ], # join_app needs outputs kwarg here!
- )
- return Dataset(None, future.outputs[0])
+ def evaluate(
+ self,
+ arg: Union[Dataset, Geometry, AppFuture[Geometry]],
+ batch_size: Optional[int] = 100,
+ ) -> Union[AppFuture, Dataset]:
+ if isinstance(arg, Dataset):
+ future = batch_apply(
+ [self.single_evaluate],
+ batch_size,
+ arg.length(),
+ inputs=[arg.extxyz],
+ outputs=[
+ psiflow.context().new_file("data_", ".xyz")
+ ], # join_app needs outputs kwarg here!
+ )
+ return Dataset(None, future.outputs[0])
+ else:
+ future = self.evaluate_app(
+ self.load_calculators,
+ inputs=[arg, self.external],
+ outputs=[],
+ **self.parameters,
+ )
+ return future
 
  # mostly for internal use
  def single_evaluate(self, dataset: Dataset) -> Dataset:

diff --git a/psiflow/sampling/order.py b/psiflow/sampling/order.py
@@ -1,20 +1,22 @@
 from __future__ import annotations # necessary for type-guarding class methods
 
-from typing import Union
+from functools import partial
+from typing import Union, Optional
 
 import typeguard
 from ase.units import kJ, mol
 from parsl.app.app import python_app
 from parsl.dataflow.futures import AppFuture
 
 import psiflow
-from psiflow.data import Dataset
+from psiflow.data import Dataset, batch_apply
 from psiflow.geometry import Geometry
 from psiflow.hamiltonians._plumed import PlumedHamiltonian
 from psiflow.hamiltonians.hamiltonian import Hamiltonian
 
 
-def _insert_in_state(
+@typeguard.typechecked
+def insert_in_state(
  state: Geometry,
  name: str,
 ) -> Geometry:
@@ -24,7 +26,32 @@ def _insert_in_state(
  return state
 
 
-insert_in_state = python_app(_insert_in_state, executors=["default_threads"])
+@typeguard.typechecked
+def _insert(
+ state_or_states: Union[Geometry, list[Geometry]],
+ name: str,
+) -> Union[list[Geometry], Geometry]:
+ if not isinstance(state_or_states, list):
+ return insert_in_state(state_or_states, name)
+ else:
+ for state in state_or_states:
+ insert_in_state(state, name) # modify list in place
+ return state_or_states
+
+
+insert = python_app(_insert, executors=["default_threads"])
+
+
+@typeguard.typechecked
+def insert_in_dataset(
+ data: Dataset,
+ name: str,
+) -> Dataset:
+ geometries = insert(
+ data.geometries(),
+ name,
+ )
+ return Dataset(geometries)
 
 
 @typeguard.typechecked
@@ -51,11 +78,29 @@ def __init__(self, name: str, hamiltonian: Hamiltonian):
  super().__init__(name)
  self.hamiltonian = hamiltonian
 
- def evaluate(self, state: Union[Geometry, AppFuture]) -> AppFuture:
- return insert_in_state(
- self.hamiltonian.evaluate(Dataset([state]))[0],
- self.name,
- )
+ def evaluate(
+ self,
+ arg: Union[Dataset, Geometry, AppFuture[Geometry]],
+ batch_size: Optional[int] = 100,
+ ) -> Union[Dataset, AppFuture]:
+ if isinstance(arg, Dataset):
+ # avoid batching the dataset twice:
+ # apply hamiltonian in batched sense and put insert afterwards
+ funcs = [
+ self.hamiltonian.single_evaluate,
+ partial(insert_in_dataset, name=self.name),
+ ]
+ future = batch_apply(
+ funcs,
+ batch_size,
+ arg.length(),
+ inputs=[arg.extxyz],
+ outputs=[psiflow.context().new_file("data_", ".xyz")],
+ )
+ return Dataset(None, future.outputs[0])
+ else:
+ state = self.hamiltonian.evaluate(arg)
+ return insert(state, self.name)
 
  def __eq__(self, other):
  if type(other) is not HamiltonianOrderParameter:

diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py
@@ -187,6 +187,7 @@ def quench(walkers: list[Walker], dataset: Dataset) -> None:
  coefficients = []
  for walker in walkers:
  c = all_hamiltonians.get_coefficients(1.0 * walker.hamiltonian)
+ assert c is not None
  coefficients.append(c)
  coefficients = np.array(coefficients)
 

diff --git a/tests/test_hamiltonian.py b/tests/test_hamiltonian.py
@@ -32,9 +32,9 @@ def test_get_filename_hills():
 RESTART
 UNITS LENGTH=A ENERGY=kj/mol TIME=fs
 CV: VOLUME
-CV0: CV
+CV0: CV #lkasdjf
 METAD ARG=CV0 SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad FILE=test_hills sdld
-METADD ARG=CV SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad sdld
+METADD ARG=CV SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad sdld #fjalsdkfj
 PRINT ARG=CV,metad.bias STRIDE=10 FILE=COLVAR
 FLUSH STRIDE=10
 """
@@ -62,6 +62,10 @@ def test_einstein(dataset, dataset_h2):
  for i in range(1, 10):
  assert evaluated[i].result().energy > 0.0
  assert not np.allclose(evaluated[i].result().stress, 0.0)
+ assert np.allclose(
+ evaluated[i].result().energy,
+ hamiltonian.evaluate(evaluated[i]).result().energy,
+ )
 
  # test evaluation with NullState in data
  data = hamiltonian.evaluate(dataset[:5] + Dataset([NullState]) + dataset[5:10])

diff --git a/tests/test_sampling.py b/tests/test_sampling.py
@@ -450,6 +450,13 @@ def test_order_parameter(dataset):
  assert state.energy is None
  assert np.allclose(CV, np.linalg.det(dataset[3].result().cell))
 
+ # test batch evaluation of order parameter
+ data = order.evaluate(dataset[:10], batch_size=5)
+ volumes = data.get("CV").result()
+ for i in range(10):
+ volume = np.linalg.det(dataset[i].result().cell)
+ assert np.allclose(volume, volumes[i])
+
 
 def test_walker_serialization(dataset, tmp_path):
  einstein = EinsteinCrystal(dataset[0], force_constant=0.1)