Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to select backends TF/PT #1545

Merged
merged 52 commits into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
575c5a9
begin add gpaw
thangckt Mar 16, 2024
3be325d
Create gpaw.py
thangckt Mar 29, 2024
d3cc70d
Merge pull request #1 from deepmodeling/devel
thangckt Mar 29, 2024
547e13b
Update gpaw.py
thangckt Apr 1, 2024
cfa904b
u
thangckt Apr 1, 2024
946561b
Update arginfo.py
thangckt Apr 2, 2024
11103f5
Merge pull request #2 from deepmodeling/devel
thangckt Apr 2, 2024
0ddbf7c
u
thangckt Apr 2, 2024
0b94ff6
u
thangckt Apr 3, 2024
494b796
u
thangckt Apr 4, 2024
2a58c7e
Merge branch 'devel' of https://github.com/thangckt/dpgen into devel
thangckt Apr 4, 2024
01fbd2f
Merge pull request #4 from deepmodeling/devel
thangckt May 2, 2024
babd77e
modify to use pytorch
thangckt May 5, 2024
28b8a49
option to choose between TF and PT
thangckt May 6, 2024
2822be0
Delete gpaw.py
thangckt May 6, 2024
99efd85
finish add option to select TF/PT
thangckt May 7, 2024
bdecc9f
Merge pull request #5 from deepmodeling/devel
thangckt May 7, 2024
f4f5665
Merge pull request #6 from thangckt/devel
thangckt May 7, 2024
ef1df18
remove GPAW to PR
thangckt May 7, 2024
a1b3ff8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
11dca54
Update simplify.py
thangckt May 7, 2024
a0684ca
Merge branch 'PR' of https://github.com/thangckt/dpgen into PR
thangckt May 7, 2024
220735d
Revert "Merge branch 'PR' of https://github.com/thangckt/dpgen into PR"
thangckt May 7, 2024
8f7f491
reset add GPAW from here
thangckt May 7, 2024
05b2412
u
thangckt May 7, 2024
ed832e8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
af3fc27
Update arginfo.py
thangckt May 7, 2024
8a949d0
u
thangckt May 7, 2024
7def8ef
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
52a9989
Update run.py
thangckt May 7, 2024
0d99ea9
Merge branch 'PR' of https://github.com/thangckt/dpgen into PR
thangckt May 7, 2024
8bdea17
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
67fab2b
Merge branch 'devel' into PR
thangckt May 7, 2024
706146a
Update run.py
thangckt May 8, 2024
e72f4c8
Merge branch 'devel' into PR
thangckt May 8, 2024
98eccb8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 8, 2024
702510c
Update run.py
thangckt May 8, 2024
6ffd1eb
Merge branch 'PR' of https://github.com/thangckt/dpgen into PR
thangckt May 8, 2024
624a48b
Update arginfo.py
thangckt May 8, 2024
f3f49d3
remove gpaw
thangckt May 9, 2024
0f9b1b0
Update run.py
thangckt May 9, 2024
5049771
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 9, 2024
47a766e
Update run.py
thangckt May 9, 2024
083ad45
Update run.py
thangckt May 9, 2024
a500987
Update run.py
thangckt May 9, 2024
35b2713
Update run.py
thangckt May 9, 2024
dd44e79
Update run.py
thangckt May 9, 2024
e07f336
Update run.py
thangckt May 9, 2024
cc19ffa
Update run.py
thangckt May 10, 2024
1e89be7
Merge branch 'devel' into select_backend
thangckt May 10, 2024
4023ccd
support training_init_model
njzjz May 10, 2024
cf68032
fix typo
njzjz May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions dpgen/generator/arginfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def training_args() -> list[Argument]:
list[dargs.Argument]
List of training arguments.
"""
doc_train_backend = (
"The backend of the training. Currently only support tensorflow and pytorch."
)
doc_numb_models = "Number of models to be trained in 00.train. 4 is recommend."
doc_training_iter0_model_path = "The model used to init the first iter training. Number of element should be equal to numb_models."
doc_training_init_model = "Iteration > 0, the model parameters will be initilized from the model trained at the previous iteration. Iteration == 0, the model parameters will be initialized from training_iter0_model_path."
Expand Down Expand Up @@ -123,6 +126,13 @@ def training_args() -> list[Argument]:
doc_training_finetune_model = "At interation 0, finetune the model parameters from the given frozen models. Number of element should be equal to numb_models."

return [
Argument(
"train_backend",
str,
optional=True,
default="tensorflow",
doc=doc_train_backend,
),
Argument("numb_models", int, optional=False, doc=doc_numb_models),
Argument(
"training_iter0_model_path",
Expand Down
104 changes: 73 additions & 31 deletions dpgen/generator/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,19 @@
run_opt_file = os.path.join(ROOT_PATH, "generator/lib/calypso_run_opt.py")


def _get_model_suffix(jdata) -> str:
"""Return the model suffix based on the backend."""
suffix_map = {"tensorflow": ".pb", "pytorch": ".pth"}
backend = jdata.get("train_backend", "tensorflow")
if backend in suffix_map:
suffix = suffix_map[backend]
else:
raise ValueError(
f"The backend {backend} is not available. Supported backends are: 'tensorflow', 'pytorch'."
)
return suffix


def get_job_names(jdata):
jobkeys = []
for ii in jdata.keys():
Expand Down Expand Up @@ -172,7 +185,7 @@ def _check_empty_iter(iter_index, max_v=0):
return all(empty_sys)


def copy_model(numb_model, prv_iter_index, cur_iter_index):
def copy_model(numb_model, prv_iter_index, cur_iter_index, suffix=".pb"):
cwd = os.getcwd()
prv_train_path = os.path.join(make_iter_name(prv_iter_index), train_name)
cur_train_path = os.path.join(make_iter_name(cur_iter_index), train_name)
Expand All @@ -184,7 +197,8 @@ def copy_model(numb_model, prv_iter_index, cur_iter_index):
os.chdir(cur_train_path)
os.symlink(os.path.relpath(prv_train_task), train_task_fmt % ii)
os.symlink(
os.path.join(train_task_fmt % ii, "frozen_model.pb"), "graph.%03d.pb" % ii
os.path.join(train_task_fmt % ii, f"frozen_model{suffix}"),
"graph.%03d%s" % (ii, suffix),
)
os.chdir(cwd)
with open(os.path.join(cur_train_path, "copied"), "w") as fp:
Expand Down Expand Up @@ -315,18 +329,19 @@ def make_train(iter_index, jdata, mdata):
number_old_frames = 0
number_new_frames = 0

suffix = _get_model_suffix(jdata)
model_devi_engine = jdata.get("model_devi_engine", "lammps")
if iter_index > 0 and _check_empty_iter(iter_index - 1, fp_task_min):
log_task("prev data is empty, copy prev model")
copy_model(numb_models, iter_index - 1, iter_index)
copy_model(numb_models, iter_index - 1, iter_index, suffix)
return
elif (
model_devi_engine != "calypso"
and iter_index > 0
and _check_skip_train(model_devi_jobs[iter_index - 1])
):
log_task("skip training at step %d " % (iter_index - 1))
copy_model(numb_models, iter_index - 1, iter_index)
copy_model(numb_models, iter_index - 1, iter_index, suffix)
wanghan-iapcm marked this conversation as resolved.
Show resolved Hide resolved
return
else:
iter_name = make_iter_name(iter_index)
Expand Down Expand Up @@ -647,7 +662,9 @@ def make_train(iter_index, jdata, mdata):
)
if copied_models is not None:
for ii in range(len(copied_models)):
_link_old_models(work_path, [copied_models[ii]], ii, basename="init.pb")
_link_old_models(
work_path, [copied_models[ii]], ii, basename=f"init{suffix}"
)
# Copy user defined forward files
symlink_user_forward_files(mdata=mdata, task_type="train", work_path=work_path)
# HDF5 format for training data
Expand Down Expand Up @@ -699,6 +716,7 @@ def run_train(iter_index, jdata, mdata):
# print("debug:run_train:mdata", mdata)
# load json param
numb_models = jdata["numb_models"]
suffix = _get_model_suffix(jdata)
# train_param = jdata['train_param']
train_input_file = default_train_input_file
training_reuse_iter = jdata.get("training_reuse_iter")
Expand Down Expand Up @@ -730,7 +748,11 @@ def run_train(iter_index, jdata, mdata):
"training_init_model, training_init_frozen_model, and training_finetune_model are mutually exclusive."
)

train_command = mdata.get("train_command", "dp")
train_command = mdata.get("train_command", "dp").strip()
# assert train_command == "dp", "The 'train_command' should be 'dp'" # the tests should be updated to run this command
if suffix == ".pth":
train_command += " --pt"

train_resources = mdata["train_resources"]

# paths
Expand Down Expand Up @@ -761,9 +783,9 @@ def run_train(iter_index, jdata, mdata):
if training_init_model:
init_flag = " --init-model old/model.ckpt"
elif training_init_frozen_model is not None:
init_flag = " --init-frz-model old/init.pb"
init_flag = f" --init-frz-model old/init{suffix}"
elif training_finetune_model is not None:
init_flag = " --finetune old/init.pb"
init_flag = f" --finetune old/init{suffix}"
command = f"{train_command} train {train_input_file}{extra_flags}"
command = f"{{ if [ ! -f model.ckpt.index ]; then {command}{init_flag}; else {command} --restart model.ckpt; fi }}"
command = f"/bin/sh -c {shlex.quote(command)}"
Expand Down Expand Up @@ -792,23 +814,35 @@ def run_train(iter_index, jdata, mdata):
if "srtab_file_path" in jdata.keys():
forward_files.append(zbl_file)
if training_init_model:
forward_files += [
os.path.join("old", "model.ckpt.meta"),
os.path.join("old", "model.ckpt.index"),
os.path.join("old", "model.ckpt.data-00000-of-00001"),
]
if suffix == ".pb":
forward_files += [
os.path.join("old", "model.ckpt.meta"),
os.path.join("old", "model.ckpt.index"),
os.path.join("old", "model.ckpt.data-00000-of-00001"),
]
elif suffix == ".pth":
forward_files += [os.path.join("old", "model.ckpt.pt"))
elif training_init_frozen_model is not None or training_finetune_model is not None:
forward_files.append(os.path.join("old", "init.pb"))
forward_files.append(os.path.join("old", f"init{suffix}"))

backward_files = ["frozen_model.pb", "lcurve.out", "train.log"]
backward_files += [
"model.ckpt.meta",
"model.ckpt.index",
"model.ckpt.data-00000-of-00001",
backward_files = [
f"frozen_model{suffix}",
"lcurve.out",
"train.log",
"checkpoint",
]
if jdata.get("dp_compress", False):
backward_files.append("frozen_model_compressed.pb")
backward_files.append(f"frozen_model_compressed{suffix}")

if suffix == ".pb":
backward_files += [
"model.ckpt.meta",
"model.ckpt.index",
"model.ckpt.data-00000-of-00001",
]
elif suffix == ".pth":
backward_files += ["model.ckpt.pt"]

if not jdata.get("one_h5", False):
init_data_sys_ = jdata["init_data_sys"]
init_data_sys = []
Expand Down Expand Up @@ -879,13 +913,14 @@ def post_train(iter_index, jdata, mdata):
log_task("copied model, do not post train")
return
# symlink models
suffix = _get_model_suffix(jdata)
for ii in range(numb_models):
if not jdata.get("dp_compress", False):
model_name = "frozen_model.pb"
else:
model_name = "frozen_model_compressed.pb"
model_name = f"frozen_model{suffix}"
if jdata.get("dp_compress", False):
model_name = f"frozen_model_compressed{suffix}"

ofile = os.path.join(work_path, "graph.%03d%s" % (ii, suffix))
wanghan-iapcm marked this conversation as resolved.
Show resolved Hide resolved
task_file = os.path.join(train_task_fmt % ii, model_name)
ofile = os.path.join(work_path, "graph.%03d.pb" % ii)
if os.path.isfile(ofile):
os.remove(ofile)
os.symlink(task_file, ofile)
Expand Down Expand Up @@ -1124,7 +1159,8 @@ def make_model_devi(iter_index, jdata, mdata):
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = sorted(glob.glob(os.path.join(train_path, "graph*pb")))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
work_path = os.path.join(iter_name, model_devi_name)
create_path(work_path)
if model_devi_engine == "calypso":
Expand Down Expand Up @@ -1305,7 +1341,8 @@ def _make_model_devi_revmat(iter_index, jdata, mdata, conf_systems):
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = sorted(glob.glob(os.path.join(train_path, "graph*pb")))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1502,7 +1539,8 @@ def _make_model_devi_native(iter_index, jdata, mdata, conf_systems):
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = glob.glob(os.path.join(train_path, "graph*pb"))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1644,7 +1682,8 @@ def _make_model_devi_native_gromacs(iter_index, jdata, mdata, conf_systems):
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = glob.glob(os.path.join(train_path, "graph*pb"))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1827,7 +1866,8 @@ def _make_model_devi_amber(
.replace("@qm_theory@", jdata["low_level"])
.replace("@rcut@", str(jdata["cutoff"]))
)
models = sorted(glob.glob(os.path.join(train_path, "graph.*.pb")))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph.*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1935,7 +1975,9 @@ def run_md_model_devi(iter_index, jdata, mdata):
run_tasks = [os.path.basename(ii) for ii in run_tasks_]
# dlog.info("all_task is ", all_task)
# dlog.info("run_tasks in run_model_deviation",run_tasks_)
all_models = glob.glob(os.path.join(work_path, "graph*pb"))

suffix = _get_model_suffix(jdata)
all_models = glob.glob(os.path.join(work_path, f"graph*{suffix}"))
model_names = [os.path.basename(ii) for ii in all_models]

model_devi_engine = jdata.get("model_devi_engine", "lammps")
Expand Down
5 changes: 4 additions & 1 deletion dpgen/simplify/simplify.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
record_iter,
)
from dpgen.generator.run import (
_get_model_suffix,
data_system_fmt,
fp_name,
fp_task_fmt,
Expand Down Expand Up @@ -186,7 +187,9 @@ def make_model_devi(iter_index, jdata, mdata):
# link the model
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = glob.glob(os.path.join(train_path, "graph*pb"))
suffix = _get_model_suffix(jdata)
models = glob.glob(os.path.join(train_path, f"graph*{suffix}"))

for mm in models:
model_name = os.path.basename(mm)
os.symlink(mm, os.path.join(work_path, model_name))
Expand Down
Loading