Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to select backends TF/PT #1545

Merged
merged 52 commits into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
575c5a9
begin add gpaw
thangckt Mar 16, 2024
3be325d
Create gpaw.py
thangckt Mar 29, 2024
d3cc70d
Merge pull request #1 from deepmodeling/devel
thangckt Mar 29, 2024
547e13b
Update gpaw.py
thangckt Apr 1, 2024
cfa904b
u
thangckt Apr 1, 2024
946561b
Update arginfo.py
thangckt Apr 2, 2024
11103f5
Merge pull request #2 from deepmodeling/devel
thangckt Apr 2, 2024
0ddbf7c
u
thangckt Apr 2, 2024
0b94ff6
u
thangckt Apr 3, 2024
494b796
u
thangckt Apr 4, 2024
2a58c7e
Merge branch 'devel' of https://github.com/thangckt/dpgen into devel
thangckt Apr 4, 2024
01fbd2f
Merge pull request #4 from deepmodeling/devel
thangckt May 2, 2024
babd77e
modify to use pytorch
thangckt May 5, 2024
28b8a49
option to choose between TF and PT
thangckt May 6, 2024
2822be0
Delete gpaw.py
thangckt May 6, 2024
99efd85
finish add option to select TF/PT
thangckt May 7, 2024
bdecc9f
Merge pull request #5 from deepmodeling/devel
thangckt May 7, 2024
f4f5665
Merge pull request #6 from thangckt/devel
thangckt May 7, 2024
ef1df18
remove GPAW to PR
thangckt May 7, 2024
a1b3ff8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
11dca54
Update simplify.py
thangckt May 7, 2024
a0684ca
Merge branch 'PR' of https://github.com/thangckt/dpgen into PR
thangckt May 7, 2024
220735d
Revert "Merge branch 'PR' of https://github.com/thangckt/dpgen into PR"
thangckt May 7, 2024
8f7f491
reset add GPAW from here
thangckt May 7, 2024
05b2412
u
thangckt May 7, 2024
ed832e8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
af3fc27
Update arginfo.py
thangckt May 7, 2024
8a949d0
u
thangckt May 7, 2024
7def8ef
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
52a9989
Update run.py
thangckt May 7, 2024
0d99ea9
Merge branch 'PR' of https://github.com/thangckt/dpgen into PR
thangckt May 7, 2024
8bdea17
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 7, 2024
67fab2b
Merge branch 'devel' into PR
thangckt May 7, 2024
706146a
Update run.py
thangckt May 8, 2024
e72f4c8
Merge branch 'devel' into PR
thangckt May 8, 2024
98eccb8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 8, 2024
702510c
Update run.py
thangckt May 8, 2024
6ffd1eb
Merge branch 'PR' of https://github.com/thangckt/dpgen into PR
thangckt May 8, 2024
624a48b
Update arginfo.py
thangckt May 8, 2024
f3f49d3
remove gpaw
thangckt May 9, 2024
0f9b1b0
Update run.py
thangckt May 9, 2024
5049771
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 9, 2024
47a766e
Update run.py
thangckt May 9, 2024
083ad45
Update run.py
thangckt May 9, 2024
a500987
Update run.py
thangckt May 9, 2024
35b2713
Update run.py
thangckt May 9, 2024
dd44e79
Update run.py
thangckt May 9, 2024
e07f336
Update run.py
thangckt May 9, 2024
cc19ffa
Update run.py
thangckt May 10, 2024
1e89be7
Merge branch 'devel' into select_backend
thangckt May 10, 2024
4023ccd
support training_init_model
njzjz May 10, 2024
cf68032
fix typo
njzjz May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions dpgen/generator/arginfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def training_args() -> list[Argument]:
list[dargs.Argument]
List of training arguments.
"""
doc_train_backend = (
"The backend of the training. Currently only support tensorflow and pytorch."
)
doc_numb_models = "Number of models to be trained in 00.train. 4 is recommend."
doc_training_iter0_model_path = "The model used to init the first iter training. Number of element should be equal to numb_models."
doc_training_init_model = "Iteration > 0, the model parameters will be initilized from the model trained at the previous iteration. Iteration == 0, the model parameters will be initialized from training_iter0_model_path."
Expand Down Expand Up @@ -123,6 +126,13 @@ def training_args() -> list[Argument]:
doc_training_finetune_model = "At interation 0, finetune the model parameters from the given frozen models. Number of element should be equal to numb_models."

return [
Argument(
"train_backend",
str,
optional=True,
default="tensorflow",
doc=doc_train_backend,
),
Argument("numb_models", int, optional=False, doc=doc_numb_models),
Argument(
"training_iter0_model_path",
Expand Down
104 changes: 73 additions & 31 deletions dpgen/generator/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,19 @@
run_opt_file = os.path.join(ROOT_PATH, "generator/lib/calypso_run_opt.py")


def _get_model_suffix(jdata) -> str:
"""Return the model suffix based on the backend."""
suffix_map = {"tensorflow": ".pb", "pytorch": ".pth"}
backend = jdata.get("train_backend", "tensorflow")
if backend in suffix_map:
suffix = suffix_map[backend]
else:
raise ValueError(

Check warning on line 135 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L135

Added line #L135 was not covered by tests
f"The backend {backend} is not available. Supported backends are: 'tensorflow', 'pytorch'."
)
return suffix


def get_job_names(jdata):
jobkeys = []
for ii in jdata.keys():
Expand Down Expand Up @@ -172,7 +185,7 @@
return all(empty_sys)


def copy_model(numb_model, prv_iter_index, cur_iter_index):
def copy_model(numb_model, prv_iter_index, cur_iter_index, suffix=".pb"):
cwd = os.getcwd()
prv_train_path = os.path.join(make_iter_name(prv_iter_index), train_name)
cur_train_path = os.path.join(make_iter_name(cur_iter_index), train_name)
Expand All @@ -184,7 +197,8 @@
os.chdir(cur_train_path)
os.symlink(os.path.relpath(prv_train_task), train_task_fmt % ii)
os.symlink(
os.path.join(train_task_fmt % ii, "frozen_model.pb"), "graph.%03d.pb" % ii
os.path.join(train_task_fmt % ii, f"frozen_model{suffix}"),
"graph.%03d%s" % (ii, suffix),
)
os.chdir(cwd)
with open(os.path.join(cur_train_path, "copied"), "w") as fp:
Expand Down Expand Up @@ -315,18 +329,19 @@
number_old_frames = 0
number_new_frames = 0

suffix = _get_model_suffix(jdata)
model_devi_engine = jdata.get("model_devi_engine", "lammps")
if iter_index > 0 and _check_empty_iter(iter_index - 1, fp_task_min):
log_task("prev data is empty, copy prev model")
copy_model(numb_models, iter_index - 1, iter_index)
copy_model(numb_models, iter_index - 1, iter_index, suffix)

Check warning on line 336 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L336

Added line #L336 was not covered by tests
return
elif (
model_devi_engine != "calypso"
and iter_index > 0
and _check_skip_train(model_devi_jobs[iter_index - 1])
):
log_task("skip training at step %d " % (iter_index - 1))
copy_model(numb_models, iter_index - 1, iter_index)
copy_model(numb_models, iter_index - 1, iter_index, suffix)

Check warning on line 344 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L344

Added line #L344 was not covered by tests
wanghan-iapcm marked this conversation as resolved.
Show resolved Hide resolved
return
else:
iter_name = make_iter_name(iter_index)
Expand Down Expand Up @@ -647,7 +662,9 @@
)
if copied_models is not None:
for ii in range(len(copied_models)):
_link_old_models(work_path, [copied_models[ii]], ii, basename="init.pb")
_link_old_models(
work_path, [copied_models[ii]], ii, basename=f"init{suffix}"
)
# Copy user defined forward files
symlink_user_forward_files(mdata=mdata, task_type="train", work_path=work_path)
# HDF5 format for training data
Expand Down Expand Up @@ -699,6 +716,7 @@
# print("debug:run_train:mdata", mdata)
# load json param
numb_models = jdata["numb_models"]
suffix = _get_model_suffix(jdata)
# train_param = jdata['train_param']
train_input_file = default_train_input_file
training_reuse_iter = jdata.get("training_reuse_iter")
Expand Down Expand Up @@ -730,7 +748,11 @@
"training_init_model, training_init_frozen_model, and training_finetune_model are mutually exclusive."
)

train_command = mdata.get("train_command", "dp")
train_command = mdata.get("train_command", "dp").strip()
# assert train_command == "dp", "The 'train_command' should be 'dp'" # the tests should be updated to run this command
if suffix == ".pth":
train_command += " --pt"

Check warning on line 754 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L754

Added line #L754 was not covered by tests

train_resources = mdata["train_resources"]

# paths
Expand Down Expand Up @@ -761,9 +783,9 @@
if training_init_model:
init_flag = " --init-model old/model.ckpt"
elif training_init_frozen_model is not None:
init_flag = " --init-frz-model old/init.pb"
init_flag = f" --init-frz-model old/init{suffix}"
elif training_finetune_model is not None:
init_flag = " --finetune old/init.pb"
init_flag = f" --finetune old/init{suffix}"
command = f"{train_command} train {train_input_file}{extra_flags}"
command = f"{{ if [ ! -f model.ckpt.index ]; then {command}{init_flag}; else {command} --restart model.ckpt; fi }}"
command = f"/bin/sh -c {shlex.quote(command)}"
Expand Down Expand Up @@ -792,23 +814,35 @@
if "srtab_file_path" in jdata.keys():
forward_files.append(zbl_file)
if training_init_model:
forward_files += [
os.path.join("old", "model.ckpt.meta"),
os.path.join("old", "model.ckpt.index"),
os.path.join("old", "model.ckpt.data-00000-of-00001"),
]
if suffix == ".pb":
forward_files += [

Check warning on line 818 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L817-L818

Added lines #L817 - L818 were not covered by tests
os.path.join("old", "model.ckpt.meta"),
os.path.join("old", "model.ckpt.index"),
os.path.join("old", "model.ckpt.data-00000-of-00001"),
]
elif suffix == ".pth":
forward_files += [os.path.join("old", "model.ckpt.pt")]

Check warning on line 824 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L823-L824

Added lines #L823 - L824 were not covered by tests
elif training_init_frozen_model is not None or training_finetune_model is not None:
forward_files.append(os.path.join("old", "init.pb"))
forward_files.append(os.path.join("old", f"init{suffix}"))

backward_files = ["frozen_model.pb", "lcurve.out", "train.log"]
backward_files += [
"model.ckpt.meta",
"model.ckpt.index",
"model.ckpt.data-00000-of-00001",
backward_files = [
f"frozen_model{suffix}",
"lcurve.out",
"train.log",
"checkpoint",
]
if jdata.get("dp_compress", False):
backward_files.append("frozen_model_compressed.pb")
backward_files.append(f"frozen_model_compressed{suffix}")

Check warning on line 835 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L835

Added line #L835 was not covered by tests

if suffix == ".pb":
backward_files += [
"model.ckpt.meta",
"model.ckpt.index",
"model.ckpt.data-00000-of-00001",
]
elif suffix == ".pth":
backward_files += ["model.ckpt.pt"]

Check warning on line 844 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L843-L844

Added lines #L843 - L844 were not covered by tests

if not jdata.get("one_h5", False):
init_data_sys_ = jdata["init_data_sys"]
init_data_sys = []
Expand Down Expand Up @@ -879,13 +913,14 @@
log_task("copied model, do not post train")
return
# symlink models
suffix = _get_model_suffix(jdata)

Check warning on line 916 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L916

Added line #L916 was not covered by tests
for ii in range(numb_models):
if not jdata.get("dp_compress", False):
model_name = "frozen_model.pb"
else:
model_name = "frozen_model_compressed.pb"
model_name = f"frozen_model{suffix}"
if jdata.get("dp_compress", False):
model_name = f"frozen_model_compressed{suffix}"

Check warning on line 920 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L918-L920

Added lines #L918 - L920 were not covered by tests

ofile = os.path.join(work_path, "graph.%03d%s" % (ii, suffix))

Check warning on line 922 in dpgen/generator/run.py

View check run for this annotation

Codecov / codecov/patch

dpgen/generator/run.py#L922

Added line #L922 was not covered by tests
wanghan-iapcm marked this conversation as resolved.
Show resolved Hide resolved
task_file = os.path.join(train_task_fmt % ii, model_name)
ofile = os.path.join(work_path, "graph.%03d.pb" % ii)
if os.path.isfile(ofile):
os.remove(ofile)
os.symlink(task_file, ofile)
Expand Down Expand Up @@ -1124,7 +1159,8 @@
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = sorted(glob.glob(os.path.join(train_path, "graph*pb")))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
work_path = os.path.join(iter_name, model_devi_name)
create_path(work_path)
if model_devi_engine == "calypso":
Expand Down Expand Up @@ -1305,7 +1341,8 @@
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = sorted(glob.glob(os.path.join(train_path, "graph*pb")))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1502,7 +1539,8 @@
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = glob.glob(os.path.join(train_path, "graph*pb"))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1644,7 +1682,8 @@
iter_name = make_iter_name(iter_index)
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = glob.glob(os.path.join(train_path, "graph*pb"))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1827,7 +1866,8 @@
.replace("@qm_theory@", jdata["low_level"])
.replace("@rcut@", str(jdata["cutoff"]))
)
models = sorted(glob.glob(os.path.join(train_path, "graph.*.pb")))
suffix = _get_model_suffix(jdata)
models = sorted(glob.glob(os.path.join(train_path, f"graph.*{suffix}")))
task_model_list = []
for ii in models:
task_model_list.append(os.path.join("..", os.path.basename(ii)))
Expand Down Expand Up @@ -1935,7 +1975,9 @@
run_tasks = [os.path.basename(ii) for ii in run_tasks_]
# dlog.info("all_task is ", all_task)
# dlog.info("run_tasks in run_model_deviation",run_tasks_)
all_models = glob.glob(os.path.join(work_path, "graph*pb"))

suffix = _get_model_suffix(jdata)
all_models = glob.glob(os.path.join(work_path, f"graph*{suffix}"))
model_names = [os.path.basename(ii) for ii in all_models]

model_devi_engine = jdata.get("model_devi_engine", "lammps")
Expand Down
5 changes: 4 additions & 1 deletion dpgen/simplify/simplify.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
record_iter,
)
from dpgen.generator.run import (
_get_model_suffix,
data_system_fmt,
fp_name,
fp_task_fmt,
Expand Down Expand Up @@ -186,7 +187,9 @@
# link the model
train_path = os.path.join(iter_name, train_name)
train_path = os.path.abspath(train_path)
models = glob.glob(os.path.join(train_path, "graph*pb"))
suffix = _get_model_suffix(jdata)
models = glob.glob(os.path.join(train_path, f"graph*{suffix}"))

Check warning on line 191 in dpgen/simplify/simplify.py

View check run for this annotation

Codecov / codecov/patch

dpgen/simplify/simplify.py#L190-L191

Added lines #L190 - L191 were not covered by tests

for mm in models:
model_name = os.path.basename(mm)
os.symlink(mm, os.path.join(work_path, model_name))
Expand Down