From 9ab511a661bf592c83e3cbed6bddda8de8c54b16 Mon Sep 17 00:00:00 2001 From: zhangjiajin Date: Mon, 26 Sep 2022 09:11:42 +0000 Subject: [PATCH] release 1.8.5 --- README.cn.md | 10 ++----- README.md | 11 +++---- RELEASE.md | 2 +- docs/cn/user/ascend_910.md | 7 ----- docs/cn/user/security_configure.md | 29 ++++++++++++++++++- docs/en/user/ascend_910.md | 7 ----- docs/en/user/security_configure.md | 27 +++++++++++++++++ evaluate_service/RELEASE.md | 2 +- evaluate_service/evaluate_service/__init__.py | 2 +- evaluate_service/evaluate_service/main.py | 6 ++-- .../evaluate_service/run_flask.py | 20 +++++++++++-- evaluate_service/setup.py | 3 +- .../data_augmentation/cyclesr/cyclesr.yml | 1 + examples/nas/modnas/darts.yml | 1 + examples/nas/modnas/mbv2.yml | 1 + examples/nas/modnas/ps.yml | 1 + examples/nas/modnas/pxl.yml | 1 + setup.py | 3 +- vega/__init__.py | 2 +- .../nas/sp_nas/spnas_trainer_callback.py | 6 ++-- vega/algorithms/nas/sp_nas/src/dataset.py | 4 +-- .../nas/sp_nas/src/model_utils/config.py | 2 +- vega/common/backend_register.py | 12 ++++---- vega/common/file_ops.py | 18 ++++++++++-- vega/common/general.py | 1 + vega/common/utils.py | 6 ++++ vega/core/pipeline/pipeline.py | 6 ++-- vega/core/scheduler/local_master.py | 2 ++ vega/core/scheduler/run_dask.py | 3 ++ vega/core/scheduler/worker_env.py | 8 ++--- vega/model_zoo/model_zoo.py | 6 ++-- vega/security/conf.py | 10 ++++--- vega/security/kmc/kmc.py | 21 ++++++++++++-- vega/security/post.py | 20 +++++++++++-- vega/security/run_dask.py | 3 +- vega/security/utils.py | 15 ++++------ vega/trainer/callbacks/hccl.py | 3 +- 37 files changed, 200 insertions(+), 82 deletions(-) diff --git a/README.cn.md b/README.cn.md index 63671c8..43bb3c0 100644 --- a/README.cn.md +++ b/README.cn.md @@ -9,16 +9,12 @@ --- -**Vega ver1.8.4 发布** +**Vega ver1.8.5 发布** - 错误修正 - - 修正ASHA算法更新数据时失败的问题。 - - 修正HCCL+Apex下,loss不更新的问题。 - - 增加字典类指标。 - - 更新安全配置文档。 - - 移除安全模式下对Horovod和TensorFlow的支持。 - - 增加安全模型下对Python3.9及以上版本的要求。 + - 修正SPNAS算法集群训练失败时的问题。 + - 修正了安全模式下模型拷贝失败等问题。 --- diff --git a/README.md b/README.md index 8200484..2d43108 100644 --- a/README.md +++ b/README.md @@ -8,16 +8,13 @@ --- -**Vega ver1.8.4 released** +**Vega ver1.8.5 released** - Bug Fixed: - - Fixed bug that ASHA failed to update data. - - Fixed bug that loss is not updated on HCCL+Apex. - - Add dictionary metrics. - - Update the security configuration document. - - Not Allowed Horovod and TensorFlow in safe mode. - - Python 3.9 or later is required in the security model. + - Fixed a bug when the SPNAS algorithm cluster training fails. + - Fixed bugs such as model copy failure in safe mode. + --- diff --git a/RELEASE.md b/RELEASE.md index 54e2d47..f4db9e6 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,4 +1,4 @@ -**Vega ver1.8.4 released:** +**Vega ver1.8.5 released:** **Introduction** diff --git a/docs/cn/user/ascend_910.md b/docs/cn/user/ascend_910.md index 5ba594d..4d58ae7 100644 --- a/docs/cn/user/ascend_910.md +++ b/docs/cn/user/ascend_910.md @@ -137,10 +137,3 @@ pip3 install --user --no-deps noah-vega ```bash pip3 show noah-vega ``` - -另外要注意的是,dask和distributed这两个包,需要安装如下版本: - -```bash -pip3 install --user distributed==2021.7.0 -pip3 install --user dask==2021.7.0 -``` diff --git a/docs/cn/user/security_configure.md b/docs/cn/user/security_configure.md index a367e9e..7b71df1 100644 --- a/docs/cn/user/security_configure.md +++ b/docs/cn/user/security_configure.md @@ -17,6 +17,7 @@ Vega的安全配置,包括如下步骤: 1. **Python3.9及以上** 2. **dask和distributed版本为2022.2.0** + ## 1.安装OpenSSL 首先要安装OpenSSL 1.1.1,从源码编译安装,或者直接安装编译后的发行包。 @@ -87,7 +88,7 @@ openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out s rm server.csr ``` -执行如下脚本生成评估服务客户端所使用的证书的加密私钥,执行该命令时,会提示输入加密密码,密码的强度要求如服务器端私钥,且和服务器段私钥密码不同,请记录好改密码,后继还需使用: +执行如下脚本生成评估服务客户端所使用的证书的加密私钥,执行该命令时,会提示输入加密密码,密码的强度要求如服务器端私钥,且和服务器端私钥密码不同,请记录好该密码,后继还需使用: ```shell openssl genrsa -aes-256-ofb -out client.key 4096 @@ -172,6 +173,17 @@ chmod 600 ~/.vega/* 1. 如上的秘钥、证书、加密材料也可以放到其他目录位置,注意访问权限要设置为`600`,并在后继的配置文件中同步修改该文件的位置,需要使用绝对路径。 2. 在训练集群上,需要保留`ca.crt`、`client.key`、`client.crt`、`ksmaster_client.dat`、`ksstandby_client.dat`、`server_dask.key`、`server_dask.crt`、`client_dask.key`、`client_dask.crt`,并删除其他文件。 3. 评估服务上,需要保留`ca.crt`、`server.key`、`server.crt`、`ksmaster_server.dat`、`ksstandby_server.dat`,并删除其他文件。 +4. 以下为默认配置的加密套件: + + ```txt + ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-DSS-AES128-GCM-SHA256:DHE-DSS-AES256-GCM-SHA384:DHE-RSA-AES128-CCM:DHE-RSA-AES256-CCM + ``` + + 如需缩小范围,可在`client.ini`与`vega.ini`中加入配置: + + ```ini + ciphers=ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM + ``` 在`~/.vega`目录下创建`server.ini`和`client.ini`。 @@ -290,3 +302,18 @@ find ~/.local/ -name *.pem ### 9.5 Horovod 和 TensorFlow 在安全模式下,Vega不支持Horovod数据并行,也不支持TensorFlow框架,Vega在运行前检查若是Horovod数据并行程序,或者TensorFlow框架,会自动退出。 + +### 9.6 限定Distributed仅使用tls1.3协议进行通信 + +若需要限定开源软件Distributed的组件间的通信仅使用tls1.3协议,需要配置`~/.config/dask/distributed.yaml` + +distributed.yaml: + +```yaml +distributed: + comm: + tls: + min-version: 1.3 +``` + +请参考Dask的[配置指导](https://docs.dask.org/en/stable/configuration.html)。 diff --git a/docs/en/user/ascend_910.md b/docs/en/user/ascend_910.md index f666e46..915a9bf 100644 --- a/docs/en/user/ascend_910.md +++ b/docs/en/user/ascend_910.md @@ -144,10 +144,3 @@ Run the following command to view the Vega dependency package: ```bash pip3 show noah-vega ``` - -Note that the following versions must be installed for the dask and distributed packages: - -```bash -pip3 install --user distributed==2021.7.0 -pip3 install --user dask==2021.7.0 -``` \ No newline at end of file diff --git a/docs/en/user/security_configure.md b/docs/en/user/security_configure.md index 4b85e07..c470e64 100644 --- a/docs/en/user/security_configure.md +++ b/docs/en/user/security_configure.md @@ -17,6 +17,7 @@ requirements: 1. **Python 3.9 or later.** 2. **Dask and Distributed version is 2022.2.0.** + ## 1. Install OpenSSL You need to install OpenSSL 1.1.1, compile and install from the source code, or directly install the compiled release package. @@ -173,6 +174,17 @@ Description: 1. The preceding keys, certificates, and encryption materials can also be stored in other directories. The access permission must be set to 600, and the file location must be changed to an absolute path in subsequent configuration files. 2. In the train cluster, reserve `ca.crt`, `client.key`, `client.crt`, `ksmaster_client.dat`, `ksstandby_client.dat`, and `server_dask.key`, `server_dask.crt`, `client_dask.key`, `client_dask.crt`, and delete other files. 3. In the evaluate service, reserve `ca.crt`, `server.key`, `server.crt`, `ksmaster_server.dat`, and `ksstandby_server.dat` files, and delete other files. +4. The default cipher suites are as follows:: + + ```txt + ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-DSS-AES128-GCM-SHA256:DHE-DSS-AES256-GCM-SHA384:DHE-RSA-AES128-CCM:DHE-RSA-AES256-CCM + ``` + + To narrow down the scope, add configurations to the `client.ini` and `vega.ini` files: + + ```ini + ciphers=ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM + ``` Create `server.ini` and `client.ini` in the `~/.vega` directory. @@ -292,3 +304,18 @@ Find the private key file of the open-source software on which Vega depends amon ### 9.5 Horovod and TensorFlow In security mode, Vega does not support Horovod or the TensorFlow framework. Vega automatically exits if Vega run on Horovod or the TensorFlow framework. + +### 9.6 Only TLS 1.3 can be used for Distributed + +If only the tls1.3 protocol needs to be used for communication between distributed components,configure `~/.config/dask/distributed.yaml` + +distributed.yaml: + +```yaml +distributed: + comm: + tls: + min-version: 1.3 +``` + +For details, see the [Configuration Guide](https://docs.dask.org/en/stable/configuration.html)。 diff --git a/evaluate_service/RELEASE.md b/evaluate_service/RELEASE.md index 81f473d..1ef5dd2 100644 --- a/evaluate_service/RELEASE.md +++ b/evaluate_service/RELEASE.md @@ -1,4 +1,4 @@ -**Evaluate Service ver1.8.4 released:** +**Evaluate Service ver1.8.5 released:** **Introduction** diff --git a/evaluate_service/evaluate_service/__init__.py b/evaluate_service/evaluate_service/__init__.py index 50ff521..3f3d48f 100644 --- a/evaluate_service/evaluate_service/__init__.py +++ b/evaluate_service/evaluate_service/__init__.py @@ -16,4 +16,4 @@ """Evaluate service.""" -__version__ = "1.8.4" +__version__ = "1.8.5" diff --git a/evaluate_service/evaluate_service/main.py b/evaluate_service/evaluate_service/main.py index f3dc563..86a394d 100644 --- a/evaluate_service/evaluate_service/main.py +++ b/evaluate_service/evaluate_service/main.py @@ -176,8 +176,10 @@ def upload_files(self): logging.warning("The timestamp is {}.".format(self.now_time)) self.upload_file_path = os.path.join(self.current_path, "out", self.now_time) self.share_dir = os.path.join(self.current_path, "out", self.job_id) - os.makedirs(self.upload_file_path) - os.makedirs(self.share_dir) + if not os.path.exists(self.upload_file_path): + os.makedirs(self.upload_file_path) + if not os.path.exists(self.share_dir): + os.makedirs(self.share_dir) patterns = [".pkl", ".pth", ".pt", ".pb", ".ckpt", ".air", '.om', ".onnx", ".caffemodel", ".pbtxt", ".prototxt"] model_file = request.files.get("model_file") diff --git a/evaluate_service/evaluate_service/run_flask.py b/evaluate_service/evaluate_service/run_flask.py index 355260b..8822b8e 100644 --- a/evaluate_service/evaluate_service/run_flask.py +++ b/evaluate_service/evaluate_service/run_flask.py @@ -18,6 +18,7 @@ import configparser import logging +import ssl import os from multiprocessing import Process import gevent @@ -76,13 +77,28 @@ def run_flask(app, host, port, security_mode): encrypted_password = config.get('security').get('encrypted_password') key_component_1 = config.get('security').get('key_component_1') key_component_2 = config.get('security').get('key_component_2') + ciphers = config.get('security').get('ciphers') + cipher_suites = "ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM:ECDHE-ECDSA-AES128-GCM-SHA256" \ + ":ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384" \ + ":DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-DSS-AES128-GCM-SHA256" \ + ":DHE-DSS-AES256-GCM-SHA384:DHE-RSA-AES128-CCM:DHE-RSA-AES256-CCM" + + if ciphers: + ciphersList = [cipher for cipher in ciphers.split(':') if cipher in cipher_suites.split(':')] + if ciphersList == []: + raise ssl.SSLError("The ciphers are invalid, please check.") + else: + ciphers = ':'.join(ciphersList) + else: + ciphers = cipher_suites + if not check_risky_files((ca_cert, server_cert, server_secret_key, key_component_1, key_component_2)): return try: if encrypted_password == "": - ssl_context = create_context(ca_cert, server_cert, server_secret_key) + ssl_context = create_context(ca_cert, server_cert, server_secret_key, ciphers) else: - ssl_context = create_context(ca_cert, server_cert, server_secret_key, + ssl_context = create_context(ca_cert, server_cert, server_secret_key, ciphers, encrypted_password, key_component_1, key_component_2) except Exception: logging.error("Fail to create context.") diff --git a/evaluate_service/setup.py b/evaluate_service/setup.py index 3a9e468..724b069 100644 --- a/evaluate_service/setup.py +++ b/evaluate_service/setup.py @@ -60,7 +60,7 @@ def run(self): setuptools.setup( name="evaluate-service", - version="1.8.4", + version="1.8.5", packages=["evaluate_service"], include_package_data=True, python_requires=">=3.6", @@ -80,6 +80,7 @@ def run(self): "Flask-RESTful", "Flask-Limiter", "gevent", + "PyYAML", ], cmdclass={ "build_py": custom_build_py, diff --git a/examples/data_augmentation/cyclesr/cyclesr.yml b/examples/data_augmentation/cyclesr/cyclesr.yml index c560262..25103c3 100644 --- a/examples/data_augmentation/cyclesr/cyclesr.yml +++ b/examples/data_augmentation/cyclesr/cyclesr.yml @@ -1,5 +1,6 @@ general: backend: pytorch + requires: ["tensorboardX"] pipeline: [fully_train] diff --git a/examples/nas/modnas/darts.yml b/examples/nas/modnas/darts.yml index 4892809..6927e10 100644 --- a/examples/nas/modnas/darts.yml +++ b/examples/nas/modnas/darts.yml @@ -1,5 +1,6 @@ general: backend: pytorch + requires: ["tensorboardX"] pipeline: [nas, fully_train] diff --git a/examples/nas/modnas/mbv2.yml b/examples/nas/modnas/mbv2.yml index 9a35f49..caeed46 100644 --- a/examples/nas/modnas/mbv2.yml +++ b/examples/nas/modnas/mbv2.yml @@ -1,5 +1,6 @@ general: backend: pytorch + requires: ["tensorboardX"] pipeline: [fully_train] diff --git a/examples/nas/modnas/ps.yml b/examples/nas/modnas/ps.yml index c661483..fdcef37 100644 --- a/examples/nas/modnas/ps.yml +++ b/examples/nas/modnas/ps.yml @@ -1,5 +1,6 @@ general: backend: pytorch + requires: ["tensorboardX"] pipeline: [nas, fully_train] diff --git a/examples/nas/modnas/pxl.yml b/examples/nas/modnas/pxl.yml index b9a807c..fd40822 100644 --- a/examples/nas/modnas/pxl.yml +++ b/examples/nas/modnas/pxl.yml @@ -1,5 +1,6 @@ general: backend: pytorch + requires: ["tensorboardX"] pipeline: [nas, fully_train] diff --git a/setup.py b/setup.py index b50bb15..2ab348a 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ setuptools.setup( name="noah-vega", - version="1.8.4", + version="1.8.5", packages=["vega"], include_package_data=True, python_requires=">=3.6", @@ -59,7 +59,6 @@ "dill", "scikit-learn", "opencv-python", - "tensorboardX", ], entry_points=""" [console_scripts] diff --git a/vega/__init__.py b/vega/__init__.py index de09e94..dc994b7 100644 --- a/vega/__init__.py +++ b/vega/__init__.py @@ -34,7 +34,7 @@ "get_quota", ] -__version__ = "1.8.4" +__version__ = "1.8.5" import sys diff --git a/vega/algorithms/nas/sp_nas/spnas_trainer_callback.py b/vega/algorithms/nas/sp_nas/spnas_trainer_callback.py index 6360823..62f0ca4 100644 --- a/vega/algorithms/nas/sp_nas/spnas_trainer_callback.py +++ b/vega/algorithms/nas/sp_nas/spnas_trainer_callback.py @@ -41,6 +41,7 @@ def valid(): """Construct the trainer of SpNas.""" config_val = DatasetConfig().to_dict() + dataset_type = config_val.type config_val = config_val['_class_data'].val prefix = "FasterRcnn_eval.mindrecord" mindrecord_dir = config_val.mindrecord_dir @@ -49,7 +50,7 @@ def valid(): if not os.path.exists(mindrecord_file): if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) - if config_val.dataset == "coco": + if dataset_type == "CocoDataset": if os.path.isdir(config_val.coco_root): data_to_mindrecord_byte_image(config_val, "coco", False, prefix, file_num=1) else: @@ -67,6 +68,7 @@ def valid(): def train(): """Train fasterrcnn dataset.""" config_train = DatasetConfig().to_dict() + dataset_type = config_train.type config_train = config_train['_class_data'].train prefix = "FasterRcnn.mindrecord" mindrecord_dir = config_train.mindrecord_dir @@ -78,7 +80,7 @@ def train(): if rank == 0 and not os.path.exists(mindrecord_file): if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) - if config.dataset == "coco": + if dataset_type == "CocoDataset": if os.path.isdir(config_train.coco_root): if not os.path.exists(config_train.coco_root): logging.info("Please make sure config:coco_root is valid.") diff --git a/vega/algorithms/nas/sp_nas/src/dataset.py b/vega/algorithms/nas/sp_nas/src/dataset.py index 593850d..3afdc29 100644 --- a/vega/algorithms/nas/sp_nas/src/dataset.py +++ b/vega/algorithms/nas/sp_nas/src/dataset.py @@ -361,10 +361,10 @@ def create_coco_label(is_training, config): from pycocotools.coco import COCO coco_root = config.coco_root - data_type = config.val_data_type if is_training: data_type = config.train_data_type - + else: + data_type = config.val_data_type # Classes need to train or test. train_cls = config.coco_classes train_cls_dict = {} diff --git a/vega/algorithms/nas/sp_nas/src/model_utils/config.py b/vega/algorithms/nas/sp_nas/src/model_utils/config.py index a8f6958..d9a574f 100644 --- a/vega/algorithms/nas/sp_nas/src/model_utils/config.py +++ b/vega/algorithms/nas/sp_nas/src/model_utils/config.py @@ -66,7 +66,7 @@ def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_ else: parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, help=help_description) - args = parser.parse_args() + args, _ = parser.parse_known_args() return args diff --git a/vega/common/backend_register.py b/vega/common/backend_register.py index 0af3f64..8b59072 100644 --- a/vega/common/backend_register.py +++ b/vega/common/backend_register.py @@ -17,7 +17,8 @@ """Backend Register.""" import os -import sys +import logging +import traceback __all__ = [ "set_backend", @@ -135,7 +136,8 @@ def get_devices(): def import_extension_module(): """Import extension module.""" - try: - import ascend_automl - except ImportError: - pass + if is_npu_device(): + try: + import ascend_automl + except ImportError: + logging.debug(traceback.format_exc()) diff --git a/vega/common/file_ops.py b/vega/common/file_ops.py index 0895511..64db41a 100644 --- a/vega/common/file_ops.py +++ b/vega/common/file_ops.py @@ -155,9 +155,23 @@ def copy_file(cls, src, dst): if os.path.isfile(src): shutil.copy(src, dst) else: - logger.error("failed to copy file, file is not existed, file={}.".format(src)) + logger.error(f"failed to copy file, file is not existed, file={src}.") + except OSError as os_error: + if os_error.errno == 13 and os.path.abspath(os_error.filename) != os.path.abspath(src): + need_try_again = True + os_error_filename = os.path.abspath(os_error.filename) + else: + logger.error(f"Failed to copy file, src={src}, dst={dst}, msg={os_error}") except Exception as ex: - logger.error("Failed to copy file, src={}, dst={}, msg={}".format(src, dst, str(ex))) + logger.error(f"Failed to copy file, src={src}, dst={dst}, msg={ex}") + + if "need_try_again" in locals(): + try: + logger.info("The dest file is readonly, remove the dest file and try again.") + os.remove(os_error_filename) + shutil.copy(src, dst) + except Exception as ex: + logger.error(f"Failed to copy file after removed dest file, src={src}, dst={dst}, msg={ex}") @classmethod def download_dataset(cls, src_path, local_path=None): diff --git a/vega/common/general.py b/vega/common/general.py index 9320eca..f32587f 100644 --- a/vega/common/general.py +++ b/vega/common/general.py @@ -132,3 +132,4 @@ class General(ConfigSerializable): ms_execute_mode = 0 # 0-GRAPH_MODE 1-PYNATIVE_MODE dataset_sink_mode = True security = False + skip_trainer_error = True diff --git a/vega/common/utils.py b/vega/common/utils.py index 1487d1c..201f9a9 100644 --- a/vega/common/utils.py +++ b/vega/common/utils.py @@ -187,8 +187,14 @@ def verify_requires(requires): if requires and isinstance(requires, list): failed = [] for pkg in requires: + _lower = False try: __import__(pkg.split("=")[0].replace("<", "").replace(">", "").lower()) + except Exception: + _lower = True + try: + if _lower: + __import__(pkg.split("=")[0].replace("<", "").replace(">", "")) except Exception: failed.append(pkg) if failed: diff --git a/vega/core/pipeline/pipeline.py b/vega/core/pipeline/pipeline.py index 1fff5c0..63783bb 100644 --- a/vega/core/pipeline/pipeline.py +++ b/vega/core/pipeline/pipeline.py @@ -28,7 +28,6 @@ from vega.common.general import General from vega.report import ReportServer from vega.common.message_server import MessageServer -from vega.common.parameter_sharing import ParameterSharing from .pipe_step import PipeStep from .conf import PipeStepConfig, PipelineConfig @@ -106,10 +105,11 @@ def _set_evaluator_config(self, step_cfg): PipeStepConfig.evaluator_enable = True def _interval_time(self, start, end): - seconds = (end - start).seconds + time_difference = end - start + seconds = time_difference.seconds minutes, seconds = divmod(seconds, 60) hours, minutes = divmod(minutes, 60) - return "%d:%02d:%02d" % (hours, minutes, seconds) + return "%d:%02d:%02d" % (hours + time_difference.days * 24, minutes, seconds) def _show_pipeline_info(self): logging.info("-" * 48) diff --git a/vega/core/scheduler/local_master.py b/vega/core/scheduler/local_master.py index 4efcc23..4f3dbdf 100644 --- a/vega/core/scheduler/local_master.py +++ b/vega/core/scheduler/local_master.py @@ -65,6 +65,8 @@ def run(self, worker, evaluator=None): except Exception as e: logging.debug(traceback.format_exc()) logging.error(f"Failed to run worker, id: {worker.worker_id}, message: {e}") + if not General.skip_trainer_error: + raise e self._update(step_name, worker_id) diff --git a/vega/core/scheduler/run_dask.py b/vega/core/scheduler/run_dask.py index 148f53b..f3febe3 100644 --- a/vega/core/scheduler/run_dask.py +++ b/vega/core/scheduler/run_dask.py @@ -19,6 +19,7 @@ import os import subprocess import shutil +import time from distributed import Client from vega.common import General @@ -78,6 +79,7 @@ def run_local_worker(slave_ip, address, local_dir): ) else: from vega.security.run_dask import run_local_worker_security + time.sleep(1) return run_local_worker_security(slave_ip, address, local_dir) @@ -102,4 +104,5 @@ def run_remote_worker(slave_ip, address, local_dir): return id else: from vega.security.run_dask import run_remote_worker_security + time.sleep(1) return run_remote_worker_security(slave_ip, address, local_dir) diff --git a/vega/core/scheduler/worker_env.py b/vega/core/scheduler/worker_env.py index 3cba44b..daa9bf0 100644 --- a/vega/core/scheduler/worker_env.py +++ b/vega/core/scheduler/worker_env.py @@ -107,8 +107,8 @@ def _set_rank_info(self, device_id, rank_table_file, ip): os.environ['MASTER_PORT'] = rank_table_json['server_list'][0].get('server_port', port) else: # multi-nodes - if "DLS_TASK_INDEX" in os.environ: - index = int(os.environ["DLS_TASK_INDEX"]) + if "DLS_TASK_INDEX" in os.environ or 'VC_TASK_INDEX' in os.environ: + index = int(os.getenv('DLS_TASK_INDEX', os.getenv('VC_TASK_INDEX'))) devices = server_list[index]["device"] rank_id = list(filter(lambda x: x["device_id"] == device_id, devices))[0]["rank_id"] rank_size = str(sum([len(x["device"]) for x in server_list])) @@ -120,8 +120,8 @@ def _set_rank_info(self, device_id, rank_table_file, ip): rank_size = str(sum([len(x["device"]) for x in server_list])) os.environ['RANK_ID'] = rank_id os.environ['RANK_SIZE'] = rank_size - except Exception: - logging.warn(f"wrong rank table file: {rank_table_file}") + except Exception as e: + logging.warn(f"wrong rank table file: {rank_table_file}, message: {e}") def _get_device_index(self, worker): ports_list = json.loads(os.environ["vega_workers_list"]) diff --git a/vega/model_zoo/model_zoo.py b/vega/model_zoo/model_zoo.py index 2e0ced4..b26085a 100644 --- a/vega/model_zoo/model_zoo.py +++ b/vega/model_zoo/model_zoo.py @@ -175,7 +175,7 @@ def _load_pretrained_model(cls, model, pretrained_model_file, exclude_weight_pre elif vega.is_tf_backend(): return cls._load_tf_model(model, pretrained_model_file) else: - return cls._load_ms_model(model, pretrained_model_file) + return cls._load_ms_model(model, pretrained_model_file, exclude_weight_prefix) @classmethod def _load_torch_model(cls, model, pretrained_model_file, exclude_weight_prefix=None): @@ -214,7 +214,7 @@ def _load_tf_model(cls, model, pretrained_model_file): return model @classmethod - def _load_ms_model(cls, model, pretrained_model_file): + def _load_ms_model(cls, model, pretrained_model_file, exclude_weight_prefix): from mindspore.train.serialization import load_checkpoint if hasattr(model, "pretrained"): pretrained_weight = model.pretrained(pretrained_model_file) @@ -227,7 +227,7 @@ def _load_ms_model(cls, model, pretrained_model_file): pretrained_weight = os.path.join(pretrained_model_file, file) break network = model if not hasattr(model, "get_ori_model") else model.get_ori_model() - load_checkpoint(pretrained_weight, net=network) + load_checkpoint(pretrained_weight, net=network, filter_prefix=exclude_weight_prefix) return model @classmethod diff --git a/vega/security/conf.py b/vega/security/conf.py index b168581..db23cea 100644 --- a/vega/security/conf.py +++ b/vega/security/conf.py @@ -59,7 +59,7 @@ def load(self) -> bool: if "security" not in config.sections(): return False keys = [] - pass_check_keys = ["encrypted_password", "white_list"] + pass_check_keys = ["encrypted_password", "white_list", "ciphers"] for key in config["security"]: if key not in self.keys: return False @@ -69,8 +69,9 @@ def load(self) -> bool: keys.append(key) if len(keys) != len(self.keys): missing_keys = list(set(self.keys) - set(keys)) - logging.error(f"setting items {missing_keys} are missing in {self.file_name}") - return False + if missing_keys != ["ciphers"]: + logging.error(f"setting items {missing_keys} are missing in {self.file_name}") + return False return True @@ -100,11 +101,12 @@ def __init__(self): self.encrypted_password = None self.key_component_1 = None self.key_component_2 = None + self.ciphers = None self.white_list = [] self.file_name = os.path.expanduser("~/.vega/client.ini") self.keys = [ "ca_cert", "client_cert", "client_secret_key", "encrypted_password", - "key_component_1", "key_component_2"] + "key_component_1", "key_component_2", "ciphers"] _server_config = ServerConfig() diff --git a/vega/security/kmc/kmc.py b/vega/security/kmc/kmc.py index 37e3131..61da454 100644 --- a/vega/security/kmc/kmc.py +++ b/vega/security/kmc/kmc.py @@ -18,6 +18,7 @@ import ctypes import os +import random from ctypes.util import find_library import logging import platform @@ -28,7 +29,6 @@ _libc_dll: ctypes.CDLL = None ADVANCE_DAY = 3 - def hmac(domain_id: int, plain_text: str) -> str: """Encode HMAC code.""" p_char = ctypes.c_char_p() @@ -102,6 +102,11 @@ def _decrypt(domain_id: int, cipher_text: str): def check_and_update_mk(domain_id: int, advance_day: int) -> bool: """Check and update mk.""" + try: + _kmc_dll.KeRefreshMkMask() + except Exception as err: + logging.error('refresh_task failed, catch error: %s', err) + ret = _kmc_dll.KeCheckAndUpdateMk(domain_id, advance_day) if ret != 0: logging.error(f"failed to call KeCheckAndUpdateMk, code={ret}") @@ -180,7 +185,11 @@ def _init_kmc_config(primary_key_store_file, standby_key_store_file, alg_id, dom config.procLockPerm = 0o0600 config.sdpAlgId = alg_id config.hmacAlgId = 2052 # HMAC_SHA256 2052; HMAC_SHA384 2053 HMAC_SHA512 2054 - config.semKey = 0x20161516 + DEFAULT_SEM_KEY = 0x20160000 + MIN_HEX_SEM_KEY = 0x1111 + MAX_HEX_SEM_KEY = 0x9999 + config.semKey = DEFAULT_SEM_KEY + \ + random.randint(MIN_HEX_SEM_KEY, MAX_HEX_SEM_KEY) _kmc_dll.KeInitialize.restype = ctypes.c_int _kmc_dll.KeInitialize.argtypes = [ctypes.POINTER(KMCConfig)] return _kmc_dll.KeInitialize(ctypes.byref(config)) @@ -197,6 +206,13 @@ def init(primary_key_store_file: str, standby_key_store_file: str, alg_id: int, if ret != 0: logging.error(f"failed to call KeInitialized, code={ret}") return False + domain_id = 0 + try: + _kmc_dll.KeActiveNewKey(domain_id) + except Exception: + logging.error("failed to call KeActiveNewKey.") + + check_and_update_mk(domain_id, ADVANCE_DAY) return True @@ -223,6 +239,5 @@ def decrypt(cert_pem_file, secret_key_file, key_mm, key_component_1, key_compone if decrypt_mm == "": logging.error("kmc init error.") raise Exception('ERROR: kmc init failed!') - check_and_update_mk(domain_id, ADVANCE_DAY) finalize() return decrypt_mm diff --git a/vega/security/post.py b/vega/security/post.py index a5110e1..af28e2b 100644 --- a/vega/security/post.py +++ b/vega/security/post.py @@ -15,9 +15,11 @@ """Rest post operation in security mode.""" import urllib +import ssl import json import logging import requests + from .conf import get_config from .utils import create_context from .args import check_msg @@ -34,6 +36,20 @@ def post(host, files, data): encrypted_password = sec_cfg.encrypted_password key_component_1 = sec_cfg.key_component_1 key_component_2 = sec_cfg.key_component_2 + ciphers = sec_cfg.ciphers + cipher_suites = "ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM:ECDHE-ECDSA-AES128-GCM-SHA256" \ + ":ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384" \ + ":DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-DSS-AES128-GCM-SHA256" \ + ":DHE-DSS-AES256-GCM-SHA384:DHE-RSA-AES128-CCM:DHE-RSA-AES256-CCM" + + if ciphers: + ciphersList = [cipher for cipher in ciphers.split(':') if cipher in cipher_suites.split(':')] + if ciphersList == []: + raise ssl.SSLError("The ciphers are invalid, please check.") + else: + ciphers = ':'.join(ciphersList) + else: + ciphers = cipher_suites if not cert_pem_file or not secret_key_file or not ca_file: logging.error("CERT file is not existed.") @@ -42,9 +58,9 @@ def post(host, files, data): logging.error(f"The cert {ca_file} and {cert_pem_file} are invalid, please check.") if encrypted_password == "": - context = create_context(ca_file, cert_pem_file, secret_key_file) + context = create_context(ca_file, cert_pem_file, secret_key_file, ciphers) else: - context = create_context(ca_file, cert_pem_file, secret_key_file, encrypted_password, key_component_1, + context = create_context(ca_file, cert_pem_file, secret_key_file, ciphers, encrypted_password, key_component_1, key_component_2) if host.lower().startswith('https') is False: raise Exception(f'The host {host} must start with https') diff --git a/vega/security/run_dask.py b/vega/security/run_dask.py index 69e2fba..8025155 100644 --- a/vega/security/run_dask.py +++ b/vega/security/run_dask.py @@ -25,7 +25,7 @@ from distributed.security import Security from .conf import get_config from .verify_cert import verify_cert - +import ssl sec_cfg = get_config('server') @@ -38,6 +38,7 @@ def get_client_security(address): sec = Security(tls_ca_file=sec_cfg.ca_cert, tls_client_cert=sec_cfg.client_cert_dask, tls_client_key=sec_cfg.client_secret_key_dask, + tls_min_version=ssl.TLSVersion.TLSv1_3, require_encryption=True) return Client(address, security=sec) diff --git a/vega/security/utils.py b/vega/security/utils.py index 9b6c220..d73bc9a 100644 --- a/vega/security/utils.py +++ b/vega/security/utils.py @@ -18,18 +18,15 @@ import logging -def create_context(ca_file, cert_pem_file, secret_key_file, key_mm=None, key_component_1=None, key_component_2=None): +def create_context(ca_file, cert_pem_file, secret_key_file, ciphers, key_mm=None, key_component_1=None, key_component_2=None): """Create the SSL context.""" - ciphers = "ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM:ECDHE-ECDSA-AES128-GCM-SHA256" \ - ":ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384" \ - ":DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-DSS-AES128-GCM-SHA256" \ - ":DHE-DSS-AES256-GCM-SHA384:DHE-RSA-AES128-CCM:DHE-RSA-AES256-CCM" context = ssl.SSLContext(ssl.PROTOCOL_TLS) - context.options += ssl.OP_NO_TLSv1 - context.options += ssl.OP_NO_TLSv1_1 + context.options |= ssl.OP_NO_TLSv1 + context.options |= ssl.OP_NO_TLSv1_1 if sys.version_info >= (3, 7): - context.options += ssl.OP_NO_TLSv1_2 - context.options += ssl.OP_NO_RENEGOTIATION + context.minimum_version = ssl.TLSVersion.TLSv1_3 + context.options |= ssl.OP_NO_TLSv1_2 + context.options |= ssl.OP_NO_RENEGOTIATION context.options -= ssl.OP_ALL context.verify_mode = ssl.CERT_REQUIRED context.set_ciphers(ciphers) diff --git a/vega/trainer/callbacks/hccl.py b/vega/trainer/callbacks/hccl.py index 6fccb78..36c3df4 100644 --- a/vega/trainer/callbacks/hccl.py +++ b/vega/trainer/callbacks/hccl.py @@ -16,6 +16,7 @@ """Data parallel callback.""" +import os import logging import vega from vega.common import ClassFactory, ClassType @@ -59,7 +60,7 @@ def _init_ms_trainer(self): from mindspore.communication.management import init logger.info("init HCCL") - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + context.set_auto_parallel_context(device_num=int(os.getenv('RANK_SIZE', '1')), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() def before_epoch(self, epoch, logs=None):