add hot boot arg (PaddlePaddle#980)

wangxicoding · Sep 8, 2021 · a61d4d3 · a61d4d3
1 parent 1a6729e
commit a61d4d3
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 3 deletions.
diff --git a/examples/model_compression/minilmv2/README.md b/examples/model_compression/minilmv2/README.md
@@ -34,6 +34,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" general_distill.py
     --student_model_type tinybert \
     --num_relation_heads 48 \
     --student_model_name_or_path tinybert-6l-768d-zh \
+    --init_from_student False \
     --teacher_model_type bert \
     --teacher_model_name_or_path bert-base-chinese \
     --max_seq_length 128 \
@@ -51,6 +52,26 @@ python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" general_distill.py
     --input_dir ${dataset} \
 
 ```
+
+其中参数释义如下：
+
+- `student_model_type` 学生模型的类型
+- `num_relation_heads` head重新组合之后的head数
+- `student_model_name_or_path` 学生模型的名字（需要与学生模型类型对应），或者是学生模型的路径
+- `init_from_student` 本次蒸馏的学生模型是否用`student_model_name_or_path`中的参数进行初始化，是个bool类型的参数。默认是False
+- `teacher_model_type bert` 教师模型的类型
+- `teacher_model_name_or_path`  教师模型的名字
+- `max_seq_length 128` 表示最大句子长度，超过该长度将被截断。
+- `warmup_steps` 学习率warmup up的步数
+- `save_steps` 保存模型的频率
+- `teacher_layer_index`表示学生模型从教师模型学习的教师层
+- `student_layer_index` 表示学生模型从教师模型学习的学生层
+- `output_dir` 模型输出的目录
+- `device gpu` 表示运行该程序的设备，默认是gpu
+- `input_dir` 预训练数据的存放地址
+
+
+
 ### 评价方法
 
 假设预训练完成后的模型存储在`${pretrained_models}`下，这里也提供了我们已经预训练完成的一版[模型](https://paddlenlp.bj.bcebos.com/models/general_distill/minilmv2_6l_768d_ch.tar.gz)可供参考，模型与`tinybert-6l-768d-zh`结构相同，因此可以使用`TinyBertForSequenceClassification.from_pretrained()`对模型直接进行加载。
@@ -80,6 +101,7 @@ python -u ./run_clue.py \
 
 ```
 
+
 其中不同的任务下，`${learning_rate}`、`${num_train_epochs}`、`${max_seq_len}`，我们推荐不同的Fine-tuning的超参数，可以参考以下配置：
 
 | TASK_NAME        | AFQMC | TNEWS | IFLYTEK | OCNLI | CMNLI | CLUEWSC2020 | CSL  |

diff --git a/examples/model_compression/minilmv2/general_distill.py b/examples/model_compression/minilmv2/general_distill.py
@@ -17,6 +17,7 @@
 import random
 import time
 from functools import partial
+import distutils.util
 from concurrent.futures import ThreadPoolExecutor
 
 import numpy as np
@@ -66,6 +67,11 @@ def parse_args():
                 list(classes[-1].pretrained_init_configuration.keys())
                 for classes in MODEL_CLASSES.values()
             ], [])), )
+    parser.add_argument(
+        "--init_from_student",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Whether to use the parameters of student model to initialize.")
     parser.add_argument(
         "--teacher_model_name_or_path",
         default=None,
@@ -85,7 +91,6 @@ def parse_args():
         required=True,
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-
     parser.add_argument(
         "--max_seq_length",
         default=128,
@@ -248,8 +253,11 @@ def do_train(args):
     # For student
     model_class, tokenizer_class = MODEL_CLASSES[args.student_model_type]
     tokenizer = tokenizer_class.from_pretrained(args.student_model_name_or_path)
-    tinybert = TinyBertModel(vocab_size=21128, num_hidden_layers=6)
-    student = model_class(tinybert)
+    if args.init_from_student:
+        student = model_class.from_pretrained(args.student_model_name_or_path)
+    else:
+        tinybert = TinyBertModel(vocab_size=21128, num_hidden_layers=6)
+        student = model_class(tinybert)
 
     # For teacher
     teacher_model_class, _ = MODEL_CLASSES[args.teacher_model_type]