diff --git a/examples/csmsc/tts2/local/inference_npu.sh b/examples/csmsc/tts2/local/inference_npu.sh new file mode 100644 index 00000000000..0746a0cdbfa --- /dev/null +++ b/examples/csmsc/tts2/local/inference_npu.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device npu +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device npu +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device npu +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e_npu.sh b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh new file mode 100755 index 00000000000..1209a532b17 --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 + + +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_npu.sh b/examples/csmsc/tts2/local/synthesize_npu.sh new file mode 100755 index 00000000000..90fcef83d2c --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_npu.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi diff --git a/examples/csmsc/tts2/local/train_npu.sh b/examples/csmsc/tts2/local/train_npu.sh new file mode 100755 index 00000000000..46243e1555c --- /dev/null +++ b/examples/csmsc/tts2/local/train_npu.sh @@ -0,0 +1,16 @@ + +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=0 \ + --nnpu=1 \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt \ + --use-relative-path=True diff --git a/examples/csmsc/tts2/run_npu.sh b/examples/csmsc/tts2/run_npu.sh new file mode 100644 index 00000000000..f36c93f74d9 --- /dev/null +++ b/examples/csmsc/tts2/run_npu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +npus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_76.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run_xpu.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1 +fi diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 8a5269825a2..21d105adeb2 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument( "--device", default="gpu", - choices=["gpu", "cpu", "xpu"], + choices=["gpu", "cpu", "xpu", "npu"], help="Device selected for inference.", ) parser.add_argument('--cpu_threads', type=int, default=1) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index c90090daa1e..b82d6880224 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -45,15 +45,18 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: - if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0: - paddle.set_device("cpu") - else: - paddle.set_device("xpu") - else: + if paddle.is_compiled_with_cuda() and args.ngpu > 0: paddle.set_device("gpu") if world_size > 1: paddle.distributed.init_parallel_env() + elif paddle.is_compiled_with_xpu() and args.nxpu > 0: + paddle.device.set_device("xpu") + elif args.nnpu > 0: + paddle.device.set_device("npu") + if world_size > 1: + paddle.distributed.init_parallel_env() + else: + paddle.set_device("cpu") # set the random seed, it is a must for multiprocess training seed_everything(config.seed) @@ -191,9 +194,19 @@ def main(): "--nxpu", type=int, default=0, - help="if nxpu == 0 and ngpu == 0, use cpu.") + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + ) + parser.add_argument( + "--nnpu", + type=int, + default=0, + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + ) parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu") + "--ngpu", + type=int, + default=1, + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") parser.add_argument( "--use-relative-path", diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 9a07df64de8..d29dd8110b2 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -591,7 +591,8 @@ def get_predictor( config = inference.Config( str(Path(model_dir) / model_file), str(Path(model_dir) / params_file)) - config.enable_memory_optim() + if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0": + config.enable_memory_optim() config.switch_ir_optim(True) if device == "gpu": config.enable_use_gpu(100, device_id) diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index e7cf7850e91..9eb459894e8 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -219,12 +219,21 @@ def parse_args(): ) # other parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + "--ngpu", + type=int, + default=1, + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + ) + parser.add_argument( + "--nnpu", + type=int, + default=0, + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." ) parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--output_dir", type=str, help="output dir.") @@ -245,10 +254,12 @@ def main(): paddle.set_device("gpu") elif args.nxpu > 0: paddle.set_device("xpu") - elif args.ngpu == 0 and args.nxpu == 0: + elif args.nnpu > 0: + paddle.set_device("npu") + elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0: paddle.set_device("cpu") else: - print("ngpu or nxpu should >= 0 !") + print("ngpu, nxpu and nnpu should be >= 0") evaluate(args) diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index c63a5fbe976..b9073124bd0 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -299,12 +299,21 @@ def parse_args(): default=None, help="dir to save inference models") parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + "--ngpu", + type=int, + default=1, + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + ) + parser.add_argument( + "--nnpu", + type=int, + default=0, + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." ) parser.add_argument( "--text", @@ -339,10 +348,12 @@ def main(): paddle.set_device("gpu") elif args.nxpu > 0: paddle.set_device("xpu") - elif args.ngpu == 0 and args.nxpu == 0: + elif args.nnpu > 0: + paddle.set_device("npu") + elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0: paddle.set_device("cpu") else: - print("ngpu or nxpu should >= 0 !") + print("ngpu, nxpu and nnpu should be >= 0") evaluate(args)