Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
[SCRIPT] add script for converting ElectraForPretrain params to backb…
Browse files Browse the repository at this point in the history
…one (#1527)

* add script for converting ElectraForPretrain params to backbone

Signed-off-by: Sheng Zha <[email protected]>

* fix pretraining utils for mxnp usage

Signed-off-by: Sheng Zha <[email protected]>
  • Loading branch information
szha authored Feb 22, 2021
1 parent 0211c65 commit 46c9221
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 55 deletions.
49 changes: 49 additions & 0 deletions scripts/pretraining/convert_electra_pretrain_backbone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Convert pre-trained model parameters from ElectraForPretrain to ElectraModel"""

import os
import argparse
import mxnet as mx

from pretraining_utils import get_electra_pretraining_model


def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--model-name', type=str, default='google_electra_small',
help='Name of the pretrained model.')
parser.add_argument('--params-file', type=str, required=True,
help='Path to the pretrained parameter file.')
parser.add_argument('--out-file', type=str, default=None,
help='Output file path.')
parser.add_argument('--generator_units_scale', type=float, default=None,
help='The scale size of the generator units, same as used in pretraining.')
parser.add_argument('--generator_layers_scale', type=float, default=None,
help='The scale size of the generator layer, same as used in pretraining.')

args = parser.parse_args()
return args


def convert_params(model_name, generator_units_scale, generator_layers_scale,
params_path, out_path):
_, _, pretrain_model = get_electra_pretraining_model(model_name, [mx.cpu()],
generator_units_scale=generator_units_scale,
generator_layers_scale=generator_layers_scale,
params_path=params_path)
backbone_model = pretrain_model.disc_backbone
backbone_model.save_parameters(out_path)


if __name__ == '__main__':
args = parse_args()
out_path = args.out_file
if not out_path:
params_file = args.params_file
file_name_sep = os.path.basename(params_file).split(os.path.extsep)
file_name_sep.insert(-1, 'backbone')
out_path = os.path.join(
os.path.dirname(params_file),
os.path.extsep.join(file_name_sep))
convert_params(args.model_name, args.generator_units_scale, args.generator_layers_scale,
args.params_file, out_path)
63 changes: 52 additions & 11 deletions scripts/pretraining/pretraining_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import collections

import numpy as np
from mxnet import npx
from mxnet import np as mxnp, npx
from mxnet.gluon import HybridBlock
from mxnet.gluon.data import ArrayDataset

Expand All @@ -16,6 +16,8 @@
from gluonnlp.data.loading import NumpyDataset, DatasetLoader
from gluonnlp.data.sampler import SplitSampler, FixedBucketSampler
from gluonnlp.op import select_vectors_by_position, update_vectors_by_position
from gluonnlp.initializer import TruncNorm
from gluonnlp.models.electra import ElectraModel, ElectraForPretrain, get_pretrained_electra

PretrainFeature = collections.namedtuple(
'PretrainFeature',
Expand Down Expand Up @@ -519,39 +521,39 @@ def dynamic_masking(self, input_ids, valid_lengths):
np.not_equal(input_ids, ignore_token)
valid_lengths = valid_lengths.astype(np.float32)
valid_candidates = valid_candidates.astype(np.float32)
num_masked_position = np.maximum(
num_masked_position = mxnp.maximum(
1, np.minimum(N, round(valid_lengths * self._mask_prob)))

# Get the masking probability of each position
sample_probs = self._proposal_distribution * valid_candidates
sample_probs /= np.sum(sample_probs, axis=-1, keepdims=True)
sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True)
sample_probs = npx.stop_gradient(sample_probs)
gumbels = np.random.gumbel(np.zeros_like(sample_probs))
gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs))
# Following the instruction of official repo to avoid deduplicate postions
# with Top_k Sampling as https://github.com/google-research/electra/issues/41
masked_positions = npx.topk(
np.log(sample_probs) + gumbels, k=N,
mxnp.log(sample_probs) + gumbels, k=N,
axis=-1, ret_typ='indices', dtype=np.int32)

masked_weights = npx.sequence_mask(
np.ones_like(masked_positions),
mxnp.ones_like(masked_positions),
sequence_length=num_masked_position,
use_sequence_length=True, axis=1, value=0)
masked_positions = masked_positions * masked_weights
length_masks = npx.sequence_mask(
np.ones_like(input_ids, dtype=np.float32),
mxnp.ones_like(input_ids, dtype=np.float32),
sequence_length=valid_lengths,
use_sequence_length=True, axis=1, value=0)
unmasked_tokens = select_vectors_by_position(
input_ids, masked_positions) * masked_weights
masked_weights = masked_weights.astype(np.float32)
replaced_positions = (
np.random.uniform(
np.zeros_like(masked_positions),
np.ones_like(masked_positions)) < self._replace_prob) * masked_positions
mxnp.random.uniform(
mxnp.zeros_like(masked_positions),
mxnp.ones_like(masked_positions)) < self._replace_prob) * masked_positions
# dealing with multiple zero values in replaced_positions which causes
# the [CLS] being replaced
filled = np.where(
filled = mxnp.where(
replaced_positions,
self.vocab.mask_id,
self.vocab.cls_id).astype(
Expand All @@ -568,3 +570,42 @@ def dynamic_masking(self, input_ids, valid_lengths):
masked_positions=masked_positions,
masked_weights=masked_weights)
return masked_input


def get_electra_pretraining_model(model_name, ctx_l,
max_seq_length=128,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
generator_units_scale=None,
generator_layers_scale=None,
params_path=None):
"""
A Electra Pretrain Model is built with a generator and a discriminator, in which
the generator has the same embedding as the discriminator but different backbone.
"""
cfg, tokenizer, _, _ = get_pretrained_electra(
model_name, load_backbone=False)
cfg = ElectraModel.get_cfg().clone_merge(cfg)
cfg.defrost()
cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob
cfg.MODEL.attention_dropout_prob = attention_dropout_prob
cfg.MODEL.max_length = max_seq_length
# Keep the original generator size if not designated
if generator_layers_scale:
cfg.MODEL.generator_layers_scale = generator_layers_scale
if generator_units_scale:
cfg.MODEL.generator_units_scale = generator_units_scale
cfg.freeze()

model = ElectraForPretrain(cfg,
uniform_generator=False,
tied_generator=False,
tied_embeddings=True,
disallow_correct=False,
weight_initializer=TruncNorm(stdev=0.02))
if not params_path:
model.initialize(ctx=ctx_l)
else:
model.load_parameters(params_path, ctx=ctx_l)
model.hybridize()
return cfg, tokenizer, model
53 changes: 9 additions & 44 deletions scripts/pretraining/run_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from mxnet.lr_scheduler import PolyScheduler

from sklearn import metrics
from pretraining_utils import ElectraMasker, get_pretrain_data_npz, get_pretrain_data_text
from pretraining_utils import ElectraMasker, get_pretrain_data_npz, \
get_pretrain_data_text, get_electra_pretraining_model
from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, logging_config, naming_convention
from gluonnlp.initializer import TruncNorm
from gluonnlp.models.electra import ElectraModel, ElectraForPretrain, get_pretrained_electra
from gluonnlp.models.electra import ElectraModel, ElectraForPretrain
from gluonnlp.utils.parameter import clip_grad_global_norm
try:
import horovod.mxnet as hvd
Expand Down Expand Up @@ -128,41 +128,6 @@ def parse_args():
return args


def get_pretraining_model(model_name, ctx_l,
max_seq_length=128,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
generator_units_scale=None,
generator_layers_scale=None):
"""
A Electra Pretrain Model is built with a generator and a discriminator, in which
the generator has the same embedding as the discriminator but different backbone.
"""
cfg, tokenizer, _, _ = get_pretrained_electra(
model_name, load_backbone=False)
cfg = ElectraModel.get_cfg().clone_merge(cfg)
cfg.defrost()
cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob
cfg.MODEL.attention_dropout_prob = attention_dropout_prob
cfg.MODEL.max_length = max_seq_length
# Keep the original generator size if not designated
if generator_layers_scale:
cfg.MODEL.generator_layers_scale = generator_layers_scale
if generator_units_scale:
cfg.MODEL.generator_units_scale = generator_units_scale
cfg.freeze()

model = ElectraForPretrain(cfg,
uniform_generator=False,
tied_generator=False,
tied_embeddings=True,
disallow_correct=False,
weight_initializer=TruncNorm(stdev=0.02))
model.initialize(ctx=ctx_l)
model.hybridize()
return cfg, tokenizer, model


ElectraOutput = collections.namedtuple('ElectraOutput',
['mlm_scores',
'rtd_scores',
Expand Down Expand Up @@ -237,12 +202,12 @@ def train(args):
logging.info('Training info: num_buckets: {}, '
'num_workers: {}, rank: {}'.format(
args.num_buckets, num_workers, rank))
cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l,
args.max_seq_length,
args.hidden_dropout_prob,
args.attention_dropout_prob,
args.generator_units_scale,
args.generator_layers_scale)
cfg, tokenizer, model = get_electra_pretraining_model(args.model_name, ctx_l,
args.max_seq_length,
args.hidden_dropout_prob,
args.attention_dropout_prob,
args.generator_units_scale,
args.generator_layers_scale)
data_masker = ElectraMasker(
tokenizer, args.max_seq_length, mask_prob=args.mask_prob,
replace_prob=args.replace_prob)
Expand Down

0 comments on commit 46c9221

Please sign in to comment.