Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add code for apple silicon #291

Open
wants to merge 4 commits into
base: old_gpt_2_chinese_before_2021_4_22
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ data/
.idea/modules.xml
.idea/vcs.xml
.idea
tensorboard_summary
/checkpoint
8 changes: 7 additions & 1 deletion eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,13 @@ def main():
n_ctx = model_config.n_ctx
full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
full_tokenizer.max_len = n_ctx
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
if torch.cuda.is_available() :
device = 'cuda'
elif torch.backends.mps.is_available():
mps_device = torch.device("mps")
x = torch.ones(1, device=mps_device)
device = "mps"
print('using device:', device)

raw_data_path = args.raw_data_path
Expand Down
240 changes: 170 additions & 70 deletions generate.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import argparse
import os

import torch
import torch.nn.functional as F
import os
import argparse
from tqdm import trange
from transformers import GPT2LMHeadModel


def is_word(word):
for item in list(word):
if item not in 'qwertyuiopasdfghjklzxcvbnm':
if item not in "qwertyuiopasdfghjklzxcvbnm":
return False
return True

Expand All @@ -24,29 +25,33 @@ def _is_chinese_char(char):
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
cp = ord(char)
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True

return False


def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (vocabulary size)
top_k > 0: keep only top k tokens with highest probability (top-k filtering).
top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
"""Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (vocabulary size)
top_k > 0: keep only top k tokens with highest probability (top-k filtering).
top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
"""
assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
assert (
logits.dim() == 1
) # batch size 1 for now - could be updated for more but the code would be less clear
top_k = min(top_k, logits.size(-1)) # Safety check
if top_k > 0:
# Remove all tokens with a probability less than the last token of the top-k
Expand All @@ -68,28 +73,49 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
return logits


def sample_sequence(model, context, length, n_ctx, tokenizer, temperature=1.0, top_k=30, top_p=0.0, repitition_penalty=1.0,
device='cpu'):
def sample_sequence(
model,
context,
length,
n_ctx,
tokenizer,
temperature=1.0,
top_k=30,
top_p=0.0,
repitition_penalty=1.0,
device="cpu",
):
if torch.backends.mps.is_available():
device = "mps"
context = torch.tensor(context, dtype=torch.long, device=device)
context = context.unsqueeze(0)
generated = context
with torch.no_grad():
for _ in trange(length):
inputs = {'input_ids': generated[0][-(n_ctx - 1):].unsqueeze(0)}
inputs = {"input_ids": generated[0][-(n_ctx - 1) :].unsqueeze(0)}
outputs = model(
**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
**inputs
) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
next_token_logits = outputs[0][0, -1, :]
for id in set(generated):
next_token_logits[id] /= repitition_penalty
next_token_logits = next_token_logits / temperature
next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
next_token_logits[tokenizer.convert_tokens_to_ids("[UNK]")] = -float("Inf")
filtered_logits = top_k_top_p_filtering(
next_token_logits, top_k=top_k, top_p=top_p
)
next_token = torch.multinomial(
F.softmax(filtered_logits, dim=-1), num_samples=1
)
generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
return generated.tolist()[0]


def fast_sample_sequence(model, context, length, temperature=1.0, top_k=30, top_p=0.0, device='cpu'):
def fast_sample_sequence(
model, context, length, temperature=1.0, top_k=30, top_p=0.0, device="cpu"
):
if torch.backends.mps.is_available():
device = "mps"
inputs = torch.LongTensor(context).view(1, -1).to(device)
if len(context) > 1:
_, past = model(inputs[:, :-1], None)[:2]
Expand All @@ -104,46 +130,107 @@ def fast_sample_sequence(model, context, length, temperature=1.0, top_k=30, top_
output, past = output[:2]
output = output[-1].squeeze(0) / temperature
filtered_logits = top_k_top_p_filtering(output, top_k=top_k, top_p=top_p)
next_token = torch.multinomial(torch.softmax(filtered_logits, dim=-1), num_samples=1)
next_token = torch.multinomial(
torch.softmax(filtered_logits, dim=-1), num_samples=1
)
generate.append(next_token.item())
prev = next_token.view(1, 1)
return generate


# 通过命令行参数--fast_pattern,指定模式
def generate(n_ctx, model, context, length, tokenizer, temperature=1, top_k=0, top_p=0.0, repitition_penalty=1.0, device='cpu',
is_fast_pattern=False):
def generate(
n_ctx,
model,
context,
length,
tokenizer,
temperature=1,
top_k=0,
top_p=0.0,
repitition_penalty=1.0,
device="cpu",
is_fast_pattern=False,
):
if torch.backends.mps.is_available():
device = "mps"
if is_fast_pattern:
return fast_sample_sequence(model, context, length, temperature=temperature, top_k=top_k, top_p=top_p,
device=device)
return fast_sample_sequence(
model,
context,
length,
temperature=temperature,
top_k=top_k,
top_p=top_p,
device=device,
)
else:
return sample_sequence(model, context, length, n_ctx, tokenizer=tokenizer, temperature=temperature, top_k=top_k, top_p=top_p,
repitition_penalty=repitition_penalty, device=device)
return sample_sequence(
model,
context,
length,
n_ctx,
tokenizer=tokenizer,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repitition_penalty=repitition_penalty,
device=device,
)


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备')
parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度')
parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size')
parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本')
parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度')
parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一')
parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率')
parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
help='模型参数')
parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径')
parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径')
parser.add_argument('--prefix', default='萧炎', type=str, required=False, help='生成文章的开头')
parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词')
parser.add_argument('--segment', action='store_true', help='中文以词为单位')
parser.add_argument('--fast_pattern', action='store_true', help='采用更加快的方式生成文本')
parser.add_argument('--save_samples', action='store_true', help='保存产生的样本')
parser.add_argument('--save_samples_path', default='.', type=str, required=False, help="保存样本的路径")
parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False)
parser.add_argument(
"--device", default="0,1,2,3", type=str, required=False, help="生成设备"
)
parser.add_argument("--length", default=-1, type=int, required=False, help="生成长度")
parser.add_argument(
"--batch_size", default=1, type=int, required=False, help="生成的batch size"
)
parser.add_argument(
"--nsamples", default=10, type=int, required=False, help="生成几个样本"
)
parser.add_argument(
"--temperature", default=1, type=float, required=False, help="生成温度"
)
parser.add_argument("--topk", default=8, type=int, required=False, help="最高几选一")
parser.add_argument("--topp", default=0, type=float, required=False, help="最高积累概率")
parser.add_argument(
"--model_config",
default="config/model_config_small.json",
type=str,
required=False,
help="模型参数",
)
parser.add_argument(
"--tokenizer_path",
default="cache/vocab_small.txt",
type=str,
required=False,
help="词表路径",
)
parser.add_argument(
"--model_path",
default="model/final_model",
type=str,
required=False,
help="模型路径",
)
parser.add_argument(
"--prefix", default="萧炎", type=str, required=False, help="生成文章的开头"
)
parser.add_argument("--no_wordpiece", action="store_true", help="不做word piece切词")
parser.add_argument("--segment", action="store_true", help="中文以词为单位")
parser.add_argument("--fast_pattern", action="store_true", help="采用更加快的方式生成文本")
parser.add_argument("--save_samples", action="store_true", help="保存产生的样本")
parser.add_argument(
"--save_samples_path", default=".", type=str, required=False, help="保存样本的路径"
)
parser.add_argument("--repetition_penalty", default=1.0, type=float, required=False)

args = parser.parse_args()
print('args:\n' + args.__repr__())
print("args:\n" + args.__repr__())

if args.segment:
from tokenizations import tokenization_bert_word_level as tokenization_bert
Expand All @@ -159,7 +246,13 @@ def main():
topp = args.topp
repetition_penalty = args.repetition_penalty

device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'cpu'
if torch.cuda.is_available() :
device = 'cuda'
elif torch.backends.mps.is_available():
mps_device = torch.device("mps")
x = torch.ones(1, device=mps_device)
device = "mps"

tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
model = GPT2LMHeadModel.from_pretrained(args.model_path)
Expand All @@ -173,7 +266,9 @@ def main():
if args.save_samples:
if not os.path.exists(args.save_samples_path):
os.makedirs(args.save_samples_path)
samples_file = open(args.save_samples_path + '/samples.txt', 'w', encoding='utf8')
samples_file = open(
args.save_samples_path + "/samples.txt", "w", encoding="utf8"
)
while True:
raw_text = args.prefix
context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_text))
Expand All @@ -184,32 +279,37 @@ def main():
model=model,
context=context_tokens,
length=length,
is_fast_pattern=args.fast_pattern, tokenizer=tokenizer,
temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device
is_fast_pattern=args.fast_pattern,
tokenizer=tokenizer,
temperature=temperature,
top_k=topk,
top_p=topp,
repitition_penalty=repetition_penalty,
device=device,
)
for i in range(batch_size):
generated += 1
text = tokenizer.convert_ids_to_tokens(out)
for i, item in enumerate(text[:-1]): # 确保英文前后有空格
if is_word(item) and is_word(text[i + 1]):
text[i] = item + ' '
text[i] = item + " "
for i, item in enumerate(text):
if item == '[MASK]':
text[i] = ''
elif item == '[CLS]':
text[i] = '\n\n'
elif item == '[SEP]':
text[i] = '\n'
if item == "[MASK]":
text[i] = ""
elif item == "[CLS]":
text[i] = "\n\n"
elif item == "[SEP]":
text[i] = "\n"
info = "=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40 + "\n"
print(info)
text = ''.join(text).replace('##', '').strip()
text = "".join(text).replace("##", "").strip()
print(text)
if args.save_samples:
samples_file.write(info)
samples_file.write(text)
samples_file.write('\n')
samples_file.write('=' * 90)
samples_file.write('\n' * 2)
samples_file.write("\n")
samples_file.write("=" * 90)
samples_file.write("\n" * 2)
print("=" * 80)
if generated == nsamples:
# close file when finish writing.
Expand All @@ -218,5 +318,5 @@ def main():
break


if __name__ == '__main__':
if __name__ == "__main__":
main()
Loading