-
Notifications
You must be signed in to change notification settings - Fork 0
/
bert.py
24 lines (20 loc) · 1008 Bytes
/
bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForPreTraining
def getBertModel(multilingual=True):
if multilingual:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased")
else:
tokenizer = BertTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
model = BertModel.from_pretrained("Maltehb/danish-bert-botxo")
return tokenizer, model
def getContextualizedEmbeddings(text, tokenizer, model, max_length):
#Tokenize input and remove start and end tokens from sentence
if max_length > 0:
encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
else:
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
#Get contextualized embeddings
output = model(**encoded_input)
return output