-
Notifications
You must be signed in to change notification settings - Fork 0
/
huggingfaceDemo.py
51 lines (40 loc) · 1.32 KB
/
huggingfaceDemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
from transformers import LongformerModel, LongformerTokenizer, LongformerConfig
model_name = 'pre-model/' + 'longformer-base-4096'
config = LongformerConfig.from_pretrained(model_name)
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerModel.from_pretrained(model_name, config=config)
txts = ["ATGCAGTA", "ATGCATGCA", "ACGTACGATGCAAA"]
max = 0
for txt in txts:
tokens = tokenizer.tokenize(txt)
token_value = len(tokens)
print(tokens)
print(token_value)
if token_value > max:
max = token_value
txts1 = ["ATGCA", "ACTGACGTA", "ACG"]
encoded_inputs = tokenizer(txts, return_tensors='pt', padding=True,
max_length=max)
print(encoded_inputs)
for input_id in encoded_inputs['input_ids']:
seq = tokenizer.decode(input_id)
print(seq)
X_enpr_features = model(**encoded_inputs)[0]
print(X_enpr_features.shape)
X_enpr = X_enpr_features.detach().numpy()
print(X_enpr)
print(X_enpr.size)
print(X_enpr.shape)
# encoded_inputs = tokenizer(txts1, return_tensors='pt', padding=True,
# max_len=max)
# X_enpr_features = model(**encoded_inputs)
# X_enpr = np.array(X_enpr_features)
# print("output:")
# for item in X_enpr:
# print(item)
# print("\n")
#
# np.savez("test.npz", x1=X_enpr, x2=X_enpr)
#
# print("saved!")