Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
truncate the end if the sequence is too long...
Browse files Browse the repository at this point in the history
  • Loading branch information
scottyih committed Apr 2, 2021
1 parent 43b6b78 commit 5fe254d
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion elq/main_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,13 @@ def _process_biencoder_dataloader(samples, tokenizer, biencoder_params, logger):
max_seq_len = 0
for sample in samples:
samples_text_tuple
encoded_sample = [101] + tokenizer.encode(sample['text']) + [102]
# truncate the end if the sequence is too long...
encoded_sample = [101] + tokenizer.encode(sample['text'])[:biencoder_params["max_context_length"]-2] + [102]
max_seq_len = max(len(encoded_sample), max_seq_len)
samples_text_tuple.append(encoded_sample + [0 for _ in range(biencoder_params["max_context_length"] - len(encoded_sample))])

# print(samples_text_tuple)

tensor_data_tuple = [torch.tensor(samples_text_tuple)]
tensor_data = TensorDataset(*tensor_data_tuple)
sampler = SequentialSampler(tensor_data)
Expand Down

0 comments on commit 5fe254d

Please sign in to comment.