diff --git a/capreolus/extractor/__init__.py b/capreolus/extractor/__init__.py index 220a4af2..a5c3ac2a 100644 --- a/capreolus/extractor/__init__.py +++ b/capreolus/extractor/__init__.py @@ -68,7 +68,7 @@ def _build_vocab(self, qids, docids, topics): def build_from_benchmark(self, *args, **kwargs): raise NotImplementedError - def id2vec(self, qid, posdocid, negdocid=None, label=None): + def id2vec(self, qid, posdocid, negdocid=None, label=None, *args, **kwargs): """ Creates a feature from the (qid, docid) pair. If negdocid is supplied, that's also included in the feature (needed for training with pairwise hinge loss) diff --git a/capreolus/extractor/bagofwords.py b/capreolus/extractor/bagofwords.py index 17cb986a..63931f33 100644 --- a/capreolus/extractor/bagofwords.py +++ b/capreolus/extractor/bagofwords.py @@ -112,7 +112,7 @@ def preprocess(self, qids, docids, topics): self._build_vocab(qids, docids, topics) - def id2vec(self, q_id, posdoc_id, negdoc_id=None, **kwargs): + def id2vec(self, q_id, posdoc_id, negdoc_id=None, *args, **kwargs): query_toks = self.qid2toks[q_id] posdoc_toks = self.docid2toks.get(posdoc_id) diff --git a/capreolus/extractor/bertpassage.py b/capreolus/extractor/bertpassage.py index f584db5c..7255ad1e 100644 --- a/capreolus/extractor/bertpassage.py +++ b/capreolus/extractor/bertpassage.py @@ -11,11 +11,13 @@ from capreolus.utils.exceptions import MissingDocError from capreolus.tokenizer.punkt import PunktTokenizer +from .common import SingleTrainingPassagesMixin + logger = get_logger(__name__) @Extractor.register -class BertPassage(Extractor): +class BertPassage(Extractor, SingleTrainingPassagesMixin): """ Extracts passages from the document to be later consumed by a BERT based model. Does NOT use all the passages. The first passages is always used. Use the `prob` config to control the probability @@ -37,6 +39,7 @@ class BertPassage(Extractor): config_spec = [ ConfigOption("maxseqlen", 256, "Maximum input length (query+document)"), ConfigOption("maxqlen", 20, "Maximum query length"), + ConfigOption("padq", False, "Always pad queries to maxqlen"), ConfigOption("usecache", False, "Should the extracted features be cached?"), ConfigOption("passagelen", 150, "Length of the extracted passage"), ConfigOption("stride", 100, "Stride"), @@ -85,60 +88,6 @@ def get_tf_feature_description(self): return feature_description - def create_tf_train_feature(self, sample): - """ - Returns a set of features from a doc. - Of the num_passages passages that are present in a document, we use only a subset of it. - params: - sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen] - - Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen]. - Yes, the output shape is different to the input shape because we sample from the passages. - """ - num_passages = self.config["numpassages"] - - def _bytes_feature(value): - """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors.""" - if isinstance(value, type(tf.constant(0))): # if value ist tensor - value = value.numpy() # get value of tensor - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"] - posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = ( - sample["pos_mask"], - sample["pos_seg"], - sample["neg_mask"], - sample["neg_seg"], - ) - label = sample["label"] - features = [] - - for i in range(num_passages): - # Always use the first passage, then sample from the remaining passages - if i > 0 and self.rng.random() > self.config["prob"]: - continue - - bert_input_line = posdoc[i] - bert_input_line = " ".join(self.tokenizer.bert_tokenizer.convert_ids_to_tokens(list(bert_input_line))) - passage = bert_input_line.split(self.sep_tok)[-2] - - # Ignore empty passages as well - if passage.strip() == self.pad_tok: - continue - - feature = { - "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc[i])), - "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask[i])), - "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg[i])), - "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc[i])), - "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask[i])), - "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg[i])), - "label": _bytes_feature(tf.io.serialize_tensor(label[i])), - } - features.append(feature) - - return features - def create_tf_dev_feature(self, sample): """ Unlike the train feature, the dev set uses all passages. Both the input and the output are dicts with the shape @@ -171,13 +120,13 @@ def _bytes_feature(value): return [feature] - def parse_tf_train_example(self, example_proto): + def parse_tf_dev_example(self, example_proto): feature_description = self.get_tf_feature_description() parsed_example = tf.io.parse_example(example_proto, feature_description) def parse_tensor_as_int(x): parsed_tensor = tf.io.parse_tensor(x, tf.int64) - parsed_tensor.set_shape([self.config["maxseqlen"]]) + parsed_tensor.set_shape([self.config["numpassages"], self.config["maxseqlen"]]) return parsed_tensor @@ -197,31 +146,31 @@ def parse_label_tensor(x): return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label - def parse_tf_dev_example(self, example_proto): - feature_description = self.get_tf_feature_description() - parsed_example = tf.io.parse_example(example_proto, feature_description) - - def parse_tensor_as_int(x): - parsed_tensor = tf.io.parse_tensor(x, tf.int64) - parsed_tensor.set_shape([self.config["numpassages"], self.config["maxseqlen"]]) - - return parsed_tensor - - def parse_label_tensor(x): - parsed_tensor = tf.io.parse_tensor(x, tf.float32) - parsed_tensor.set_shape([self.config["numpassages"], 2]) + def _filter_inputs(self, bert_inputs, bert_masks, bert_segs, n_valid_psg): + """Preserve only one passage from all available passages.""" + assert n_valid_psg <= len( + bert_inputs + ), f"Passages only have {len(bert_inputs)} entries, but got {n_valid_psg} valid passages." + valid_indexes = list(range(0, n_valid_psg)) + if len(valid_indexes) == 0: + valid_indexes = [0] + random_i = self.rng.choice(valid_indexes) + return list(map(lambda arr: arr[random_i], [bert_inputs, bert_masks, bert_segs])) + + def _encode_inputs(self, query_toks, passages): + """Convert the query and passages into BERT inputs, mask, segments.""" + bert_inputs, bert_masks, bert_segs = [], [], [] + n_valid_psg = 0 + for tokenized_passage in passages: + if tokenized_passage != [self.pad_tok]: # end of the passage + n_valid_psg += 1 - return parsed_tensor - - pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64) - pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64) - pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64) - neg_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64) - neg_mask = tf.map_fn(parse_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64) - neg_seg = tf.map_fn(parse_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64) - label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32) + inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) + bert_inputs.append(inp) + bert_masks.append(mask) + bert_segs.append(seg) - return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label + return bert_inputs, bert_masks, bert_segs, n_valid_psg def _get_passages(self, docid): doc = self.index.get_doc(docid) @@ -321,60 +270,71 @@ def _prepare_bert_input(self, query_toks, psg_toks): if len(query_toks) > maxqlen: logger.warning(f"Truncating query from {len(query_toks)} to {maxqlen}") query_toks = query_toks[:maxqlen] + else: # if the len(query_toks) <= maxqlen, whether to pad it + if self.config["padq"]: + query_toks = padlist(query_toks, padlen=maxqlen, pad_token=self.pad_tok) psg_toks = psg_toks[: maxseqlen - len(query_toks) - 3] psg_toks = " ".join(psg_toks).split() # in case that psg_toks is np.array input_line = [self.cls_tok] + query_toks + [self.sep_tok] + psg_toks + [self.sep_tok] padded_input_line = padlist(input_line, padlen=maxseqlen, pad_token=self.pad_tok) inp = self.tokenizer.convert_tokens_to_ids(padded_input_line) - mask = [1] * len(input_line) + [0] * (len(padded_input_line) - len(input_line)) + mask = [1 if tok != self.pad_tok else 0 for tok in input_line] + [0] * (len(padded_input_line) - len(input_line)) seg = [0] * (len(query_toks) + 2) + [1] * (len(padded_input_line) - len(query_toks) - 2) return inp, mask, seg - def id2vec(self, qid, posid, negid=None, label=None): + def id2vec(self, qid, posid, negid=None, label=None, *args, **kwargs): """ See parent class for docstring """ + training = kwargs.get("training", True) # default to be training + assert label is not None maxseqlen = self.config["maxseqlen"] numpassages = self.config["numpassages"] query_toks = self.qid2toks[qid] - pos_bert_inputs, pos_bert_masks, pos_bert_segs = [], [], [] # N.B: The passages in self.docid2passages are not bert tokenized pos_passages = self._get_passages(posid) - for tokenized_passage in pos_passages: - inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) - pos_bert_inputs.append(inp) - pos_bert_masks.append(mask) - pos_bert_segs.append(seg) + pos_bert_inputs, pos_bert_masks, pos_bert_segs, n_valid_psg = self._encode_inputs(query_toks, pos_passages) + if training: + pos_bert_inputs, pos_bert_masks, pos_bert_segs = self._filter_inputs( + pos_bert_inputs, pos_bert_masks, pos_bert_segs, n_valid_psg + ) + else: + assert len(pos_bert_inputs) == numpassages + + pos_bert_inputs, pos_bert_masks, pos_bert_segs = map( + lambda lst: np.array(lst, dtype=np.long), [pos_bert_inputs, pos_bert_masks, pos_bert_segs] + ) # TODO: Rename the posdoc key in the below dict to 'pos_bert_input' data = { "qid": qid, "posdocid": posid, - "pos_bert_input": np.array(pos_bert_inputs, dtype=np.long), - "pos_mask": np.array(pos_bert_masks, dtype=np.long), - "pos_seg": np.array(pos_bert_segs, dtype=np.long), + "pos_bert_input": pos_bert_inputs, + "pos_mask": pos_bert_masks, + "pos_seg": pos_bert_segs, "negdocid": "", - "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long), - "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long), - "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long), - "label": np.repeat(np.array([label], dtype=np.float32), numpassages, 0), + "neg_bert_input": np.zeros_like(pos_bert_inputs, dtype=np.long), + "neg_mask": np.zeros_like(pos_bert_masks, dtype=np.long), + "neg_seg": np.zeros_like(pos_bert_segs, dtype=np.long), + "label": np.array(label, dtype=np.float32), + # ^^^ not change the shape of the label as it is only needed during training } if not negid: return data - neg_bert_inputs, neg_bert_masks, neg_bert_segs = [], [], [] neg_passages = self._get_passages(negid) - - for tokenized_passage in neg_passages: - inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) - neg_bert_inputs.append(inp) - neg_bert_masks.append(mask) - neg_bert_segs.append(seg) + neg_bert_inputs, neg_bert_masks, neg_bert_segs, n_valid_psg = self._encode_inputs(query_toks, neg_passages) + if training: + neg_bert_inputs, neg_bert_masks, neg_bert_segs = self._filter_inputs( + neg_bert_inputs, neg_bert_masks, neg_bert_segs, n_valid_psg + ) + else: + assert len(neg_bert_inputs) == numpassages if not neg_bert_inputs: raise MissingDocError(qid, negid) diff --git a/capreolus/extractor/berttext.py b/capreolus/extractor/berttext.py index a835bf51..feeb7720 100644 --- a/capreolus/extractor/berttext.py +++ b/capreolus/extractor/berttext.py @@ -117,7 +117,7 @@ def preprocess(self, qids, docids, topics): self._build_vocab(qids, docids, topics) - def id2vec(self, qid, posid, negid=None): + def id2vec(self, qid, posid, negid=None, *args, **kwargs): tokenizer = self.tokenizer qlen, doclen = self.config["maxqlen"], self.config["maxdoclen"] diff --git a/capreolus/extractor/birch_bertpassage.py b/capreolus/extractor/birch_bertpassage.py new file mode 100644 index 00000000..26d78fda --- /dev/null +++ b/capreolus/extractor/birch_bertpassage.py @@ -0,0 +1,71 @@ +import tensorflow as tf +import numpy as np + +from capreolus import get_logger +from capreolus.utils.exceptions import MissingDocError +from . import Extractor +from .bertpassage import BertPassage +from .common import MultipleTrainingPassagesMixin + +logger = get_logger(__name__) + + +@Extractor.register +class BirchBertPassage(MultipleTrainingPassagesMixin, BertPassage): + module_name = "birchbertpassage" + + config_spec = BertPassage.config_spec + + def id2vec(self, qid, posid, negid=None, label=None, **kwargs): + """ + See parent class for docstring + """ + assert label is not None + maxseqlen = self.config["maxseqlen"] + numpassages = self.config["numpassages"] + + query_toks = self.qid2toks[qid] + pos_bert_inputs, pos_bert_masks, pos_bert_segs = [], [], [] + + # N.B: The passages in self.docid2passages are not bert tokenized + pos_passages = self._get_passages(posid) + for tokenized_passage in pos_passages: + inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) + pos_bert_inputs.append(inp) + pos_bert_masks.append(mask) + pos_bert_segs.append(seg) + + # TODO: Rename the posdoc key in the below dict to 'pos_bert_input' + data = { + "qid": qid, + "posdocid": posid, + "pos_bert_input": np.array(pos_bert_inputs, dtype=np.long), + "pos_mask": np.array(pos_bert_masks, dtype=np.long), + "pos_seg": np.array(pos_bert_segs, dtype=np.long), + "negdocid": "", + "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long), + "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long), + "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long), + "label": np.repeat(np.array([label], dtype=np.float32), numpassages, 0), + } + + if not negid: + return data + + neg_bert_inputs, neg_bert_masks, neg_bert_segs = [], [], [] + neg_passages = self._get_passages(negid) + + for tokenized_passage in neg_passages: + inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) + neg_bert_inputs.append(inp) + neg_bert_masks.append(mask) + neg_bert_segs.append(seg) + + if not neg_bert_inputs: + raise MissingDocError(qid, negid) + + data["negdocid"] = negid + data["neg_bert_input"] = np.array(neg_bert_inputs, dtype=np.long) + data["neg_mask"] = np.array(neg_bert_masks, dtype=np.long) + data["neg_seg"] = np.array(neg_bert_segs, dtype=np.long) + return data diff --git a/capreolus/extractor/common.py b/capreolus/extractor/common.py index 1e1bf736..c767b8c6 100644 --- a/capreolus/extractor/common.py +++ b/capreolus/extractor/common.py @@ -1,5 +1,6 @@ import numpy as np from pymagnitude import Magnitude, MagnitudeUtils +import tensorflow as tf from capreolus import constants, get_logger @@ -63,3 +64,168 @@ def save_vocab_file(itos, fn): with open(fn, "wt") as outf: for idx, term in sorted(itos.items()): print(term, file=outf) + + +class MultipleTrainingPassagesMixin: + """ + Prepare and parse TF training feature that contain multiple passage per query. + That is, the "pos_bert_input" features prepared by extractor's `id2vec()` function should have 3 dimension + """ + + def create_tf_train_feature(self, sample): + """ + Returns a set of features from a doc. + Of the num_passages passages that are present in a document, we use only a subset of it. + params: + sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen] + Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen]. + Yes, the output shape is different to the input shape because we sample from the passages. + """ + num_passages = self.config["numpassages"] + + def _bytes_feature(value): + """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors.""" + if isinstance(value, type(tf.constant(0))): # if value ist tensor + value = value.numpy() # get value of tensor + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + def transpose_neg_input(neg_inp): + return tf.cast(tf.transpose(neg_inp, perm=[1, 0, 2]), tf.int64) + + posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"] + posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = ( + sample["pos_mask"], + sample["pos_seg"], + sample["neg_mask"], + sample["neg_seg"], + ) + label = sample["label"] + features = [] + + negdoc = transpose_neg_input(negdoc) + negdoc_seg = transpose_neg_input(negdoc_seg) + negdoc_mask = transpose_neg_input(negdoc_mask) + + for i in range(num_passages): + if i > 0 and self.rng.random() > self.config["prob"]: + continue + + bert_input_line = posdoc[i] + bert_input_line = " ".join(self.tokenizer.bert_tokenizer.convert_ids_to_tokens(list(bert_input_line))) + passage = bert_input_line.split(self.sep_tok)[-2] + + # Ignore empty passages as well + if passage.strip() == self.pad_tok: + continue + + feature = { + "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc[i])), + "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask[i])), + "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg[i])), + "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc[i])), + "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask[i])), + "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg[i])), + "label": _bytes_feature(tf.io.serialize_tensor(label[i])), + } + features.append(feature) + + return features + + def parse_tf_train_example(self, example_proto): + maxseqlen = self.config["maxseqlen"] + + feature_description = self.get_tf_feature_description() + parsed_example = tf.io.parse_example(example_proto, feature_description) + + def parse_tensor_as_int(x): + parsed_tensor = tf.io.parse_tensor(x, tf.int64) + parsed_tensor.set_shape([maxseqlen]) + return parsed_tensor + + def parse_neg_tensor_as_int(x): + parsed_tensor = tf.io.parse_tensor(x, tf.int64) + return parsed_tensor + + def parse_label_tensor(x): + parsed_tensor = tf.io.parse_tensor(x, tf.float32) + return parsed_tensor + + pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64) + pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64) + pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64) + neg_bert_input = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64) + neg_mask = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64) + neg_seg = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64) + label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32) + + return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label + + +class SingleTrainingPassagesMixin: + """ + Prepare and parse TF training feature that contain single passage per query. + That is, the "pos_bert_input" features prepared by extractor's `id2vec()` function should have 2 dimension + """ + + def create_tf_train_feature(self, sample): + """ + Returns a set of features from a doc. + Of the num_passages passages that are present in a document, we use only a subset of it. + params: + sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen] + + Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen]. + Yes, the output shape is different to the input shape because we sample from the passages. + """ + num_passages = self.config["numpassages"] + + def _bytes_feature(value): + """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors.""" + if isinstance(value, type(tf.constant(0))): # if value ist tensor + value = value.numpy() # get value of tensor + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"] + posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = ( + sample["pos_mask"], + sample["pos_seg"], + sample["neg_mask"], + sample["neg_seg"], + ) + label = sample["label"] + feature = { + "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc)), + "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask)), + "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg)), + "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc)), + "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask)), + "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg)), + "label": _bytes_feature(tf.io.serialize_tensor(label)), + } + return [feature] + + def parse_tf_train_example(self, example_proto): + feature_description = self.get_tf_feature_description() + parsed_example = tf.io.parse_example(example_proto, feature_description) + + def parse_tensor_as_int(x): + parsed_tensor = tf.io.parse_tensor(x, tf.int64) + parsed_tensor.set_shape([self.config["maxseqlen"]]) + + return parsed_tensor + + def parse_label_tensor(x): + parsed_tensor = tf.io.parse_tensor(x, tf.float32) + parsed_tensor.set_shape([2]) + + return parsed_tensor + + pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64) + pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64) + pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64) + neg_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64) + neg_mask = tf.map_fn(parse_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64) + neg_seg = tf.map_fn(parse_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64) + label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32) + + return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label diff --git a/capreolus/extractor/deeptileextractor.py b/capreolus/extractor/deeptileextractor.py index be3c5d78..da5eddb5 100644 --- a/capreolus/extractor/deeptileextractor.py +++ b/capreolus/extractor/deeptileextractor.py @@ -249,7 +249,7 @@ def preprocess(self, qids, docids, topics): self._build_vocab(qids, docids, topics) self._build_embedding_matrix() - def id2vec(self, qid, posdocid, negdocid=None, **kwargs): + def id2vec(self, qid, posdocid, negdocid=None, *args, **kwargs): query_toks = padlist(self.qid2toks[qid], self.config["maxqlen"], pad_token=self.pad_tok) posdoc_tilebar = self.create_visualization_matrix(query_toks, self.docid2segments[posdocid], self.embeddings) diff --git a/capreolus/extractor/embedtext.py b/capreolus/extractor/embedtext.py index 42e5e9c8..d7270d35 100644 --- a/capreolus/extractor/embedtext.py +++ b/capreolus/extractor/embedtext.py @@ -125,7 +125,7 @@ def _add_oov_to_vocab(self, tokens): def _tok2vec(self, toks): return [self.stoi[tok] for tok in toks] - def id2vec(self, qid, posid, negid=None, **kwargs): + def id2vec(self, qid, posid, negid=None, *args, **kwargs): query = self.qid2toks[qid] # TODO find a way to calculate qlen/doclen stats earlier, so we can log them and check sanity of our values diff --git a/capreolus/extractor/lce_bertpassage.py b/capreolus/extractor/lce_bertpassage.py index 81ed5c7b..c6d49702 100644 --- a/capreolus/extractor/lce_bertpassage.py +++ b/capreolus/extractor/lce_bertpassage.py @@ -5,108 +5,22 @@ from capreolus.utils.exceptions import MissingDocError from . import Extractor from .bertpassage import BertPassage +from .common import MultipleTrainingPassagesMixin logger = get_logger(__name__) @Extractor.register -class LCEBertPassage(BertPassage): +class LCEBertPassage(MultipleTrainingPassagesMixin, BertPassage): module_name = "LCEbertpassage" config_spec = BertPassage.config_spec - def create_tf_train_feature(self, sample): - """ - Returns a set of features from a doc. - Of the num_passages passages that are present in a document, we use only a subset of it. - params: - sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen] - Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen]. - Yes, the output shape is different to the input shape because we sample from the passages. - """ - num_passages = self.config["numpassages"] - - def _bytes_feature(value): - """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors.""" - if isinstance(value, type(tf.constant(0))): # if value ist tensor - value = value.numpy() # get value of tensor - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - def transpose_neg_input(neg_inp): - return tf.cast(tf.transpose(neg_inp, perm=[1, 0, 2]), tf.int64) - - posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"] - posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = ( - sample["pos_mask"], - sample["pos_seg"], - sample["neg_mask"], - sample["neg_seg"], - ) - label = sample["label"] - features = [] - - negdoc = transpose_neg_input(negdoc) - negdoc_seg = transpose_neg_input(negdoc_seg) - negdoc_mask = transpose_neg_input(negdoc_mask) - - for i in range(num_passages): - if i > 0 and self.rng.random() > self.config["prob"]: - continue - - bert_input_line = posdoc[i] - bert_input_line = " ".join(self.tokenizer.bert_tokenizer.convert_ids_to_tokens(list(bert_input_line))) - passage = bert_input_line.split(self.sep_tok)[-2] - - # Ignore empty passages as well - if passage.strip() == self.pad_tok: - continue - - feature = { - "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc[i])), - "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask[i])), - "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg[i])), - "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc[i])), - "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask[i])), - "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg[i])), - "label": _bytes_feature(tf.io.serialize_tensor(label[i])), - } - features.append(feature) - - return features - - def parse_tf_train_example(self, example_proto): - maxseqlen = self.config["maxseqlen"] - - feature_description = self.get_tf_feature_description() - parsed_example = tf.io.parse_example(example_proto, feature_description) - - def parse_tensor_as_int(x): - parsed_tensor = tf.io.parse_tensor(x, tf.int64) - parsed_tensor.set_shape([maxseqlen]) - return parsed_tensor - - def parse_neg_tensor_as_int(x): - parsed_tensor = tf.io.parse_tensor(x, tf.int64) - return parsed_tensor - - def parse_label_tensor(x): - parsed_tensor = tf.io.parse_tensor(x, tf.float32) - return parsed_tensor - - pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64) - pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64) - pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64) - neg_bert_input = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64) - neg_mask = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64) - neg_seg = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64) - label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32) - - return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label - - def id2vec(self, qid, posid, negids=None, label=None): + def id2vec(self, qid, posid, negids=None, label=None, **kwargs): """ See parent class for docstring """ + training = kwargs.get("training", True) # default to be training assert label is not None maxseqlen = self.config["maxseqlen"] numpassages = self.config["numpassages"] @@ -122,6 +36,7 @@ def id2vec(self, qid, posid, negids=None, label=None): pos_bert_masks.append(mask) pos_bert_segs.append(seg) + label = [label] if training else label # TODO: Rename the posdoc key in the below dict to 'pos_bert_input' data = { "qid": qid, @@ -133,7 +48,7 @@ def id2vec(self, qid, posid, negids=None, label=None): "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long), "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long), "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long), - "label": np.repeat(np.array([label], dtype=np.float32), numpassages, 0), + "label": np.repeat(np.array(label, dtype=np.float32), numpassages, 0), } if negids is None: diff --git a/capreolus/extractor/pooled_bertpassage.py b/capreolus/extractor/pooled_bertpassage.py index 93a0967b..8f8d8fc5 100644 --- a/capreolus/extractor/pooled_bertpassage.py +++ b/capreolus/extractor/pooled_bertpassage.py @@ -31,6 +31,7 @@ class PooledBertPassage(BertPassage): config_spec = [ ConfigOption("maxseqlen", 256, "Maximum input length (query+document)"), ConfigOption("maxqlen", 20, "Maximum query length"), + ConfigOption("padq", False, "Always pad queries to maxqlen"), ConfigOption("usecache", False, "Should the extracted features be cached?"), ConfigOption("passagelen", 150, "Length of the extracted passage"), ConfigOption("stride", 100, "Stride"), @@ -117,7 +118,7 @@ def parse_label_tensor(x): return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label - def id2vec(self, qid, posid, negid=None, label=None): + def id2vec(self, qid, posid, negid=None, label=None, *args, **kwargs): """ See parent class for docstring """ diff --git a/capreolus/extractor/slowembedtext.py b/capreolus/extractor/slowembedtext.py index fbc11837..f1f3f001 100644 --- a/capreolus/extractor/slowembedtext.py +++ b/capreolus/extractor/slowembedtext.py @@ -168,7 +168,7 @@ def _tok2vec(self, toks): # return [self.embeddings[self.stoi[tok]] for tok in toks] return [self.stoi[tok] for tok in toks] - def id2vec(self, qid, posid, negid=None, label=None): + def id2vec(self, qid, posid, negid=None, label=None, *args, **kwargs): assert label is not None query = self.qid2toks[qid] diff --git a/capreolus/reranker/birch.py b/capreolus/reranker/birch.py index fa4aea9f..9e9b9c47 100644 --- a/capreolus/reranker/birch.py +++ b/capreolus/reranker/birch.py @@ -141,7 +141,7 @@ class Birch(Reranker): Dependency( key="extractor", module="extractor", - name="bertpassage", + name="birchbertpassage", default_config_overrides={"tokenizer": {"pretrained": "bert-large-uncased"}}, ), Dependency(key="trainer", module="trainer", name="pytorch"), diff --git a/capreolus/reranker/common.py b/capreolus/reranker/common.py index ebdb969c..3c9fbb38 100644 --- a/capreolus/reranker/common.py +++ b/capreolus/reranker/common.py @@ -78,9 +78,12 @@ def call(self, y_true, y_pred): class TFCategoricalCrossEntropyLoss(CategoricalCrossentropy): def call(self, ytrue, ypred): + """Shape: (batch_size, 2)""" tf.debugging.assert_equal(tf.shape(ytrue), tf.shape(ypred)) + batch_size = tf.shape(ypred)[0] - return super(TFCategoricalCrossEntropyLoss, self).call(ytrue, ypred) + losses = super(TFCategoricalCrossEntropyLoss, self).call(ytrue, ypred) + return losses / tf.cast(batch_size, losses.dtype) class TFLCELoss(CategoricalCrossentropy): diff --git a/capreolus/reranker/ptBERTMaxP.py b/capreolus/reranker/ptBERTMaxP.py new file mode 100644 index 00000000..7b922e2e --- /dev/null +++ b/capreolus/reranker/ptBERTMaxP.py @@ -0,0 +1,135 @@ +import random + +import torch +from torch import nn +from transformers import AutoModelForSequenceClassification + +from capreolus import ConfigOption, Dependency +from capreolus.reranker import Reranker +from capreolus.utils.loginit import get_logger + +logger = get_logger(__name__) + + +class ElectraRelevanceHead(nn.Module): + """BERT-style ClassificationHead (i.e., out_proj only -- no dense). See transformers.ElectraClassificationHead""" + + def __init__(self, dropout, out_proj, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dropout = dropout + self.out_proj = out_proj + + def call(self, inputs, **kwargs): + x = inputs[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +class PTBERTMaxP_Class(nn.Module): + def __init__(self, extractor, config, *args, **kwargs): + super(PTBERTMaxP_Class, self).__init__(*args, **kwargs) + self.extractor = extractor + + # TODO hidden prob missing below? + if config["pretrained"] == "electra-base-msmarco": + self.bert = AutoModelForSequenceClassification.from_pretrained("Capreolus/electra-base-msmarco") + dropout, fc = self.bert.classifier.dropout, self.bert.classifier.out_proj + self.bert.classifier = ElectraRelevanceHead(dropout, fc) + elif config["pretrained"] == "electra-base": + self.bert = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator") + dropout, fc = self.bert.classifier.dropout, self.bert.classifier.out_proj + self.bert.classifier = ElectraRelevanceHead(dropout, fc) + elif config["pretrained"] == "bert-base-msmarco": + self.bert = AutoModelForSequenceClassification.from_pretrained("Capreolus/bert-base-msmarco") + else: + self.bert = AutoModelForSequenceClassification.from_pretrained( + config["pretrained"], hidden_dropout_prob=config["hidden_dropout_prob"] + ) + + self.config = config + + def forward(self, doc_input, doc_mask, doc_seg): + """ + doc_input: (BS, N_PSG, SEQ_LEN) -> [psg-1, psg-2, ..., [PAD], [PAD]] + """ + batch_size = doc_input.shape[0] + if "roberta" in self.config["pretrained"]: + doc_seg = torch.zeros_like(doc_mask) # since roberta does not have segment input + + if self.training: + passage_scores = self.bert(doc_input, attention_mask=doc_mask, token_type_ids=doc_seg)[0] + else: + passage_scores = self.predict_step(doc_input, doc_mask, doc_seg) + + return passage_scores + + def predict_step(self, doc_input, doc_mask, doc_seg): + """ + Scores each passage and applies max pooling over it. + """ + batch_size = doc_input.shape[0] + num_passages = self.extractor.config["numpassages"] + maxseqlen = self.extractor.config["maxseqlen"] + + passage_position = (doc_mask * doc_seg).sum(dim=-1) # (B, P) + passage_mask = (passage_position > 5).long() # (B, P) + + doc_input = doc_input.reshape([batch_size * num_passages, maxseqlen]) + doc_mask = doc_mask.reshape([batch_size * num_passages, maxseqlen]) + doc_seg = doc_seg.reshape([batch_size * num_passages, maxseqlen]) + + passage_scores = self.bert(doc_input, attention_mask=doc_mask, token_type_ids=doc_seg)[0][:, 1] + passage_scores = passage_scores.reshape([batch_size, num_passages]) + + if self.config["aggregation"] == "max": + passage_scores = passage_scores.max(dim=1)[0] # (batch size, ) + elif self.config["aggregation"] == "first": + passage_scores = passage_scores[:, 0] + elif self.config["aggregation"] == "sum": + passage_scores = torch.sum(passage_mask * passage_scores, dim=1) + elif self.config["aggregation"] == "avg": + passage_scores = torch.sum(passage_mask * passage_scores, dim=1) / torch.sum(passage_mask) + else: + raise ValueError("Unknown aggregation method: {}".format(self.config["aggregation"])) + + return passage_scores + + +@Reranker.register +class PTBERTMaxP(Reranker): + """ + PyTorch implementation of BERT-MaxP. + + Deeper Text Understanding for IR with Contextual Neural Language Modeling. Zhuyun Dai and Jamie Callan. SIGIR 2019. + https://arxiv.org/pdf/1905.09217.pdf + """ + + module_name = "ptBERTMaxP" + + dependencies = [ + Dependency(key="extractor", module="extractor", name="bertpassage"), + Dependency(key="trainer", module="trainer", name="pytorch"), + ] + config_spec = [ + ConfigOption( + "pretrained", + "bert-base-uncased", + "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base-msmarco, or HuggingFace supported models", + ), + ConfigOption("aggregation", "max"), + ConfigOption("hidden_dropout_prob", 0.1, "The dropout probability of BERT-like model's hidden layers."), + ] + + def build_model(self): + self.model = PTBERTMaxP_Class(self.extractor, self.config) + return self.model + + def score(self, d): + return [ + self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1), + self.model(d["neg_bert_input"], d["neg_mask"], d["neg_seg"]).view(-1), + ] + + def test(self, d): + return self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1) diff --git a/capreolus/sampler/__init__.py b/capreolus/sampler/__init__.py index 2c48dfab..5df771f2 100644 --- a/capreolus/sampler/__init__.py +++ b/capreolus/sampler/__init__.py @@ -107,7 +107,7 @@ def generate_samples(self): """ Generates triplets infinitely. """ - all_qids = sorted(self.qid_to_reldocs) + all_qids = list(self.qid_to_reldocs) if len(all_qids) == 0: raise RuntimeError("TrainDataset has no valid qids") @@ -121,7 +121,7 @@ def generate_samples(self): try: # Convention for label - [1, 0] indicates that doc belongs to class 1 (i.e relevant # ^ This is used with categorical cross entropy loss - yield self.extractor.id2vec(qid, posdocid, negdocid, label=[1, 0]) + yield self.extractor.id2vec(qid, posdocid, negdocid, label=[1, 0], training=True) except MissingDocError: # at training time we warn but ignore on missing docs logger.warning( @@ -146,19 +146,20 @@ def get_hash(self): return "pair_{0}".format(key) def generate_samples(self): - all_qids = sorted(self.qid_to_reldocs) + all_qids = list(self.qid_to_reldocs) if len(all_qids) == 0: raise RuntimeError("TrainDataset has no valid training pairs") while True: self.rng.shuffle(all_qids) for qid in all_qids: + posdocid = self.rng.choice(self.qid_to_reldocs[qid]) + negdocid = self.rng.choice(self.qid_to_negdocs[qid]) + # Convention for label - [1, 0] indicates that doc belongs to class 1 (i.e relevant # ^ This is used with categorical cross entropy loss - for docid in self.qid_to_reldocs[qid]: - yield self.extractor.id2vec(qid, docid, negid=None, label=[0, 1]) - for docid in self.qid_to_negdocs[qid]: - yield self.extractor.id2vec(qid, docid, negid=None, label=[1, 0]) + yield self.extractor.id2vec(qid, posdocid, negid=None, label=[0, 1], training=True) + yield self.extractor.id2vec(qid, negdocid, negid=None, label=[1, 0], training=True) # REF-TODO returning all docs in a row does not make sense w/ pytorch # (with TF the dataset itself is shuffled, so this is okay) # REF-TODO make sure always negid empty is ok @@ -223,9 +224,9 @@ def generate_samples(self): for docid in docids: try: if docid in self.qid_to_reldocs[qid]: - yield self.extractor.id2vec(qid, docid, label=[0, 1]) + yield self.extractor.id2vec(qid, docid, label=[0, 1], training=False) else: - yield self.extractor.id2vec(qid, docid, label=[1, 0]) + yield self.extractor.id2vec(qid, docid, label=[1, 0], training=False) except MissingDocError: # when predictiong we raise an exception on missing docs, as this may invalidate results logger.error("got none features for prediction: qid=%s posid=%s", qid, docid) diff --git a/capreolus/tests/test_extractor.py b/capreolus/tests/test_extractor.py index 7c636307..4eafea30 100644 --- a/capreolus/tests/test_extractor.py +++ b/capreolus/tests/test_extractor.py @@ -612,7 +612,8 @@ def get_doc(*args, **kwargs): def test_bertpassage_id2vec(monkeypatch): benchmark = DummyBenchmark() extractor = BertPassage( - {"numpassages": 5, "passagelen": 5, "maxseqlen": 15, "stride": 3, "index": {"collection": {"name": "dummy"}}}, + # cannot test the numpassages > 1 cases, as the passages were randomly selected + {"numpassages": 1, "passagelen": 5, "maxseqlen": 15, "stride": 3, "index": {"collection": {"name": "dummy"}}}, provide=benchmark, ) @@ -627,7 +628,7 @@ def get_doc(*args, **kwargs): tokenizer = extractor.tokenizer.bert_tokenizer - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][0]) == [ + assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"]) == [ "[CLS]", "sc", "##oo", @@ -644,59 +645,7 @@ def get_doc(*args, **kwargs): "we", "[SEP]", ] - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][1]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "now", - "had", - "here", - "[SEP]", - ] - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][2]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "but", - "one", - "ten", - "[SEP]", - ] - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][3]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "thousand", - "of", - "those", - "[SEP]", - ] - - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][0]) == [ + assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"]) == [ "[CLS]", "sc", "##oo", @@ -713,63 +662,12 @@ def get_doc(*args, **kwargs): "we", "[SEP]", ] - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][1]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "now", - "had", - "here", - "[SEP]", - ] - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][2]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "but", - "one", - "ten", - "[SEP]", - ] - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][3]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "thousand", - "of", - "those", - "[SEP]", - ] def test_bertpassage_id2vec_with_pad(monkeypatch): benchmark = DummyBenchmark() extractor = BertPassage( - {"numpassages": 5, "passagelen": 5, "maxseqlen": 20, "stride": 3, "index": {"collection": {"name": "dummy"}}}, + {"numpassages": 1, "passagelen": 5, "maxseqlen": 20, "stride": 3, "index": {"collection": {"name": "dummy"}}}, provide=benchmark, ) @@ -784,7 +682,7 @@ def get_doc(*args, **kwargs): tokenizer = extractor.tokenizer.bert_tokenizer - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][0]) == [ + assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"]) == [ "[CLS]", "sc", "##oo", @@ -806,101 +704,15 @@ def get_doc(*args, **kwargs): "[PAD]", "[PAD]", ] - tf.debugging.assert_equal( - data["pos_mask"][0], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) - ) - tf.debugging.assert_equal( - data["pos_seg"][0], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) - ) - - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][1]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "now", - "had", - "here", - "but", - "one", - "[SEP]", - "[PAD]", - "[PAD]", - "[PAD]", - ] - tf.debugging.assert_equal( - data["pos_mask"][1], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) - ) - tf.debugging.assert_equal( - data["pos_seg"][1], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) - ) - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][2]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "but", - "one", - "ten", - "thousand", - "of", - "[SEP]", - "[PAD]", - "[PAD]", - "[PAD]", - ] tf.debugging.assert_equal( - data["pos_mask"][2], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) + data["pos_mask"], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) ) tf.debugging.assert_equal( - data["pos_seg"][2], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) + data["pos_seg"], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) ) - assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][3]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "thousand", - "of", - "those", - "men", - "in", - "[SEP]", - "[PAD]", - "[PAD]", - "[PAD]", - ] - tf.debugging.assert_equal( - data["pos_mask"][3], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) - ) - tf.debugging.assert_equal( - data["pos_seg"][3], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) - ) - - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][0]) == [ + assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"]) == [ "[CLS]", "sc", "##oo", @@ -923,95 +735,8 @@ def get_doc(*args, **kwargs): "[PAD]", ] tf.debugging.assert_equal( - data["neg_mask"][0], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) - ) - tf.debugging.assert_equal( - data["neg_seg"][0], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) - ) - - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][1]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "now", - "had", - "here", - "but", - "one", - "[SEP]", - "[PAD]", - "[PAD]", - "[PAD]", - ] - tf.debugging.assert_equal( - data["neg_mask"][1], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) - ) - tf.debugging.assert_equal( - data["neg_seg"][1], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) - ) - - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][2]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "but", - "one", - "ten", - "thousand", - "of", - "[SEP]", - "[PAD]", - "[PAD]", - "[PAD]", - ] - tf.debugging.assert_equal( - data["neg_mask"][2], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) - ) - tf.debugging.assert_equal( - data["neg_seg"][2], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) - ) - - assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][3]) == [ - "[CLS]", - "sc", - "##oo", - "##by", - "doo", - "##by", - "doo", - "where", - "are", - "you", - "[SEP]", - "thousand", - "of", - "those", - "men", - "in", - "[SEP]", - "[PAD]", - "[PAD]", - "[PAD]", - ] - tf.debugging.assert_equal( - data["neg_mask"][3], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) + data["neg_mask"], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64) ) tf.debugging.assert_equal( - data["neg_seg"][3], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) + data["neg_seg"], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64) ) diff --git a/capreolus/trainer/pytorch.py b/capreolus/trainer/pytorch.py index 3e30e89f..a3199cc8 100644 --- a/capreolus/trainer/pytorch.py +++ b/capreolus/trainer/pytorch.py @@ -73,7 +73,7 @@ def build(self): torch.manual_seed(self.config["seed"]) torch.cuda.manual_seed_all(self.config["seed"]) - def single_train_iteration(self, reranker, train_dataloader): + def single_train_iteration(self, reranker, train_dataloader, cur_iter): """Train model for one iteration using instances from train_dataloader. Args: @@ -86,6 +86,7 @@ def single_train_iteration(self, reranker, train_dataloader): """ iter_loss = [] + cur_step = cur_iter * self.n_batch_per_iter batches_since_update = 0 batches_per_step = self.config["gradacc"] @@ -112,9 +113,11 @@ def single_train_iteration(self, reranker, train_dataloader): self.optimizer.zero_grad() if (bi + 1) % self.n_batch_per_iter == 0: - # REF-TODO: save scheduler state along with optimizer - self.lr_scheduler.step() break + # REF-TODO: save scheduler state along with optimizer + # hacky: use step instead the internally calculated epoch to support step-wise lr update + self.lr_scheduler.step(epoch=cur_step) + cur_step += 1 return torch.stack(iter_loss).mean() @@ -210,7 +213,8 @@ def train(self, reranker, train_dataset, train_output_path, dev_data, dev_output # REF-TODO how to handle interactions between fastforward and schedule? --> just save its state self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR( - self.optimizer, lambda epoch: self.lr_multiplier(step=epoch * self.n_batch_per_iter) + self.optimizer, + lambda step: self.lr_multiplier(step=step), ) if self.config["softmaxloss"]: @@ -254,7 +258,7 @@ def train(self, reranker, train_dataset, train_output_path, dev_data, dev_output model.train() iter_start_time = time.time() - iter_loss_tensor = self.single_train_iteration(reranker, train_dataloader) + iter_loss_tensor = self.single_train_iteration(reranker, train_dataloader, cur_iter=niter) logger.info("A single iteration takes {}".format(time.time() - iter_start_time)) train_loss.append(iter_loss_tensor.item()) logger.info("iter = %d loss = %f", niter, train_loss[-1]) diff --git a/docs/reproduction/MS_MARCO.md b/docs/reproduction/MS_MARCO.md index 6ba5d09b..7793b470 100644 --- a/docs/reproduction/MS_MARCO.md +++ b/docs/reproduction/MS_MARCO.md @@ -7,11 +7,14 @@ first follow [this](../setup/setup-cc.md) guide to set up the environment on CC Once the environment is set, you can verify the installation with [these instructions](./PARADE.md#testing-installation). ## Running MS MARCO -This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU. -1. Make sure you are in the top-level `capreolus` directory; -2. Use the following script to run a "mini" version of the MS MARCO fine-tuning, testing if everything is working. +This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU. Before replication, make sure you are in the top-level `capreolus` directory; +### TensorFlow version +1. Use the following script to run a "mini" version of the MS MARCO fine-tuning, testing if everything is working. ```bash - python -m capreolus.run rerank.train with file=docs/reproduction/config_msmarco.txt + python -m capreolus.run rerank.train with \ + file=docs/reproduction/config_msmarco.txt \ + reranker.name=TFBERTMaxP \ + reranker.trainer.usecache=True ``` This would train the monoBERT for only 3k steps with batch size to be 4, then rerank the *top100* documents per query. The script should take no more than 24 hours to finish, and could be fit into a single `v100l`. @@ -27,6 +30,8 @@ This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU. python -m capreolus.run rerank.train with \ file=docs/reproduction/config_msmarco.txt \ + reranker.name=TFBERTMaxP \ + reranker.trainer.bertlr=1e-5 \ threshold=$threshold \ reranker.trainer.niters=$niters \ reranker.trainer.batch=$batch_size \ @@ -38,6 +43,34 @@ This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU. After data is prepared, it would take 4~6 hours to train and 6~10 hours to inference with *4 V100s* for BERT-base. This should achieve `MRR@10=0.35+`. +### PyTorch version +Similar with reproduction with TensorFlow: +1. To fine-tune a "mini" version of the MS MARCO: + ```bash + python -m capreolus.run rerank.train with \ + file=docs/reproduction/config_msmarco.txt \ + reranker.name=ptBERTMaxP + ``` + +2. To fine-tune a full version on MS MARCO Passage: + ```bash + niters=10 + batch_size=16 + validatefreq=$niters # to ensure the validation is run only at the end of training + decayiters=$niters # either same with $itersize or 0 + threshold=1000 # the top-k documents to rerank + + python -m capreolus.run rerank.train with \ + file=docs/reproduction/config_msmarco.txt \ + reranker.name=ptBERTMaxP \ + threshold=$threshold \ + reranker.trainer.niters=$niters \ + reranker.trainer.batch=$batch_size \ + reranker.trainer.decayiters=$decayiters \ + reranker.trainer.validatefreq=$validatefreq \ + fold=s1 + ``` + ### For CC slurm users: In case you are new to [slurm](https://slurm.schedmd.com/documentation.html), a sample slurm script for the *full version* fine-tuning could be found under `docs/reproduction/sample_slurm_script.sh`. This should work on `cedar` directly via `sbatch sample_slurm_script.sh`. @@ -52,4 +85,7 @@ To adapt it to the `mini` version, simply change the GPU number and request time + Results (with hypperparameter-6) replicated by [@Dahlia-Chehata](https://github.com/Dahlia-Chehata) on 2021-03-29 (commit [`7915aad`](https://github.com/capreolus-ir/capreolus/commit/7915aad75406527a3b88498926cff85259808696)) (Tesla V100 on Compute Canada) + Results (with hypperparameter-7) replicated by [@larryli1999](https://github.com/larryli1999) on 2021-05-16 (commit [`6d1aed2`](https://github.com/capreolus-ir/capreolus/commit/6d1aed29de7828ceb94560a8bf7c87f1af5458b5)) (Tesla V100 on Compute Canada) + Results (MRR@10=0.356) replicated by [@andrewyguo](https://github.com/andrewyguo) on 2021-05-29 (commit [`1ce71d9`](https://github.com/capreolus-ir/capreolus/commit/1ce71d93ab5473b40d4ae02768fd053261b27320)) (Tesla V100 on Compute Canada) -+ Results (MRR@10=0.356) reproduced by [@lingwei-gu](https://github.com/lingwei-gu) on 2022-01-20 (commit [`1bbf0f2`](https://github.com/capreolus-ir/capreolus/commit/1bbf0f295b09774e2fb2a1db7dfddef88adec7be)) (Tesla V100 on Compute Canada) ++ Results (MRR@10=0.3513) replicated by [@nimasadri11](https://github.com/nimasadri11) on 2021-09-26 (commit [`2d41e28`](https://github.com/capreolus-ir/capreolus/commit/2d41e28092813c22bfa5e8e2a88ff3ca944a688a)) (Tesla V100 on Compute Canada) ++ Results (MRR@10=0.353 w/ pytorch, MRR@10=0.352 w/ tensorflow) replicated by [@leungjch](https://github.com/leungjch) on 2021-10-19 (commit [`b5e7448`](https://github.com/crystina-z/capreolus/commit/b5e7448ab88aea69ab0df0838424254d0a079e7b)) (Tesla V100 on Compute Canada) ++ Results (MRR@10=0.353 w/ pytorch, MRR@10=0.353 w/ tensorflow) replicated by [@AlexWang000](https://github.com/AlexWang000) on 2021-11-05 (commit [`1c570c3`](https://github.com/crystina-z/capreolus/commit/1c570c34630a984ccff3843d89223effa5d48aba)) (Tesla V100 on Compute Canada) ++ Results (MRR@10=0.356) reproduced by [@lingwei-gu](https://github.com/lingwei-gu) on 2022-01-20 (commit [`1bbf0f2`](https://github.com/capreolus-ir/capreolus/commit/1bbf0f295b09774e2fb2a1db7dfddef88adec7be)) (Tesla V100 on Compute Canada) \ No newline at end of file diff --git a/docs/reproduction/config_maxp_pt-robust04_title.txt b/docs/reproduction/config_maxp_pt-robust04_title.txt new file mode 100644 index 00000000..9ef0cb8f --- /dev/null +++ b/docs/reproduction/config_maxp_pt-robust04_title.txt @@ -0,0 +1,23 @@ +optimize=nDCG@20 +threshold=100 +testthreshold=100 + +reranker.name=ptBERTMaxP +reranker.aggregation=max + +reranker.trainer.niters=36 +reranker.trainer.batch=16 +reranker.trainer.warmupiters=8 +reranker.trainer.decayiters=36 +reranker.trainer.validatefreq=2 +reranker.trainer.lr=0.00002 +reranker.trainer.decay=0.1 +reranker.trainer.decayiters=36 +reranker.trainer.decaytype=linear + +reranker.extractor.usecache=True +reranker.extractor.maxqlen=20 +reranker.extractor.maxseqlen=256 +reranker.extractor.numpassages=16 +reranker.extractor.passagelen=150 +reranker.extractor.stride=75 diff --git a/docs/reproduction/config_msmarco.txt b/docs/reproduction/config_msmarco.txt index f85790ae..b2cab3c6 100644 --- a/docs/reproduction/config_msmarco.txt +++ b/docs/reproduction/config_msmarco.txt @@ -5,16 +5,15 @@ testthreshold=1 benchmark.name=msmarcopsg rank.searcher.name=msmarcopsgbm25 -reranker.name=TFBERTMaxP +# reranker.name=TFBERTMaxP reranker.pretrained=bert-base-uncased reranker.extractor.usecache=True reranker.extractor.numpassages=1 -reranker.extractor.maxseqlen=512 +reranker.extractor.maxseqlen=256 reranker.extractor.maxqlen=50 reranker.extractor.tokenizer.pretrained=bert-base-uncased -reranker.trainer.usecache=True reranker.trainer.niters=1 reranker.trainer.batch=4 reranker.trainer.evalbatch=256 @@ -23,6 +22,5 @@ reranker.trainer.warmupiters=1 reranker.trainer.decay=0.1 reranker.trainer.decayiters=1 reranker.trainer.decaytype=linear - -reranker.trainer.loss=pairwise_hinge_loss reranker.trainer.decay=0.1 +reranker.trainer.lr=1e-5 diff --git a/docs/reproduction/config_msmarco_lce.txt b/docs/reproduction/config_msmarco_lce.txt index ed4b8a8c..f4604ee5 100644 --- a/docs/reproduction/config_msmarco_lce.txt +++ b/docs/reproduction/config_msmarco_lce.txt @@ -1,5 +1,5 @@ optimize=MRR@10 -threshold=1000 +threshold=100 testthreshold=1 benchmark.name=msmarcopsg @@ -26,11 +26,11 @@ reranker.trainer.seed=42 reranker.trainer.batch=16 reranker.trainer.evalbatch=256 reranker.trainer.itersize=48000 -reranker.trainer.niters=10 +reranker.trainer.niters=1 reranker.trainer.lr=0.00001 reranker.trainer.bertlr=0.00001 reranker.trainer.decay=0.1 -reranker.trainer.decayiters=10 +reranker.trainer.decayiters=1 reranker.trainer.decaytype=linear reranker.trainer.warmupiters=1 -reranker.trainer.validatefreq=10 +reranker.trainer.validatefreq=1 diff --git a/docs/reproduction/monoELECTRA+LCE.md b/docs/reproduction/monoELECTRA+LCE.md index 006a1511..d3d099ba 100644 --- a/docs/reproduction/monoELECTRA+LCE.md +++ b/docs/reproduction/monoELECTRA+LCE.md @@ -6,11 +6,25 @@ Basically reproduce the results in [this](to-be-added) paper. For the set-up and monoBERT w/ hinge loss experiments, please refer to [this](MS_MARCO.md) page ## Running MS MARCO -The config file (config_msmarco_lce.txt)[config_msmarco_lce.txt] could be used out-of-box, with the following command: - +1. Use the following script to run a "mini" version of the MS MARCO fine-tuning, testing if everything is working. ```bash python -m capreolus.run rerank.train with file=docs/reproduction/config_msmarco_lce.txt ``` +This would train the monoBERT for only 3k steps with batch size to be 16, +then rerank the *top100* documents per query. +The script should take no more than 24 hours to finish, +At the end of execusion, it would display a bunch of metrics, where `MRR@10` should be around `0.359`. + +2. Once the above is done, we can fine-tune a full version on MS MARCO Passage using the following scripts: +Once the above is done, we can fine-tune a full version on MS MARCO Passage using the following scripts: +```bash +python -m capreolus.run rerank.train with \ + file=docs/reproduction/config_msmarco_lce.txt \ + threshold=1000 \ + reranker.trainer.niters=10 \ + reranker.trainer.decayiters=10 \ + reranker.trainer.validatefreq=10 +``` The config would achieve `MRR@10` around `0.395~0.4` (maybe <0.01 points fluctuation). It trains monoELECTRA with the hard negative data prepared from the [TCT-ColBERT](https://cs.uwaterloo.ca/~jimmylin/publications/Lin_etal_2021_RepL4NLP.pdf), and uses LCE loss with 3 hard negative per query. @@ -18,7 +32,13 @@ To experiments with different hard negative example, simply spcify `sampler.nneg For example, the following command would run the same config but with 7 hard negatives per query, which should gives `MRR@10` around `0.405~0.41` ```bash -python -m capreolus.run rerank.train with file=docs/reproduction/config_msmarco_lce.txt sampler.nneg=7 +python -m capreolus.run rerank.train with \ + file=docs/reproduction/config_msmarco_lce.txt \ + threshold=1000 \ + reranker.trainer.niters=10 \ + reranker.trainer.decayiters=10 \ + reranker.trainer.validatefreq=10 \ + sampler.nneg=7 ``` ## Replication Logs