diff --git a/capreolus/extractor/__init__.py b/capreolus/extractor/__init__.py
index 220a4af2..a5c3ac2a 100644
--- a/capreolus/extractor/__init__.py
+++ b/capreolus/extractor/__init__.py
@@ -68,7 +68,7 @@ def _build_vocab(self, qids, docids, topics):
     def build_from_benchmark(self, *args, **kwargs):
         raise NotImplementedError
 
-    def id2vec(self, qid, posdocid, negdocid=None, label=None):
+    def id2vec(self, qid, posdocid, negdocid=None, label=None, *args, **kwargs):
         """
         Creates a feature from the (qid, docid) pair.
         If negdocid is supplied, that's also included in the feature (needed for training with pairwise hinge loss)
diff --git a/capreolus/extractor/bagofwords.py b/capreolus/extractor/bagofwords.py
index 17cb986a..63931f33 100644
--- a/capreolus/extractor/bagofwords.py
+++ b/capreolus/extractor/bagofwords.py
@@ -112,7 +112,7 @@ def preprocess(self, qids, docids, topics):
 
         self._build_vocab(qids, docids, topics)
 
-    def id2vec(self, q_id, posdoc_id, negdoc_id=None, **kwargs):
+    def id2vec(self, q_id, posdoc_id, negdoc_id=None, *args, **kwargs):
         query_toks = self.qid2toks[q_id]
         posdoc_toks = self.docid2toks.get(posdoc_id)
 
diff --git a/capreolus/extractor/bertpassage.py b/capreolus/extractor/bertpassage.py
index f584db5c..7255ad1e 100644
--- a/capreolus/extractor/bertpassage.py
+++ b/capreolus/extractor/bertpassage.py
@@ -11,11 +11,13 @@
 from capreolus.utils.exceptions import MissingDocError
 from capreolus.tokenizer.punkt import PunktTokenizer
 
+from .common import SingleTrainingPassagesMixin
+
 logger = get_logger(__name__)
 
 
 @Extractor.register
-class BertPassage(Extractor):
+class BertPassage(Extractor, SingleTrainingPassagesMixin):
     """
     Extracts passages from the document to be later consumed by a BERT based model.
     Does NOT use all the passages. The first passages is always used. Use the `prob` config to control the probability
@@ -37,6 +39,7 @@ class BertPassage(Extractor):
     config_spec = [
         ConfigOption("maxseqlen", 256, "Maximum input length (query+document)"),
         ConfigOption("maxqlen", 20, "Maximum query length"),
+        ConfigOption("padq", False, "Always pad queries to maxqlen"),
         ConfigOption("usecache", False, "Should the extracted features be cached?"),
         ConfigOption("passagelen", 150, "Length of the extracted passage"),
         ConfigOption("stride", 100, "Stride"),
@@ -85,60 +88,6 @@ def get_tf_feature_description(self):
 
         return feature_description
 
-    def create_tf_train_feature(self, sample):
-        """
-        Returns a set of features from a doc.
-        Of the num_passages passages that are present in a document, we use only a subset of it.
-        params:
-        sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen]
-
-        Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen].
-        Yes, the output shape is different to the input shape because we sample from the passages.
-        """
-        num_passages = self.config["numpassages"]
-
-        def _bytes_feature(value):
-            """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors."""
-            if isinstance(value, type(tf.constant(0))):  # if value ist tensor
-                value = value.numpy()  # get value of tensor
-            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-
-        posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"]
-        posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = (
-            sample["pos_mask"],
-            sample["pos_seg"],
-            sample["neg_mask"],
-            sample["neg_seg"],
-        )
-        label = sample["label"]
-        features = []
-
-        for i in range(num_passages):
-            # Always use the first passage, then sample from the remaining passages
-            if i > 0 and self.rng.random() > self.config["prob"]:
-                continue
-
-            bert_input_line = posdoc[i]
-            bert_input_line = " ".join(self.tokenizer.bert_tokenizer.convert_ids_to_tokens(list(bert_input_line)))
-            passage = bert_input_line.split(self.sep_tok)[-2]
-
-            # Ignore empty passages as well
-            if passage.strip() == self.pad_tok:
-                continue
-
-            feature = {
-                "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc[i])),
-                "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask[i])),
-                "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg[i])),
-                "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc[i])),
-                "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask[i])),
-                "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg[i])),
-                "label": _bytes_feature(tf.io.serialize_tensor(label[i])),
-            }
-            features.append(feature)
-
-        return features
-
     def create_tf_dev_feature(self, sample):
         """
         Unlike the train feature, the dev set uses all passages. Both the input and the output are dicts with the shape
@@ -171,13 +120,13 @@ def _bytes_feature(value):
 
         return [feature]
 
-    def parse_tf_train_example(self, example_proto):
+    def parse_tf_dev_example(self, example_proto):
         feature_description = self.get_tf_feature_description()
         parsed_example = tf.io.parse_example(example_proto, feature_description)
 
         def parse_tensor_as_int(x):
             parsed_tensor = tf.io.parse_tensor(x, tf.int64)
-            parsed_tensor.set_shape([self.config["maxseqlen"]])
+            parsed_tensor.set_shape([self.config["numpassages"], self.config["maxseqlen"]])
 
             return parsed_tensor
 
@@ -197,31 +146,31 @@ def parse_label_tensor(x):
 
         return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label
 
-    def parse_tf_dev_example(self, example_proto):
-        feature_description = self.get_tf_feature_description()
-        parsed_example = tf.io.parse_example(example_proto, feature_description)
-
-        def parse_tensor_as_int(x):
-            parsed_tensor = tf.io.parse_tensor(x, tf.int64)
-            parsed_tensor.set_shape([self.config["numpassages"], self.config["maxseqlen"]])
-
-            return parsed_tensor
-
-        def parse_label_tensor(x):
-            parsed_tensor = tf.io.parse_tensor(x, tf.float32)
-            parsed_tensor.set_shape([self.config["numpassages"], 2])
+    def _filter_inputs(self, bert_inputs, bert_masks, bert_segs, n_valid_psg):
+        """Preserve only one passage from all available passages."""
+        assert n_valid_psg <= len(
+            bert_inputs
+        ), f"Passages only have {len(bert_inputs)} entries, but got {n_valid_psg} valid passages."
+        valid_indexes = list(range(0, n_valid_psg))
+        if len(valid_indexes) == 0:
+            valid_indexes = [0]
+        random_i = self.rng.choice(valid_indexes)
+        return list(map(lambda arr: arr[random_i], [bert_inputs, bert_masks, bert_segs]))
+
+    def _encode_inputs(self, query_toks, passages):
+        """Convert the query and passages into BERT inputs, mask, segments."""
+        bert_inputs, bert_masks, bert_segs = [], [], []
+        n_valid_psg = 0
+        for tokenized_passage in passages:
+            if tokenized_passage != [self.pad_tok]:  # end of the passage
+                n_valid_psg += 1
 
-            return parsed_tensor
-
-        pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64)
-        pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64)
-        pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64)
-        neg_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64)
-        neg_mask = tf.map_fn(parse_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64)
-        neg_seg = tf.map_fn(parse_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64)
-        label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32)
+            inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage)
+            bert_inputs.append(inp)
+            bert_masks.append(mask)
+            bert_segs.append(seg)
 
-        return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label
+        return bert_inputs, bert_masks, bert_segs, n_valid_psg
 
     def _get_passages(self, docid):
         doc = self.index.get_doc(docid)
@@ -321,60 +270,71 @@ def _prepare_bert_input(self, query_toks, psg_toks):
         if len(query_toks) > maxqlen:
             logger.warning(f"Truncating query from {len(query_toks)} to {maxqlen}")
             query_toks = query_toks[:maxqlen]
+        else:  # if the len(query_toks) <= maxqlen, whether to pad it
+            if self.config["padq"]:
+                query_toks = padlist(query_toks, padlen=maxqlen, pad_token=self.pad_tok)
         psg_toks = psg_toks[: maxseqlen - len(query_toks) - 3]
 
         psg_toks = " ".join(psg_toks).split()  # in case that psg_toks is np.array
         input_line = [self.cls_tok] + query_toks + [self.sep_tok] + psg_toks + [self.sep_tok]
         padded_input_line = padlist(input_line, padlen=maxseqlen, pad_token=self.pad_tok)
         inp = self.tokenizer.convert_tokens_to_ids(padded_input_line)
-        mask = [1] * len(input_line) + [0] * (len(padded_input_line) - len(input_line))
+        mask = [1 if tok != self.pad_tok else 0 for tok in input_line] + [0] * (len(padded_input_line) - len(input_line))
         seg = [0] * (len(query_toks) + 2) + [1] * (len(padded_input_line) - len(query_toks) - 2)
         return inp, mask, seg
 
-    def id2vec(self, qid, posid, negid=None, label=None):
+    def id2vec(self, qid, posid, negid=None, label=None, *args, **kwargs):
         """
         See parent class for docstring
         """
+        training = kwargs.get("training", True)  # default to be training
+
         assert label is not None
         maxseqlen = self.config["maxseqlen"]
         numpassages = self.config["numpassages"]
 
         query_toks = self.qid2toks[qid]
-        pos_bert_inputs, pos_bert_masks, pos_bert_segs = [], [], []
 
         # N.B: The passages in self.docid2passages are not bert tokenized
         pos_passages = self._get_passages(posid)
-        for tokenized_passage in pos_passages:
-            inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage)
-            pos_bert_inputs.append(inp)
-            pos_bert_masks.append(mask)
-            pos_bert_segs.append(seg)
+        pos_bert_inputs, pos_bert_masks, pos_bert_segs, n_valid_psg = self._encode_inputs(query_toks, pos_passages)
+        if training:
+            pos_bert_inputs, pos_bert_masks, pos_bert_segs = self._filter_inputs(
+                pos_bert_inputs, pos_bert_masks, pos_bert_segs, n_valid_psg
+            )
+        else:
+            assert len(pos_bert_inputs) == numpassages
+
+        pos_bert_inputs, pos_bert_masks, pos_bert_segs = map(
+            lambda lst: np.array(lst, dtype=np.long), [pos_bert_inputs, pos_bert_masks, pos_bert_segs]
+        )
 
         # TODO: Rename the posdoc key in the below dict to 'pos_bert_input'
         data = {
             "qid": qid,
             "posdocid": posid,
-            "pos_bert_input": np.array(pos_bert_inputs, dtype=np.long),
-            "pos_mask": np.array(pos_bert_masks, dtype=np.long),
-            "pos_seg": np.array(pos_bert_segs, dtype=np.long),
+            "pos_bert_input": pos_bert_inputs,
+            "pos_mask": pos_bert_masks,
+            "pos_seg": pos_bert_segs,
             "negdocid": "",
-            "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long),
-            "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long),
-            "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long),
-            "label": np.repeat(np.array([label], dtype=np.float32), numpassages, 0),
+            "neg_bert_input": np.zeros_like(pos_bert_inputs, dtype=np.long),
+            "neg_mask": np.zeros_like(pos_bert_masks, dtype=np.long),
+            "neg_seg": np.zeros_like(pos_bert_segs, dtype=np.long),
+            "label": np.array(label, dtype=np.float32),
+            # ^^^ not change the shape of the label as it is only needed during training
         }
 
         if not negid:
             return data
 
-        neg_bert_inputs, neg_bert_masks, neg_bert_segs = [], [], []
         neg_passages = self._get_passages(negid)
-
-        for tokenized_passage in neg_passages:
-            inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage)
-            neg_bert_inputs.append(inp)
-            neg_bert_masks.append(mask)
-            neg_bert_segs.append(seg)
+        neg_bert_inputs, neg_bert_masks, neg_bert_segs, n_valid_psg = self._encode_inputs(query_toks, neg_passages)
+        if training:
+            neg_bert_inputs, neg_bert_masks, neg_bert_segs = self._filter_inputs(
+                neg_bert_inputs, neg_bert_masks, neg_bert_segs, n_valid_psg
+            )
+        else:
+            assert len(neg_bert_inputs) == numpassages
 
         if not neg_bert_inputs:
             raise MissingDocError(qid, negid)
diff --git a/capreolus/extractor/berttext.py b/capreolus/extractor/berttext.py
index a835bf51..feeb7720 100644
--- a/capreolus/extractor/berttext.py
+++ b/capreolus/extractor/berttext.py
@@ -117,7 +117,7 @@ def preprocess(self, qids, docids, topics):
 
         self._build_vocab(qids, docids, topics)
 
-    def id2vec(self, qid, posid, negid=None):
+    def id2vec(self, qid, posid, negid=None, *args, **kwargs):
         tokenizer = self.tokenizer
         qlen, doclen = self.config["maxqlen"], self.config["maxdoclen"]
 
diff --git a/capreolus/extractor/birch_bertpassage.py b/capreolus/extractor/birch_bertpassage.py
new file mode 100644
index 00000000..26d78fda
--- /dev/null
+++ b/capreolus/extractor/birch_bertpassage.py
@@ -0,0 +1,71 @@
+import tensorflow as tf
+import numpy as np
+
+from capreolus import get_logger
+from capreolus.utils.exceptions import MissingDocError
+from . import Extractor
+from .bertpassage import BertPassage
+from .common import MultipleTrainingPassagesMixin
+
+logger = get_logger(__name__)
+
+
+@Extractor.register
+class BirchBertPassage(MultipleTrainingPassagesMixin, BertPassage):
+    module_name = "birchbertpassage"
+
+    config_spec = BertPassage.config_spec
+
+    def id2vec(self, qid, posid, negid=None, label=None, **kwargs):
+        """
+        See parent class for docstring
+        """
+        assert label is not None
+        maxseqlen = self.config["maxseqlen"]
+        numpassages = self.config["numpassages"]
+
+        query_toks = self.qid2toks[qid]
+        pos_bert_inputs, pos_bert_masks, pos_bert_segs = [], [], []
+
+        # N.B: The passages in self.docid2passages are not bert tokenized
+        pos_passages = self._get_passages(posid)
+        for tokenized_passage in pos_passages:
+            inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage)
+            pos_bert_inputs.append(inp)
+            pos_bert_masks.append(mask)
+            pos_bert_segs.append(seg)
+
+        # TODO: Rename the posdoc key in the below dict to 'pos_bert_input'
+        data = {
+            "qid": qid,
+            "posdocid": posid,
+            "pos_bert_input": np.array(pos_bert_inputs, dtype=np.long),
+            "pos_mask": np.array(pos_bert_masks, dtype=np.long),
+            "pos_seg": np.array(pos_bert_segs, dtype=np.long),
+            "negdocid": "",
+            "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long),
+            "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long),
+            "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long),
+            "label": np.repeat(np.array([label], dtype=np.float32), numpassages, 0),
+        }
+
+        if not negid:
+            return data
+
+        neg_bert_inputs, neg_bert_masks, neg_bert_segs = [], [], []
+        neg_passages = self._get_passages(negid)
+
+        for tokenized_passage in neg_passages:
+            inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage)
+            neg_bert_inputs.append(inp)
+            neg_bert_masks.append(mask)
+            neg_bert_segs.append(seg)
+
+        if not neg_bert_inputs:
+            raise MissingDocError(qid, negid)
+
+        data["negdocid"] = negid
+        data["neg_bert_input"] = np.array(neg_bert_inputs, dtype=np.long)
+        data["neg_mask"] = np.array(neg_bert_masks, dtype=np.long)
+        data["neg_seg"] = np.array(neg_bert_segs, dtype=np.long)
+        return data
diff --git a/capreolus/extractor/common.py b/capreolus/extractor/common.py
index 1e1bf736..c767b8c6 100644
--- a/capreolus/extractor/common.py
+++ b/capreolus/extractor/common.py
@@ -1,5 +1,6 @@
 import numpy as np
 from pymagnitude import Magnitude, MagnitudeUtils
+import tensorflow as tf
 
 from capreolus import constants, get_logger
 
@@ -63,3 +64,168 @@ def save_vocab_file(itos, fn):
     with open(fn, "wt") as outf:
         for idx, term in sorted(itos.items()):
             print(term, file=outf)
+
+
+class MultipleTrainingPassagesMixin:
+    """
+    Prepare and parse TF training feature that contain multiple passage per query.
+    That is, the "pos_bert_input" features prepared by extractor's `id2vec()` function should have 3 dimension
+    """
+
+    def create_tf_train_feature(self, sample):
+        """
+        Returns a set of features from a doc.
+        Of the num_passages passages that are present in a document, we use only a subset of it.
+        params:
+        sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen]
+        Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen].
+        Yes, the output shape is different to the input shape because we sample from the passages.
+        """
+        num_passages = self.config["numpassages"]
+
+        def _bytes_feature(value):
+            """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors."""
+            if isinstance(value, type(tf.constant(0))):  # if value ist tensor
+                value = value.numpy()  # get value of tensor
+            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+        def transpose_neg_input(neg_inp):
+            return tf.cast(tf.transpose(neg_inp, perm=[1, 0, 2]), tf.int64)
+
+        posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"]
+        posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = (
+            sample["pos_mask"],
+            sample["pos_seg"],
+            sample["neg_mask"],
+            sample["neg_seg"],
+        )
+        label = sample["label"]
+        features = []
+
+        negdoc = transpose_neg_input(negdoc)
+        negdoc_seg = transpose_neg_input(negdoc_seg)
+        negdoc_mask = transpose_neg_input(negdoc_mask)
+
+        for i in range(num_passages):
+            if i > 0 and self.rng.random() > self.config["prob"]:
+                continue
+
+            bert_input_line = posdoc[i]
+            bert_input_line = " ".join(self.tokenizer.bert_tokenizer.convert_ids_to_tokens(list(bert_input_line)))
+            passage = bert_input_line.split(self.sep_tok)[-2]
+
+            # Ignore empty passages as well
+            if passage.strip() == self.pad_tok:
+                continue
+
+            feature = {
+                "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc[i])),
+                "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask[i])),
+                "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg[i])),
+                "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc[i])),
+                "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask[i])),
+                "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg[i])),
+                "label": _bytes_feature(tf.io.serialize_tensor(label[i])),
+            }
+            features.append(feature)
+
+        return features
+
+    def parse_tf_train_example(self, example_proto):
+        maxseqlen = self.config["maxseqlen"]
+
+        feature_description = self.get_tf_feature_description()
+        parsed_example = tf.io.parse_example(example_proto, feature_description)
+
+        def parse_tensor_as_int(x):
+            parsed_tensor = tf.io.parse_tensor(x, tf.int64)
+            parsed_tensor.set_shape([maxseqlen])
+            return parsed_tensor
+
+        def parse_neg_tensor_as_int(x):
+            parsed_tensor = tf.io.parse_tensor(x, tf.int64)
+            return parsed_tensor
+
+        def parse_label_tensor(x):
+            parsed_tensor = tf.io.parse_tensor(x, tf.float32)
+            return parsed_tensor
+
+        pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64)
+        pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64)
+        pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64)
+        neg_bert_input = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64)
+        neg_mask = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64)
+        neg_seg = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64)
+        label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32)
+
+        return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label
+
+
+class SingleTrainingPassagesMixin:
+    """
+    Prepare and parse TF training feature that contain single passage per query.
+    That is, the "pos_bert_input" features prepared by extractor's `id2vec()` function should have 2 dimension
+    """
+
+    def create_tf_train_feature(self, sample):
+        """
+        Returns a set of features from a doc.
+        Of the num_passages passages that are present in a document, we use only a subset of it.
+        params:
+        sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen]
+
+        Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen].
+        Yes, the output shape is different to the input shape because we sample from the passages.
+        """
+        num_passages = self.config["numpassages"]
+
+        def _bytes_feature(value):
+            """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors."""
+            if isinstance(value, type(tf.constant(0))):  # if value ist tensor
+                value = value.numpy()  # get value of tensor
+            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+        posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"]
+        posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = (
+            sample["pos_mask"],
+            sample["pos_seg"],
+            sample["neg_mask"],
+            sample["neg_seg"],
+        )
+        label = sample["label"]
+        feature = {
+            "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc)),
+            "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask)),
+            "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg)),
+            "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc)),
+            "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask)),
+            "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg)),
+            "label": _bytes_feature(tf.io.serialize_tensor(label)),
+        }
+        return [feature]
+
+    def parse_tf_train_example(self, example_proto):
+        feature_description = self.get_tf_feature_description()
+        parsed_example = tf.io.parse_example(example_proto, feature_description)
+
+        def parse_tensor_as_int(x):
+            parsed_tensor = tf.io.parse_tensor(x, tf.int64)
+            parsed_tensor.set_shape([self.config["maxseqlen"]])
+
+            return parsed_tensor
+
+        def parse_label_tensor(x):
+            parsed_tensor = tf.io.parse_tensor(x, tf.float32)
+            parsed_tensor.set_shape([2])
+
+            return parsed_tensor
+
+        pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64)
+        pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64)
+        pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64)
+        neg_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64)
+        neg_mask = tf.map_fn(parse_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64)
+        neg_seg = tf.map_fn(parse_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64)
+        label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32)
+
+        return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label
diff --git a/capreolus/extractor/deeptileextractor.py b/capreolus/extractor/deeptileextractor.py
index be3c5d78..da5eddb5 100644
--- a/capreolus/extractor/deeptileextractor.py
+++ b/capreolus/extractor/deeptileextractor.py
@@ -249,7 +249,7 @@ def preprocess(self, qids, docids, topics):
         self._build_vocab(qids, docids, topics)
         self._build_embedding_matrix()
 
-    def id2vec(self, qid, posdocid, negdocid=None, **kwargs):
+    def id2vec(self, qid, posdocid, negdocid=None, *args, **kwargs):
         query_toks = padlist(self.qid2toks[qid], self.config["maxqlen"], pad_token=self.pad_tok)
         posdoc_tilebar = self.create_visualization_matrix(query_toks, self.docid2segments[posdocid], self.embeddings)
 
diff --git a/capreolus/extractor/embedtext.py b/capreolus/extractor/embedtext.py
index 42e5e9c8..d7270d35 100644
--- a/capreolus/extractor/embedtext.py
+++ b/capreolus/extractor/embedtext.py
@@ -125,7 +125,7 @@ def _add_oov_to_vocab(self, tokens):
     def _tok2vec(self, toks):
         return [self.stoi[tok] for tok in toks]
 
-    def id2vec(self, qid, posid, negid=None, **kwargs):
+    def id2vec(self, qid, posid, negid=None, *args, **kwargs):
         query = self.qid2toks[qid]
 
         # TODO find a way to calculate qlen/doclen stats earlier, so we can log them and check sanity of our values
diff --git a/capreolus/extractor/lce_bertpassage.py b/capreolus/extractor/lce_bertpassage.py
index 81ed5c7b..c6d49702 100644
--- a/capreolus/extractor/lce_bertpassage.py
+++ b/capreolus/extractor/lce_bertpassage.py
@@ -5,108 +5,22 @@
 from capreolus.utils.exceptions import MissingDocError
 from . import Extractor
 from .bertpassage import BertPassage
+from .common import MultipleTrainingPassagesMixin
 
 logger = get_logger(__name__)
 
 
 @Extractor.register
-class LCEBertPassage(BertPassage):
+class LCEBertPassage(MultipleTrainingPassagesMixin, BertPassage):
     module_name = "LCEbertpassage"
 
     config_spec = BertPassage.config_spec
 
-    def create_tf_train_feature(self, sample):
-        """
-        Returns a set of features from a doc.
-        Of the num_passages passages that are present in a document, we use only a subset of it.
-        params:
-        sample - A dict where each entry has the shape [batch_size, num_passages, maxseqlen]
-        Returns a list of features. Each feature is a dict, and each value in the dict has the shape [batch_size, maxseqlen].
-        Yes, the output shape is different to the input shape because we sample from the passages.
-        """
-        num_passages = self.config["numpassages"]
-
-        def _bytes_feature(value):
-            """Returns a bytes_list from a string / byte. Our features are multi-dimensional tensors."""
-            if isinstance(value, type(tf.constant(0))):  # if value ist tensor
-                value = value.numpy()  # get value of tensor
-            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-
-        def transpose_neg_input(neg_inp):
-            return tf.cast(tf.transpose(neg_inp, perm=[1, 0, 2]), tf.int64)
-
-        posdoc, negdoc, negdoc_id = sample["pos_bert_input"], sample["neg_bert_input"], sample["negdocid"]
-        posdoc_mask, posdoc_seg, negdoc_mask, negdoc_seg = (
-            sample["pos_mask"],
-            sample["pos_seg"],
-            sample["neg_mask"],
-            sample["neg_seg"],
-        )
-        label = sample["label"]
-        features = []
-
-        negdoc = transpose_neg_input(negdoc)
-        negdoc_seg = transpose_neg_input(negdoc_seg)
-        negdoc_mask = transpose_neg_input(negdoc_mask)
-
-        for i in range(num_passages):
-            if i > 0 and self.rng.random() > self.config["prob"]:
-                continue
-
-            bert_input_line = posdoc[i]
-            bert_input_line = " ".join(self.tokenizer.bert_tokenizer.convert_ids_to_tokens(list(bert_input_line)))
-            passage = bert_input_line.split(self.sep_tok)[-2]
-
-            # Ignore empty passages as well
-            if passage.strip() == self.pad_tok:
-                continue
-
-            feature = {
-                "pos_bert_input": _bytes_feature(tf.io.serialize_tensor(posdoc[i])),
-                "pos_mask": _bytes_feature(tf.io.serialize_tensor(posdoc_mask[i])),
-                "pos_seg": _bytes_feature(tf.io.serialize_tensor(posdoc_seg[i])),
-                "neg_bert_input": _bytes_feature(tf.io.serialize_tensor(negdoc[i])),
-                "neg_mask": _bytes_feature(tf.io.serialize_tensor(negdoc_mask[i])),
-                "neg_seg": _bytes_feature(tf.io.serialize_tensor(negdoc_seg[i])),
-                "label": _bytes_feature(tf.io.serialize_tensor(label[i])),
-            }
-            features.append(feature)
-
-        return features
-
-    def parse_tf_train_example(self, example_proto):
-        maxseqlen = self.config["maxseqlen"]
-
-        feature_description = self.get_tf_feature_description()
-        parsed_example = tf.io.parse_example(example_proto, feature_description)
-
-        def parse_tensor_as_int(x):
-            parsed_tensor = tf.io.parse_tensor(x, tf.int64)
-            parsed_tensor.set_shape([maxseqlen])
-            return parsed_tensor
-
-        def parse_neg_tensor_as_int(x):
-            parsed_tensor = tf.io.parse_tensor(x, tf.int64)
-            return parsed_tensor
-
-        def parse_label_tensor(x):
-            parsed_tensor = tf.io.parse_tensor(x, tf.float32)
-            return parsed_tensor
-
-        pos_bert_input = tf.map_fn(parse_tensor_as_int, parsed_example["pos_bert_input"], dtype=tf.int64)
-        pos_mask = tf.map_fn(parse_tensor_as_int, parsed_example["pos_mask"], dtype=tf.int64)
-        pos_seg = tf.map_fn(parse_tensor_as_int, parsed_example["pos_seg"], dtype=tf.int64)
-        neg_bert_input = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_bert_input"], dtype=tf.int64)
-        neg_mask = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_mask"], dtype=tf.int64)
-        neg_seg = tf.map_fn(parse_neg_tensor_as_int, parsed_example["neg_seg"], dtype=tf.int64)
-        label = tf.map_fn(parse_label_tensor, parsed_example["label"], dtype=tf.float32)
-
-        return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label
-
-    def id2vec(self, qid, posid, negids=None, label=None):
+    def id2vec(self, qid, posid, negids=None, label=None, **kwargs):
         """
         See parent class for docstring
         """
+        training = kwargs.get("training", True)  # default to be training
         assert label is not None
         maxseqlen = self.config["maxseqlen"]
         numpassages = self.config["numpassages"]
@@ -122,6 +36,7 @@ def id2vec(self, qid, posid, negids=None, label=None):
             pos_bert_masks.append(mask)
             pos_bert_segs.append(seg)
 
+        label = [label] if training else label
         # TODO: Rename the posdoc key in the below dict to 'pos_bert_input'
         data = {
             "qid": qid,
@@ -133,7 +48,7 @@ def id2vec(self, qid, posid, negids=None, label=None):
             "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long),
             "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long),
             "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long),
-            "label": np.repeat(np.array([label], dtype=np.float32), numpassages, 0),
+            "label": np.repeat(np.array(label, dtype=np.float32), numpassages, 0),
         }
 
         if negids is None:
diff --git a/capreolus/extractor/pooled_bertpassage.py b/capreolus/extractor/pooled_bertpassage.py
index 93a0967b..8f8d8fc5 100644
--- a/capreolus/extractor/pooled_bertpassage.py
+++ b/capreolus/extractor/pooled_bertpassage.py
@@ -31,6 +31,7 @@ class PooledBertPassage(BertPassage):
     config_spec = [
         ConfigOption("maxseqlen", 256, "Maximum input length (query+document)"),
         ConfigOption("maxqlen", 20, "Maximum query length"),
+        ConfigOption("padq", False, "Always pad queries to maxqlen"),
         ConfigOption("usecache", False, "Should the extracted features be cached?"),
         ConfigOption("passagelen", 150, "Length of the extracted passage"),
         ConfigOption("stride", 100, "Stride"),
@@ -117,7 +118,7 @@ def parse_label_tensor(x):
 
         return (pos_bert_input, pos_mask, pos_seg, neg_bert_input, neg_mask, neg_seg), label
 
-    def id2vec(self, qid, posid, negid=None, label=None):
+    def id2vec(self, qid, posid, negid=None, label=None, *args, **kwargs):
         """
         See parent class for docstring
         """
diff --git a/capreolus/extractor/slowembedtext.py b/capreolus/extractor/slowembedtext.py
index fbc11837..f1f3f001 100644
--- a/capreolus/extractor/slowembedtext.py
+++ b/capreolus/extractor/slowembedtext.py
@@ -168,7 +168,7 @@ def _tok2vec(self, toks):
         # return [self.embeddings[self.stoi[tok]] for tok in toks]
         return [self.stoi[tok] for tok in toks]
 
-    def id2vec(self, qid, posid, negid=None, label=None):
+    def id2vec(self, qid, posid, negid=None, label=None, *args, **kwargs):
         assert label is not None
         query = self.qid2toks[qid]
 
diff --git a/capreolus/reranker/birch.py b/capreolus/reranker/birch.py
index fa4aea9f..9e9b9c47 100644
--- a/capreolus/reranker/birch.py
+++ b/capreolus/reranker/birch.py
@@ -141,7 +141,7 @@ class Birch(Reranker):
         Dependency(
             key="extractor",
             module="extractor",
-            name="bertpassage",
+            name="birchbertpassage",
             default_config_overrides={"tokenizer": {"pretrained": "bert-large-uncased"}},
         ),
         Dependency(key="trainer", module="trainer", name="pytorch"),
diff --git a/capreolus/reranker/common.py b/capreolus/reranker/common.py
index ebdb969c..3c9fbb38 100644
--- a/capreolus/reranker/common.py
+++ b/capreolus/reranker/common.py
@@ -78,9 +78,12 @@ def call(self, y_true, y_pred):
 
 class TFCategoricalCrossEntropyLoss(CategoricalCrossentropy):
     def call(self, ytrue, ypred):
+        """Shape: (batch_size, 2)"""
         tf.debugging.assert_equal(tf.shape(ytrue), tf.shape(ypred))
+        batch_size = tf.shape(ypred)[0]
 
-        return super(TFCategoricalCrossEntropyLoss, self).call(ytrue, ypred)
+        losses = super(TFCategoricalCrossEntropyLoss, self).call(ytrue, ypred)
+        return losses / tf.cast(batch_size, losses.dtype)
 
 
 class TFLCELoss(CategoricalCrossentropy):
diff --git a/capreolus/reranker/ptBERTMaxP.py b/capreolus/reranker/ptBERTMaxP.py
new file mode 100644
index 00000000..7b922e2e
--- /dev/null
+++ b/capreolus/reranker/ptBERTMaxP.py
@@ -0,0 +1,135 @@
+import random
+
+import torch
+from torch import nn
+from transformers import AutoModelForSequenceClassification
+
+from capreolus import ConfigOption, Dependency
+from capreolus.reranker import Reranker
+from capreolus.utils.loginit import get_logger
+
+logger = get_logger(__name__)
+
+
+class ElectraRelevanceHead(nn.Module):
+    """BERT-style ClassificationHead (i.e., out_proj only -- no dense). See transformers.ElectraClassificationHead"""
+
+    def __init__(self, dropout, out_proj, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dropout = dropout
+        self.out_proj = out_proj
+
+    def call(self, inputs, **kwargs):
+        x = inputs[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class PTBERTMaxP_Class(nn.Module):
+    def __init__(self, extractor, config, *args, **kwargs):
+        super(PTBERTMaxP_Class, self).__init__(*args, **kwargs)
+        self.extractor = extractor
+
+        # TODO hidden prob missing below?
+        if config["pretrained"] == "electra-base-msmarco":
+            self.bert = AutoModelForSequenceClassification.from_pretrained("Capreolus/electra-base-msmarco")
+            dropout, fc = self.bert.classifier.dropout, self.bert.classifier.out_proj
+            self.bert.classifier = ElectraRelevanceHead(dropout, fc)
+        elif config["pretrained"] == "electra-base":
+            self.bert = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator")
+            dropout, fc = self.bert.classifier.dropout, self.bert.classifier.out_proj
+            self.bert.classifier = ElectraRelevanceHead(dropout, fc)
+        elif config["pretrained"] == "bert-base-msmarco":
+            self.bert = AutoModelForSequenceClassification.from_pretrained("Capreolus/bert-base-msmarco")
+        else:
+            self.bert = AutoModelForSequenceClassification.from_pretrained(
+                config["pretrained"], hidden_dropout_prob=config["hidden_dropout_prob"]
+            )
+
+        self.config = config
+
+    def forward(self, doc_input, doc_mask, doc_seg):
+        """
+        doc_input: (BS, N_PSG, SEQ_LEN) -> [psg-1, psg-2, ..., [PAD], [PAD]]
+        """
+        batch_size = doc_input.shape[0]
+        if "roberta" in self.config["pretrained"]:
+            doc_seg = torch.zeros_like(doc_mask)  # since roberta does not have segment input
+
+        if self.training:
+            passage_scores = self.bert(doc_input, attention_mask=doc_mask, token_type_ids=doc_seg)[0]
+        else:
+            passage_scores = self.predict_step(doc_input, doc_mask, doc_seg)
+
+        return passage_scores
+
+    def predict_step(self, doc_input, doc_mask, doc_seg):
+        """
+        Scores each passage and applies max pooling over it.
+        """
+        batch_size = doc_input.shape[0]
+        num_passages = self.extractor.config["numpassages"]
+        maxseqlen = self.extractor.config["maxseqlen"]
+
+        passage_position = (doc_mask * doc_seg).sum(dim=-1)  # (B, P)
+        passage_mask = (passage_position > 5).long()  # (B, P)
+
+        doc_input = doc_input.reshape([batch_size * num_passages, maxseqlen])
+        doc_mask = doc_mask.reshape([batch_size * num_passages, maxseqlen])
+        doc_seg = doc_seg.reshape([batch_size * num_passages, maxseqlen])
+
+        passage_scores = self.bert(doc_input, attention_mask=doc_mask, token_type_ids=doc_seg)[0][:, 1]
+        passage_scores = passage_scores.reshape([batch_size, num_passages])
+
+        if self.config["aggregation"] == "max":
+            passage_scores = passage_scores.max(dim=1)[0]  # (batch size, )
+        elif self.config["aggregation"] == "first":
+            passage_scores = passage_scores[:, 0]
+        elif self.config["aggregation"] == "sum":
+            passage_scores = torch.sum(passage_mask * passage_scores, dim=1)
+        elif self.config["aggregation"] == "avg":
+            passage_scores = torch.sum(passage_mask * passage_scores, dim=1) / torch.sum(passage_mask)
+        else:
+            raise ValueError("Unknown aggregation method: {}".format(self.config["aggregation"]))
+
+        return passage_scores
+
+
+@Reranker.register
+class PTBERTMaxP(Reranker):
+    """
+    PyTorch implementation of BERT-MaxP.
+
+    Deeper Text Understanding for IR with Contextual Neural Language Modeling. Zhuyun Dai and Jamie Callan. SIGIR 2019.
+    https://arxiv.org/pdf/1905.09217.pdf
+    """
+
+    module_name = "ptBERTMaxP"
+
+    dependencies = [
+        Dependency(key="extractor", module="extractor", name="bertpassage"),
+        Dependency(key="trainer", module="trainer", name="pytorch"),
+    ]
+    config_spec = [
+        ConfigOption(
+            "pretrained",
+            "bert-base-uncased",
+            "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base-msmarco, or HuggingFace supported models",
+        ),
+        ConfigOption("aggregation", "max"),
+        ConfigOption("hidden_dropout_prob", 0.1, "The dropout probability of BERT-like model's hidden layers."),
+    ]
+
+    def build_model(self):
+        self.model = PTBERTMaxP_Class(self.extractor, self.config)
+        return self.model
+
+    def score(self, d):
+        return [
+            self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1),
+            self.model(d["neg_bert_input"], d["neg_mask"], d["neg_seg"]).view(-1),
+        ]
+
+    def test(self, d):
+        return self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1)
diff --git a/capreolus/sampler/__init__.py b/capreolus/sampler/__init__.py
index 2c48dfab..5df771f2 100644
--- a/capreolus/sampler/__init__.py
+++ b/capreolus/sampler/__init__.py
@@ -107,7 +107,7 @@ def generate_samples(self):
         """
         Generates triplets infinitely.
         """
-        all_qids = sorted(self.qid_to_reldocs)
+        all_qids = list(self.qid_to_reldocs)
         if len(all_qids) == 0:
             raise RuntimeError("TrainDataset has no valid qids")
 
@@ -121,7 +121,7 @@ def generate_samples(self):
                 try:
                     # Convention for label - [1, 0] indicates that doc belongs to class 1 (i.e relevant
                     # ^ This is used with categorical cross entropy loss
-                    yield self.extractor.id2vec(qid, posdocid, negdocid, label=[1, 0])
+                    yield self.extractor.id2vec(qid, posdocid, negdocid, label=[1, 0], training=True)
                 except MissingDocError:
                     # at training time we warn but ignore on missing docs
                     logger.warning(
@@ -146,19 +146,20 @@ def get_hash(self):
         return "pair_{0}".format(key)
 
     def generate_samples(self):
-        all_qids = sorted(self.qid_to_reldocs)
+        all_qids = list(self.qid_to_reldocs)
         if len(all_qids) == 0:
             raise RuntimeError("TrainDataset has no valid training pairs")
 
         while True:
             self.rng.shuffle(all_qids)
             for qid in all_qids:
+                posdocid = self.rng.choice(self.qid_to_reldocs[qid])
+                negdocid = self.rng.choice(self.qid_to_negdocs[qid])
+
                 # Convention for label - [1, 0] indicates that doc belongs to class 1 (i.e relevant
                 # ^ This is used with categorical cross entropy loss
-                for docid in self.qid_to_reldocs[qid]:
-                    yield self.extractor.id2vec(qid, docid, negid=None, label=[0, 1])
-                for docid in self.qid_to_negdocs[qid]:
-                    yield self.extractor.id2vec(qid, docid, negid=None, label=[1, 0])
+                yield self.extractor.id2vec(qid, posdocid, negid=None, label=[0, 1], training=True)
+                yield self.extractor.id2vec(qid, negdocid, negid=None, label=[1, 0], training=True)
                 # REF-TODO returning all docs in a row does not make sense w/ pytorch
                 #          (with TF the dataset itself is shuffled, so this is okay)
                 # REF-TODO make sure always negid empty is ok
@@ -223,9 +224,9 @@ def generate_samples(self):
             for docid in docids:
                 try:
                     if docid in self.qid_to_reldocs[qid]:
-                        yield self.extractor.id2vec(qid, docid, label=[0, 1])
+                        yield self.extractor.id2vec(qid, docid, label=[0, 1], training=False)
                     else:
-                        yield self.extractor.id2vec(qid, docid, label=[1, 0])
+                        yield self.extractor.id2vec(qid, docid, label=[1, 0], training=False)
                 except MissingDocError:
                     # when predictiong we raise an exception on missing docs, as this may invalidate results
                     logger.error("got none features for prediction: qid=%s posid=%s", qid, docid)
diff --git a/capreolus/tests/test_extractor.py b/capreolus/tests/test_extractor.py
index 7c636307..4eafea30 100644
--- a/capreolus/tests/test_extractor.py
+++ b/capreolus/tests/test_extractor.py
@@ -612,7 +612,8 @@ def get_doc(*args, **kwargs):
 def test_bertpassage_id2vec(monkeypatch):
     benchmark = DummyBenchmark()
     extractor = BertPassage(
-        {"numpassages": 5, "passagelen": 5, "maxseqlen": 15, "stride": 3, "index": {"collection": {"name": "dummy"}}},
+        # cannot test the numpassages > 1 cases, as the passages were randomly selected
+        {"numpassages": 1, "passagelen": 5, "maxseqlen": 15, "stride": 3, "index": {"collection": {"name": "dummy"}}},
         provide=benchmark,
     )
 
@@ -627,7 +628,7 @@ def get_doc(*args, **kwargs):
 
     tokenizer = extractor.tokenizer.bert_tokenizer
 
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][0]) == [
+    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"]) == [
         "[CLS]",
         "sc",
         "##oo",
@@ -644,59 +645,7 @@ def get_doc(*args, **kwargs):
         "we",
         "[SEP]",
     ]
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][1]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "now",
-        "had",
-        "here",
-        "[SEP]",
-    ]
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][2]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "but",
-        "one",
-        "ten",
-        "[SEP]",
-    ]
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][3]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "thousand",
-        "of",
-        "those",
-        "[SEP]",
-    ]
-
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][0]) == [
+    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"]) == [
         "[CLS]",
         "sc",
         "##oo",
@@ -713,63 +662,12 @@ def get_doc(*args, **kwargs):
         "we",
         "[SEP]",
     ]
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][1]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "now",
-        "had",
-        "here",
-        "[SEP]",
-    ]
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][2]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "but",
-        "one",
-        "ten",
-        "[SEP]",
-    ]
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][3]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "thousand",
-        "of",
-        "those",
-        "[SEP]",
-    ]
 
 
 def test_bertpassage_id2vec_with_pad(monkeypatch):
     benchmark = DummyBenchmark()
     extractor = BertPassage(
-        {"numpassages": 5, "passagelen": 5, "maxseqlen": 20, "stride": 3, "index": {"collection": {"name": "dummy"}}},
+        {"numpassages": 1, "passagelen": 5, "maxseqlen": 20, "stride": 3, "index": {"collection": {"name": "dummy"}}},
         provide=benchmark,
     )
 
@@ -784,7 +682,7 @@ def get_doc(*args, **kwargs):
 
     tokenizer = extractor.tokenizer.bert_tokenizer
 
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][0]) == [
+    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"]) == [
         "[CLS]",
         "sc",
         "##oo",
@@ -806,101 +704,15 @@ def get_doc(*args, **kwargs):
         "[PAD]",
         "[PAD]",
     ]
-    tf.debugging.assert_equal(
-        data["pos_mask"][0], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
-    )
-    tf.debugging.assert_equal(
-        data["pos_seg"][0], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
-    )
-
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][1]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "now",
-        "had",
-        "here",
-        "but",
-        "one",
-        "[SEP]",
-        "[PAD]",
-        "[PAD]",
-        "[PAD]",
-    ]
-    tf.debugging.assert_equal(
-        data["pos_mask"][1], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
-    )
-    tf.debugging.assert_equal(
-        data["pos_seg"][1], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
-    )
 
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][2]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "but",
-        "one",
-        "ten",
-        "thousand",
-        "of",
-        "[SEP]",
-        "[PAD]",
-        "[PAD]",
-        "[PAD]",
-    ]
     tf.debugging.assert_equal(
-        data["pos_mask"][2], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
+        data["pos_mask"], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
     )
     tf.debugging.assert_equal(
-        data["pos_seg"][2], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
+        data["pos_seg"], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
     )
 
-    assert tokenizer.convert_ids_to_tokens(data["pos_bert_input"][3]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "thousand",
-        "of",
-        "those",
-        "men",
-        "in",
-        "[SEP]",
-        "[PAD]",
-        "[PAD]",
-        "[PAD]",
-    ]
-    tf.debugging.assert_equal(
-        data["pos_mask"][3], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
-    )
-    tf.debugging.assert_equal(
-        data["pos_seg"][3], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
-    )
-
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][0]) == [
+    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"]) == [
         "[CLS]",
         "sc",
         "##oo",
@@ -923,95 +735,8 @@ def get_doc(*args, **kwargs):
         "[PAD]",
     ]
     tf.debugging.assert_equal(
-        data["neg_mask"][0], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
-    )
-    tf.debugging.assert_equal(
-        data["neg_seg"][0], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
-    )
-
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][1]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "now",
-        "had",
-        "here",
-        "but",
-        "one",
-        "[SEP]",
-        "[PAD]",
-        "[PAD]",
-        "[PAD]",
-    ]
-    tf.debugging.assert_equal(
-        data["neg_mask"][1], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
-    )
-    tf.debugging.assert_equal(
-        data["neg_seg"][1], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
-    )
-
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][2]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "but",
-        "one",
-        "ten",
-        "thousand",
-        "of",
-        "[SEP]",
-        "[PAD]",
-        "[PAD]",
-        "[PAD]",
-    ]
-    tf.debugging.assert_equal(
-        data["neg_mask"][2], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
-    )
-    tf.debugging.assert_equal(
-        data["neg_seg"][2], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
-    )
-
-    assert tokenizer.convert_ids_to_tokens(data["neg_bert_input"][3]) == [
-        "[CLS]",
-        "sc",
-        "##oo",
-        "##by",
-        "doo",
-        "##by",
-        "doo",
-        "where",
-        "are",
-        "you",
-        "[SEP]",
-        "thousand",
-        "of",
-        "those",
-        "men",
-        "in",
-        "[SEP]",
-        "[PAD]",
-        "[PAD]",
-        "[PAD]",
-    ]
-    tf.debugging.assert_equal(
-        data["neg_mask"][3], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
+        data["neg_mask"], tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=tf.int64)
     )
     tf.debugging.assert_equal(
-        data["neg_seg"][3], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
+        data["neg_seg"], tf.constant([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.int64)
     )
diff --git a/capreolus/trainer/pytorch.py b/capreolus/trainer/pytorch.py
index 3e30e89f..a3199cc8 100644
--- a/capreolus/trainer/pytorch.py
+++ b/capreolus/trainer/pytorch.py
@@ -73,7 +73,7 @@ def build(self):
         torch.manual_seed(self.config["seed"])
         torch.cuda.manual_seed_all(self.config["seed"])
 
-    def single_train_iteration(self, reranker, train_dataloader):
+    def single_train_iteration(self, reranker, train_dataloader, cur_iter):
         """Train model for one iteration using instances from train_dataloader.
 
         Args:
@@ -86,6 +86,7 @@ def single_train_iteration(self, reranker, train_dataloader):
         """
 
         iter_loss = []
+        cur_step = cur_iter * self.n_batch_per_iter
         batches_since_update = 0
         batches_per_step = self.config["gradacc"]
 
@@ -112,9 +113,11 @@ def single_train_iteration(self, reranker, train_dataloader):
                 self.optimizer.zero_grad()
 
             if (bi + 1) % self.n_batch_per_iter == 0:
-                # REF-TODO: save scheduler state along with optimizer
-                self.lr_scheduler.step()
                 break
+            # REF-TODO: save scheduler state along with optimizer
+            # hacky: use step instead the internally calculated epoch to support step-wise lr update
+            self.lr_scheduler.step(epoch=cur_step)
+            cur_step += 1
 
         return torch.stack(iter_loss).mean()
 
@@ -210,7 +213,8 @@ def train(self, reranker, train_dataset, train_output_path, dev_data, dev_output
 
         # REF-TODO how to handle interactions between fastforward and schedule? --> just save its state
         self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
-            self.optimizer, lambda epoch: self.lr_multiplier(step=epoch * self.n_batch_per_iter)
+            self.optimizer,
+            lambda step: self.lr_multiplier(step=step),
         )
 
         if self.config["softmaxloss"]:
@@ -254,7 +258,7 @@ def train(self, reranker, train_dataset, train_output_path, dev_data, dev_output
             model.train()
 
             iter_start_time = time.time()
-            iter_loss_tensor = self.single_train_iteration(reranker, train_dataloader)
+            iter_loss_tensor = self.single_train_iteration(reranker, train_dataloader, cur_iter=niter)
             logger.info("A single iteration takes {}".format(time.time() - iter_start_time))
             train_loss.append(iter_loss_tensor.item())
             logger.info("iter = %d loss = %f", niter, train_loss[-1])
diff --git a/docs/reproduction/MS_MARCO.md b/docs/reproduction/MS_MARCO.md
index 6ba5d09b..7793b470 100644
--- a/docs/reproduction/MS_MARCO.md
+++ b/docs/reproduction/MS_MARCO.md
@@ -7,11 +7,14 @@ first follow [this](../setup/setup-cc.md) guide to set up the environment on CC
 Once the environment is set, you can verify the installation with [these instructions](./PARADE.md#testing-installation).
 
 ## Running MS MARCO 
-This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU. 
-1. Make sure you are in the top-level `capreolus` directory;
-2. Use the following script to run a "mini" version of the MS MARCO fine-tuning, testing if everything is working. 
+This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU. Before replication, make sure you are in the top-level `capreolus` directory;
+### TensorFlow version 
+1. Use the following script to run a "mini" version of the MS MARCO fine-tuning, testing if everything is working. 
     ```bash
-    python -m capreolus.run rerank.train with file=docs/reproduction/config_msmarco.txt
+    python -m capreolus.run rerank.train with \
+        file=docs/reproduction/config_msmarco.txt \
+        reranker.name=TFBERTMaxP \
+        reranker.trainer.usecache=True
     ``` 
     This would train the monoBERT for only 3k steps with batch size to be 4, then rerank the *top100* documents per query. 
     The script should take no more than 24 hours to finish, and could be fit into a single `v100l`.
@@ -27,6 +30,8 @@ This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU.
 
     python -m capreolus.run rerank.train with \
         file=docs/reproduction/config_msmarco.txt  \
+        reranker.name=TFBERTMaxP \
+        reranker.trainer.bertlr=1e-5 \
         threshold=$threshold \
         reranker.trainer.niters=$niters \
         reranker.trainer.batch=$batch_size \
@@ -38,6 +43,34 @@ This requires GPU(s) with 48GB memory (e.g. 3 V100 or a RTX 8000) or a TPU.
     After data is prepared, it would take 4~6 hours to train and 6～10 hours to inference with *4 V100s* for BERT-base. 
     This should achieve `MRR@10=0.35+`.
 
+### PyTorch version 
+Similar with reproduction with TensorFlow:
+1. To fine-tune a "mini" version of the MS MARCO:
+    ```bash
+    python -m capreolus.run rerank.train with \
+        file=docs/reproduction/config_msmarco.txt \
+        reranker.name=ptBERTMaxP
+    ``` 
+
+2. To fine-tune a full version on MS MARCO Passage:
+    ```bash
+    niters=10
+    batch_size=16
+    validatefreq=$niters # to ensure the validation is run only at the end of training
+    decayiters=$niters   # either same with $itersize or 0
+    threshold=1000       # the top-k documents to rerank
+
+    python -m capreolus.run rerank.train with \
+        file=docs/reproduction/config_msmarco.txt  \
+        reranker.name=ptBERTMaxP \
+        threshold=$threshold \
+        reranker.trainer.niters=$niters \
+        reranker.trainer.batch=$batch_size \
+        reranker.trainer.decayiters=$decayiters \
+        reranker.trainer.validatefreq=$validatefreq \
+        fold=s1
+    ```
+
 ### For CC slurm users:
 In case you are new to [slurm](https://slurm.schedmd.com/documentation.html), a sample slurm script for the *full version* fine-tuning could be found under `docs/reproduction/sample_slurm_script.sh`.
 This should work on `cedar` directly via `sbatch sample_slurm_script.sh`.
@@ -52,4 +85,7 @@ To adapt it to the `mini` version, simply change the GPU number and request time
 + Results (with hypperparameter-6) replicated by [@Dahlia-Chehata](https://github.com/Dahlia-Chehata) on 2021-03-29 (commit [`7915aad`](https://github.com/capreolus-ir/capreolus/commit/7915aad75406527a3b88498926cff85259808696)) (Tesla V100 on Compute Canada)
 + Results (with hypperparameter-7) replicated by [@larryli1999](https://github.com/larryli1999) on 2021-05-16 (commit [`6d1aed2`](https://github.com/capreolus-ir/capreolus/commit/6d1aed29de7828ceb94560a8bf7c87f1af5458b5)) (Tesla V100 on Compute Canada)
 + Results (MRR@10=0.356) replicated by [@andrewyguo](https://github.com/andrewyguo) on 2021-05-29 (commit [`1ce71d9`](https://github.com/capreolus-ir/capreolus/commit/1ce71d93ab5473b40d4ae02768fd053261b27320)) (Tesla V100 on Compute Canada)
-+ Results (MRR@10=0.356) reproduced by [@lingwei-gu](https://github.com/lingwei-gu) on 2022-01-20 (commit [`1bbf0f2`](https://github.com/capreolus-ir/capreolus/commit/1bbf0f295b09774e2fb2a1db7dfddef88adec7be)) (Tesla V100 on Compute Canada)
++ Results (MRR@10=0.3513) replicated by [@nimasadri11](https://github.com/nimasadri11) on 2021-09-26 (commit [`2d41e28`](https://github.com/capreolus-ir/capreolus/commit/2d41e28092813c22bfa5e8e2a88ff3ca944a688a)) (Tesla V100 on Compute Canada)
++ Results (MRR@10=0.353 w/ pytorch, MRR@10=0.352 w/ tensorflow) replicated by [@leungjch](https://github.com/leungjch) on 2021-10-19 (commit [`b5e7448`](https://github.com/crystina-z/capreolus/commit/b5e7448ab88aea69ab0df0838424254d0a079e7b)) (Tesla V100 on Compute Canada)
++ Results (MRR@10=0.353 w/ pytorch, MRR@10=0.353 w/ tensorflow) replicated by [@AlexWang000](https://github.com/AlexWang000) on 2021-11-05 (commit [`1c570c3`](https://github.com/crystina-z/capreolus/commit/1c570c34630a984ccff3843d89223effa5d48aba)) (Tesla V100 on Compute Canada)
++ Results (MRR@10=0.356) reproduced by [@lingwei-gu](https://github.com/lingwei-gu) on 2022-01-20 (commit [`1bbf0f2`](https://github.com/capreolus-ir/capreolus/commit/1bbf0f295b09774e2fb2a1db7dfddef88adec7be)) (Tesla V100 on Compute Canada)
\ No newline at end of file
diff --git a/docs/reproduction/config_maxp_pt-robust04_title.txt b/docs/reproduction/config_maxp_pt-robust04_title.txt
new file mode 100644
index 00000000..9ef0cb8f
--- /dev/null
+++ b/docs/reproduction/config_maxp_pt-robust04_title.txt
@@ -0,0 +1,23 @@
+optimize=nDCG@20 
+threshold=100
+testthreshold=100
+
+reranker.name=ptBERTMaxP 
+reranker.aggregation=max
+
+reranker.trainer.niters=36
+reranker.trainer.batch=16
+reranker.trainer.warmupiters=8
+reranker.trainer.decayiters=36
+reranker.trainer.validatefreq=2
+reranker.trainer.lr=0.00002
+reranker.trainer.decay=0.1
+reranker.trainer.decayiters=36
+reranker.trainer.decaytype=linear
+
+reranker.extractor.usecache=True
+reranker.extractor.maxqlen=20
+reranker.extractor.maxseqlen=256
+reranker.extractor.numpassages=16
+reranker.extractor.passagelen=150
+reranker.extractor.stride=75
diff --git a/docs/reproduction/config_msmarco.txt b/docs/reproduction/config_msmarco.txt
index f85790ae..b2cab3c6 100644
--- a/docs/reproduction/config_msmarco.txt
+++ b/docs/reproduction/config_msmarco.txt
@@ -5,16 +5,15 @@ testthreshold=1
 benchmark.name=msmarcopsg
 rank.searcher.name=msmarcopsgbm25
 
-reranker.name=TFBERTMaxP
+# reranker.name=TFBERTMaxP
 reranker.pretrained=bert-base-uncased
 
 reranker.extractor.usecache=True
 reranker.extractor.numpassages=1
-reranker.extractor.maxseqlen=512
+reranker.extractor.maxseqlen=256
 reranker.extractor.maxqlen=50
 reranker.extractor.tokenizer.pretrained=bert-base-uncased
 
-reranker.trainer.usecache=True
 reranker.trainer.niters=1
 reranker.trainer.batch=4
 reranker.trainer.evalbatch=256
@@ -23,6 +22,5 @@ reranker.trainer.warmupiters=1
 reranker.trainer.decay=0.1
 reranker.trainer.decayiters=1
 reranker.trainer.decaytype=linear
-
-reranker.trainer.loss=pairwise_hinge_loss
 reranker.trainer.decay=0.1
+reranker.trainer.lr=1e-5
diff --git a/docs/reproduction/config_msmarco_lce.txt b/docs/reproduction/config_msmarco_lce.txt
index ed4b8a8c..f4604ee5 100644
--- a/docs/reproduction/config_msmarco_lce.txt
+++ b/docs/reproduction/config_msmarco_lce.txt
@@ -1,5 +1,5 @@
 optimize=MRR@10
-threshold=1000
+threshold=100
 testthreshold=1
 
 benchmark.name=msmarcopsg
@@ -26,11 +26,11 @@ reranker.trainer.seed=42
 reranker.trainer.batch=16
 reranker.trainer.evalbatch=256
 reranker.trainer.itersize=48000
-reranker.trainer.niters=10
+reranker.trainer.niters=1
 reranker.trainer.lr=0.00001
 reranker.trainer.bertlr=0.00001
 reranker.trainer.decay=0.1
-reranker.trainer.decayiters=10
+reranker.trainer.decayiters=1
 reranker.trainer.decaytype=linear
 reranker.trainer.warmupiters=1
-reranker.trainer.validatefreq=10
+reranker.trainer.validatefreq=1
diff --git a/docs/reproduction/monoELECTRA+LCE.md b/docs/reproduction/monoELECTRA+LCE.md
index 006a1511..d3d099ba 100644
--- a/docs/reproduction/monoELECTRA+LCE.md
+++ b/docs/reproduction/monoELECTRA+LCE.md
@@ -6,11 +6,25 @@ Basically reproduce the results in [this](to-be-added) paper.
 For the set-up and monoBERT w/ hinge loss experiments, please refer to [this](MS_MARCO.md) page
 
 ## Running MS MARCO 
-The config file (config_msmarco_lce.txt)[config_msmarco_lce.txt] could be used out-of-box, with the following command: 
-
+1. Use the following script to run a "mini" version of the MS MARCO fine-tuning, testing if everything is working. 
 ```bash
 python -m capreolus.run rerank.train with file=docs/reproduction/config_msmarco_lce.txt
 ```
+This would train the monoBERT for only 3k steps with batch size to be 16,
+then rerank the *top100* documents per query. 
+The script should take no more than 24 hours to finish,
+At the end of execusion, it would display a bunch of metrics, where `MRR@10` should be around `0.359`.
+
+2. Once the above is done, we can fine-tune a full version on MS MARCO Passage using the following scripts: 
+Once the above is done, we can fine-tune a full version on MS MARCO Passage using the following scripts: 
+```bash
+python -m capreolus.run rerank.train with \
+    file=docs/reproduction/config_msmarco_lce.txt \
+    threshold=1000 \
+    reranker.trainer.niters=10 \
+    reranker.trainer.decayiters=10 \
+    reranker.trainer.validatefreq=10
+```
 
 The config would achieve `MRR@10` around `0.395~0.4` (maybe <0.01 points fluctuation).
 It trains monoELECTRA with the hard negative data prepared from the [TCT-ColBERT](https://cs.uwaterloo.ca/~jimmylin/publications/Lin_etal_2021_RepL4NLP.pdf), and uses LCE loss with 3 hard negative per query.
@@ -18,7 +32,13 @@ To experiments with different hard negative example, simply spcify `sampler.nneg
 For example, the following command would run the same config but with 7 hard negatives per query,
 which should gives `MRR@10` around `0.405~0.41` 
 ```bash
-python -m capreolus.run rerank.train with file=docs/reproduction/config_msmarco_lce.txt sampler.nneg=7
+python -m capreolus.run rerank.train with \
+    file=docs/reproduction/config_msmarco_lce.txt \
+    threshold=1000 \
+    reranker.trainer.niters=10 \
+    reranker.trainer.decayiters=10 \
+    reranker.trainer.validatefreq=10 \
+    sampler.nneg=7
 ```
 
 ## Replication Logs