MSMARCO support: monoBERT (#14)

* add monobert for marco * temp change python3.7 to 3.6 for colab compatibility * fix evaluation options * fix issues * add missing options in evaluate_passage_ranker * working monobert * update transformers, clean code * update tokenizers * add dataclasses if < 3.7 * cleanup todos * update to newer transformers along with syntax, clean up settings * model-name-or-path as str type * fix tokenizer loading for t5
castorini · Apr 30, 2020 · 55e4961 · 55e4961
1 parent 34345c8
commit 55e4961
Show file tree

Hide file tree

Showing 11 changed files with 360 additions and 15 deletions.
diff --git a/pygaggle/data/__init__.py b/pygaggle/data/__init__.py
@@ -1,2 +1,3 @@
 from .kaggle import *
 from .relevance import *
+from .msmarco import *
diff --git a/pygaggle/data/msmarco.py b/pygaggle/data/msmarco.py
@@ -0,0 +1,137 @@
+import os
+from collections import OrderedDict, defaultdict
+from typing import List, Set, DefaultDict
+import json
+import logging
+from itertools import permutations
+
+from pydantic import BaseModel
+import scipy.special as sp
+import numpy as np
+
+from .relevance import RelevanceExample, MsMarcoPassageLoader
+from pygaggle.model.tokenize import SpacySenticizer
+from pygaggle.rerank.base import Query, Text
+from pygaggle.data.unicode import convert_to_unicode
+
+
+__all__ = ['MsMarcoExample', 'MsMarcoDataset']
+
+
+class MsMarcoExample(BaseModel):
+ qid: str
+ text: str
+ candidates: List[str]
+ relevant_candidates: Set[str]
+
+class MsMarcoDataset(BaseModel):
+ examples: List[MsMarcoExample]
+
+ @classmethod
+ def load_qrels(cls, path: str) -> DefaultDict[str, Set[str]]:
+ qrels = defaultdict(set)
+ with open(path) as f:
+ for i, line in enumerate(f):
+ qid, _, doc_id, relevance = line.rstrip().split('\t')
+ if int(relevance) >= 1:
+ qrels[qid].add(doc_id)
+ return qrels
+
+ @classmethod
+ def load_run(cls, path: str):
+ '''Returns OrderedDict[str, List[str]]'''
+ run = OrderedDict()
+ with open(path) as f:
+ for i, line in enumerate(f):
+ qid, doc_title, rank = line.split('\t')
+ if qid not in run:
+ run[qid] = []
+ run[qid].append((doc_title, int(rank)))
+ sorted_run = OrderedDict()
+ for qid, doc_titles_ranks in run.items():
+ sorted(doc_titles_ranks, key=lambda x: x[1])
+ doc_titles = [doc_titles for doc_titles, _ in doc_titles_ranks]
+ sorted_run[qid] = doc_titles
+ return sorted_run
+
+ @classmethod
+ def load_queries(cls, 
+ path: str, 
+ qrels: DefaultDict[str, Set[str]], 
+ run) -> List[MsMarcoExample]:
+ queries = []
+ with open(path) as f:
+ for i, line in enumerate(f):
+ qid, query = line.rstrip().split('\t')
+ candidates = run[qid]
+ queries.append(MsMarcoExample(qid = qid,
+ text = query,
+ candidates = run[qid],
+ relevant_candidates = qrels[qid]))
+ return queries
+
+ @classmethod
+ def from_folder(cls, 
+ folder: str, 
+ split: str = 'dev', 
+ is_duo: bool = False) -> 'MsMarcoDataset':
+ run_mono = "mono." if is_duo else ""
+ query_path = os.path.join(folder, f"queries.{split}.small.tsv")
+ qrels_path = os.path.join(folder, f"qrels.{split}.small.tsv")
+ run_path = os.path.join(folder, f"run.{run_mono}{split}.small.tsv")
+ return cls(examples = cls.load_queries(query_path, 
+ cls.load_qrels(qrels_path),
+ cls.load_run(run_path)))
+
+
+ def query_passage_tuples(self, is_duo: bool = False):
+ return (((ex.qid, ex.text, ex.relevant_candidates), perm_pas) for ex in self.examples
+ for perm_pas in permutations(ex.candidates, r=1+int(is_duo)))
+
+
+ def to_relevance_examples(self,
+ index_path: str,
+ is_duo: bool = False) -> List[RelevanceExample]:
+ loader = MsMarcoPassageLoader(index_path)
+ example_map = {}
+ for (qid, text, rel_cands), cands in self.query_passage_tuples():
+ if qid not in example_map:
+ example_map[qid] = [convert_to_unicode(text), [], [], []]
+ example_map[qid][1].append([cand for cand in cands][0])
+ try:
+ passages = [loader.load_passage(cand) for cand in cands]
+ example_map[qid][2].append([convert_to_unicode(passage.all_text) for passage in passages][0]) 
+ except ValueError as e:
+ logging.warning(f'Skipping {passages}')
+ continue
+ example_map[qid][3].append(cands[0] in rel_cands)
+ mean_stats = defaultdict(list)
+ for ex in self.examples:
+ int_rels = np.array(list(map(int, example_map[ex.qid][3])))
+ p = int_rels.sum()/(len(ex.candidates) - 1) if is_duo else int_rels.sum()
+ mean_stats['Random P@1'].append(np.mean(int_rels))
+ n = len(ex.candidates) - p
+ N = len(ex.candidates)
+ if len(ex.candidates) <= 1000:
+ mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0)
+ numer = np.array([sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i!=N]) * p
+ if n == N:
+ numer = np.append(numer, 0)
+ denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
+ rr = 1 / np.arange(1, n + 2)
+ rmrr = np.sum(numer * rr / denom)
+ mean_stats['Random MRR'].append(rmrr)
+ rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
+ mean_stats['Random MRR@10'].append(rmrr10)
+ ex_index = len(ex.candidates)
+ for rel_cand in ex.relevant_candidates:
+ if rel_cand in ex.candidates:
+ ex_index = min(ex.candidates.index(rel_cand), ex_index)
+ mean_stats['Existing MRR'].append(1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
+ mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0)
+ for k, v in mean_stats.items():
+ logging.info(f'{k}: {np.mean(v)}')
+ return [RelevanceExample(Query(text=query_text, id=qid), 
+ list(map(lambda s: Text(s[1], dict(docid=s[0])), zip(cands, cands_text))), 
+ rel_cands) \
+ for qid, (query_text, cands, cands_text, rel_cands) in example_map.items()]
diff --git a/pygaggle/data/relevance.py b/pygaggle/data/relevance.py
@@ -30,6 +30,15 @@ def all_text(self):
  return '\n'.join((self.abstract, self.body_text, self.ref_entries))
 
 
+@dataclass
+class MsMarcoPassage:
+ para_text: str
+
+ @property
+ def all_text(self):
+ return self.para_text
+
+
 class Cord19DocumentLoader:
  double_space_pattern = re.compile(r'\s\s+')
 
@@ -50,3 +59,15 @@ def unfold(entries):
  return Cord19Document(unfold(article['abstract']),
  unfold(article['body_text']),
  unfold(ref_entries))
+
+
+class MsMarcoPassageLoader:
+ def __init__(self, index_path: str):
+ self.searcher = pysearch.SimpleSearcher(index_path)
+
+ def load_passage(self, id: str) -> MsMarcoPassage:
+ try:
+ passage = self.searcher.doc(id).lucene_document().get('raw')
+ except AttributeError:
+ raise ValueError('passage unretrievable')
+ return MsMarcoPassage(passage)
diff --git a/pygaggle/data/unicode.py b/pygaggle/data/unicode.py
@@ -0,0 +1,8 @@
+def convert_to_unicode(text):
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, bytes):
+ return text.decode("utf-8", "ignore")
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
diff --git a/pygaggle/model/decode.py b/pygaggle/model/decode.py
@@ -19,7 +19,10 @@ def greedy_decode(model: PreTrainedModel,
  past = model.get_encoder()(input_ids, attention_mask=attention_mask)
  next_token_logits = None
  for _ in range(length):
- model_inputs = model.prepare_inputs_for_generation(decode_ids, past=past, attention_mask=attention_mask)
+ model_inputs = model.prepare_inputs_for_generation(decode_ids,
+ past=past, 
+ attention_mask=attention_mask, 
+ use_cache=True)
  outputs = model(**model_inputs) # (batch_size, cur_len, vocab_size)
  next_token_logits = outputs[0][:, -1, :] # (batch_size, vocab_size)
  decode_ids = torch.cat([decode_ids, next_token_logits.max(1)[1].unsqueeze(-1)], dim=-1)

diff --git a/pygaggle/model/evaluate.py b/pygaggle/model/evaluate.py
@@ -100,6 +100,16 @@ class RecallAt3Metric(TopkMixin, RecallAccumulator):
  top_k = 3
 
 
+@register_metric('recall@50')
+class RecallAt50Metric(TopkMixin, RecallAccumulator):
+ top_k = 50
+
+
+@register_metric('recall@1000')
+class RecallAt1000Metric(TopkMixin, RecallAccumulator):
+ top_k = 1000
+
+
 @register_metric('mrr')
 class MrrMetric(MeanAccumulator):
  def accumulate(self, scores: List[float], gold: RelevanceExample):
@@ -108,6 +118,13 @@ def accumulate(self, scores: List[float], gold: RelevanceExample):
  self.scores.append(rr)
 
 
+@register_metric('mrr@10')
+class MrrAt10Metric(MeanAccumulator):
+ def accumulate(self, scores: List[float], gold: RelevanceExample):
+ scores = sorted(list(enumerate(scores)), key=lambda x: x[1], reverse=True)
+ rr = next((1 / (rank_idx + 1) for rank_idx, (idx, _) in enumerate(scores) if (gold.labels[idx] and rank_idx < 10)), 0)
+ self.scores.append(rr)
+
 class ThresholdedRecallMetric(DynamicThresholdingMixin, RecallAccumulator):
  threshold = 0.5
 

diff --git a/pygaggle/rerank/base.py b/pygaggle/rerank/base.py
@@ -17,9 +17,12 @@ class Query:
  ----------
  text : str
  The query text.
+ id : Optional[str]
+ The query id.
  """
- def __init__(self, text: str):
+ def __init__(self, text: str, id: Optional[str] = None):
  self.text = text
+ self.id = id
 
 
 class Text:

diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py
@@ -16,10 +16,10 @@
 from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider
 from pygaggle.model import SimpleBatchTokenizer, CachedT5ModelLoader, T5BatchTokenizer, RerankerEvaluator, metric_names
 from pygaggle.data import LitReviewDataset
-from pygaggle.settings import Settings
+from pygaggle.settings import Cord19Settings
 
 
-SETTINGS = Settings()
+SETTINGS = Cord19Settings()
 METHOD_CHOICES = ('transformer', 'bm25', 't5', 'seq_class_transformer', 'qa_transformer', 'random')