allenai · SimonLupart · May 1, 2024 · May 1, 2024 · May 1, 2024 · May 1, 2024
diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
@@ -55,3 +55,4 @@
 from . import hc4
 from . import neuclir # must be after hc4
 from . import sara
+from . import trec_ikat23
diff --git a/ir_datasets/datasets/trec_ikat23.py b/ir_datasets/datasets/trec_ikat23.py
@@ -0,0 +1,123 @@
+import json
+import itertools
+import os
+from typing import NamedTuple, Tuple, Dict
+import ir_datasets
+from ir_datasets.util import DownloadConfig, Bz2Extract
+from ir_datasets.formats import TrecQrels, TrecScoredDocs, BaseDocs, BaseQueries, GenericDoc, TsvDocs, TsvQueries
+from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs
+from ir_datasets.indices import PickleLz4FullStore
+import bz2
+
+NAME = 'trec-ikat'
+
+QRELS_DEFS = {
+    4: "Fully meets. The passage is a perfect answer for the turn. It includes all of the information needed to fully answer the turn in the conversation context. It focuses only on the subject and contains little extra information.",
+    3: "Highly meets. The passage answers the question and is focused on the turn. It would be a satisfactory answer if Google Assistant or Alexa returned this passage in response to the query. It may contain limited extraneous information.",
+    2: "Moderately meets. The passage answers the turn, but is focused on other information that is unrelated to the question. The passage may contain the answer, but users will need extra effort to pick the correct portion. The passage may be relevant, but it may only partially answer the turn, missing a small aspect of the context.",
+    1: "Slightly meets. The passage includes some information about the turn, but does not directly answer it. Users will find some useful information in the passage that may lead to the correct answer, perhaps after additional rounds of conversation (better than nothing).",
+    0: "Fails to meet. The passage is not relevant to the question. The passage is unrelated to the target query.",
+}
+
+QRELS_PTKB_DEFS = {
+    1: "Relevant",
+    0: "Irrelevant",
+}
+
+class iKATDocs(BaseDocs):
+    def __init__(self, dlc):
+        super().__init__()
+        self._dlc = dlc
+
+    @ir_datasets.util.use_docstore
+    def docs_iter(self):
+        for chunck_file in os.listdir(self._dlc.path()):
+            if ".bz2" not in chunck_file:
+                continue
+            chunck = os.path.join(self._dlc.path(), chunck_file)
+
+            with bz2.open(chunck, "rt") as bzinput:
+                for line in bzinput:
+                    data = json.loads(line)
+                    yield GenericDoc(data['id'], data['contents'])
+
+    def docs_cls(self):
+        return GenericDoc
+
+    def docs_store(self, field='doc_id'):
+        return PickleLz4FullStore(
+            path=f'{ir_datasets.util.home_path()/NAME}/doc.pklz4',
+            init_iter_fn=self.docs_iter,
+            data_cls=self.docs_cls(),
+            lookup_field=field,
+            index_fields=['doc_id']
+        )
+
+    def docs_count(self):
+        if self.docs_store().built():
+            return self.docs_store().count()
+
+
+class iKATQuery(NamedTuple):
+    query_id: str
+    topic_title: str
+    topic_ptkb: Dict[str, str]
+    topic_number: int
+    turn_id: int
+    utterance: str
+    resolved_utterance: str
+    response: str
+    def default_text(self):
+        """
+        raw_utterance
+        """
+        return self.utterance
+
+
+class iKATQueries(BaseQueries):
+    def __init__(self, dlc_list):
+        super().__init__()
+        self._dlc_list = dlc_list
+
+    def queries_iter(self):
+        for _dlc in self._dlc_list:
+            with _dlc.stream() as stream:
+                topics = json.load(stream)
+                for topic in topics:
+                    topic_number = topic['number']
+                    topic_title = topic['title']
+                    topic_ptkb = topic['ptkb']
+                    for turn in topic['turns']:
+                        turn_id = turn['turn_id']
+                        yield iKATQuery(f'{topic_number}_{turn_id}', topic_title, topic_ptkb, topic_number, turn_id, turn['utterance'], turn['resolved_utterance'], turn['response'])
+
+    def queries_namespace(self):
+        return NAME
+
+
+# An initialization function is used to keep the namespace clean
+def _init():
+    base_path = ir_datasets.util.home_path() / NAME
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+    dlc = DownloadConfig.context(NAME, base_path)
+
+    base = Dataset(documentation('_'))
+    subsets = {}
+
+    docs = iKATDocs(dlc['docs'])
+    queries = iKATQueries([dlc['train_queries'], dlc['test_queries']])
+    qrels = TrecQrels(dlc['qrels'], QRELS_DEFS)
+    subsets['2023'] = Dataset(docs, queries, qrels, documentation('2023'))
+
+    judged_queries = iKATQueries([dlc['test_queries']])
+    subsets['2023/judged'] = Dataset(docs, judged_queries, qrels, documentation('2023/judged'))
+
+    ptkb = TrecQrels(dlc['ptkb'], QRELS_PTKB_DEFS)
+    subsets['2023/judged/ptkb'] = Dataset(judged_queries, ptkb, documentation('2023/judged/ptkb'))
+
+    for s in subsets:
+        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
+
+    return base, subsets
+
+base, subsets = _init()
diff --git a/ir_datasets/docs/trec_ikat23.yaml b/ir_datasets/docs/trec_ikat23.yaml
@@ -0,0 +1,53 @@
+_:
+  pretty_name: 'TREC iKAT'
+  desc: '
+<p>
+A conversational passage ranking resource from the TREC iKAT task. Relevance judgments come
+from NIST assessors based on from the TREC iKAT runs.
+</p>
+'
+
+2023:
+  desc: '
+<p>
+Official TREC iKAT Collection from 2023.
+</p>
+<p>
+Passage collection contains 116,838,987 passages, extracted from ClueWeb22-B, 
+follows here to get the licence and access to the dataset 
+(iKAT guidelines <a href="https://www.trecikat.com/">link</a>).
+</p>
+<p>
+Queries and qrels are free access. Queries are the flatten converation, where <kbd>query_id</kbd> is both
+a topic number and a turn id.
+</p>
+'
+  data_access: '
+<p>
+To use this dataset, you need a local copy of the processed passage corpus, provided by 
+the iKAT organizers. To get this copy, first apply for the ClueWeb licence on 
+https://lemurproject.org/clueweb22/obtain.php and send the form to CMU for approval ([email protected]).
+Once you have the license, send a mail to Andrew Ramsay <[email protected]> to have access to a 
+download link with the preprocessed iKAT collection.
+<p>
+Download the 16 subfiles [Passages (JSONL)], in a folder 
+as <kbd>~/.ir_datasets/trec-ikat/2023/passage_jsonl/*</kbd>.
+</p>
+'
+
+2023/judged:
+  desc: '
+<p>
+Subset of queries with assed relevance.
+</p>
+'
+
+2023/judged/ptkb:
+  desc: '
+<p>
+PTKB: Relevance for the PTKB (Personal Textual Knowledge Base). 
+</p>
+<p>
+Each topic contains a list of PTKBs, within the conversation each turn has one or several relevant PTKB.
+</p>
+'
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
@@ -5790,7 +5790,32 @@
       "cache_path": "2020/2020qrels.txt"
     }
   },
-
+
+  "trec-ikat": {
+    "docs": {
+      "instructions": "Please download the dataset. Instructions: <https://www.trecikat.com/>.\nTo proceed, symlink the processed passages source files here: {path}",
+      "cache_path": "TREC-Ikat-CW22-passage/"
+    },
+    "test_queries": {
+      "url": "https://docs.google.com/uc?export=download&id=1zPSiAqLmbx9QFGm6walnuMUl7xoJmRB7",
+      "expected_md5": "684fa0197cdec8c3cfb6a2e586ab83f6",
+      "cache_path": "2023_test_topics.json"
+    },
+    "qrels": {
+      "url": "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
+      "expected_md5": "79dc121bab25b2245e52a53263e5ad1f",
+      "cache_path": "2023-qrels.all-turns.txt"
+    },
+    "train_queries": {
+      "url": "https://docs.google.com/uc?export=download&id=1sNHmVYO9PVG2kFxLscPGhN-uCCUuDAu9",
+      "cache_path": "2023_train_topics.json"
+    },
+    "ptkb": {
+      "url": "https://trec.nist.gov/data/ikat/2023-ptkb-qrels.txt",
+      "cache_path": "2023-ptkb-qrels.txt"
+    }
+  },
+
   "trec-mandarin": {
     "docs": {
       "instructions": "The dataset is based on the TREC Mandarin corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2000T52>\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-mandarin.html#DataAccess>.\nTo proceed, symlink the source file here: {path}",

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
@@ -645,6 +645,9 @@
   "trec-fair/2021/train": {"docs": {"_ref": "trec-fair/2021"}, "queries": {"count": 57}, "qrels": {"count": 2185446, "fields": {"relevance": {"counts_by_value": {"1": 2185446}}}}},
   "trec-fair/2022": {"docs": {"count": 6475537, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
   "trec-fair/2022/train": {"docs": {"_ref": "trec-fair/2022"}, "queries": {"count": 50}, "qrels": {"count": 2088306, "fields": {"relevance": {"counts_by_value": {"1": 2088306}}}}},
+  "trec-ikat/2023": {"docs": {"count": 116838987, "fields": {"doc_id": {"max_len": 29, "common_prefix": "clueweb22-en00"}}}, "queries": {"count": 427}, "qrels": {"count": 34183, "fields": {"relevance": {"counts_by_value": {"0": 25467, "2": 2733, "3": 1542, "1": 4131, "4": 310}}}}},
+  "trec-ikat/2023/judged": {"docs": {"_ref": "trec-ikat/2023"}, "queries": {"count": 332}, "qrels": {"_ref": "trec-ikat/2023"}},
+  "trec-ikat/2023/judged/ptkb": {"queries": {"_ref": "trec-ikat/2023/judged"}, "qrels": {"count": 1030, "fields": {"relevance": {"counts_by_value": {"0": 806, "1": 224}}}}},
   "trec-mandarin": {"docs": {"count": 164789, "fields": {"doc_id": {"max_len": 22, "common_prefix": ""}}}},
   "trec-mandarin/trec5": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 28}, "qrels": {"count": 15588, "fields": {"relevance": {"counts_by_value": {"0": 13406, "1": 2182}}}}},
   "trec-mandarin/trec6": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 26}, "qrels": {"count": 9236, "fields": {"relevance": {"counts_by_value": {"1": 2958, "0": 6278}}}}},