Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,4 @@
from . import hc4
from . import neuclir # must be after hc4
from . import sara
from . import trec_ikat23
123 changes: 123 additions & 0 deletions ir_datasets/datasets/trec_ikat23.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import json
import itertools
import os
from typing import NamedTuple, Tuple, Dict
import ir_datasets
from ir_datasets.util import DownloadConfig, Bz2Extract
from ir_datasets.formats import TrecQrels, TrecScoredDocs, BaseDocs, BaseQueries, GenericDoc, TsvDocs, TsvQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs
from ir_datasets.indices import PickleLz4FullStore
import bz2

NAME = 'trec-ikat'

QRELS_DEFS = {
4: "Fully meets. The passage is a perfect answer for the turn. It includes all of the information needed to fully answer the turn in the conversation context. It focuses only on the subject and contains little extra information.",
3: "Highly meets. The passage answers the question and is focused on the turn. It would be a satisfactory answer if Google Assistant or Alexa returned this passage in response to the query. It may contain limited extraneous information.",
2: "Moderately meets. The passage answers the turn, but is focused on other information that is unrelated to the question. The passage may contain the answer, but users will need extra effort to pick the correct portion. The passage may be relevant, but it may only partially answer the turn, missing a small aspect of the context.",
1: "Slightly meets. The passage includes some information about the turn, but does not directly answer it. Users will find some useful information in the passage that may lead to the correct answer, perhaps after additional rounds of conversation (better than nothing).",
0: "Fails to meet. The passage is not relevant to the question. The passage is unrelated to the target query.",
}

QRELS_PTKB_DEFS = {
1: "Relevant",
0: "Irrelevant",
}

class iKATDocs(BaseDocs):
def __init__(self, dlc):
super().__init__()
self._dlc = dlc

@ir_datasets.util.use_docstore
def docs_iter(self):
for chunck_file in os.listdir(self._dlc.path()):
if ".bz2" not in chunck_file:
continue
chunck = os.path.join(self._dlc.path(), chunck_file)

with bz2.open(chunck, "rt") as bzinput:
for line in bzinput:
data = json.loads(line)
yield GenericDoc(data['id'], data['contents'])

def docs_cls(self):
return GenericDoc

def docs_store(self, field='doc_id'):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME}/doc.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id']
)

def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()


class iKATQuery(NamedTuple):
query_id: str
topic_title: str
topic_ptkb: Dict[str, str]
topic_number: int
turn_id: int
utterance: str
resolved_utterance: str
response: str
def default_text(self):
"""
raw_utterance
"""
return self.utterance


class iKATQueries(BaseQueries):
def __init__(self, dlc_list):
super().__init__()
self._dlc_list = dlc_list

def queries_iter(self):
for _dlc in self._dlc_list:
with _dlc.stream() as stream:
topics = json.load(stream)
for topic in topics:
topic_number = topic['number']
topic_title = topic['title']
topic_ptkb = topic['ptkb']
for turn in topic['turns']:
turn_id = turn['turn_id']
yield iKATQuery(f'{topic_number}_{turn_id}', topic_title, topic_ptkb, topic_number, turn_id, turn['utterance'], turn['resolved_utterance'], turn['response'])

def queries_namespace(self):
return NAME


# An initialization function is used to keep the namespace clean
def _init():
base_path = ir_datasets.util.home_path() / NAME
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
dlc = DownloadConfig.context(NAME, base_path)

base = Dataset(documentation('_'))
subsets = {}

docs = iKATDocs(dlc['docs'])
queries = iKATQueries([dlc['train_queries'], dlc['test_queries']])
qrels = TrecQrels(dlc['qrels'], QRELS_DEFS)
subsets['2023'] = Dataset(docs, queries, qrels, documentation('2023'))

judged_queries = iKATQueries([dlc['test_queries']])
subsets['2023/judged'] = Dataset(docs, judged_queries, qrels, documentation('2023/judged'))

ptkb = TrecQrels(dlc['ptkb'], QRELS_PTKB_DEFS)
subsets['2023/judged/ptkb'] = Dataset(judged_queries, ptkb, documentation('2023/judged/ptkb'))

for s in subsets:
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

return base, subsets

base, subsets = _init()
53 changes: 53 additions & 0 deletions ir_datasets/docs/trec_ikat23.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
_:
pretty_name: 'TREC iKAT'
desc: '
<p>
A conversational passage ranking resource from the TREC iKAT task. Relevance judgments come
from NIST assessors based on from the TREC iKAT runs.
</p>
'

2023:
desc: '
<p>
Official TREC iKAT Collection from 2023.
</p>
<p>
Passage collection contains 116,838,987 passages, extracted from ClueWeb22-B,
follows here to get the licence and access to the dataset
(iKAT guidelines <a href="https://www.trecikat.com/">link</a>).
</p>
<p>
Queries and qrels are free access. Queries are the flatten converation, where <kbd>query_id</kbd> is both
a topic number and a turn id.
</p>
'
data_access: '
<p>
To use this dataset, you need a local copy of the processed passage corpus, provided by
the iKAT organizers. To get this copy, first apply for the ClueWeb licence on
https://lemurproject.org/clueweb22/obtain.php and send the form to CMU for approval ([email protected]).
Once you have the license, send a mail to Andrew Ramsay <[email protected]> to have access to a
download link with the preprocessed iKAT collection.
<p>
Download the 16 subfiles [Passages (JSONL)], in a folder
as <kbd>~/.ir_datasets/trec-ikat/2023/passage_jsonl/*</kbd>.
</p>
'

2023/judged:
desc: '
<p>
Subset of queries with assed relevance.
</p>
'

2023/judged/ptkb:
desc: '
<p>
PTKB: Relevance for the PTKB (Personal Textual Knowledge Base).
</p>
<p>
Each topic contains a list of PTKBs, within the conversation each turn has one or several relevant PTKB.
</p>
'
27 changes: 26 additions & 1 deletion ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -5790,7 +5790,32 @@
"cache_path": "2020/2020qrels.txt"
}
},


"trec-ikat": {
"docs": {
"instructions": "Please download the dataset. Instructions: <https://www.trecikat.com/>.\nTo proceed, symlink the processed passages source files here: {path}",
"cache_path": "TREC-Ikat-CW22-passage/"
},
"test_queries": {
"url": "https://docs.google.com/uc?export=download&id=1zPSiAqLmbx9QFGm6walnuMUl7xoJmRB7",
"expected_md5": "684fa0197cdec8c3cfb6a2e586ab83f6",
"cache_path": "2023_test_topics.json"
},
"qrels": {
"url": "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
"expected_md5": "79dc121bab25b2245e52a53263e5ad1f",
"cache_path": "2023-qrels.all-turns.txt"
},
"train_queries": {
"url": "https://docs.google.com/uc?export=download&id=1sNHmVYO9PVG2kFxLscPGhN-uCCUuDAu9",
"cache_path": "2023_train_topics.json"
},
"ptkb": {
"url": "https://trec.nist.gov/data/ikat/2023-ptkb-qrels.txt",
"cache_path": "2023-ptkb-qrels.txt"
}
},

"trec-mandarin": {
"docs": {
"instructions": "The dataset is based on the TREC Mandarin corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2000T52>\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-mandarin.html#DataAccess>.\nTo proceed, symlink the source file here: {path}",
Expand Down
3 changes: 3 additions & 0 deletions ir_datasets/etc/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,9 @@
"trec-fair/2021/train": {"docs": {"_ref": "trec-fair/2021"}, "queries": {"count": 57}, "qrels": {"count": 2185446, "fields": {"relevance": {"counts_by_value": {"1": 2185446}}}}},
"trec-fair/2022": {"docs": {"count": 6475537, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-fair/2022/train": {"docs": {"_ref": "trec-fair/2022"}, "queries": {"count": 50}, "qrels": {"count": 2088306, "fields": {"relevance": {"counts_by_value": {"1": 2088306}}}}},
"trec-ikat/2023": {"docs": {"count": 116838987, "fields": {"doc_id": {"max_len": 29, "common_prefix": "clueweb22-en00"}}}, "queries": {"count": 427}, "qrels": {"count": 34183, "fields": {"relevance": {"counts_by_value": {"0": 25467, "2": 2733, "3": 1542, "1": 4131, "4": 310}}}}},
"trec-ikat/2023/judged": {"docs": {"_ref": "trec-ikat/2023"}, "queries": {"count": 332}, "qrels": {"_ref": "trec-ikat/2023"}},
"trec-ikat/2023/judged/ptkb": {"queries": {"_ref": "trec-ikat/2023/judged"}, "qrels": {"count": 1030, "fields": {"relevance": {"counts_by_value": {"0": 806, "1": 224}}}}},
"trec-mandarin": {"docs": {"count": 164789, "fields": {"doc_id": {"max_len": 22, "common_prefix": ""}}}},
"trec-mandarin/trec5": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 28}, "qrels": {"count": 15588, "fields": {"relevance": {"counts_by_value": {"0": 13406, "1": 2182}}}}},
"trec-mandarin/trec6": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 26}, "qrels": {"count": 9236, "fields": {"relevance": {"counts_by_value": {"1": 2958, "0": 6278}}}}},
Expand Down
Loading