diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index c5298d1..5bc3e92 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -55,3 +55,4 @@ from . import hc4 from . import neuclir # must be after hc4 from . import sara +from . import trec_ikat23 diff --git a/ir_datasets/datasets/trec_ikat23.py b/ir_datasets/datasets/trec_ikat23.py new file mode 100644 index 0000000..09964e4 --- /dev/null +++ b/ir_datasets/datasets/trec_ikat23.py @@ -0,0 +1,123 @@ +import json +import itertools +import os +from typing import NamedTuple, Tuple, Dict +import ir_datasets +from ir_datasets.util import DownloadConfig, Bz2Extract +from ir_datasets.formats import TrecQrels, TrecScoredDocs, BaseDocs, BaseQueries, GenericDoc, TsvDocs, TsvQueries +from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs +from ir_datasets.indices import PickleLz4FullStore +import bz2 + +NAME = 'trec-ikat' + +QRELS_DEFS = { + 4: "Fully meets. The passage is a perfect answer for the turn. It includes all of the information needed to fully answer the turn in the conversation context. It focuses only on the subject and contains little extra information.", + 3: "Highly meets. The passage answers the question and is focused on the turn. It would be a satisfactory answer if Google Assistant or Alexa returned this passage in response to the query. It may contain limited extraneous information.", + 2: "Moderately meets. The passage answers the turn, but is focused on other information that is unrelated to the question. The passage may contain the answer, but users will need extra effort to pick the correct portion. The passage may be relevant, but it may only partially answer the turn, missing a small aspect of the context.", + 1: "Slightly meets. The passage includes some information about the turn, but does not directly answer it. Users will find some useful information in the passage that may lead to the correct answer, perhaps after additional rounds of conversation (better than nothing).", + 0: "Fails to meet. The passage is not relevant to the question. The passage is unrelated to the target query.", +} + +QRELS_PTKB_DEFS = { + 1: "Relevant", + 0: "Irrelevant", +} + +class iKATDocs(BaseDocs): + def __init__(self, dlc): + super().__init__() + self._dlc = dlc + + @ir_datasets.util.use_docstore + def docs_iter(self): + for chunck_file in os.listdir(self._dlc.path()): + if ".bz2" not in chunck_file: + continue + chunck = os.path.join(self._dlc.path(), chunck_file) + + with bz2.open(chunck, "rt") as bzinput: + for line in bzinput: + data = json.loads(line) + yield GenericDoc(data['id'], data['contents']) + + def docs_cls(self): + return GenericDoc + + def docs_store(self, field='doc_id'): + return PickleLz4FullStore( + path=f'{ir_datasets.util.home_path()/NAME}/doc.pklz4', + init_iter_fn=self.docs_iter, + data_cls=self.docs_cls(), + lookup_field=field, + index_fields=['doc_id'] + ) + + def docs_count(self): + if self.docs_store().built(): + return self.docs_store().count() + + +class iKATQuery(NamedTuple): + query_id: str + topic_title: str + topic_ptkb: Dict[str, str] + topic_number: int + turn_id: int + utterance: str + resolved_utterance: str + response: str + def default_text(self): + """ + raw_utterance + """ + return self.utterance + + +class iKATQueries(BaseQueries): + def __init__(self, dlc_list): + super().__init__() + self._dlc_list = dlc_list + + def queries_iter(self): + for _dlc in self._dlc_list: + with _dlc.stream() as stream: + topics = json.load(stream) + for topic in topics: + topic_number = topic['number'] + topic_title = topic['title'] + topic_ptkb = topic['ptkb'] + for turn in topic['turns']: + turn_id = turn['turn_id'] + yield iKATQuery(f'{topic_number}_{turn_id}', topic_title, topic_ptkb, topic_number, turn_id, turn['utterance'], turn['resolved_utterance'], turn['response']) + + def queries_namespace(self): + return NAME + + +# An initialization function is used to keep the namespace clean +def _init(): + base_path = ir_datasets.util.home_path() / NAME + documentation = YamlDocumentation(f'docs/{NAME}.yaml') + dlc = DownloadConfig.context(NAME, base_path) + + base = Dataset(documentation('_')) + subsets = {} + + docs = iKATDocs(dlc['docs']) + queries = iKATQueries([dlc['train_queries'], dlc['test_queries']]) + qrels = TrecQrels(dlc['qrels'], QRELS_DEFS) + subsets['2023'] = Dataset(docs, queries, qrels, documentation('2023')) + + judged_queries = iKATQueries([dlc['test_queries']]) + subsets['2023/judged'] = Dataset(docs, judged_queries, qrels, documentation('2023/judged')) + + ptkb = TrecQrels(dlc['ptkb'], QRELS_PTKB_DEFS) + subsets['2023/judged/ptkb'] = Dataset(judged_queries, ptkb, documentation('2023/judged/ptkb')) + + for s in subsets: + ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) + + return base, subsets + +base, subsets = _init() \ No newline at end of file diff --git a/ir_datasets/docs/trec_ikat23.yaml b/ir_datasets/docs/trec_ikat23.yaml new file mode 100644 index 0000000..9388c81 --- /dev/null +++ b/ir_datasets/docs/trec_ikat23.yaml @@ -0,0 +1,53 @@ +_: + pretty_name: 'TREC iKAT' + desc: ' +

+A conversational passage ranking resource from the TREC iKAT task. Relevance judgments come +from NIST assessors based on from the TREC iKAT runs. +

+' + +2023: + desc: ' +

+Official TREC iKAT Collection from 2023. +

+

+Passage collection contains 116,838,987 passages, extracted from ClueWeb22-B, +follows here to get the licence and access to the dataset +(iKAT guidelines link). +

+

+Queries and qrels are free access. Queries are the flatten converation, where query_id is both +a topic number and a turn id. +

+' + data_access: ' +

+To use this dataset, you need a local copy of the processed passage corpus, provided by +the iKAT organizers. To get this copy, first apply for the ClueWeb licence on +https://lemurproject.org/clueweb22/obtain.php and send the form to CMU for approval (jlm4@andrew.cmu.edu). +Once you have the license, send a mail to Andrew Ramsay to have access to a +download link with the preprocessed iKAT collection. +

+Download the 16 subfiles [Passages (JSONL)], in a folder +as ~/.ir_datasets/trec-ikat/2023/passage_jsonl/*. +

+' + +2023/judged: + desc: ' +

+Subset of queries with assed relevance. +

+' + +2023/judged/ptkb: + desc: ' +

+PTKB: Relevance for the PTKB (Personal Textual Knowledge Base). +

+

+Each topic contains a list of PTKBs, within the conversation each turn has one or several relevant PTKB. +

+' diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index 7aaa26c..7304a67 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -5790,7 +5790,32 @@ "cache_path": "2020/2020qrels.txt" } }, - + + "trec-ikat": { + "docs": { + "instructions": "Please download the dataset. Instructions: .\nTo proceed, symlink the processed passages source files here: {path}", + "cache_path": "TREC-Ikat-CW22-passage/" + }, + "test_queries": { + "url": "https://docs.google.com/uc?export=download&id=1zPSiAqLmbx9QFGm6walnuMUl7xoJmRB7", + "expected_md5": "684fa0197cdec8c3cfb6a2e586ab83f6", + "cache_path": "2023_test_topics.json" + }, + "qrels": { + "url": "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt", + "expected_md5": "79dc121bab25b2245e52a53263e5ad1f", + "cache_path": "2023-qrels.all-turns.txt" + }, + "train_queries": { + "url": "https://docs.google.com/uc?export=download&id=1sNHmVYO9PVG2kFxLscPGhN-uCCUuDAu9", + "cache_path": "2023_train_topics.json" + }, + "ptkb": { + "url": "https://trec.nist.gov/data/ikat/2023-ptkb-qrels.txt", + "cache_path": "2023-ptkb-qrels.txt" + } + }, + "trec-mandarin": { "docs": { "instructions": "The dataset is based on the TREC Mandarin corpus. It is available from the LDC via: \nMore details about the procedure can be found here: .\nTo proceed, symlink the source file here: {path}", diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index 60bd7c8..84884f4 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -645,6 +645,9 @@ "trec-fair/2021/train": {"docs": {"_ref": "trec-fair/2021"}, "queries": {"count": 57}, "qrels": {"count": 2185446, "fields": {"relevance": {"counts_by_value": {"1": 2185446}}}}}, "trec-fair/2022": {"docs": {"count": 6475537, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-fair/2022/train": {"docs": {"_ref": "trec-fair/2022"}, "queries": {"count": 50}, "qrels": {"count": 2088306, "fields": {"relevance": {"counts_by_value": {"1": 2088306}}}}}, + "trec-ikat/2023": {"docs": {"count": 116838987, "fields": {"doc_id": {"max_len": 29, "common_prefix": "clueweb22-en00"}}}, "queries": {"count": 427}, "qrels": {"count": 34183, "fields": {"relevance": {"counts_by_value": {"0": 25467, "2": 2733, "3": 1542, "1": 4131, "4": 310}}}}}, + "trec-ikat/2023/judged": {"docs": {"_ref": "trec-ikat/2023"}, "queries": {"count": 332}, "qrels": {"_ref": "trec-ikat/2023"}}, + "trec-ikat/2023/judged/ptkb": {"queries": {"_ref": "trec-ikat/2023/judged"}, "qrels": {"count": 1030, "fields": {"relevance": {"counts_by_value": {"0": 806, "1": 224}}}}}, "trec-mandarin": {"docs": {"count": 164789, "fields": {"doc_id": {"max_len": 22, "common_prefix": ""}}}}, "trec-mandarin/trec5": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 28}, "qrels": {"count": 15588, "fields": {"relevance": {"counts_by_value": {"0": 13406, "1": 2182}}}}}, "trec-mandarin/trec6": {"docs": {"_ref": "trec-mandarin"}, "queries": {"count": 26}, "qrels": {"count": 9236, "fields": {"relevance": {"counts_by_value": {"1": 2958, "0": 6278}}}}}, diff --git a/test/integration/trec_ikat23.py b/test/integration/trec_ikat23.py new file mode 100644 index 0000000..0a01808 --- /dev/null +++ b/test/integration/trec_ikat23.py @@ -0,0 +1,45 @@ +import re +import unittest +from ir_datasets.formats import TrecQrel, GenericDoc +from .base import DatasetIntegrationTest +from ir_datasets.datasets.trec_ikat23 import iKATQuery + +class TestTrecIkat(DatasetIntegrationTest): + def test_trec_ikat_2023_queries(self): + self._test_queries('trec-ikat/2023', count=427, items={ + 0: iKATQuery(query_id='1-1_1', topic_title='Finding a university', topic_ptkb={'1': 'I graduated from Tilburg university.', '2': 'I live in the Netherlands.', '3': "I'm allergic to peanuts.", '4': 'I worked as a web developer for 2 years.', '5': "I have a bachelor's degree in computer science.", '6': 'I like Indian food.', '7': "My bachelor's GPA is 5.6.", '8': "I'm 26 years old.", '9': 'My TOEFL SCORE is 91.', '10': 'My interesting bachelor courses are data structure, algorithm, data mining, and artificial intelligence.', '11': "I didn't like computer architecture and logical circuits courses."}, topic_number='1-1', turn_id=1, utterance="I want to start my master's degree, can you help me with finding a university?", resolved_utterance="I want to start my master's degree, can you help me with finding a university?", response="Do you want to continue your bachelor's studies and obtain a degree in computer science?"), + 9: iKATQuery(query_id='1-2_2', topic_title='Finding a university', topic_ptkb={'1': "I don't like crazy cold weather.", '2': "I don't have a driver's license.", '3': 'I plan to move to Canada.', '4': "I'm from the Netherlands.", '5': "I'm used to heavy rains in the Netherlands.", '6': 'I graduated from UvA.', '7': "I have bachelor's degree in computer science.", '8': 'I speak English fluently.'}, topic_number='1-2', turn_id=2, utterance='Yes, I want to pursue the same major. Can you tell me the name of the best universities?', resolved_utterance='Yes, I want to pursue computer science. Can you tell me the name of the best computer science universities in Canada?', response='Here are the top universities for computer science in Canada: 1) University of British Columbia, 2) University of Alberta, 3)Concordia University, 4) Simon Fraser University, 5)The University of Toronto'), + 243: iKATQuery(query_id='15-1_2', topic_title='Find a movie', topic_ptkb={'1': 'My sister is following the "West World", but I don\'t like it.', '2': 'Johnny Depp made the Pirates of the Caribbean excellent.', '3': 'My friend suggested to me the "Now you see me" movie, it was fantastic.', '4': 'I went on a biking trip last year.', '5': 'I usually like to drink coffee in the morning.', '6': 'I watched the proposal and enjoyed it. Ryan Reynolds is my favorite!.', '7': 'The "Friends" series was terrific, Jennifer Aniston and Lisa Kudrow were the best stars!.'}, topic_number='15-1', turn_id=2, utterance='What is it?', resolved_utterance='What is the Academy Awards?', response="The Academy Awards is a once-a-year event put on by the Academy of Motion Pictures Art and Sciences (AMPAS). Typically, the Academy, which is manufactured up of several elite filmmakers in addition to film professionals, votes about the winners for every single category ahead of each year's ceremony. it is a prestigious award that recognizes personalities with artistic and technical merit in the movie industry. It was founded in 1927, but the presentation was first started in 1929, and winners since then receive a gold-plated statuette commonly called Oscar.") + }) + + def test_trec_ikat_2023_qrels(self): + self._test_qrels('trec-ikat/2023', count=34183, items={ + 0: TrecQrel(query_id='9-1_3', doc_id='clueweb22-en0000-00-00251:12', relevance=0, iteration='0'), + 9: TrecQrel(query_id='9-1_3', doc_id='clueweb22-en0003-65-04965:0', relevance=3, iteration='0'), + 243: TrecQrel(query_id='9-1_4', doc_id='clueweb22-en0015-14-11947:6', relevance=0, iteration='0') + }) + + def test_trec_ikat_2023_judged_queries(self): + self._test_queries('trec-ikat/2023/judged', count=332, items={ + 0: iKATQuery(query_id='9-1_1', topic_title='Finding a diet', topic_ptkb={'1': "I don't like the new spin-off; because I keep comparing the two and it has lower quality.", '2': 'Because of my kidney problem, I have to drink water frequently to stay hydrated.', '3': "I'm going to change my phone.", '4': "I can't exercise too much because of the heart problem that I have.", '5': "I'm vegetarian.", '6': "I'm lactose intolerant.", '7': "I'm allergic to soybeans.", '8': 'I just finished watching the Game of Thrones.', '9': "I didn't like how the series ended, especially the war scenes.", '10': "I'm an Android user."}, topic_number='9-1', turn_id=1, utterance='Can you help me find a diet for myself?', resolved_utterance="Can you help me find a diet for myself considering that I'm vegetarian, allergic to soybeans, lactose intolerant, can't exercise too much, and should drink water regularly?", response='Sure, these diets fit your condition and preference: 1) Vegan Mediterranean combines the principles of the Mediterranean diet with veganism and emphasizes whole grains, vegetables, beans, nuts, seeds, herbs, and spices. It excludes animal foods while incorporating plant-based alternatives, offering a sustainable and compassionate approach. (2) Vegan Keto: is a high-fat, low-carbohydrate, and moderate-protein diet which is also exclusively plant-based. It suits individuals following a vegan lifestyle who wish to adopt a ketogenic eating plan. (3) Eco-Atkins is a vegetarian variation of the Atkins diet which focuses on whole grains, vegetables, tofu, legumes, nuts, and seeds while minimizing animal products. By reducing carbohydrate intake and incorporating plant-based protein sources, it promotes sustainability and healthier eating. (4) The Ornish Diet (with limited exercise) is designed to improve overall health, support weight loss, and reverse cardiovascular disease progression, the Ornish diet is low in fat. It emphasizes exercise, stress management, and healthy relationships alongside a low-fat eating plan. It is worth noting that vegan diets, in general, have been linked to a lower risk of chronic health conditions, including heart disease. These diet options provide alternatives for individuals with specific dietary preferences and goals.'), + 9: iKATQuery(query_id='9-2_4', topic_title='Finding a diet', topic_ptkb={'1': "I'm quite into cars.", '2': "I'm having a social dinner with colleagues tonight.", '3': 'Traveling to Florida next week.', '4': "I'm 155 cm tall.", '5': "I have gained weight and now I'm 100 Kg.", '6': 'I have a knee injury.', '7': 'I had a big fight with my wife recently.', '8': "I'm American.", '9': "I'm a beer guy.", '10': "I'm a 56 years old.", '11': "I don't like surgical solutions."}, topic_number='9-2', turn_id=4, utterance="Ok, I didn't expect it, now what can I do to lose some weight?", resolved_utterance="Ok, I didn't expect it, now what can I do to lose some weight?", response='There are surgical methods you might not like and nonsurgical methods like healthy diet and exercise. Which one do you prefer?'), + 243: iKATQuery(query_id='17-3_16', topic_title='Buying a phone', topic_ptkb={'1': "I'm going to replace my glasses soon.", '2': 'I have an old iPhone.', '3': 'I value ethical consumption and am mindful about the products I buy.', '4': 'I live and work in the UK.', '5': "I don't want to spend a lot of money on a new phone.", '6': 'I went on a trip to Switzerland in the summer.', '7': "I'm interested in the UK heritage.", '8': "I've read a lot about colonialism.", '9': 'I am specifically interested in the processor speed and camera quality of my new phone.', '10': 'I usually buy secondhand.', '11': "I'm traveling to the US for my vacation.", '12': "I'm a fan of Greta Thunberg."}, topic_number='17-3', turn_id=16, utterance='Sorry, that was not what I was looking for. I mean which country is better?', resolved_utterance='Sorry, that was not what I was looking for. I mean which of the US or UK is better for buying a refurbished iPhone??', response="I apologize for the confusion. Based on my web search results, the average price of an iPhone 8 Plus in the US is around $209. In the UK, you can find an iPhone 8 Plus for around PS149 ($188) on Amazon UK. So according to your preferences, buying the phone in the UK might be better for you. However, prices may vary depending on the condition of the phone (new or refurbished), storage capacity, and color. It's always a good idea to compare prices from different retailers to find the best deal. Is there anything else you would like to know?") + }) + + def test_trec_ikat_2023_judged_ptkb_qrels(self): + self._test_qrels('trec-ikat/2023/judged/ptkb', count=1030, items={ + 0: TrecQrel(query_id='9-1_3', doc_id='1', relevance=0, iteration='0'), + 9: TrecQrel(query_id='9-1_3', doc_id='9', relevance=0, iteration='0'), + 243: TrecQrel(query_id='10-3_9', doc_id='13', relevance=0, iteration='0') + }) + + def test_trec_ikat_2023_docs(self): + self._test_docs('trec-ikat/2023', count=116838987, items={ + 0: GenericDoc(doc_id='clueweb22-en0004-50-00555:0', text='Saddleback Church: Events: Light of the World: Christmas Lights at Saddleback! Home > Events > Light of the World: Christmas Lights at Saddleback! Date & Time Date Monday, December 13, 2021 Time 6:00 PM - 7:00 PM Location Campus Lake Forest Venue Central Park Street Address 1 Saddleback Pkwy Lake Forest, CA 92630 Cost Free Event Contact registration@saddleback.com +19496098000 Light of the World: Christmas Lights at Saddleback! Description Feel the magic that comes with being together during Saddleback’s annual Christmas light experience! During this walk-through event, share the joy of community while sipping on hot chocolate, strolling through Christmas-themed lands, and snapping photos of the illuminated night sky. Invite your friends and family to enjoy the festive spirit of Christmas at Saddleback this season!'), + 9: GenericDoc(doc_id='clueweb22-en0004-50-00599:8', text="This movie is considered a classic now but my girls and I enjoy it very much. I'm so happy they put it on DVD. Look for the part where he is in the tub trying on the crown. It's hilarious! I highly recommend this one. The quality is nice and clear and the colors are vivid. Make sure you order one made for the US. Enjoy! 6 people found this helpful Report abuse See all reviews Amazon Music Stream millions of songs Amazon Advertising Find, attract, and engage customers Amazon Drive Cloud storage from Amazon 6pm Score deals on fashion brands AbeBooks Books, art & collectibles ACX Audiobook Publishing Made Easy Alexa Actionable Analytics for the Web Sell on Amazon Start a Selling Account Amazon Business Everything For Your Business Amazon Fresh Groceries & More Right To Your Door AmazonGlobal Ship Orders Internationally Home Services Experienced Pros Happiness Guarantee Amazon Ignite Sell your original Digital Educational Resources Amazon Web Services Scalable Cloud Computing Services Audible Listen to Books & Original Audio Performances Book Depository Books With Free Delivery Worldwide Box Office Mojo Find Movie Box Office Data ComiXology Thousands of Digital Comics DPReview Digital Photography Fabric Sewing, Quilting & Knitting Goodreads Book reviews & recommendations IMDb Movies, TV & Celebrities IMDbPro Get Info Entertainment Professionals Need Kindle Direct Publishing Indie Digital & Print Publishing Made Easy Amazon Photos Unlimited Photo Storage Free With Prime Prime Video Direct Video Distribution Made Easy Shopbop Designer Fashion Brands Amazon Warehouse Great Deals on Quality Used Products Whole Foods Market America’s Healthiest Grocery Store Woot! Deals and Shenanigans Zappos Shoes & Clothing Ring Smart Home Security Systems eero WiFi Stream 4K Video in Every Room Blink Smart Security for Every Home Neighbors App Real-Time Crime & Safety Alerts Amazon Subscription Boxes Top subscription boxes – right to your door PillPack Pharmacy Simplified Amazon Renewed Like-new products you can trust"), + 243: GenericDoc(doc_id='clueweb22-en0004-50-03089:8', text="Lesson 11: Keeping your data in check with the Replit database Now that you are starting to build larger and more complicated applications, it is time to start using databases to keep your data clean and secure. Lesson 12: Repl audio - control (or create) your music with code Find, download, play, and control the volume of your music, all in code. If that's not enough, create your own music too. This is the part where you realize that the possibilities are endless while you learn how to control your music with code. Part 2 - Build your own projects Lesson 13: Steganography - hiding messages in images Share secret messages with your friends by hiding them inside images with steganography, Python and Replit. Lesson 14: 2D platform game with PyGame and Replit Build a fun 2D platform game while learning all about Python game development. It's is easily expandable with endless possibilities so let your imagination run wild! Lesson 15: Building a Chat App with Node.js and Replit Build a Replit-to-Replit chat app - one server will coordinate between multiple clients so you can chat oldschool style (in the terminal) with your friends. Lesson 16: Build a static site generator in Python Build your own static site generator to build a custom blog from scratch. Lesson 17: Build a predictive text engine Build an autocomplete bot that can generate its own sentences.") + }) + + +if __name__ == '__main__': + unittest.main()