Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 19 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,36 @@ pip install ir-datasets-longeval

## Usage

Using this extension is simple. Just register the additional datasets by calling `register()`. Then you can load the datasets with [ir_datasets](https://ir-datasets.com/python.html) as usual:
The `ir_datasets_longeval` extension provides an `load` method that returns a LongEval `ir_dataset` that allows to load official versions of the LongEval datasets as well as modified versions that you have on your local filesystem:

```python
from ir_datasets import load
from ir_datasets_longeval import register
from ir_datasets_longeval import load

# Register the longeval datasets.
register()
# Use ir_datasets as usual.
# load an official version of the LongEval dataset.
dataset = load("longeval-web/2022-06")

# load a local copy of a LongEval dataset.
# E.g., so that you can easily run your approach on modified data.
dataset = load("<PATH-TO-A-DIRECTORY-ON-YOUR-MACHINE>")

# From now on, you can use dataset as any ir_dataset
```

You can also register only the `longeval-web` or `longeval-sci` dataset:
LongEval datasets have a set of temporal specifics that you can use:

```Python
from ir_datasets import load
from ir_datasets_longeval import register

# Register the longeval datasets.
register("longeval-web")
# At what time does/did a dataset take place?
dataset.get_timestamp()

# Each dataset can have a list of zero or more past datasets/interactions.
# You can incorporate them in your retrieval system:
for past_dataset in dataset.get_past_datasets():
# `past_dataset` is an LongEval `ir_dataset` with the same functionality as the `dataset`
past_dataset.get_timestamp()
```


If you want to use the [CLI](https://ir-datasets.com/cli.html), just use the `ir_datasets_longeval` instead of `ir_datasets`. All CLI commands will work as usual, e.g., to list the available datasets:
If you want to use the [CLI](https://ir-datasets.com/cli.html), just use the `ir_datasets_longeval` instead of `ir_datasets`. All CLI commands will work as usual, e.g., to list the officially available datasets:

```shell
ir_datasets_longeval list
Expand Down
34 changes: 32 additions & 2 deletions ir_datasets_longeval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,39 @@
from ir_datasets import main_cli as irds_main_cli
from ir_datasets import main_cli as irds_main_cli, registry as irds_registry

from ir_datasets_longeval.longeval_sci import register as register_longeval_sci
from ir_datasets_longeval.longeval_sci import register as register_longeval_sci, LongEvalSciDataset
from ir_datasets_longeval.longeval_web import register as register_longeval_web
from typing import Union
from pathlib import Path


def load(longeval_ir_dataset: Union[str, Path]):
"""Load an LongEval ir_dataset. Can point to an official ID of an LongEval dataset or a local directory of the same structure.

Args:
longeval_ir_dataset (Union[str, Path]): the ID of an LongEval ir_dataset or a local path.
"""
if longeval_ir_dataset is None:
raise ValueError('Please pass either a string or a Path.')

if longeval_ir_dataset.startswith("longeval-sci"):
register_longeval_sci()
if longeval_ir_dataset.startswith("longeval-web"):
register_longeval_web()

exists_locally = longeval_ir_dataset and Path(longeval_ir_dataset).exists() and Path(longeval_ir_dataset).is_dir()
exists_in_irds = longeval_ir_dataset in irds_registry and irds_registry[longeval_ir_dataset]

if exists_locally and exists_in_irds:
raise ValueError(f'The passed {longeval_ir_dataset} is ambiguous, as it is a valid official ir_datasets id and a local directory.')

if exists_locally:
return LongEvalSciDataset(Path(longeval_ir_dataset))

if exists_in_irds:
return irds_registry[longeval_ir_dataset]

raise ValueError('I could not find a dataset with the id '+ str(longeval_ir_dataset))

def register(dataset=None) -> None:
if dataset:
dataset = dataset.split("/")[0]
Expand Down
1 change: 1 addition & 0 deletions ir_datasets_longeval/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
},
"longeval-sci": {
"longeval_sci_training_2025": {
"url": "https://researchdata.tuwien.ac.at/records/r643n-yc044/files/longeval_sci_training_2025_abstract.zip?download=1&preview=1&token=eyJhbGciOiJIUzUxMiJ9.eyJpZCI6IjcwM2Y4MzQ0LTFlMDEtNDYxNy1iNDc4LTI5MmQ5MzYwNTU3NyIsImRhdGEiOnt9LCJyYW5kb20iOiI4NjYxMWFkODQzNDk2ZDk0NzllMDNlOWIyYWM1Zjc4NCJ9.YhnRV6WzWfQiuLQcGyTrA3gyI_5UBe9rtUAV6qKk5U7tqGEmD4NUdyfjGo2-U7tnBIlD7iTwUUDi0nw3GcXPmA",
"instructions": "TBD",
"cache_path": "longeval_sci_training_2025.zip"
}
Expand Down
152 changes: 95 additions & 57 deletions ir_datasets_longeval/longeval_sci.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
from ir_datasets import registry
from ir_datasets.datasets.base import Dataset
from ir_datasets.formats import JsonlDocs, TrecQrels, TsvQueries
from ir_datasets.util import RelativePath, ZipExtractCache, home_path
from ir_datasets.util import ZipExtractCache, home_path
from datetime import datetime

import contextlib
import json
from pathlib import Path

from ir_datasets_longeval.util import DownloadConfig, YamlDocumentation

Expand All @@ -15,23 +20,21 @@
0: "not relevant",
}
SUB_COLLECTIONS = ["2024-11"]
MAPPING = (
{
"doc_id": "id",
"title": "title",
"abstract": "abstract",
"authors": "authors",
"createdDate": "createdDate",
"doi": "doi",
"arxivId": "arxivId",
"pubmedId": "pubmedId",
"magId": "magId",
"oaiIds": "oaiIds",
"links": "links",
"publishedDate": "publishedDate",
"updatedDate": "updatedDate",
},
)
MAPPING = {
"doc_id": "id",
"title": "title",
"abstract": "abstract",
"authors": "authors",
"createdDate": "createdDate",
"doi": "doi",
"arxivId": "arxivId",
"pubmedId": "pubmedId",
"magId": "magId",
"oaiIds": "oaiIds",
"links": "links",
"publishedDate": "publishedDate",
"updatedDate": "updatedDate",
}


class LongEvalSciDoc(NamedTuple):
Expand All @@ -50,55 +53,90 @@ class LongEvalSciDoc(NamedTuple):
updatedDate: str

def default_text(self):
return self.title + self.abstract
return self.title + " " + self.abstract

class ExtractedPath():
def __init__(self, path):
self._path = path

@contextlib.contextmanager
def stream(self):
with open(self._path, 'rb') as f:
yield f

class LongEvalSciDataset(Dataset):
def __init__(self, base_path: Path, yaml_documentation: str = "longeval_sci.yaml", timestamp: Optional[str] = None, prior_datasets: Optional[List[str]] = None):
documentation = YamlDocumentation(yaml_documentation)
self.base_path = base_path

if not base_path or not base_path.exists() or not base_path.is_dir():
raise ValueError(f'I expected that the directory {base_path} exists. But the directory does not exist.')

if not timestamp:
timestamp = self.read_property_from_metadata("timestamp")

self.timestamp = datetime.strptime(timestamp, "%Y-%m")

if prior_datasets is None:
prior_datasets = self.read_property_from_metadata("prior-datasets")

self.prior_datasets = prior_datasets

docs_path = base_path / 'documents'
if not docs_path.exists() or not docs_path.is_dir():
raise ValueError(f'I expected that the directory {docs_path} exists. But the directory does not exist.')

jsonl_doc_files = os.listdir(docs_path)
if len(jsonl_doc_files) == 0:
raise ValueError(f'The directory {docs_path} has no jsonl files. This is likely an arror.')

docs = JsonlDocs(
[ExtractedPath(base_path/ "documents" / split) for split in jsonl_doc_files],
doc_cls=LongEvalSciDoc,
docstore_path=f"{docs_path}/docstore.pklz4",
mapping=MAPPING
)

queries_path = base_path / 'queries.txt'
if not queries_path.exists() or not queries_path.is_file():
raise ValueError(f'I expected that the file {queries_path} exists. But the directory does not exist.')

queries = TsvQueries(ExtractedPath(queries_path))
qrels = None
qrels_path = base_path / 'qrels.txt'

if qrels_path.exists() and qrels_path.is_file():
qrels = TrecQrels(ExtractedPath(qrels_path), QREL_DEFS)

super().__init__(docs, queries, qrels, documentation)

def get_timestamp(self):
return self.timestamp

def get_past_datasets(self):
return [LongEvalSciDataset(self.base_path / i) for i in self.prior_datasets]

def read_property_from_metadata(self, property):
return json.load(open(self.base_path / "metadata.json", "r"))[property]


def register():
if NAME in registry:
if f'{NAME}/2024-11/train' in registry:
# Already registered.
return
documentation = YamlDocumentation("longeval_sci.yaml")

base_path = home_path() / NAME
dlc = DownloadConfig.context(NAME, base_path)

base = Dataset(documentation("_"))
dlc = DownloadConfig.context(NAME, base_path)
base_path = home_path() / NAME

training_2025_data_cache = ZipExtractCache(
data_path = ZipExtractCache(
dlc["longeval_sci_training_2025"], base_path / "longeval_sci_training_2025"
)
docs_path = training_2025_data_cache.path() / "longeval_sci_training_2025/documents"
collection_2024_11 = JsonlDocs(
[
RelativePath(
training_2025_data_cache,
f"longeval_sci_training_2025/documents/{split}",
)
for split in os.listdir(docs_path)
],
doc_cls=LongEvalSciDoc,
docstore_path=f"{docs_path}.pklz4",
mapping=MAPPING,
)
).path()

subsets = {}
subsets["2024-11"] = Dataset(collection_2024_11, documentation("2024-11"))

subsets["2024-11/train"] = Dataset(
collection_2024_11,
TsvQueries(
RelativePath(
training_2025_data_cache, "longeval_sci_training_2025/queries.txt"
)
),
TrecQrels(
RelativePath(
training_2025_data_cache, "longeval_sci_training_2025/qrels.txt"
),
QREL_DEFS,
),
documentation("2024-11/train"),
)

registry.register(NAME, base)

subsets["2024-11/train"] = LongEvalSciDataset(data_path, "2024-11/train", "2024-11", [])

for s in sorted(subsets):
registry.register(f"{NAME}/{s}", subsets[s])
Empty file added tests/__init__.py
Empty file.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
doc_id title abstract authors createdDate doi arxivId pubmedId magId oaiIds links publishedDate updatedDate
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
9 4
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "77444382", "title": "Pelan strategik untuk menangani isu dan cabaran dalam pengurusan zakat perniagaan di Kedah", "abstract": "Sesebuah organisasi memerlukan strategi yang berkesan untuk bersaing dengan jayanya di\n\npasaran. Ini kerana persekitaran berubah dengan pantas dan menjadi lebih berdaya saing. Untuk mengatasi perubahan perlu dapat membangunkan strategi yang berkesan. Pelan strategik adalah kerangka kerja untuk melaksanakan pemikiran, arah tuju dan tindakan strategik yang membawa kepada pencapaian hasil yang konsisten dan dirancang. Dalam konteks zakat, dengan pendekatan strategik salah satunya seperti Pengurusan Perhubungan\n\nPelanggan dapat meningkatkan pengumpulan dana zakat sehingga dapat memberi beberapa kelebihan seperti kesinambungan kepuasan dan kesetiaan muzakki, pengedaran zakat yang berkesan dan pengurusan zakat dapat memberi lebih fokus terhadap visi, misi, dan objektif yang telah ditetapkan. Selain itu, tahap pengumpulan zakat yang rendah ini dipercayai dikaitkan dengan kaedah pengumpulan yang terhad dan perancangan strategi pemungutan dan pengumpulan dana yang lemah yang dilaksanakan oleh institusi zakat di kebanyakan negara Islam. Selain itu, kerana kekurangan penyelidikan tentang hubungan pematuhan pembayaran\n\nzakat dengan perancangan strategik, membenarkan mengapa penyelidik ingin menyelidiki kajian ini. Matlamat utama kajian ini adalah untuk mengukur tahap kepatuhan membayar zakat perniagaan di kalangan peniaga kecil ini dijalankan bagi meningkatkan dan memperbaiki pelan strategik pihak Lembaga Zakat Negeri Kedah dan meningkatkan hasil\n\nkutipan zakat. Hasil dari kajian ini menunjukkan tahap kesedaran membayar zakat perniagaan di kalangan peniaga kecil negeri Kedah berada di tahap sederhana. Sampel kajian terdiri daripada golongan peniaga Muslim di sekitar bandar Alor setar, Jitra, Changloon dan Sintok. Hasil kajian ini boleh dimanfaatkan oleh pelbagai pihak dalam usaha meningkatkan kutipan zakat pada masa akan datang. Bagi meningkatkan jumlah kepatuhan peniaga dalam membayar zakat perniagaan, pejabat zakat perlu menggiatkan lagi promosi melalui jariangan sosial media dan menggiatkan aktiviti keusahawanan zakat agar dapat membantu golongan yang memerlukan serta meningkatkan ekonomi negara", "authors": [{"name": "Abdul Rahman, Maria"}, {"name": "Nik Mat, Nik Kamariah"}, {"name": "Sulaiman, Yaty"}], "createdDate": "2020-01-04T19:09:45", "doi": null, "arxivId": null, "pubmedId": null, "magId": null, "oaiIds": ["oai:repo.uum.edu.my:26683"], "links": [{"type": "download", "url": "https://core.ac.uk/download/286034309.pdf"}, {"type": "reader", "url": "https://core.ac.uk/reader/286034309"}, {"type": "thumbnail_m", "url": "https://core.ac.uk/image/286034309/large"}, {"type": "thumbnail_l", "url": "https://core.ac.uk/image/286034309/large"}, {"type": "display", "url": "https://core.ac.uk/works/77444382"}], "publishedDate": "2019-01-01T00:00:00", "updatedDate": "2022-05-16T21:54:31"}
{"id": "140120179", "title": "Shareholder Litigation in Mergers and Acquisitions", "abstract": "Using hand-collected data, we examine the targeting of shareholder class action lawsuits in merger & acquisition (M & A) transactions, and the associations of these lawsuits with offer completion rates and takeover premia. We find that M & A offers subject to shareholder lawsuits are completed at a significantly lower rate than offers not subject to litigation, after controlling for selection bias, different judicial standards, major offer characteristics, M & A financial and legal advisor reputations as well as industry and year fixed effects. M & A offers subject to shareholder lawsuits have significantly higher takeover premia in completed deals, after controlling for the same factors. Economically, the expected rise in takeover premia more than offsets the fall in the probability of deal completion, resulting in a positive expected gain to target shareholders. However, in general, target stock price reactions to bid announcements do not appear to fully anticipate the positive expected gain from potential litigation. We find that during a merger wave characterized by friendly single-bidder offers, shareholder litigation substitutes for the presence of a rival bidder by policing low-ball bids and forcing offer price improvement by the bidder", "authors": [{"name": "Krishnan, C. N.V."}, {"name": "Masulis, Ronald W."}, {"name": "Thomas, Randall S."}], "createdDate": "2023-03-05T21:40:31", "doi": null, "arxivId": null, "pubmedId": null, "magId": null, "oaiIds": ["oai:scholarship.law.vanderbilt.edu:faculty-publications-2334"], "links": [{"type": "download", "url": "https://core.ac.uk/download/555356360.pdf"}, {"type": "reader", "url": "https://core.ac.uk/reader/555356360"}, {"type": "thumbnail_m", "url": "https://core.ac.uk/image/555356360/large"}, {"type": "thumbnail_l", "url": "https://core.ac.uk/image/555356360/large"}, {"type": "display", "url": "https://core.ac.uk/works/140120179"}], "publishedDate": "2012-01-01T08:00:00", "updatedDate": "2023-03-05T21:40:31"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "44934830", "title": "The aesthetics of the tangible: haptic motifs and sensory contagion in gothic terror films", "abstract": null, "authors": [{"name": "Ramalho, JR"}], "createdDate": "2017-11-03T14:16:30", "doi": null, "arxivId": null, "pubmedId": null, "magId": null, "oaiIds": ["oai:eprints.ucl.ac.uk.oai2:1551521"], "links": [{"type": "download", "url": "https://core.ac.uk/download/111003174.pdf"}, {"type": "reader", "url": "https://core.ac.uk/reader/111003174"}, {"type": "thumbnail_m", "url": "https://core.ac.uk/image/111003174/large"}, {"type": "thumbnail_l", "url": "https://core.ac.uk/image/111003174/large"}, {"type": "display", "url": "https://core.ac.uk/works/44934830"}], "publishedDate": "2016-10-01T01:00:00", "updatedDate": "2021-05-30T13:15:20"}
{"id": "34195138", "title": "Pitfalls in the use of randomised controlled trials for fish oil studies with cardiac patients", "abstract": "Randomised controlled trials (RCT) examining the effects of fish oil supplementation on cardiac outcomes have yielded varying results over time. Although RCT are placed at the top of the evidence hierarchy, this methodology arose in the framework of pharmaceutical development. RCT with pharmaceuticals differ in important ways from RCT involving fish oil interventions. In particular, in pharmaceutical RCT, the test agent is present only in the intervention group and not in the control group, whereas in fish oil RCT, n-3 fats are present in the diet and in the tissues of both groups. Also, early phase studies with pharmaceuticals determine pharmacokinetics and pharmacodynamics to design the dose of the RCT intervention so that it is in a predicted linear dose-response range. None of this happens in fish oil RCT, and there is evidence that both baseline n-3 intake and tissue levels may be sufficiently high in the dose-response range that it is not possible to demonstrate a clinical effect with a RCT. When these issues are considered, it is possible that the changing pattern of fish consumption and fish oil use over time, especially in cardiac patients, can explain the disparity where benefit was observed in the early fish oil trials but not in the more recent trials.Michael J. James, Thomas R. Sullivan, Robert G. Metcalf and Leslie G. Clelan", "authors": [{"name": "Cleland, L."}, {"name": "James, M."}, {"name": "Metcalf, R."}, {"name": "Sullivan, T."}], "createdDate": "2017-03-08T19:18:45", "doi": "10.1017/s0007114514001408", "arxivId": null, "pubmedId": null, "magId": null, "oaiIds": ["oai:digital.library.adelaide.edu.au:2440/90480"], "links": [{"type": "display", "url": "https://core.ac.uk/works/34195138"}], "publishedDate": "2014-01-01T00:00:00", "updatedDate": "2023-02-16T01:18:14"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"timestamp": "2024-11", "prior-datasets": []}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1234-1234-1234-1234-1234 2024-11 140120179 2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1234-1234-1234-1234-1234 selection bias
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"timestamp": "2025-01", "prior-datasets": ["../example-local-dataset-no-prior-datasets/", "../example-local-dataset-no-prior-datasets/"]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 some cool query
Loading