Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from . import natural_questions
from . import nyt
from . import pmc
from . import pt_image_ir_dataset
from . import touche_image
from . import touche # must be after argsme,clueweb12,touche_image
from . import trec_arabic
Expand Down
120 changes: 120 additions & 0 deletions ir_datasets/datasets/pt_image_ir_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import ir_datasets

from ir_datasets.formats import TsvDocs
from ir_datasets.formats import TrecQrels
from ir_datasets.formats import TsvQueries
from ir_datasets.formats.tsv import _TsvBase
from ir_datasets.formats.base import BaseQueries
from ir_datasets.formats.base import GenericQuery

from .base import Dataset
from .base import YamlDocumentation

from ir_datasets.util import DownloadConfig

from typing import NamedTuple

NAME = "pt-image-ir-dataset"


class PtImageIrArticle(NamedTuple):
doc_id: str
url: str
title: str
text: str # content field
date: str
images: str


class PtImageIrImage(NamedTuple):
doc_id: str
text: str # url field


# Custom TsvQueries class that supports skipping the first line (header)
class TsvQueriesWithHeader(TsvQueries):
def __init__(
self,
queries_dlc,
query_cls=None,
namespace=None,
lang=None,
skip_first_line=False,
):
if query_cls is None:
query_cls = GenericQuery
# Call the _TsvBase constructor directly with skip_first_line
_TsvBase.__init__(
self, queries_dlc, query_cls, "queries", skip_first_line=skip_first_line
)
BaseQueries.__init__(self)
self._queries_namespace = namespace
self._queries_lang = lang


# What do the relevance levels in qrels mean?
QREL_DEFS = {
1: "relevant - the image is relevant to the query",
0: "not relevant - the image is not relevant to the query",
}

# This message is shown to the user before downloads are started
DUA = (
"This work is licensed under the Creative Commons Attribution 4.0 International License. "
"To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/. "
"By using this dataset, you agree to the terms and conditions of this license."
)


def _init():
# The directory where this dataset's data files will be stored
base_path = ir_datasets.util.home_path() / NAME

# Load an object that is used for providing the documentation
documentation = YamlDocumentation(f"docs/{NAME}.yaml")

# A reference to the downloads file, under the key "pt-image-ir". (DLC stands for DownLoadable Content)
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)

# How to process the documents (articles). Since they are in a TSV format with 6 fields, we'll use TsvDocs with custom doc class.
articles = TsvDocs(
dlc["articles"],
doc_cls=PtImageIrArticle,
namespace=NAME,
lang="pt",
count_hint=4678,
skip_first_line=True, # Skip header row
)

# How to process the images. TSV format with 2 fields, using custom doc class.
images = TsvDocs(
dlc["images"],
doc_cls=PtImageIrImage,
namespace=f"{NAME}/images",
lang="pt",
count_hint=42333,
skip_first_line=True, # Skip header row
)

# How to process the queries. Using the custom class that can skip header.
queries = TsvQueriesWithHeader(
dlc["queries"], namespace=NAME, lang="pt", skip_first_line=True
)

# Qrels: The qrels file is in the TREC format, so we'll use TrecQrels to process them
qrels = TrecQrels(dlc["qrels"], QREL_DEFS)

# Package the docs, queries, qrels, and documentation into a Dataset object
dataset = Dataset(articles, queries, qrels, documentation("_"))

# Also create a dataset just for images
images_dataset = Dataset(images, queries, qrels, documentation("images"))

# Register the dataset in ir_datasets
ir_datasets.registry.register(NAME, dataset)
ir_datasets.registry.register(f"{NAME}/images", images_dataset)

return dataset, images_dataset


dataset, images_dataset = _init()
23 changes: 23 additions & 0 deletions ir_datasets/docs/pt-image-ir-dataset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
_: # matches documentation key above
pretty_name: 'PT Image IR Dataset' # a more human-readable way to present this dataset than the dataset-id
desc: '
<p>
A Dataset for Image Information Retrieval in European Portuguese. The data is sourced from the
<a href="https://www.presidencia.pt/">Portuguese Presidency</a> website. It contains 4,678 articles, 42,333 images, and 80 queries related to the Portuguese Presidency. Over 5,000 images were annotated by three annotators.
</p>
<p>
The dataset includes:
</p>
<ul>
<li><strong>Articles:</strong> 4,678 articles with URL, title, content, date, and associated images</li>
<li><strong>Images:</strong> 42,333 images with URLs</li>
<li><strong>Queries:</strong> 80 queries in Portuguese created by the dataset authors</li>
<li><strong>Relevance judgments:</strong> Over 5,000 image-query relevance annotations (binary: relevant/not relevant)</li>
</ul>
<p>
The dataset was annotated manually by three annotators following specific annotation rules. The relevance judgments are in TREC format, where each line contains a query ID, a zero, an image ID, and a relevance score (0 or 1).
</p>
<ul>
<li><a href="https://github.com/LIAAD/pt-image-ir-dataset">Dataset Repository</a></li>
</ul>
'
22 changes: 22 additions & 0 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -6490,5 +6490,27 @@
"expected_md5": "49589ab65d1eaf78dbbadfc5ae56ef72",
"cache_path": "qrels.txt"
}
},
"pt-image-ir-dataset": {
"articles": {
"url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/articles.tsv",
"expected_md5": "ebbce9e470f683918d526b44849ad97c",
"cache_path": "articles.tsv"
},
"images": {
"url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/images.tsv",
"expected_md5": "eaefd26a5b2ba1e18c48715d3363d8a1",
"cache_path": "images.tsv"
},
"queries": {
"url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/queries.tsv",
"expected_md5": "2e094149f0ba84e2eb8d5dedc574c3e2",
"cache_path": "queries.tsv"
},
"qrels": {
"url": "https://raw.githubusercontent.com/LIAAD/pt-image-ir-dataset/main/data/qrels.txt",
"expected_md5": "53187432192e9989b913e5c3259322ca",
"cache_path": "qrels.txt"
}
}
}
103 changes: 103 additions & 0 deletions test/integration/pt_image_ir_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import unittest
import ir_datasets

from ir_datasets.formats import TrecQrel
from ir_datasets.formats import GenericQuery

from test.integration.base import DatasetIntegrationTest

from ir_datasets.datasets.pt_image_ir_dataset import PtImageIrImage


class TestPtImageIr(DatasetIntegrationTest):
def test_articles(self):
# Test that the dataset 'pt-image-ir-dataset' has 4743 articles (excluding header)
# Just test the count and basic structure
docs = list(ir_datasets.load("pt-image-ir-dataset").docs_iter())
self.assertEqual(len(docs), 4743)

# Test first document structure
first_doc = docs[0]
self.assertEqual(first_doc.doc_id, "art001")
self.assertEqual(first_doc.date, "2023-12-01")
self.assertTrue(first_doc.title.startswith("Comemorações"))
self.assertTrue(first_doc.text.startswith("O Presidente"))
self.assertTrue("img00001" in first_doc.images)

def test_images(self):
# Test that the dataset 'pt-image-ir-dataset/images' has 42920 images (excluding header)
# Testing start (index 0), middle (index 21459), and end (index 42919) entries
self._test_docs(
"pt-image-ir-dataset/images",
count=42920,
items={
0: PtImageIrImage(
doc_id="img00001",
text="https://www.presidencia.pt/media/bspfpsfp/231201-prmrs-mfl-0461-4542.jpg",
),
21459: PtImageIrImage(
doc_id="img21460",
text="https://www.presidencia.pt/media/c5wbrtqc/191219-prmrs-ro-0017-8746.jpg",
),
42919: PtImageIrImage(
doc_id="img42920",
text="https://www.presidencia.pt/media/dw1kvy3f/170602-prmrs-ro-0002-1624.jpg",
),
},
)

def test_queries(self):
# Test that the dataset 'pt-image-ir-dataset' has 80 queries (excluding header)
# Testing start (index 0), middle (index 39), and end (index 79) entries
self._test_queries(
"pt-image-ir-dataset",
count=80,
items={
0: GenericQuery("q01", "Emoções de tristeza em rostos"),
39: GenericQuery("q40", "Brexit"),
79: GenericQuery("q80", "Algarve"),
},
)

def test_qrels(self):
# Test that the dataset 'pt-image-ir-dataset' has 5201 qrels
# Testing start (index 0), middle (index 2600), and end (index 5200) entries
self._test_qrels(
"pt-image-ir-dataset",
count=5201,
items={
0: TrecQrel("q01", "img40494", 0, "0"),
2600: TrecQrel("q40", "img22242", 0, "0"),
5200: TrecQrel("q80", "img24820", 1, "0"),
},
)

def test_images_qrels(self):
# Test qrels for the images-only dataset variant
# Should have the same qrels as the main dataset
self._test_qrels(
"pt-image-ir-dataset/images",
count=5201,
items={
0: TrecQrel("q01", "img40494", 0, "0"),
2600: TrecQrel("q40", "img22242", 0, "0"),
5200: TrecQrel("q80", "img24820", 1, "0"),
},
)

def test_images_queries(self):
# Test queries for the images-only dataset variant
# Should have the same queries as the main dataset
self._test_queries(
"pt-image-ir-dataset/images",
count=80,
items={
0: GenericQuery("q01", "Emoções de tristeza em rostos"),
39: GenericQuery("q40", "Brexit"),
79: GenericQuery("q80", "Algarve"),
},
)


if __name__ == "__main__":
unittest.main()
Loading