-
Notifications
You must be signed in to change notification settings - Fork 3
Automatic categorization (labelling) of reports based on asking LLM questions. #196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
1371f40
a5e8611
9ab56e0
34c1c41
7125830
578f9e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,33 @@ | ||||||
| from django.contrib import admin | ||||||
|
|
||||||
| from .models import LabelChoice, LabelGroup, LabelQuestion, ReportLabel | ||||||
|
|
||||||
|
|
||||||
| class LabelChoiceInline(admin.TabularInline): | ||||||
| model = LabelChoice | ||||||
| extra = 0 | ||||||
|
|
||||||
|
|
||||||
| @admin.register(LabelGroup) | ||||||
| class LabelGroupAdmin(admin.ModelAdmin): | ||||||
| list_display = ("name", "is_active", "order") | ||||||
| list_filter = ("is_active",) | ||||||
| search_fields = ("name",) | ||||||
| ordering = ("order", "name") | ||||||
|
|
||||||
|
|
||||||
| @admin.register(LabelQuestion) | ||||||
| class LabelQuestionAdmin(admin.ModelAdmin): | ||||||
| list_display = ("label", "question", "group", "is_active", "order") | ||||||
| list_filter = ("group", "is_active") | ||||||
| search_fields = ("label", "question") | ||||||
| ordering = ("group__order", "order", "label") | ||||||
| inlines = (LabelChoiceInline,) | ||||||
|
|
||||||
|
|
||||||
| @admin.register(ReportLabel) | ||||||
| class ReportLabelAdmin(admin.ModelAdmin): | ||||||
| list_display = ("report", "question", "choice", "confidence", "verified", "created_at") | ||||||
| list_filter = ("verified", "question__group") | ||||||
| search_fields = ("report__document_id", "question__name", "choice__label") | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug:
Proposed fix- search_fields = ("report__document_id", "question__name", "choice__label")
+ search_fields = ("report__document_id", "question__label", "choice__label")📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
| ordering = ("-created_at",) | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| from django.apps import AppConfig | ||
|
|
||
|
|
||
| class LabelsConfig(AppConfig): | ||
| name = "radis.labels" | ||
|
|
||
| def ready(self) -> None: | ||
| register_app() | ||
|
|
||
| from radis.reports.site import ( | ||
| ReportsCreatedHandler, | ||
| ReportsUpdatedHandler, | ||
| register_reports_created_handler, | ||
| register_reports_updated_handler, | ||
| ) | ||
|
|
||
| from . import signals # noqa: F401 | ||
| from .site import handle_reports_created, handle_reports_updated | ||
|
|
||
| register_reports_created_handler( | ||
| ReportsCreatedHandler( | ||
| name="Labels", | ||
| handle=handle_reports_created, | ||
| ) | ||
| ) | ||
| register_reports_updated_handler( | ||
| ReportsUpdatedHandler( | ||
| name="Labels", | ||
| handle=handle_reports_updated, | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| def register_app() -> None: | ||
| from adit_radis_shared.common.site import MainMenuItem, register_main_menu_item | ||
|
|
||
| register_main_menu_item( | ||
| MainMenuItem( | ||
| url_name="label_group_list", | ||
| label="Auto Labels", | ||
| ) | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| DEFAULT_LABEL_CHOICES = [ | ||
| {"value": "yes", "label": "Yes", "is_unknown": False, "order": 1}, | ||
| {"value": "no", "label": "No", "is_unknown": False, "order": 2}, | ||
| {"value": "cannot_decide", "label": "Cannot decide", "is_unknown": True, "order": 3}, | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| from crispy_forms.helper import FormHelper | ||
| from crispy_forms.layout import Column, Layout, Row | ||
| from django import forms | ||
|
|
||
| from .models import LabelGroup, LabelQuestion | ||
|
|
||
|
|
||
| class LabelGroupForm(forms.ModelForm): | ||
| class Meta: | ||
| model = LabelGroup | ||
| fields = [ | ||
| "name", | ||
| "description", | ||
| "is_active", | ||
| "order", | ||
| ] | ||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
|
|
||
| self.helper = FormHelper() | ||
| self.helper.form_tag = False | ||
| self.helper.layout = Layout( | ||
| Row( | ||
| Column("name", "description"), | ||
| Column("is_active", "order", css_class="col-3"), | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| class LabelQuestionForm(forms.ModelForm): | ||
| class Meta: | ||
| model = LabelQuestion | ||
| fields = [ | ||
| "label", | ||
| "question", | ||
| "is_active", | ||
| "order", | ||
| ] | ||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| self.group = kwargs.pop("group", None) | ||
| super().__init__(*args, **kwargs) | ||
|
|
||
| self.helper = FormHelper() | ||
| self.helper.form_tag = False | ||
| self.helper.layout = Layout( | ||
| "label", | ||
| "question", | ||
| Row(Column("is_active"), Column("order", css_class="col-3")), | ||
| ) | ||
|
|
||
| self.fields["question"].required = False | ||
| self.fields["question"].help_text = "Optional. If left empty, the label is used." | ||
|
|
||
| def clean_label(self): | ||
| label = self.cleaned_data.get("label", "") | ||
| if not label or not self.group: | ||
| return label | ||
|
|
||
| existing = LabelQuestion.objects.filter(group=self.group, label__iexact=label) | ||
| if self.instance and self.instance.pk: | ||
| existing = existing.exclude(pk=self.instance.pk) | ||
| if existing.exists(): | ||
| raise forms.ValidationError("A question with this label already exists in this group.") | ||
| return label |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from itertools import batched | ||
|
|
||
| from django.core.management.base import BaseCommand, CommandError | ||
|
|
||
| from radis.reports.models import Report | ||
|
|
||
| from ...models import LabelGroup | ||
| from ...tasks import process_label_group | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| help = "Enqueue labeling tasks for existing reports." | ||
|
|
||
| def add_arguments(self, parser): | ||
| parser.add_argument( | ||
| "--group", | ||
| dest="group", | ||
| help="Label group name or ID. If omitted, all active groups are used.", | ||
| ) | ||
| parser.add_argument( | ||
| "--batch-size", | ||
| dest="batch_size", | ||
| type=int, | ||
| default=None, | ||
| help="Override the task batch size.", | ||
| ) | ||
| parser.add_argument( | ||
| "--limit", | ||
| dest="limit", | ||
| type=int, | ||
| default=None, | ||
| help="Limit the number of reports to enqueue.", | ||
| ) | ||
|
|
||
| def handle(self, *args, **options): | ||
| group_value = options.get("group") | ||
| batch_size = options.get("batch_size") | ||
| limit = options.get("limit") | ||
|
|
||
| if group_value: | ||
| group = self._get_group(group_value) | ||
| groups = [group] | ||
| else: | ||
| groups = list(LabelGroup.objects.filter(is_active=True)) | ||
|
|
||
| if not groups: | ||
| self.stdout.write(self.style.WARNING("No active label groups found.")) | ||
| return | ||
|
|
||
| report_ids = Report.objects.order_by("id").values_list("id", flat=True) | ||
| if limit: | ||
| report_ids = report_ids[:limit] | ||
| report_ids = list(report_ids) | ||
|
|
||
| if not report_ids: | ||
| self.stdout.write(self.style.WARNING("No reports found.")) | ||
| return | ||
|
|
||
| if batch_size is None: | ||
| from django.conf import settings | ||
|
|
||
| batch_size = settings.LABELING_TASK_BATCH_SIZE | ||
|
|
||
| for group in groups: | ||
| for report_batch in batched(report_ids, batch_size): | ||
| process_label_group.defer( | ||
| label_group_id=group.id, | ||
| report_ids=list(report_batch), | ||
| ) | ||
|
|
||
| self.stdout.write( | ||
| self.style.SUCCESS( | ||
| f"Enqueued labeling for {len(report_ids)} reports across {len(groups)} group(s)." | ||
| ) | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This command loads all report IDs into memory by calling report_ids_qs = Report.objects.order_by("id").values_list("id", flat=True)
if limit:
report_ids_qs = report_ids_qs[:limit]
num_reports = report_ids_qs.count()
if not num_reports:
self.stdout.write(self.style.WARNING("No reports found."))
return
if batch_size is None:
from django.conf import settings
batch_size = settings.LABELING_TASK_BATCH_SIZE
for group in groups:
for report_batch in batched(report_ids_qs.iterator(), batch_size):
process_label_group.defer(
label_group_id=group.id,
report_ids=list(report_batch),
)
self.stdout.write(
self.style.SUCCESS(
f"Enqueued labeling for {num_reports} reports across {len(groups)} group(s)."
)
) |
||
|
|
||
| def _get_group(self, value: str) -> LabelGroup: | ||
| if value.isdigit(): | ||
| group = LabelGroup.objects.filter(id=int(value)).first() | ||
| else: | ||
| matches = LabelGroup.objects.filter(name=value) | ||
| if matches.count() > 1: | ||
| raise CommandError( | ||
| f"Multiple label groups named '{value}' exist. Use the numeric ID." | ||
| ) | ||
| group = matches.first() | ||
|
|
||
| if not group: | ||
| raise CommandError(f"Label group '{value}' not found.") | ||
|
|
||
| return group | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,92 @@ | ||||||||||||||||||||||||||
| from __future__ import annotations | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| import json | ||||||||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| from django.core.management.base import BaseCommand, CommandError | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| from ...models import LabelChoice, LabelGroup, LabelQuestion | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| class Command(BaseCommand): | ||||||||||||||||||||||||||
| help = "Seed label groups, questions, and choices from a JSON file." | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| def add_arguments(self, parser): | ||||||||||||||||||||||||||
| parser.add_argument( | ||||||||||||||||||||||||||
| "--file", | ||||||||||||||||||||||||||
| dest="file", | ||||||||||||||||||||||||||
| default="resources/labels/seed.json", | ||||||||||||||||||||||||||
| help="Path to the seed JSON file.", | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| def handle(self, *args, **options): | ||||||||||||||||||||||||||
| seed_path = Path(options["file"]).resolve() | ||||||||||||||||||||||||||
| if not seed_path.exists(): | ||||||||||||||||||||||||||
| raise CommandError(f"Seed file not found: {seed_path}") | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| payload = json.loads(seed_path.read_text()) | ||||||||||||||||||||||||||
| groups = payload.get("groups", []) | ||||||||||||||||||||||||||
| if not groups: | ||||||||||||||||||||||||||
| self.stdout.write(self.style.WARNING("No groups found in seed file.")) | ||||||||||||||||||||||||||
| return | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| for group_data in groups: | ||||||||||||||||||||||||||
| group = self._upsert_group(group_data) | ||||||||||||||||||||||||||
| for question_data in group_data.get("questions", []): | ||||||||||||||||||||||||||
| question = self._upsert_question(group, question_data) | ||||||||||||||||||||||||||
| if question_data.get("choices"): | ||||||||||||||||||||||||||
| self._upsert_choice(question, {}) | ||||||||||||||||||||||||||
|
Comment on lines
+33
to
+38
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Empty dict passed to Line 38 calls Proposed fix- if question_data.get("choices"):
- self._upsert_choice(question, {})
+ for choice_data in question_data.get("choices", []):
+ self._upsert_choice(question, choice_data)📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| self.stdout.write(self.style.SUCCESS("Label seed import completed.")) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| def _upsert_group(self, data: dict) -> LabelGroup: | ||||||||||||||||||||||||||
| name = data.get("name") | ||||||||||||||||||||||||||
| if not name: | ||||||||||||||||||||||||||
| raise CommandError("Label group requires 'name'.") | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| groups = LabelGroup.objects.filter(name=name) | ||||||||||||||||||||||||||
| if groups.count() > 1: | ||||||||||||||||||||||||||
| raise CommandError( | ||||||||||||||||||||||||||
| f"Multiple label groups named '{name}' exist. Use unique names before seeding." | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| defaults = { | ||||||||||||||||||||||||||
| "description": data.get("description", ""), | ||||||||||||||||||||||||||
| "is_active": data.get("is_active", True), | ||||||||||||||||||||||||||
| "order": data.get("order", 0), | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| if groups.exists(): | ||||||||||||||||||||||||||
| group = groups.first() | ||||||||||||||||||||||||||
| if group is None: | ||||||||||||||||||||||||||
| raise CommandError( | ||||||||||||||||||||||||||
| f"Label group '{name}' could not be resolved. Try seeding again." | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
| for key, value in defaults.items(): | ||||||||||||||||||||||||||
| setattr(group, key, value) | ||||||||||||||||||||||||||
| group.save(update_fields=list(defaults.keys())) | ||||||||||||||||||||||||||
| return group | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| return LabelGroup.objects.create(name=name, **defaults) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| def _upsert_question(self, group: LabelGroup, data: dict) -> LabelQuestion: | ||||||||||||||||||||||||||
| label = data.get("label") | ||||||||||||||||||||||||||
| question_text = data.get("question", "") | ||||||||||||||||||||||||||
| if not label: | ||||||||||||||||||||||||||
| raise CommandError("Label question requires 'label'.") | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| question, _ = LabelQuestion.objects.update_or_create( | ||||||||||||||||||||||||||
| group=group, | ||||||||||||||||||||||||||
| label=label, | ||||||||||||||||||||||||||
| defaults={ | ||||||||||||||||||||||||||
| "question": question_text, | ||||||||||||||||||||||||||
| "is_active": data.get("is_active", True), | ||||||||||||||||||||||||||
| "order": data.get("order", 0), | ||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
| return question | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| def _upsert_choice(self, question: LabelQuestion, data: dict) -> LabelChoice: | ||||||||||||||||||||||||||
| raise CommandError( | ||||||||||||||||||||||||||
| "Custom choices are not supported. Labels use fixed Yes/No/Cannot decide choices." | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
search_fieldsforReportLabelAdminincludesquestion__name, but theLabelQuestionmodel does not have anamefield. This will raise aFieldErrorwhen using the search functionality in the Django admin for Report Labels. You should probably search on thelabelorquestionfield of theLabelQuestionmodel instead.