diff --git a/kobo/apps/subsequences/schemas.py b/kobo/apps/subsequences/schemas.py index 333ff79c32..2ef53865c4 100644 --- a/kobo/apps/subsequences/schemas.py +++ b/kobo/apps/subsequences/schemas.py @@ -29,7 +29,7 @@ def validate_submission_supplement(asset: 'kpi.models.Asset', supplement: dict): - jsonschema.validate(get_submission_supplement_schema(asset), supplement) + jsonschema.validate(supplement, get_submission_supplement_schema(asset)) def get_submission_supplement_schema(asset: 'kpi.models.Asset') -> dict: diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py new file mode 100644 index 0000000000..b84efc1824 --- /dev/null +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -0,0 +1,287 @@ +from datetime import datetime, timedelta +from unittest.mock import patch + +from ddt import data, ddt, unpack +from django.test import TestCase +from django.utils import timezone +from freezegun import freeze_time + +from kobo.apps.subsequences.utils.versioning import ( + _determine_source_transcript, + _new_revision_from_old, + _separate_manual_and_automatic_versions, + migrate_submission_supplementals, +) + + +@ddt +class TestVersioning(TestCase): + def setUp(self): + super().setUp() + # works for translations or transcriptions + self.now = timezone.now().isoformat() + self.yesterday = (timezone.now() - timedelta(days=1)).isoformat() + self.action_dict = { + 'dateCreated': None, + 'dateModified': self.now, + 'languageCode': 'en', + 'revisions': [ + { + 'dateModified': self.yesterday, + 'languageCode': 'en', + 'value': 'Old value', + } + ], + 'value': 'Latest value', + } + + def test_new_revision_from_old(self): + now = timezone.now() + old = { + 'dateCreated': None, + 'dateModified': '2025-10-22 17:09:38', + 'languageCode': 'en', + 'value': 'Transcribed new', + } + with freeze_time(now): + result = _new_revision_from_old(old) + assert result['value'] == old['value'] + assert result['language'] == old['languageCode'] + assert result['_dateCreated'] == old['dateModified'] + assert result['_uuid'] is not None + assert result['_dateAccepted'] == now.isoformat() + + def test_new_transcript_revision_from_old_returns_none_for_bad_data(self): + old = {'badly': 'formatted'} + assert _new_revision_from_old(old) is None + + @data(True, False) + def test_separate_automatic_and_manual(self, latest_is_automated): + automated_transcription_value = ( + 'Latest value' if latest_is_automated else 'Old value' + ) + manual, automated = _separate_manual_and_automatic_versions( + self.action_dict, 'en', automated_transcription_value + ) + new_automated_transcript = automated[0] + new_manual_transcript = manual[0] + expected_most_recent_transcript = ( + new_automated_transcript if latest_is_automated else new_manual_transcript + ) + expected_old_transcript = ( + new_manual_transcript if latest_is_automated else new_automated_transcript + ) + + assert expected_most_recent_transcript['_dateCreated'] == self.now + assert expected_most_recent_transcript['value'] == 'Latest value' + assert expected_old_transcript['_dateCreated'] == self.yesterday + assert expected_old_transcript['value'] == 'Old value' + + def test_separate_automatic_and_manual_forces_language_if_given(self): + manual, automated = _separate_manual_and_automatic_versions( + self.action_dict, None, None, language='en' + ) + for formatted_item in manual: + assert formatted_item['language'] == 'en' + + def test_separate_automatic_and_manual_without_automatic_value(self): + manual, automatic = _separate_manual_and_automatic_versions( + self.action_dict, None, None + ) + assert len(manual) == 2 + assert len(automatic) == 0 + + @data( + # known language, date created, expected result uuid + # there is a transcript of the same language with an older date + ('de', '2024-12-31', 'uuid4'), + # there are transcripts of the same language but none older than the translation + ('de', '2023-01-01', 'uuid3'), + # there are no transcripts of the same language + ('fr', '2024-12-31', 'uuid1'), + # we don't know the source language but there are older transcripts + (None, '2024-12-31', 'uuid2'), + # we don't know the source language and there are no older transcripts + (None, '2023-01-01', 'uuid1'), + ) + @unpack + def test_determine_source_transcription( + self, source_language, date_created, expected_source_uuid + ): + now = timezone.now() + one_day_ago = now - timedelta(days=1) + jan_1_2024 = datetime(2024, 1, 1, tzinfo=timezone.utc) + jan_2_2024 = datetime(2024, 1, 2, tzinfo=timezone.utc) + transcripts = [ + { + '_uuid': 'uuid1', + '_dateCreated': now.isoformat(), + 'language': 'en', + '_actionId': 'manual_transcription', + }, + { + '_uuid': 'uuid2', + '_dateCreated': jan_1_2024.isoformat(), + 'language': 'en', + '_actionId': 'automatic_transcription', + }, + { + '_uuid': 'uuid3', + '_dateCreated': one_day_ago.isoformat(), + 'language': 'de', + '_actionId': 'manual_transcription', + }, + { + '_uuid': 'uuid4', + '_dateCreated': jan_2_2024.isoformat(), + 'language': 'de', + '_actionId': 'automatic_transcription', + }, + ] + translation_revision = {'_dateCreated': date_created} + source_transcript = _determine_source_transcript( + translation_revision, transcripts, automatic_source_language=source_language + ) + assert source_transcript['_uuid'] == expected_source_uuid + + # test the whole transformation process + def test_migrate_submission_extra_to_supplemental(self): + now = timezone.now() + one_day_ago = (now - timedelta(days=1)).isoformat() + one_year_ago = (now - timedelta(days=365)).isoformat() + a_year_and_a_day_ago = (now - timedelta(days=366)).isoformat() + old_version = { + 'Audio_question': { + 'googlets': { + 'languageCode': 'en', + 'regionCode': None, + 'status': 'complete', + 'value': 'This is audio that I am trying to ' 'transcribe.', + }, + 'googletx': { + 'languageCode': 'es', + 'source': 'en', + 'status': 'complete', + 'value': 'Esto es un audio que estoy ' 'intentando a transcribir.', + }, + 'transcript': { + 'dateCreated': one_day_ago, + 'dateModified': one_day_ago, + 'languageCode': 'en', + 'revisions': [ + { + 'dateModified': a_year_and_a_day_ago, + 'languageCode': 'en', + 'value': 'This is audio that ' + 'I am trying to ' + 'transcribe.', + }, + {}, + ], + 'value': 'This is audio that I am trying to ' + 'transcribe but i edited it.', + }, + 'translation': { + 'es': { + 'dateCreated': one_year_ago, + 'dateModified': now.isoformat(), + 'languageCode': 'es', + 'revisions': [ + { + 'dateModified': one_year_ago, + 'languageCode': 'es', + 'value': 'Esto es un ' + 'audio que ' + 'estoy ' + 'intentando a ' + 'transcribir.', + } + ], + 'value': 'Esto es un audio que ' + 'estoy intentando ' + 'transcribir pero yo lo edité', + } + }, + } + } + + with patch( + 'kobo.apps.subsequences.utils.versioning.uuid.uuid4', + side_effect=['uuid1', 'uuid2', 'uuid3', 'uuid4'], + ): + with freeze_time(now): + migrated = migrate_submission_supplementals(old_version) + + new_version = { + '_version': '20250820', + 'Audio_question': { + 'automatic_google_transcription': { + '_dateCreated': a_year_and_a_day_ago, + '_dateModified': a_year_and_a_day_ago, + '_versions': [ + { + '_dateCreated': a_year_and_a_day_ago, + '_dateAccepted': now.isoformat(), + '_uuid': 'uuid2', + 'language': 'en', + 'value': 'This is audio that I am trying to transcribe.', + 'status': 'complete', + } + ] + }, + 'automatic_google_translation': { + 'es': { + '_dateCreated': one_year_ago, + '_dateModified': one_year_ago, + '_versions': [ + { + '_dateCreated': one_year_ago, + '_dateAccepted': now.isoformat(), + '_dependency': { + '_actionId': 'automatic_google_transcription', + '_uuid': 'uuid2', + }, + '_uuid': 'uuid4', + 'language': 'es', + 'value': 'Esto es un audio que estoy intentando a' + ' transcribir.', + 'status': 'complete', + } + ] + } + }, + 'manual_transcription': { + '_dateCreated': one_day_ago, + '_dateModified': one_day_ago, + '_versions': [ + { + '_dateCreated': one_day_ago, + '_dateAccepted': now.isoformat(), + '_uuid': 'uuid1', + 'language': 'en', + 'value': 'This is audio that I am trying to ' + 'transcribe but i edited it.', + } + ] + }, + 'manual_translation': { + 'es': { + '_dateCreated': now.isoformat(), + '_dateModified': now.isoformat(), + '_versions': [ + { + '_dateCreated': now.isoformat(), + '_dateAccepted': now.isoformat(), + '_dependency': {'_actionId': 'manual_transcription', + '_uuid': 'uuid1'}, + '_uuid': 'uuid3', + 'language': 'es', + 'value': 'Esto es un audio que estoy intentando' + ' transcribir pero yo lo edité', + } + ] + } + }, + } + } + assert migrated == new_version diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index aba7b21852..9d29d38b9c 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -1,3 +1,7 @@ +import uuid + +from django.utils import timezone + from ..constants import SCHEMA_VERSIONS @@ -6,10 +10,7 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: if advanced_features.get('_version') == SCHEMA_VERSIONS[0]: return - migrated_advanced_features = { - '_version': SCHEMA_VERSIONS[0], - '_actionConfigs': {} - } + migrated_advanced_features = {'_version': SCHEMA_VERSIONS[0], '_actionConfigs': {}} actionConfigs = migrated_advanced_features['_actionConfigs'] for key, value in advanced_features.items(): @@ -39,6 +40,219 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: return migrated_advanced_features +def migrate_submission_supplementals(supplemental_data: dict) -> dict | None: + if supplemental_data.get('_version') == SCHEMA_VERSIONS[0]: + return + supplemental = { + '_version': SCHEMA_VERSIONS[0], + } + for question_xpath, action_results in supplemental_data.items(): + question_results_by_action = {} + + # get all the automatic result data + automatic_transcript = action_results.get('googlets', {}) + automatic_transcript_language = automatic_transcript.get('languageCode') + automatic_transcript_value = automatic_transcript.get('value') + automatic_translation = action_results.get('googletx', {}) + automatic_translation_language = automatic_translation.get('languageCode') + automatic_translation_value = automatic_translation.get('value') + automatic_translation_source_language = automatic_translation.get('source') + + # divide transcripts into manual and automatic + manual_transcripts, automatic_transcripts = ( + _separate_manual_and_automatic_versions( + action_results.get('transcript'), + automatic_transcript_language, + automatic_transcript_value, + ) + ) + + if len(manual_transcripts) > 0: + question_results_by_action['manual_transcription'] = ( + _version_list_to_summary_dict(manual_transcripts) + ) + if len(automatic_transcripts) > 0: + question_results_by_action['automatic_google_transcription'] = ( + _version_list_to_summary_dict(automatic_transcripts) + ) + + # process translations + translations_dict = action_results.get('translation', {}) + automatic_translations = {} + manual_translations = {} + + # divide translations into manual and automatic by language + for language_code, translations in translations_dict.items(): + manual_translations_for_language, automatic_translations_for_language = ( + _separate_manual_and_automatic_versions( + translations, + automatic_translation_language, + automatic_translation_value, + language_code, + ) + ) + + all_tagged_transcripts = _combine_source_transcripts( + manual_transcripts, automatic_transcripts + ) + if len(automatic_translations_for_language) > 0: + _add_translation_sources( + automatic_translations_for_language, + all_tagged_transcripts, + automatic_translation_source_language, + ) + automatic_translations[language_code] = _version_list_to_summary_dict( + automatic_translations_for_language + ) + if len(manual_translations_for_language) > 0: + _add_translation_sources( + manual_translations_for_language, all_tagged_transcripts + ) + manual_translations[language_code] = _version_list_to_summary_dict( + manual_translations_for_language + ) + if automatic_translations != {}: + question_results_by_action['automatic_google_translation'] = ( + automatic_translations + ) + if manual_translations != {}: + question_results_by_action['manual_translation'] = manual_translations + supplemental[question_xpath] = question_results_by_action + + return supplemental + + def set_version(schema: dict) -> dict: schema['_version'] = SCHEMA_VERSIONS[0] return schema + + +def _add_translation_sources( + version_list, all_tagged_transcripts, automatic_translation_source_language=None +): + for translation in version_list: + # determine and record the most likely source transcript + source = _determine_source_transcript( + translation, + all_tagged_transcripts, + automatic_source_language=automatic_translation_source_language, + ) + translation['_dependency'] = { + '_uuid': source['_uuid'], + '_actionId': source['_actionId'], + } + + +def _combine_source_transcripts(manual_transcripts, automatic_transcripts): + # Combine manual and automatic transcripts and sort by dateCreated descending + # tag them with the action so we don't lose track + tagged_manual_transcripts = [ + {**transcript, '_actionId': 'manual_transcription'} + for transcript in manual_transcripts + ] + tagged_automatic_transcripts = [ + {**transcript, '_actionId': 'automatic_google_transcription'} + for transcript in automatic_transcripts + ] + + all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] + all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) + return all_tagged_transcripts + + +def _determine_source_transcript( + translation_revision, all_transcripts, automatic_source_language=None +): + if automatic_source_language: # we know the source language + transcripts_matching_language = [ + transcript + for transcript in all_transcripts + if transcript['language'] == automatic_source_language + ] + for transcript in transcripts_matching_language: + # is there a transcript in the source language created earlier than the + # translation? + if transcript['_dateCreated'] < translation_revision['_dateCreated']: + return transcript + # if not, is there *any* transcript in the source language? take the most + # recent one + if len(transcripts_matching_language) > 0: + return transcripts_matching_language[0] + else: + # is there a transcript older than the translation? + for transcript in all_transcripts: + if transcript['_dateCreated'] < translation_revision['_dateCreated']: + return transcript + # default to the most recent transcript + return all_transcripts[0] + + +def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: + now = timezone.now().isoformat() + # ignore bad data + if ( + 'languageCode' not in old_transcript_revision_dict + or 'value' not in old_transcript_revision_dict + ): + return None + return { + '_dateCreated': old_transcript_revision_dict.get('dateModified'), + 'language': old_transcript_revision_dict['languageCode'], + 'value': old_transcript_revision_dict['value'], + '_uuid': str(uuid.uuid4()), + # all preexisting translations/transcripts are considered accepted + '_dateAccepted': now, + } + + +def _separate_manual_and_automatic_versions( + old_action_dictionary, + automatic_result_language, + automatic_result_value, + # translations have an expected language + language=None, +): + automatic_versions = [] + manual_versions = [] + latest_revision = { + key: val + for key, val in old_action_dictionary.items() + if key in ['value', 'languageCode', 'dateModified'] + } + # add the latest revision to the list of all revisions for easier processing + all_revisions = [latest_revision, *old_action_dictionary.get('revisions', [])] + for revision in all_revisions: + if language: + # force the expected language if given + revision['languageCode'] = language + revision_formatted = _new_revision_from_old(revision) + if revision_formatted is None: + continue + # if the language and value match that of the automatic result, + # assume this one was generated automatically + matches_automatic_result = ( + revision_formatted['language'] == automatic_result_language + and revision_formatted['value'] == automatic_result_value + ) + correct_version_list_to_append = ( + automatic_versions if matches_automatic_result else manual_versions + ) + if matches_automatic_result: + # automatic versions also need a status + revision_formatted['status'] = 'complete' + correct_version_list_to_append.append(revision_formatted) + + # they should be sorted anyway, but just make sure in case the input values + # weren't sorted correctly + manual_versions.sort(reverse=True, key=lambda d: d['_dateCreated']) + automatic_versions.sort(reverse=True, key=lambda d: d['_dateCreated']) + + return manual_versions, automatic_versions + + +def _version_list_to_summary_dict(list_of_versions: list[dict]) -> dict: + return { + '_dateCreated': list_of_versions[-1]['_dateCreated'], + '_dateModified': list_of_versions[0]['_dateCreated'], + '_versions': list_of_versions, + }