From 4ea3f8cc74795d9cd67bc48194330425eade6b6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Anders=20Sj=C3=B6qvist?= <anders@sjoqvi.st>
Date: Fri, 16 Aug 2024 17:16:16 +0200
Subject: [PATCH] Start implementing a problem migration tool

---
 problemtools/migrateproblem.py                | 569 ++++++++++++++++++
 problemtools/schemas/problem-yaml-schema.json | 196 ++++++
 2 files changed, 765 insertions(+)
 create mode 100644 problemtools/migrateproblem.py
 create mode 100644 problemtools/schemas/problem-yaml-schema.json
diff --git a/problemtools/migrateproblem.py b/problemtools/migrateproblem.py
new file mode 100644
index 00000000..1e3ed325
--- /dev/null
+++ b/problemtools/migrateproblem.py
@@ -0,0 +1,569 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+from enum import IntFlag
+import argparse
+import io
+import os
+import re
+import shutil
+import sys
+import tempfile
+import uuid
+import yaml
+import yaml.parser
+
+def dict_add_unless_none(dict, key, value):
+    if value is not None:
+        dict[key] = value
+
+def parser_warning(msg):
+    print(f'PARSER WARNING: {msg}', file=sys.stderr)
+
+def parser_error(msg):
+    sys.exit(f'PARSER ERROR: {msg}')
+
+def parser_unimplemented(msg):
+    sys.exit(f'PARSER NOT IMPLEMENTED: {msg}')
+
+class ProblemFormatVersion(IntFlag):
+    LEGACY_ICPC = 2
+    LEGACY = 3
+    V2023_07 = 4
+
+class Validation(IntFlag):
+    NONE = 0
+    DEFAULT = 1
+    CUSTOM = 2
+    SCORE = 4
+    INTERACTIVE = 8
+
+class ProblemYaml:
+    def __init__(self, in_version, out_version):
+        # names without spaces and/or with special characters are suspicious
+        self._suspicious_name = re.compile(r'^([^ ]*|.*[!"#$%&()*+,./0-9:;<=>?@[\\\]\^_{|}~].*)$')
+        # split a string on the format "Full Name <adress>" or "Full Name"
+        self._fullname_address = re.compile(r'^(|.*[^\s>])\s*(?:<([^<>]*)>)?\s*$')
+
+        if in_version is None or in_version == 'legacy':
+            self._in_version = ProblemFormatVersion.LEGACY
+        else:
+            parser_unimplemented(f'problem format version: {in_version}')
+        self._out_version = out_version
+        self._type = None
+        self._name = None
+        self._uuid = None
+        self._version = None
+        self._credits = None
+        self._source = None
+        self._license = None
+        self._rights_owner = None
+        self._limits = None
+        self._validation = None
+        self._validator_flags = None
+        self._scoring = None
+        self._keywords = None
+
+    # present in all versions
+    @property
+    def problem_format_version(self):
+        match self._out_version:
+            case ProblemFormatVersion.LEGACY:
+                return 'legacy'
+            case ProblemFormatVersion.LEGACY_ICPC:
+                return 'legacy-icpc'
+            case ProblemFormatVersion.V2023_07:
+                return '2023-07-draft'
+            case _:
+                parser_error('unexpected target problem format version')
+
+    # not present in legacy-icpc
+    @property
+    def type(self):
+        if self._out_version is ProblemFormatVersion.LEGACY_ICPC and self._type is not None:
+            parser_error('legacy-icpc format doesn\'t support property "type"')
+        if self._out_version is ProblemFormatVersion.LEGACY and self._type not in ['pass-fail', 'scoring']:
+            parser_error(f'unsupported type property in legacy format: "{self._type}"')
+        return self._type
+
+    @type.setter
+    def type(self, value):
+        if value is None: return
+        # todo: implement 2023-07-draft
+        if not isinstance(value, str): parser_error(f'unexpected type of property "type": {type(value)}')
+        if value not in ['pass-fail', 'scoring']:
+            parser_error(f'unknown problem type: {value}')
+        self._type = value
+
+    # present in all versions
+    @property
+    def name(self):
+        if self._out_version >= ProblemFormatVersion.V2023_07 and self._name is None:
+            parser_error('the target format version requires the problem "name" property')
+        return self._name
+
+    @name.setter
+    def name(self, value):
+        if value is None: return
+        # todo: implement 2023-07-draft
+        if not isinstance(value, str): parser_error(f'unexpected type of property "name": {type(value)}')
+        self._name = value
+
+    # present in all versions
+    @property
+    def uuid(self):
+        if self._out_version >= ProblemFormatVersion.V2023_07 and self._uuid is None:
+            self._uuid = uuid.uuid4()
+            parser_warning('generated a new UUID for the "uuid" property as the input didn\'t contain one')
+        return str(self._uuid)
+
+    @uuid.setter
+    def uuid(self, value):
+        if value is None: return
+        self._uuid = uuid.UUID(value)
+
+    # present since 2023-07-draft
+    @property
+    def version(self):
+        if self._out_version & ProblemFormatVersion.LEGACY_ICPC and self._version is not None:
+            parser_warning('dropping "version" property because of target problem format version')
+        return self._version
+
+    @version.setter
+    def version(self, value):
+        if value is None: return
+        # todo: check format version
+        self._version = value
+
+    # not present since 2023-07-draft (see credits)
+    @property
+    def author(self):
+        if self._out_version >= ProblemFormatVersion.V2023_07: return None
+        if self._credits is None or 'authors' not in self._credits: return None
+        return ', '.join([author.get('name', None) for author in self._credits['authors']])
+
+    @author.setter
+    def author(self, value):
+        if value is None: return
+        # check input version
+
+        # adapted from kattisd/addproblem.py
+        authors = re.split(',|\s+and\s+|\s+&\s+', value)
+        authors = [x.strip(' \t\r\n') for x in authors]
+        authors = [{ 'name': x } for x in authors if len(x) > 0]
+
+        for author in authors:
+            name = author['name']
+            if self._suspicious_name.search(name):
+                parser_warning(f'the author name "{name}" may have been incorrectly parsed')
+
+        self._credits = { 'authors': authors }
+
+    # present since 2023-07-draft (see author for older syntax)
+    @property
+    def credits(self):
+        # todo: check if credits are dropped because of version downgrade
+        if self._out_version & ProblemFormatVersion.LEGACY_ICPC: return None
+        if self._credits is None: return None
+        # todo: flatten if array is of length 1
+        dict = {}
+        if 'authors' in self._credits:
+            dict['authors'] = [(f'{author["name"]} <{author["address"]}>' if 'address' in author else author['name']) for author in self._credits['authors']]
+        return dict if dict else None
+
+    @credits.setter
+    def credits(self, value):
+        if value is None: return
+        parser_unimplemented('parsing of credits has not yet been implemented')
+
+    # present in all versions
+    @property
+    def source(self):
+        if self._out_version & ProblemFormatVersion.LEGACY_ICPC:
+            if self._source is None or not self._source: return None
+            return self._source[0].get('name', None)
+        return self._source
+
+    @source.setter
+    def source(self, value):
+        if value is None: return
+        if self._in_version & ProblemFormatVersion.LEGACY_ICPC:
+            if not isinstance(value, str): parser_error(f'unexpected type of property "source": {type(value)}')
+            if self._source is None: self._source = [{}]
+            self._source[0]['name'] = value
+        else:
+            # value could be
+            # 1. a string
+            # 2. an object of 'name' and possibly 'url'
+            # 3. an array of strings and/or objects of 'name' and possibly 'url'
+            # this code unifies 1 and 2 into 3
+            if not isinstance(value, list):
+                value = [value]
+            for i, x in enumerate(value):
+                if isinstance(x, str): value[i] = { 'name': x }
+            self._source = value
+
+    # not present since 2023-07-draft (see source)
+    @property
+    def source_url(self):
+        if self._out_version & ProblemFormatVersion.LEGACY_ICPC:
+            if self._source is None or not self._source: return None
+            return self._source[0].get('url', None)
+        return None
+
+    @source_url.setter
+    def source_url(self, value):
+        if value is None: return
+        if self._in_version & ProblemFormatVersion.LEGACY_ICPC:
+            if not isinstance(value, str): parser_error(f'unexpected type of property "source_url": {type(value)}')
+            if self._source is None: self._source = [{}]
+            self._source[0]['url'] = value
+        else:
+            parser_error('property "source_url" is not allowed in this source problem format version')
+
+    # present in all versions
+    @property
+    def license(self):
+        return self._license
+
+    @license.setter
+    def license(self, value):
+        if value is None: return
+        if value not in ['unknown', 'public domain', 'cc0', 'cc by', 'cc by-sa', 'educational', 'permission']:
+            parser_error(f'illegal license: {value}')
+        self._license = value
+
+    # present in all versions
+    @property
+    def rights_owner(self):
+        if self._rights_owner is not None and self._license == 'public domain':
+            parser_error('"rights_owner" given although license is "public domain"')
+        if self._license is not None and self._license not in ['unknown', 'public domain'] and self._rights_owner is None and (self._credits is None or 'authors' not in self._credits) and self._source is None:
+            parser_error(f'no owner can be identified although license is "{self._license}"')
+        return self._rights_owner
+
+    @rights_owner.setter
+    def rights_owner(self, value):
+        if value is None: return
+        self._rights_owner = value
+
+    # present in all versions
+    @property
+    def limits(self):
+        return self._limits
+
+    @limits.setter
+    def limits(self, value):
+        if value is None: return
+        self._limits = value
+
+    # not present since 2023-07-draft
+    @property
+    def validation(self):
+        if self._validation is None: return None
+        flags = []
+        if self._validation & Validation.DEFAULT:
+            flags.append('default')
+        if self._validation & Validation.CUSTOM:
+            flags.append('custom')
+        if self._validation & Validation.SCORE:
+            flags.append('score')
+        if self._validation & Validation.INTERACTIVE:
+            flags.append('interactive')
+        return ' '.join(flags)
+
+    @validation.setter
+    def validation(self, value):
+        if value is None: return
+
+        flags = Validation.NONE
+        for s in value.split():
+            match s:
+                case 'default':
+                    flags |= Validation.DEFAULT
+                case 'custom':
+                    flags |= Validation.CUSTOM
+                case 'score':
+                    flags |= Validation.SCORE
+                case 'interactive':
+                    flags |= Validation.INTERACTIVE
+                case _:
+                    parser_error(f'unknown validation "{s}"')
+        if flags & Validation.DEFAULT and flags & ~Validation.DEFAULT:
+            parser_error(f'forbidden validation combination "{value}"')
+        self._validation = flags
+
+    # not present since 2023-07-draft
+    @property
+    def validator_flags(self):
+        return self._validator_flags
+
+    @validator_flags.setter
+    def validator_flags(self, value):
+        if value is None: return
+        self._validator_flags = value
+
+    # only present in legacy
+    @property
+    def scoring(self):
+        return self._scoring
+
+    @scoring.setter
+    def scoring(self, value):
+        if value is None: return
+        self._scoring = value
+
+    def grading(self, value):
+        if value is None: return
+        parser_warning('"grading" is deprecated, use "scoring" instead')
+        self._scoring = value
+
+    # present in all versions
+    @property
+    def keywords(self):
+        return self._keywords
+
+    @keywords.setter
+    def keywords(self, value):
+        if value is None: return
+        self._keywords = value
+
+    # todo: add languages (since 2023-07-draft)
+    # todo: add constants (since 2023-07-draft)
+
+    def generate_dict(self):
+        dict = {
+            "problem_format_version": self.problem_format_version,
+        }
+        dict_add_unless_none(dict, 'type', self.type)
+        dict_add_unless_none(dict, 'name', self.name)
+        dict_add_unless_none(dict, 'uuid', self.uuid)
+        dict_add_unless_none(dict, 'version', self.version)
+        dict_add_unless_none(dict, 'author', self.author)
+        dict_add_unless_none(dict, 'credits', self.credits)
+        dict_add_unless_none(dict, 'source', self.source)
+        dict_add_unless_none(dict, 'source_url', self.source_url)
+        dict_add_unless_none(dict, 'license', self.license)
+        dict_add_unless_none(dict, 'rights_owner', self.rights_owner)
+        dict_add_unless_none(dict, 'limits', self.limits)
+        dict_add_unless_none(dict, 'validation', self.validation)
+        dict_add_unless_none(dict, 'validator_flags', self.validator_flags)
+        dict_add_unless_none(dict, 'scoring', self.scoring)
+        dict_add_unless_none(dict, 'keywords', self.keywords)
+        return dict
+
+class TestdataYaml:
+    def __init__(self, in_version, out_version):
+        if in_version is None or in_version == 'legacy':
+            self._in_version = ProblemFormatVersion.LEGACY
+        else:
+            parser_unimplemented(f'problem format version: {in_version}')
+
+    # only present in legacy
+    @property
+    def on_reject(self):
+        return self._on_reject
+
+    @on_reject.setter
+    def on_reject(self, value):
+        if value is None: return
+        if value not in ['break', 'continue']:
+            parser_error(f'illegal on_reject: {value}')
+        self._on_reject = value
+
+    # only present since 2023-07-draft
+    @property
+    def scoring(self):
+        return self._scoring
+
+    @scoring.setter
+    def scoring(self, value):
+        if value is None: return
+        self._scoring = value
+
+    # only present in legacy
+    @property
+    def grading(self):
+        return self._grading
+
+    @grading.setter
+    def grading(self, value):
+        if value is None: return
+        if value not in ['default', 'custom']:
+            parser_error(f'illegal grading: {value}')
+        self._grading = value
+
+    # only present in legacy
+    @property
+    def grader_flags(self):
+        return self._grader_flags
+
+    @grader_flags.setter
+    def grader_flags(self, value):
+        if value is None: return
+        self._grader_flags = value
+
+    # only present in legacy
+    @property
+    def input_validator_flags(self):
+        return self._input_validator_args
+
+    @input_validator_flags.setter
+    def input_validator_flags(self, value):
+        if value is None: return
+        self._input_validator_args = value
+
+    # only present since 2023-07-draft
+    @property
+    def input_validator_args(self):
+        return self._input_validator_args
+
+    @input_validator_args.setter
+    def input_validator_args(self, value):
+        if value is None: return
+        self._input_validator_args = value
+
+    # only present since 2023-07-draft
+    @property
+    def static_validation(self):
+        return self._static_validation
+
+    @static_validation.setter
+    def static_validation(self, value):
+        if value is None: return
+        self._static_validation = value
+
+    # only present since 2023-07-draft
+    @property
+    def full_feedback(self):
+        return self._full_feedback
+
+    @full_feedback.setter
+    def full_feedback(self, value):
+        if value is None: return
+        self._full_feedback = value
+
+    # only present in legacy
+    # todo: should this really be a string as specified?
+    @property
+    def accept_score(self):
+        return self._accept_score
+
+    @accept_score.setter
+    def accept_score(self, value):
+        if value is None: return
+        self._accept_score = value
+
+    # only present in legacy
+    # todo: should this really be a string as specified?
+    @property
+    def reject_score(self):
+        return self._reject_score
+
+    @reject_score.setter
+    def reject_score(self, value):
+        if value is None: return
+        self._reject_score = value
+
+    # only present in legacy
+    @property
+    def range(self):
+        return self._range
+
+    @range.setter
+    def range(self, value):
+        if value is None: return
+        self._range = value
+
+def arg_inputdir(path):
+    if not os.path.isdir(path):
+        raise argparse.ArgumentTypeError(f'inputdir: {path} is not a valid path')
+    return path
+
+def arg_outputdir(path):
+    #if os.path.lexists(path):
+    #    raise argparse.ArgumentTypeError(f'outputdir: {path} already exists')
+    canonical = os.path.realpath(path)
+    parent, _ = os.path.split(canonical)
+    try:
+        os.makedirs(parent, exist_ok=True)
+    except OSError as error:
+        raise argparse.ArgumentTypeError(f'outputdir: could not create: {error}')
+    return canonical
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Evaluate (and optionally perform) problem package migration from legacy to current format.')
+    parser.add_argument('inputdir', type=arg_inputdir, help='the path to a problem package in legacy format')
+    parser.add_argument('-o', '--outputdir', type=arg_outputdir, help='folder of the output package (to be created)')
+    parser.add_argument('-f', '--format', choices=['legacy', 'legacy-icpc', '2023-07-draft'], default='2023-07-draft', help='problem version format of the target')
+    options = parser.parse_args()
+
+    match options.format:
+        case 'legacy':
+            target_format = ProblemFormatVersion.LEGACY
+        case 'legacy-icpc':
+            target_format = ProblemFormatVersion.LEGACY_ICPC
+        case '2023-07-draft':
+            target_format = ProblemFormatVersion.V2023_07
+        case _:
+            sys.exit(f'unexpected target problem format version: {options.format}')
+
+    if options.outputdir is None:
+        parent = None
+    else:
+        parent, _ = os.path.split(options.outputdir)
+
+    tempdir = tempfile.mkdtemp(prefix='migrateproblem-', dir=parent)
+    print(tempdir)
+    shutil.copytree(options.inputdir, tempdir, dirs_exist_ok=True)
+
+    try:
+        problem_yaml_path = os.path.join(options.inputdir, 'problem.yaml')
+        problem_yaml_stream = io.open(problem_yaml_path, 'r')
+    except FileNotFoundError:
+        parser_error(f'problem metadata not found in inputdir ({problem_yaml_path})')
+
+    with problem_yaml_stream:
+        try:
+            problem_yaml_object = yaml.safe_load(problem_yaml_stream)
+        except yaml.parser.ParserError as error:
+            parser_error(f'problem metadata parsing failed: {error}')
+
+    problem_yaml = ProblemYaml(problem_yaml_object.pop('problem_format_version', None), target_format)
+    problem_yaml.type = problem_yaml_object.pop('type', None)
+    problem_yaml.name = problem_yaml_object.pop('name', None)
+    problem_yaml.uuid = problem_yaml_object.pop('uuid', None)
+    problem_yaml.version = problem_yaml_object.pop('version', None)
+    problem_yaml.author = problem_yaml_object.pop('author', None)
+    problem_yaml.credits = problem_yaml_object.pop('credits', None)
+    problem_yaml.source = problem_yaml_object.pop('source', None)
+    problem_yaml.source_url = problem_yaml_object.pop('source_url', None)
+    problem_yaml.license = problem_yaml_object.pop('license', None)
+    problem_yaml.rights_owner = problem_yaml_object.pop('rights_owner', None)
+    problem_yaml.limits = problem_yaml_object.pop('limits', None)
+    problem_yaml.validation = problem_yaml_object.pop('validation', None)
+    problem_yaml.validator_flags = problem_yaml_object.pop('validator_flags', None)
+    problem_yaml.scoring = problem_yaml_object.pop('scoring', None)
+    problem_yaml.grading(problem_yaml_object.pop('grading', None))
+    problem_yaml.keywords = problem_yaml_object.pop('keywords', None)
+
+    if bool(problem_yaml_object):
+        parser_warning(f'superfluous keys in "problem.yaml": {problem_yaml_object}')
+
+    print(f'{problem_yaml.generate_dict()}')
+
+    print(problem_yaml_object)
+    # key_grader_flags = problem_yaml_object.pop('grader_flags', None)
+    # match key_grader_flags:
+    #     case None:
+    #         pass
+    #     case 'min' | 'sum':
+    #         problem_yaml_object['aggregation'] = key_grader_flags
+    #     case 'accept_if_any_accepted' | 'always_accept' | 'first_error' | 'ignore_sample' | 'max' | 'worst_error':
+    #         parser_unimplemented(f'grader_flags value "{key_grader_flags}"')
+    #     case 'avg':
+    #         parser_error(f'unsupported grader_flags value "{key_grader_flags}" - this package cannot be migrated')
+    #     case _:
+    #         parser_error(f'unknown grader_flags value "{key_grader_flags}"')
+
+    print(type(problem_yaml_object))
+    print(problem_yaml_object)
diff --git a/problemtools/schemas/problem-yaml-schema.json b/problemtools/schemas/problem-yaml-schema.json
new file mode 100644
index 00000000..31f7957f
--- /dev/null
+++ b/problemtools/schemas/problem-yaml-schema.json
@@ -0,0 +1,196 @@
+{
+  "$id": "https://www.kattis.com/problem-package-format/spec/legacy.html",
+  "$defs": {
+    "license-type": {
+      "type": "string",
+      "enum": [
+        "unknown",
+        "public domain",
+        "cc0",
+        "cc by",
+        "cc by-sa",
+        "educational",
+        "permission"
+      ]
+    },
+    "limits-type": {
+      "type": "object",
+      "properties": {
+        "time_multiplier": {
+          "type": "number",
+          "default": 5
+        },
+        "time_safety_margin": {
+          "type": "number",
+          "default": 2
+        },
+        "memory": {
+          "type": "integer"
+        },
+        "output": {
+          "type": "integer"
+        },
+        "code": {
+          "type": "integer"
+        },
+        "compilation_time": {
+          "type": "integer"
+        },
+        "compilation_memory": {
+          "type": "integer"
+        },
+        "validation_time": {
+          "type": "integer"
+        },
+        "validation_memory": {
+          "type": "integer"
+        },
+        "validation_output": {
+          "type": "integer"
+        }
+      },
+      "additionalProperties": false
+    },
+    "scoring-type": {
+      "type": "object",
+      "properties": {
+        "objective": {
+          "type": "string",
+          "enum": [
+            "min",
+            "max"
+          ],
+          "default": "max"
+        },
+        "show_test_data_groups": {
+          "type": "boolean",
+          "default": false
+        }
+      },
+      "additionalProperties": false
+    },
+    "type-type": {
+      "type": "string",
+      "enum": [
+        "pass-fail",
+        "scoring"
+      ]
+    },
+    "validation-type": {
+      "type": "string",
+      "enum": [
+        "default",
+        "custom",
+        "custom interactive",
+        "custom interactive score",
+        "custom score",
+        "custom score interactive"
+      ],
+      "default": "default"
+    },
+    "version-type": {
+      "type": "string",
+      "const": "legacy"
+    }
+  },
+  "type": "object",
+  "properties": {
+    "problem_format_version": {
+      "$ref": "#/$defs/version-type",
+      "default": "legacy"
+    },
+    "type": {
+      "$ref": "#/$defs/type-type",
+      "default": "pass-fail"
+    },
+    "name": {
+      "type": "string"
+    },
+    "author": {
+      "type": "string"
+    },
+    "source": {
+      "type": "string"
+    },
+    "source_url": {
+      "type": "string"
+    },
+    "license": {
+      "$ref": "#/$defs/license-type",
+      "default": "unknown"
+    },
+    "rights_owner": {
+      "type": "string"
+    },
+    "limits": {
+      "$ref": "#/$defs/limits-type"
+    },
+    "validation": {
+      "$ref": "#/$defs/validation-type"
+    },
+    "validator_flags": {
+      "type": "string"
+    },
+    "scoring": {
+      "$ref": "#/$defs/scoring-type"
+    },
+    "grading": {
+      "$ref": "#/$defs/scoring-type",
+      "deprecated": true
+    },
+    "keywords": {
+      "type": "string"
+    }
+  },
+  "dependentRequired": {
+    "source_url": [
+      "source"
+    ]
+  },
+  "allOf": [
+    {
+      "if": {
+        "properties": {
+          "license": {
+            "not": {
+              "enum": [
+                "unknown",
+                "public domain"
+              ]
+            }
+          }
+        }
+      },
+      "then": {
+        "required": [
+          "rights_owner"
+        ]
+      }
+    },
+    {
+      "if": {
+        "properties": {
+          "license": {
+            "const": "public domain"
+          }
+        }
+      },
+      "then": {
+        "not": {
+          "required": [
+            "rights_owner"
+          ]
+        }
+      }
+    },
+    {
+      "not": {
+        "required": [
+          "scoring",
+          "grading"
+        ]
+      }
+    }
+  ],
+  "additionalProperties": false
+}
\ No newline at end of file