Skip to content

Commit f9f00ad

Browse files
gustavocidornelaswhoseoyster
authored andcommitted
Completes OPEN-4010 Refactor validators
1 parent 46258e8 commit f9f00ad

12 files changed

+1493
-1528
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2020

2121
### Changed
2222

23+
* Renamed `predictionsColumnName` argument from the datasets' configuration YAML to `predictionScoresColumnName`.
2324
* Migrated package name from [openlayer](https://pypi.org/project/openlayer/) to [openlayer](https://pypi.org/project/openlayer/) due to a company name change.
2425
* Required Python version `>=3.7` and `<3.9`.

docs/source/reference/validate.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ Models
1616
:toctree: api/
1717
:template: class.rst
1818

19-
validators.ModelValidator
19+
validators.model_validators.ModelValidator
2020

2121
Datasets
2222
--------
2323
.. autosummary::
2424
:toctree: api/
2525
:template: class.rst
2626

27-
validators.DatasetValidator
27+
validators.dataset_validators.DatasetValidator
2828

openlayer/__init__.py

+68-22
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,18 @@
99
import pandas as pd
1010
import yaml
1111

12-
from . import api, exceptions, utils, validators
12+
from . import api, exceptions, utils
1313
from .projects import Project
1414
from .schemas import BaselineModelSchema, DatasetSchema, ModelSchema
1515
from .tasks import TaskType
16+
17+
# from validators import models as model_validators
18+
from .validators import (
19+
commit_validators,
20+
dataset_validators,
21+
model_validators,
22+
project_validators,
23+
)
1624
from .version import __version__ # noqa: F401
1725

1826
OPENLAYER_DIR = os.path.join(os.path.expanduser("~"), ".openlayer")
@@ -91,7 +99,9 @@ def create_project(
9199
"description": description,
92100
"task_type": task_type,
93101
}
94-
project_validator = validators.ProjectValidator(project_config=project_config)
102+
project_validator = project_validators.ProjectValidator(
103+
project_config=project_config
104+
)
95105
failed_validations = project_validator.validate()
96106

97107
if failed_validations:
@@ -101,7 +111,11 @@ def create_project(
101111
) from None
102112

103113
endpoint = "projects"
104-
payload = dict(name=name, description=description, taskType=task_type.value)
114+
payload = {
115+
"name": name,
116+
"description": description,
117+
"taskType": task_type.value,
118+
}
105119
project_data = self.api.post_request(endpoint, body=payload)
106120

107121
project = Project(project_data, self.api.upload, self)
@@ -232,22 +246,29 @@ def add_model(
232246
233247
The model configuration YAML file must contain the following fields:
234248
235-
- ``name`` : str
249+
name : str
236250
Name of the model.
237-
- ``architectureType`` : str
251+
architectureType : str
238252
The model's framework. Must be one of the supported frameworks
239253
on :obj:`ModelType`.
240-
- ``classNames`` : List[str]
254+
classNames : List[str]
241255
List of class names corresponding to the outputs of your predict function.
242256
E.g. ``['positive', 'negative']``.
243-
- ``featureNames`` : List[str], default []
257+
featureNames : List[str], default []
244258
List of input feature names. Only applicable if your ``task_type`` is
245259
:obj:`TaskType.TabularClassification` or :obj:`TaskType.TabularRegression`.
246-
- ``categoricalFeatureNames`` : List[str], default []
260+
categoricalFeatureNames : List[str], default []
247261
A list containing the names of all categorical features used by the model.
248262
E.g. ``["Gender", "Geography"]``. Only applicable if your ``task_type`` is
249263
:obj:`TaskType.TabularClassification` or :obj:`TaskType.TabularRegression`.
250-
- ``metadata`` : Dict[str, any], default {}
264+
predictionThreshold : float, default None
265+
The threshold used to determine the predicted class. Only applicable if you
266+
are using a binary classifier and you provided the ``predictionScoresColumnName``
267+
with the lists of class probabilities in your datasets (refer to :obj:`add_dataframe`).
268+
269+
If you provided ``predictionScoresColumnName`` but not ``predictionThreshold``,
270+
the predicted class is defined by the argmax of the lists in ``predictionScoresColumnName``.
271+
metadata : Dict[str, any], default {}
251272
Dictionary containing metadata about the model. This is the metadata that
252273
will be displayed on the Openlayer platform.
253274
@@ -263,8 +284,8 @@ def add_model(
263284
- ``prediction_interface.py``
264285
The prediction interface file.
265286
- ``model artifacts``
266-
The model artifacts. This can be a single file or a directory containing
267-
multiple files. The model artifacts must be compatible with the
287+
The model artifacts. This can be a single file, multiple files or a directory.
288+
The model artifacts must be compatible with the
268289
prediction interface file.
269290
- ``requirements.txt``
270291
The requirements file. This file contains the dependencies needed to run
@@ -389,7 +410,7 @@ def add_model(
389410
)
390411

391412
# Validate model package
392-
model_package_validator = validators.ModelValidator(
413+
model_package_validator = model_validators.ModelValidator(
393414
model_package_dir=model_package_dir,
394415
model_config_file_path=model_config_file_path,
395416
sample_data=sample_data,
@@ -467,7 +488,8 @@ def add_baseline_model(
467488
)
468489

469490
# Validate the baseline model
470-
baseline_model_validator = validators.BaselineModelValidator(
491+
492+
baseline_model_validator = model_validators.BaselineModelValidator(
471493
model_config_file_path=model_config_file_path,
472494
)
473495
failed_validations = baseline_model_validator.validate()
@@ -536,15 +558,27 @@ def add_dataset(
536558
Column header in the csv containing the input text. Only applicable if
537559
your ``task_type`` is :obj:`TaskType.TextClassification`.
538560
predictionsColumnName : str, default None
539-
Column header in the csv containing the predictions. Only applicable if you
540-
are uploading a model as well with the :obj:`add_model` method.
561+
Column header in the csv containing the model's predictions as **zero-indexed
562+
integers**. Only applicable if you are uploading a model as well with the
563+
:obj:`add_model` method.
564+
565+
This is optional if you provide a ``predictionScoresColumnName``.
566+
567+
.. important::
568+
The values in this column must be zero-indexed integer values.
569+
predictionScoresColumnName : str, default None
570+
Column header in the csv containing the model's predictions as **lists of
571+
class probabilities**. Only applicable if you are uploading a model as well with
572+
the :obj:`add_model` method.
573+
574+
This is optional if you provide a ``predictionsColumnName``.
541575
542576
.. important::
543577
Each cell in this column must contain a list of
544578
class probabilities. For example, for a binary classification
545579
task, the column with the predictions should look like this:
546580
547-
**predictions**
581+
**prediction_scores**
548582
549583
``[0.1, 0.9]``
550584
@@ -684,7 +718,7 @@ class probabilities. For example, for a binary classification
684718
>>> project.push()
685719
"""
686720
# Validate dataset
687-
dataset_validator = validators.DatasetValidator(
721+
dataset_validator = dataset_validators.DatasetValidator(
688722
dataset_config_file_path=dataset_config_file_path,
689723
dataset_file_path=file_path,
690724
)
@@ -752,15 +786,27 @@ def add_dataframe(
752786
Column header in the dataframe containing the input text. Only applicable if
753787
your ``task_type`` is :obj:`TaskType.TextClassification`.
754788
predictionsColumnName : str, default None
755-
Column header in the dataframe containing the predictions. Only applicable if you
756-
are uploading a model as well with the :obj:`add_model` method.
789+
Column header in the dataframe containing the model's predictions as **zero-indexed
790+
integers**. Only applicable if you are uploading a model as well with the
791+
:obj:`add_model` method.
792+
793+
This is optional if you provide a ``predictionScoresColumnName``.
794+
795+
.. important::
796+
The values in this column must be zero-indexed integer values.
797+
predictionScoresColumnName : str, default None
798+
Column header in the dataframe containing the model's predictions as **lists of
799+
class probabilities**. Only applicable if you are uploading a model as well with
800+
the :obj:`add_model` method.
801+
802+
This is optional if you provide a ``predictionsColumnName``.
757803
758804
.. important::
759805
Each cell in this column must contain a list of
760806
class probabilities. For example, for a binary classification
761807
task, the column with the predictions should look like this:
762808
763-
**predictions**
809+
**prediction_scores**
764810
765811
``[0.1, 0.9]``
766812
@@ -950,7 +996,7 @@ def commit(self, message: str, project_id: int, force: bool = False):
950996
>>> project.push()
951997
"""
952998
# Validate commit
953-
commit_validator = validators.CommitValidator(commit_message=message)
999+
commit_validator = commit_validators.CommitValidator(commit_message=message)
9541000
failed_validations = commit_validator.validate()
9551001

9561002
if failed_validations:
@@ -1039,7 +1085,7 @@ def push(self, project_id: int):
10391085
commit = yaml.safe_load(commit_file)
10401086

10411087
# Validate bundle resources
1042-
commit_bundle_validator = validators.CommitBundleValidator(
1088+
commit_bundle_validator = commit_validators.CommitBundleValidator(
10431089
bundle_path=project_dir,
10441090
skip_dataset_validation=True,
10451091
skip_model_validation=False, # Don't skip because the sample data is different

openlayer/api.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ def _http_request(
9797

9898
@staticmethod
9999
def _raise_on_respose(res: Response):
100-
101100
try:
102101
message = res.json().get("error", res.text)
103102
except ValueError:
@@ -204,7 +203,12 @@ def upload_blob_s3(
204203
e, lambda monitor: t.update(min(t.total, monitor.bytes_read) - t.n)
205204
)
206205
headers = {"Content-Type": m.content_type}
207-
res = requests.post(presigned_json["url"], data=m, headers=headers, verify=VERIFY_REQUESTS)
206+
res = requests.post(
207+
presigned_json["url"],
208+
data=m,
209+
headers=headers,
210+
verify=VERIFY_REQUESTS,
211+
)
208212

209213
if res.ok:
210214
body["storageUri"] = presigned_json["storageUri"]
@@ -232,7 +236,7 @@ def upload_blob_gcs(
232236
presigned_json["url"],
233237
data=wrapped_file,
234238
headers={"Content-Type": "application/x-gzip"},
235-
verify=VERIFY_REQUESTS
239+
verify=VERIFY_REQUESTS,
236240
)
237241
if res.ok:
238242
body["storageUri"] = presigned_json["storageUri"]
@@ -263,7 +267,7 @@ def upload_blob_azure(
263267
"Content-Type": "application/x-gzip",
264268
"x-ms-blob-type": "BlockBlob",
265269
},
266-
verify=VERIFY_REQUESTS
270+
verify=VERIFY_REQUESTS,
267271
)
268272
if res.ok:
269273
body["storageUri"] = presigned_json["storageUri"]

openlayer/schemas.py

+6
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
COLUMN_NAME_REGEX,
2828
]
2929

30+
3031
# ---------------------------------- Schemas --------------------------------- #
3132
class BaselineModelSchema(ma.Schema):
3233
"""Schema for baseline models."""
@@ -86,6 +87,11 @@ class DatasetSchema(ma.Schema):
8687
allow_none=True,
8788
load_default=None,
8889
)
90+
predictionScoresColumnName = ma.fields.Str(
91+
validate=COLUMN_NAME_VALIDATION_LIST,
92+
allow_none=True,
93+
load_default=None,
94+
)
8995
sep = ma.fields.Str(load_default=",")
9096
textColumnName = ma.fields.Str(
9197
validate=COLUMN_NAME_VALIDATION_LIST,

0 commit comments

Comments
 (0)