Skip to content

feat: Handle large schemas by stripping titles #628

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 45 additions & 57 deletions google/genai/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,42 @@ def handle_null_fields(schema: dict[str, Any]):
del schema['anyOf']


def _is_schema_too_large(schema: dict[str, Any]) -> bool:
"""Checks if the schema is too large.

Args:
schema: The schema to check.

Returns:
True if the schema is too large, False otherwise.
"""
# The maximum size of the schema is 10000 characters.
# This is a conservative estimate based on the "too many states for serving" error.
schema_str = str(schema)
return len(schema_str) > 10000


def _strip_titles(schema: dict[str, Any]) -> None:
"""Recursively strips titles from a schema and its sub-schemas.

Args:
schema: The schema to strip titles from.
"""
if 'title' in schema:
del schema['title']

if schema.get('type', '').upper() == 'OBJECT':
if (properties := schema.get('properties')) is not None:
for sub_schema in properties.values():
_strip_titles(sub_schema)
elif schema.get('type', '').upper() == 'ARRAY':
if (items := schema.get('items')) is not None:
_strip_titles(items)
elif 'anyOf' in schema:
for sub_schema in schema['anyOf']:
_strip_titles(sub_schema)


def process_schema(
schema: dict[str, Any],
client: _api_client.BaseApiClient,
Expand All @@ -591,63 +627,11 @@ def process_schema(
):
"""Updates the schema and each sub-schema inplace to be API-compatible.

- Inlines the $defs.

Example of a schema before and after (with mldev):
Before:

`schema`

{
'items': {
'$ref': '#/$defs/CountryInfo'
},
'title': 'Placeholder',
'type': 'array'
}


`defs`

{
'CountryInfo': {
'properties': {
'continent': {
'title': 'Continent',
'type': 'string'
},
'gdp': {
'title': 'Gdp',
'type': 'integer'}
},
}
'required':['continent', 'gdp'],
'title': 'CountryInfo',
'type': 'object'
}
}

After:

`schema`
{
'items': {
'properties': {
'continent': {
'title': 'Continent',
'type': 'string'
},
'gdp': {
'title': 'Gdp',
'type': 'integer'
},
}
'required':['continent', 'gdp'],
'title': 'CountryInfo',
'type': 'object'
},
'type': 'array'
}
Args:
schema: The schema to process.
client: The API client.
defs: The definitions.
order_properties: Whether to order the properties.
"""
if not client.vertexai:
if schema.get('default') is not None:
Expand Down Expand Up @@ -727,6 +711,10 @@ def _recurse(sub_schema: dict[str, Any]) -> dict[str, Any]:
if (items := schema.get('items')) is not None:
schema['items'] = _recurse(items)

# Check if the schema is too large and, if so, strip the titles from all properties
if _is_schema_too_large(schema):
_strip_titles(schema)


def _process_enum(
enum: EnumMeta, client: _api_client.BaseApiClient
Expand Down
186 changes: 186 additions & 0 deletions google/genai/tests/transformers/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,3 +560,189 @@ def test_t_schema_does_not_set_property_ordering_for_schema_type(client):
with pytest.raises(ValueError) as e:
_transformers.t_schema(client, schema)
assert 'Default value is not supported' in str(e)


def test_is_schema_too_large():
"""Tests the _is_schema_too_large function."""
schema = {
'type': 'object',
'properties': {
'foo': {
'type': 'string',
'title': 'Foo',
},
'bar': {
'type': 'integer',
'title': 'Bar',
},
},
}
assert not _transformers._is_schema_too_large(schema)

# Create a schema that is too large.
large_schema = {
'type': 'object',
'properties': {
'foo': {
'type': 'string',
'title': 'Foo' * 5000, # Make the title much longer
},
'bar': {
'type': 'integer',
'title': 'Bar' * 5000, # Make the title much longer
},
},
}
assert _transformers._is_schema_too_large(large_schema)


def test_process_schema_strips_titles_if_too_large():
"""Tests that the process_schema function strips titles from properties if the schema is too large."""
schema = {
'type': 'object',
'properties': {
'foo': {
'type': 'string',
'title': 'Foo',
},
'bar': {
'type': 'integer',
'title': 'Bar',
},
},
}
client = google_genai_client_module.Client(api_key='test-api-key')
_transformers.process_schema(schema, client)
assert 'title' in schema['properties']['foo']
assert 'title' in schema['properties']['bar']

# Create a schema that is too large.
large_schema = {
'type': 'object',
'properties': {
'foo': {
'type': 'string',
'title': 'Foo' * 5000,
},
'bar': {
'type': 'integer',
'title': 'Bar' * 5000,
},
},
}
_transformers.process_schema(large_schema, client)
assert 'title' not in large_schema['properties']['foo']
assert 'title' not in large_schema['properties']['bar']


def test_strip_titles():
"""Tests that _strip_titles correctly removes titles from a schema."""
schema = {
'type': 'OBJECT',
'title': 'Root',
'properties': {
'foo': {
'type': 'STRING',
'title': 'Foo',
},
'bar': {
'type': 'OBJECT',
'title': 'Bar',
'properties': {
'baz': {
'type': 'INTEGER',
'title': 'Baz',
},
},
},
'qux': {
'type': 'ARRAY',
'title': 'Qux',
'items': {
'type': 'STRING',
'title': 'QuxItem',
},
},
'quux': {
'title': 'Quux',
'anyOf': [
{
'type': 'STRING',
'title': 'QuuxString',
},
{
'type': 'INTEGER',
'title': 'QuuxInt',
},
],
},
},
}

_transformers._strip_titles(schema)

# Check that all titles have been removed
assert 'title' not in schema
assert 'title' not in schema['properties']['foo']
assert 'title' not in schema['properties']['bar']
assert 'title' not in schema['properties']['bar']['properties']['baz']
assert 'title' not in schema['properties']['qux']
assert 'title' not in schema['properties']['qux']['items']
assert 'title' not in schema['properties']['quux']
assert 'title' not in schema['properties']['quux']['anyOf'][0]
assert 'title' not in schema['properties']['quux']['anyOf'][1]


def test_process_schema_strips_titles_when_too_large():
"""Tests that process_schema strips titles when the schema is too large."""
client = google_genai_client_module.Client(api_key='test-api-key')

# Create a schema that will be too large due to long titles
large_schema = {
'type': 'OBJECT',
'title': 'Root' * 1000,
'properties': {
'foo': {
'type': 'STRING',
'title': 'Foo' * 1000,
},
'bar': {
'type': 'INTEGER',
'title': 'Bar' * 1000,
},
},
}

_transformers.process_schema(large_schema, client)

# Check that all titles have been removed
assert 'title' not in large_schema
assert 'title' not in large_schema['properties']['foo']
assert 'title' not in large_schema['properties']['bar']


def test_process_schema_preserves_titles_when_not_too_large():
"""Tests that process_schema preserves titles when the schema is not too large."""
client = google_genai_client_module.Client(api_key='test-api-key')

schema = {
'type': 'OBJECT',
'title': 'Root',
'properties': {
'foo': {
'type': 'STRING',
'title': 'Foo',
},
'bar': {
'type': 'INTEGER',
'title': 'Bar',
},
},
}

_transformers.process_schema(schema, client)

# Check that all titles are preserved
assert schema['title'] == 'Root'
assert schema['properties']['foo']['title'] == 'Foo'
assert schema['properties']['bar']['title'] == 'Bar'