diff --git a/.github/workflows/python-pytest.yml b/.github/workflows/python-pytest.yml index 4b1ad344..9c724b79 100644 --- a/.github/workflows/python-pytest.yml +++ b/.github/workflows/python-pytest.yml @@ -56,4 +56,4 @@ jobs: API_KEY: ${{ secrets.API_KEY }} VI_USERNAME: ${{ secrets.VI_USERNAME }} VI_API_KEY: ${{ secrets.VI_API_KEY }} - run: python -m pytest ${{ matrix.test-path }} --use_client --reruns 5 --reruns-delay 1 + run: python -m pytest ${{ matrix.test-path }} --use_client --reruns 5 --reruns-delay 10 diff --git a/vectorai/api/api.py b/vectorai/api/api.py index 1c75fe81..50f1d725 100644 --- a/vectorai/api/api.py +++ b/vectorai/api/api.py @@ -123,14 +123,29 @@ def _delete_collection(self,collection_name, **kwargs): @retry() @return_curl_or_response('json') - def _list_collections(self,sort_by_created_at_date=False, reverse=True, **kwargs): + def _list_collections(self,sort_by_created_at_date=False, asc=False, **kwargs): return requests.get( url=self.url+'/project/list_collections', params=dict( username=self.username, api_key=self.api_key, sort_by_created_at_date=sort_by_created_at_date, - reverse=reverse, + asc=asc, + )) + + @retry() + @return_curl_or_response('json') + def list_collections_info(self,collection_names=[], sort_by_created_at_date=False, asc=False, page_size=20, page=1, **kwargs): + return requests.get( + url=self.url+'/project/list_collections_info', + params=dict( + username=self.username, + api_key=self.api_key, + collection_names=collection_names, + sort_by_created_at_date=sort_by_created_at_date, + asc=asc, + page_size=page_size, + page=page, )) @retry() @@ -169,13 +184,14 @@ def collection_vector_health(self,collection_name, **kwargs): @retry() @return_curl_or_response('json') - def collection_schema_stats(self,collection_name, **kwargs): + def collection_schema_stats(self,collection_name, include_zero_vectors=True, **kwargs): return requests.get( url=self.url+'/project/collection_schema_stats', params=dict( username=self.username, api_key=self.api_key, collection_name=collection_name, + include_zero_vectors=include_zero_vectors, )) @retry() @@ -201,6 +217,31 @@ def add_collection_metadata(self, collection_name, metadata, **kwargs): metadata=metadata, )) + @retry() + @return_curl_or_response('json') + def search_collections(self, collection_search_query, sort_by_created_at_date=False, reverse=False, **kwargs): + """Search collections +Search collections + +Args +======== +username: Username +api_key: Api Key, you can request it from request_api_key +collection_search_query: The collection search query +sort_by_created_at_date: Sort by created at date. By default shows the newest collections. Set reverse=False to get oldest collection. +reverse: Sort by created at date. By default shows the newest collections. Set reverse=False to get oldest collection. + +""" + return requests.post( + url=self.url+'/project/search_collections', + json=dict( + username=self.username, + api_key=self.api_key, + collection_search_query=collection_search_query, + sort_by_created_at_date=sort_by_created_at_date, + reverse=reverse, + )) + @retry() @return_curl_or_response('json') def collection_metadata(self,collection_name, **kwargs): @@ -254,7 +295,7 @@ def job_status(self,job_id, **kwargs): @retry() @return_curl_or_response('json') - def insert(self, collection_name, document={}, insert_date=True, overwrite=True, update_schema=True, quick=False, **kwargs): + def insert(self, collection_name, document={}, insert_date=True, overwrite=True, update_schema=True, quick=False, pipeline=[], **kwargs): """Insert a document into a Collection When inserting the document you can specify your own id for a document by using the field name **"\_id"**. For specifying your own vector use the suffix (ends with) **"\_vector\_"** for the field name. @@ -270,6 +311,7 @@ def insert(self, collection_name, document={}, insert_date=True, overwrite=True, overwrite: Whether to overwrite document if it exists. update_schema: Whether the api should check the documents for vector datatype to update the schema. quick: This will run the quickest insertion possible, which means there will be no schema checks or collection checks. +pipeline: This will run pipelines for the insert. example: pipeline=["encoders"] """ return requests.post( @@ -283,32 +325,49 @@ def insert(self, collection_name, document={}, insert_date=True, overwrite=True, overwrite=overwrite, update_schema=update_schema, quick=quick, + pipeline=pipeline, )) @retry() @return_curl_or_response('json') - def insert_and_encode(self, collection_name, encode_models, document={}, insert_date=True, overwrite=True, update_schema=True, quick=False, **kwargs): + def insert_and_encode(self, encoders, collection_name, document={}, insert_date=True, overwrite=True, update_schema=True, quick=False, store_to_pipeline=True, **kwargs): """Insert and encode document into a Collection -Insert a document and encode specified fields into vectors with provided model urls or model names. { - "thumbnail" : {"model_url" : ""https://a_vector_model_url.com/encode_image_url"", "body" : "url"}, +Insert a document and encode specified fields into vectors with provided model urls or model names. + + { + "thumbnail" : {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url"}, "short_description" : {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text"}, "short_description" : {"model_url" : "bert", "alias" : "bert"}, } + +This primarily uses deployed models. + Args ======== username: Username api_key: Api Key, you can request it from request_api_key +encoders: An array structure of models to encode fields with. + Encoders can be a `model_url` or a `model_name`. + For model_name, the options are: `image_text`, `image`, `text`. `text_multi`, `text_image`. + Note: image_text encodes images for text to image search whereas text_image encodes texts + for text to image search (text to image search/image to text search works both ways). + For model_url, you are free to deploy your own model and specify the required body as such. + + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_name" : "text", "body" : "text", "field": "short_description", "alias":"bert"}, + {"model_name" : "image_text", "body" : "url", "field" : "thumbnail"}, + ] + collection_name: Name of Collection document: A Document is a JSON-like data that we store our metadata and vectors with. For specifying id of the document use the field '\_id', for specifying vector field use the suffix of '\_vector\_' insert_date: Whether to include insert date as a field 'insert_date_'. overwrite: Whether to overwrite document if it exists. update_schema: Whether the api should check the documents for vector datatype to update the schema. quick: This will run the quickest insertion possible, which means there will be no schema checks or collection checks. -encode_models: A json structure of models to encode fields with. { - "thumbnail" : {"model_url" : ""https://a_vector_model_url.com/encode_image_url"", "body" : "url"}, - "short_description" : {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text"}, - } +store_to_pipeline: Whether to store the encoders to pipeline """ return requests.post( @@ -316,18 +375,19 @@ def insert_and_encode(self, collection_name, encode_models, document={}, insert_ json=dict( username=self.username, api_key=self.api_key, + encoders=encoders, collection_name=collection_name, document=document, insert_date=insert_date, overwrite=overwrite, update_schema=update_schema, quick=quick, - encode_models=encode_models, + store_to_pipeline=store_to_pipeline, )) @retry() @return_curl_or_response('json') - def bulk_insert(self, collection_name, documents={}, insert_date=True, overwrite=True, update_schema=True, quick=False, **kwargs): + def bulk_insert(self, collection_name, documents={}, insert_date=True, overwrite=True, update_schema=True, quick=False, pipeline=[], **kwargs): """Insert multiple documents into a Collection When inserting the document you can specify your own id for a document by using the field name **"\_id"**. For specifying your own vector use the suffix (ends with) **"\_vector\_"** for the field name. @@ -343,6 +403,7 @@ def bulk_insert(self, collection_name, documents={}, insert_date=True, overwrite overwrite: Whether to overwrite document if it exists. update_schema: Whether the api should check the documents for vector datatype to update the schema. quick: This will run the quickest insertion possible, which means there will be no schema checks or collection checks. +pipeline: This will run pipelines for the insert. example: pipeline=["encoders"] """ return requests.post( @@ -356,11 +417,12 @@ def bulk_insert(self, collection_name, documents={}, insert_date=True, overwrite overwrite=overwrite, update_schema=update_schema, quick=quick, + pipeline=pipeline, )) @retry() @return_curl_or_response('json') - def bulk_insert_and_encode(self, collection_name, encode_models, documents={}, insert_date=True, overwrite=True, update_schema=True, quick=False, **kwargs): + def bulk_insert_and_encode(self, encoders, collection_name, documents={}, insert_date=True, overwrite=True, update_schema=True, quick=False, store_to_pipeline=True, **kwargs): """Insert and encode multiple documents into a Collection Insert multiple document and encode specified fields into vectors with provided model urls or model names. { "thumbnail" : {"model_url" : ""https://a_vector_model_url.com/encode_image_url"", "body" : "url"}, @@ -372,16 +434,27 @@ def bulk_insert_and_encode(self, collection_name, encode_models, documents={}, i ======== username: Username api_key: Api Key, you can request it from request_api_key +encoders: An array structure of models to encode fields with. + Encoders can be a `model_url` or a `model_name`. + For model_name, the options are: `image_text`, `image`, `text`. `text_multi`, `text_image`. + Note: image_text encodes images for text to image search whereas text_image encodes texts + for text to image search (text to image search/image to text search works both ways). + For model_url, you are free to deploy your own model and specify the required body as such. + + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_name" : "text", "body" : "text", "field": "short_description", "alias":"bert"}, + {"model_name" : "image_text", "body" : "url", "field" : "thumbnail"}, + ] + collection_name: Name of Collection documents: A list of documents. Document is a JSON-like data that we store our metadata and vectors with. For specifying id of the document use the field '\_id', for specifying vector field use the suffix of '\_vector\_' insert_date: Whether to include insert date as a field 'insert_date_'. overwrite: Whether to overwrite document if it exists. update_schema: Whether the api should check the documents for vector datatype to update the schema. quick: This will run the quickest insertion possible, which means there will be no schema checks or collection checks. -encode_models: A json structure of models to encode fields with. { - "thumbnail" : {"model_url" : ""https://a_vector_model_url.com/encode_image_url"", "body" : "url"}, - "short_description" : {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text"}, - } +store_to_pipeline: Whether to store the encoders to pipeline """ return requests.post( @@ -389,13 +462,49 @@ def bulk_insert_and_encode(self, collection_name, encode_models, documents={}, i json=dict( username=self.username, api_key=self.api_key, + encoders=encoders, collection_name=collection_name, documents=documents, insert_date=insert_date, overwrite=overwrite, update_schema=update_schema, quick=quick, - encode_models=encode_models, + store_to_pipeline=store_to_pipeline, + )) + + @retry() + @return_curl_or_response('json') + def store_encoders_pipeline(self, encoders, collection_name, **kwargs): + """Store encoder to the collection's pipeline + +Args +======== +username: Username +api_key: Api Key, you can request it from request_api_key +encoders: An array structure of models to encode fields with. + Encoders can be a `model_url` or a `model_name`. + For model_name, the options are: `image_text`, `image`, `text`. `text_multi`, `text_image`. + Note: image_text encodes images for text to image search whereas text_image encodes texts + for text to image search (text to image search/image to text search works both ways). + For model_url, you are free to deploy your own model and specify the required body as such. + + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_name" : "text", "body" : "text", "field": "short_description", "alias":"bert"}, + {"model_name" : "image_text", "body" : "url", "field" : "thumbnail"}, + ] + +collection_name: Name of Collection + +""" + return requests.post( + url=self.url+'/collection/store_encoders_pipeline', + json=dict( + username=self.username, + api_key=self.api_key, + encoders=encoders, + collection_name=collection_name, )) @retry() @@ -898,6 +1007,90 @@ def search_with_positive_negative_ids_as_history(self,vector, positive_document_ asc=asc, )) + @retry() + @return_curl_or_response('json') + def encode(self, encoders, document, **kwargs): + """Encode document into vectors +Get a document and encode specified fields into vectors with provided model urls or model names. { + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_url" : "bert", "body" : "text", "field": "short_description", "alias":"bert"}, + ] + } + +Args +======== +username: Username +api_key: Api Key, you can request it from request_api_key +encoders: An array structure of models to encode fields with. + Encoders can be a `model_url` or a `model_name`. + For model_name, the options are: `image_text`, `image`, `text`. `text_multi`, `text_image`. + Note: image_text encodes images for text to image search whereas text_image encodes texts + for text to image search (text to image search/image to text search works both ways). + For model_url, you are free to deploy your own model and specify the required body as such. + + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_name" : "text", "body" : "text", "field": "short_description", "alias":"bert"}, + {"model_name" : "image_text", "body" : "url", "field" : "thumbnail"}, + ] + +document: A json document to encode. + +""" + return requests.post( + url=self.url+'/collection/encode', + json=dict( + username=self.username, + api_key=self.api_key, + encoders=encoders, + document=document, + )) + + @retry() + @return_curl_or_response('json') + def bulk_encode(self, encoders, documents, **kwargs): + """Bulk encode document into vectors +Get a document and encode specified fields into vectors with provided model urls or model names. { + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_url" : "bert", "body" : "text", "field": "short_description", "alias":"bert"}, + ] + } + +Args +======== +username: Username +api_key: Api Key, you can request it from request_api_key +encoders: An array structure of models to encode fields with. + Encoders can be a `model_url` or a `model_name`. + For model_name, the options are: `image_text`, `image`, `text`. `text_multi`, `text_image`. + Note: image_text encodes images for text to image search whereas text_image encodes texts + for text to image search (text to image search/image to text search works both ways). + For model_url, you are free to deploy your own model and specify the required body as such. + + [ + {"model_url" : "https://a_vector_model_url.com/encode_image_url", "body" : "url", "field": "thumbnail"}, + {"model_url" : "https://a_vector_model_url.com/encode_text", "body" : "text", "field": "short_description"}, + {"model_name" : "text", "body" : "text", "field": "short_description", "alias":"bert"}, + {"model_name" : "image_text", "body" : "url", "field" : "thumbnail"}, + ] + +documents: Json documents to encode. + +""" + return requests.post( + url=self.url+'/collection/bulk_encode', + json=dict( + username=self.username, + api_key=self.api_key, + encoders=encoders, + documents=documents, + )) + @retry() @return_curl_or_response('json') def facets(self,collection_name, facets_fields=[], date_interval="monthly", page_size=1000, page=1, asc=False, **kwargs): @@ -1557,9 +1750,20 @@ def job_status(self,job_id, collection_name, **kwargs): @retry() @return_curl_or_response('json') - def list_jobs(self,collection_name, show_active_only=True, **kwargs): + def list_jobs(self,show_active_only=True, **kwargs): return requests.get( url=self.url+'/collection/list_jobs', + params=dict( + show_active_only=show_active_only, + username=self.username, + api_key=self.api_key, + )) + + @retry() + @return_curl_or_response('json') + def list_collection_jobs(self,collection_name, show_active_only=True, **kwargs): + return requests.get( + url=self.url+'/collection/list_collection_jobs', params=dict( show_active_only=show_active_only, username=self.username, @@ -1569,11 +1773,12 @@ def list_jobs(self,collection_name, show_active_only=True, **kwargs): @retry() @return_curl_or_response('json') - def encode_image_field(self,image_field, collection_name, refresh=True, **kwargs): + def encode_image_field(self,image_field, collection_name, alias="default", refresh=True, **kwargs): return requests.get( url=self.url+'/collection/encode_image_field', params=dict( image_field=image_field, + alias=alias, refresh=refresh, username=self.username, api_key=self.api_key, @@ -1582,29 +1787,16 @@ def encode_image_field(self,image_field, collection_name, refresh=True, **kwargs @retry() @return_curl_or_response('json') - def encode_field(self, collection_name, task, field, image_field, **kwargs): - """Start job to encode field -Encode image field - -Args -======== -username: Username -api_key: Api Key, you can request it from request_api_key -collection_name: Name of Collection -task: The name of the task for the job -field: -image_field: - -""" - return requests.post( - url=self.url+'/collection/encode_field', - json=dict( - username=self.username, - api_key=self.api_key, + def encode_text_field(self,text_field, collection_name, refresh=True, alias="default", **kwargs): + return requests.get( + url=self.url+'/collection/encode_text_field', + params=dict( + text_field=text_field, + refresh=refresh, + alias=alias, + username=self.username, + api_key=self.api_key, collection_name=collection_name, - task=task, - field=field, - image_field=image_field, )) @retry() @@ -1643,6 +1835,33 @@ def tag_image_field(self, collection_name, image_field, num_of_tags=5, only_rele refresh=refresh, )) + @retry() + @return_curl_or_response('json') + def chunk_text_field(self, collection_name, field, model_url, refresh=False, **kwargs): + """Chunk a text field +Split text into separate sentences. Encode each sentence to create chunkvectors. +These are stored as _chunkvector_. The chunk field created is `field` + _chunk_. +Args +======== +username: Username +api_key: Api Key, you can request it from request_api_key +collection_name: Name of Collection +field: Field to text +refresh: If True, Re-encodes from scratch. +model_url: Model URL for encoding + +""" + return requests.post( + url=self.url+'/collection/chunk_text_field', + json=dict( + username=self.username, + api_key=self.api_key, + collection_name=collection_name, + field=field, + refresh=refresh, + model_url=model_url, + )) + @retry() @return_curl_or_response('json') def copy_collection_from_another_user(self, collection_name, source_collection_name, source_username, source_api_key, **kwargs): @@ -2621,7 +2840,7 @@ def advanced_cluster_search(self, collection_name, multivector_query, vector_fie @retry() @return_curl_or_response('json') - def advanced_search_post_cluster(self, collection_name, multivector_query, cluster_field, page=1, page_size=20, approx=0, sum_fields=True, metric="cosine", filters=[], facets=[], min_score=None, include_fields=[], include_vector=False, include_count=True, include_facets=False, hundred_scale=False, include_search_relevance=False, search_relevance_cutoff_aggressiveness=1, asc=False, keep_search_history=False, n_clusters=0, n_init=5, n_iter=10, return_as_clusters=False, **kwargs): + def advanced_search_post_cluster(self, collection_name, multivector_query, cluster_vector_field, page=1, page_size=20, approx=0, sum_fields=True, metric="cosine", filters=[], facets=[], min_score=None, include_fields=[], include_vector=False, include_count=True, include_facets=False, hundred_scale=False, include_search_relevance=False, search_relevance_cutoff_aggressiveness=1, asc=False, keep_search_history=False, n_clusters=0, n_init=5, n_iter=10, return_as_clusters=False, **kwargs): """Performs Clustering on Top X search results This will first perform an advanced search and then cluster the top X (page_size) search results. Results are returned as such: @@ -2672,7 +2891,7 @@ def advanced_search_post_cluster(self, collection_name, multivector_query, clust asc: Whether to sort results by ascending or descending order keep_search_history: Whether to store the history of search or not multivector_query: Query for advance search that allows for multiple vector and field querying -cluster_field: Vector field to perform clustering on +cluster_vector_field: Vector field to perform clustering on n_clusters: Number of clusters n_init: Number of runs to run with different centroid seeds n_iter: Number of iterations in each run @@ -2703,7 +2922,7 @@ def advanced_search_post_cluster(self, collection_name, multivector_query, clust asc=asc, keep_search_history=keep_search_history, multivector_query=multivector_query, - cluster_field=cluster_field, + cluster_vector_field=cluster_vector_field, n_clusters=n_clusters, n_init=n_init, n_iter=n_iter, @@ -2772,19 +2991,6 @@ def dimensionality_reduce(self, collection_name, vectors, vector_field, alias="d n_components=n_components, )) - @retry() - @return_curl_or_response('json') - def encode_text_field(self,text_field, collection_name, refresh=True, **kwargs): - return requests.get( - url=self.url+'/collection/encode_text_field', - params=dict( - text_field=text_field, - refresh=refresh, - username=self.username, - api_key=self.api_key, - collection_name=collection_name, - )) - @retry() @return_curl_or_response('json') def encode_text(self,text, collection_name, **kwargs):