Fix image-text-to-text provided kwargs to skip tokenizer (#100)

alvarobartt · web-flow · commit 84cabf72b771 · 2024-12-16T13:25:43.000+01:00
* Skip `tokenizer` if `HF_TASK='image-text-to-text'

* Add unit tests for 'image-text-to-text' pipelines

* Fix `pyproject.toml` warnings on `tool.ruff.lint`

* Run `make style` to fix CI

* Add `validate_image_text_to_text` function in `utils.py`
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,6 +4,12 @@ no_implicit_optional = true
 scripts_are_modules = true
 
 [tool.ruff]
+# Same as Black.
+line-length = 119
+# Assume Python 3.11
+target-version = "py311"
+
+[tool.ruff.lint]
 select = [
     "E", # pycodestyle errors
     "W", # pycodestyle warnings
@@ -17,15 +23,8 @@ ignore = [
     "B008", # do not perform function calls in argument defaults
     "C901", # too complex
 ]
-# Same as Black.
-line-length = 119
-
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
-
-# Assume Python 3.11
-target-version = "py311"
-
 per-file-ignores = { "__init__.py" = ["F401"] }
 
 [tool.isort]
diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
@@ -237,7 +237,7 @@ def get_pipeline(
         "zero-shot-image-classification",
     }:
         kwargs["feature_extractor"] = model_dir
-    elif task not in {"image-to-text", "text-to-image"}:
+    elif task not in {"image-text-to-text", "image-to-text", "text-to-image"}:
         kwargs["tokenizer"] = model_dir
 
     if is_sentence_transformers_available() and task in [
diff --git a/tests/integ/config.py b/tests/integ/config.py
@@ -7,6 +7,7 @@
     validate_custom,
     validate_feature_extraction,
     validate_fill_mask,
+    validate_image_text_to_text,
     validate_ner,
     validate_object_detection,
     validate_question_answering,
@@ -108,6 +109,10 @@
         "pytorch": "hf-internal-testing/tiny-random-beit-pipeline",
         "tensorflow": None,
     },
+    "image-text-to-text": {
+        "pytorch": "Salesforce/blip-image-captioning-base",
+        "tensorflow": None,
+    },
 }
 
 
@@ -134,24 +139,12 @@
         "inputs": "question: What is 42 context: 42 is the answer to life, the universe and everything."
     },
     "text-generation": {"inputs": "My name is philipp and I am"},
-    "image-classification": open(
-        os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb"
-    ).read(),
-    "zero-shot-image-classification": open(
-        os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb"
-    ).read(),
-    "object-detection": open(
-        os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb"
-    ).read(),
-    "image-segmentation": open(
-        os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb"
-    ).read(),
-    "automatic-speech-recognition": open(
-        os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb"
-    ).read(),
-    "audio-classification": open(
-        os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb"
-    ).read(),
+    "image-classification": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
+    "zero-shot-image-classification": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
+    "object-detection": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
+    "image-segmentation": open(os.path.join(os.getcwd(), "tests/resources/image/tiger.jpeg"), "rb").read(),
+    "automatic-speech-recognition": open(os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb").read(),
+    "audio-classification": open(os.path.join(os.getcwd(), "tests/resources/audio/sample1.flac"), "rb").read(),
     "table-question-answering": {
         "inputs": {
             "query": "How many stars does the transformers repository have?",
@@ -175,11 +168,15 @@
         }
     },
     "sentence-embeddings": {"inputs": "Lets create an embedding"},
-    "sentence-ranking": {
-        "inputs": ["Lets create an embedding", "Lets create an embedding"]
-    },
+    "sentence-ranking": {"inputs": ["Lets create an embedding", "Lets create an embedding"]},
     "text-to-image": {"inputs": "a man on a horse jumps over a broken down airplane."},
     "custom": {"inputs": "this is a test"},
+    "image-text-to-text": {
+        "inputs": {
+            "images": "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+            "text": "A photo of",
+        }
+    },
 }
 
 task2output = {
@@ -213,15 +210,9 @@
         "end": 77,
         "answer": "sagemaker",
     },
-    "summarization": [
-        {"summary_text": " The A The The ANew York City has been installed in the US."}
-    ],
-    "translation_xx_to_yy": [
-        {"translation_text": "Mein Name ist Sarah und ich lebe in London"}
-    ],
-    "text2text-generation": [
-        {"generated_text": "42 is the answer to life, the universe and everything"}
-    ],
+    "summarization": [{"summary_text": " The A The The ANew York City has been installed in the US."}],
+    "translation_xx_to_yy": [{"translation_text": "Mein Name ist Sarah und ich lebe in London"}],
+    "text2text-generation": [{"generated_text": "42 is the answer to life, the universe and everything"}],
     "feature-extraction": None,
     "fill-mask": None,
     "text-generation": None,
@@ -269,6 +260,7 @@
     "sentence-embeddings": {"embeddings": ""},
     "sentence-ranking": {"scores": ""},
     "text-to-image": bytes,
+    "image-text-to-text": [{"input_text": "A photo of", "generated_text": "..."}],
     "custom": {"inputs": "this is a test"},
 }
 
@@ -296,5 +288,6 @@
     "sentence-embeddings": validate_zero_shot_classification,
     "sentence-ranking": validate_zero_shot_classification,
     "text-to-image": validate_text_to_image,
+    "image-text-to-text": validate_image_text_to_text,
     "custom": validate_custom,
 }
diff --git a/tests/integ/helpers.py b/tests/integ/helpers.py
@@ -34,7 +34,6 @@ def make_sure_other_containers_are_stopped(client: DockerClient, container_name:
 #    reraise = True
 # )
 def wait_for_container_to_be_ready(base_url, time_between_retries=3, max_retries=30):
-
     retries = 0
     error = None
 
@@ -46,9 +45,7 @@ def wait_for_container_to_be_ready(base_url, time_between_retries=3, max_retries
                 logging.info("Container ready!")
                 return True
             else:
-                raise ConnectionError(
-                    f"Couldn'start container, Error: {response.status_code}"
-                )
+                raise ConnectionError(f"Couldn'start container, Error: {response.status_code}")
         except Exception as exception:
             error = exception
             logging.warning(f"Container at {base_url} not ready, trying again...")
@@ -62,7 +59,6 @@ def verify_task(
     # container: DockerClient,
     task: str,
     port: int = 5000,
-    framework: str = "pytorch",
 ):
     BASE_URL = f"http://localhost:{port}"
     logging.info(f"Base URL: {BASE_URL}")
@@ -90,10 +86,7 @@ def verify_task(
                 headers={"content-type": "audio/x-audio"},
             ).json()
         elif task == "text-to-image":
-            prediction = requests.post(
-                f"{BASE_URL}", json=input, headers={"accept": "image/png"}
-            ).content
-
+            prediction = requests.post(f"{BASE_URL}", json=input, headers={"accept": "image/png"}).content
         else:
             prediction = requests.post(f"{BASE_URL}", json=input).json()
 
@@ -119,6 +112,8 @@ def verify_task(
 @pytest.mark.parametrize(
     "task",
     [
+        # transformers
+        # TODO: "visual-question-answering" and "zero-shot-image-classification" not supported yet due to multimodality input
         "text-classification",
         "zero-shot-classification",
         "token-classification",
@@ -136,25 +131,22 @@ def verify_task(
         "image-segmentation",
         "table-question-answering",
         "conversational",
-        # TODO currently not supported due to multimodality input
-        # "visual-question-answering",
-        # "zero-shot-image-classification",
+        "image-text-to-text",
+        # sentence-transformers
         "sentence-similarity",
         "sentence-embeddings",
         "sentence-ranking",
         # diffusers
         "text-to-image",
     ],
 )
-def test_pt_container_remote_model(task) -> None:
+def test_pt_container_remote_model(task: str) -> None:
     container_name = f"integration-test-{task}"
     container_image = f"starlette-transformers:{DEVICE}"
     framework = "pytorch"
     model = task2model[task][framework]
     port = random.randint(5000, 6000)
-    device_request = (
-        [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
-    )
+    device_request = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
 
     make_sure_other_containers_are_stopped(client, container_name)
     container = client.containers.run(
@@ -177,6 +169,8 @@ def test_pt_container_remote_model(task) -> None:
 @pytest.mark.parametrize(
     "task",
     [
+        # transformers
+        # TODO: "visual-question-answering" and "zero-shot-image-classification" not supported yet due to multimodality input
         "text-classification",
         "zero-shot-classification",
         "token-classification",
@@ -194,29 +188,26 @@ def test_pt_container_remote_model(task) -> None:
         "image-segmentation",
         "table-question-answering",
         "conversational",
-        # TODO currently not supported due to multimodality input
-        # "visual-question-answering",
-        # "zero-shot-image-classification",
+        "image-text-to-text",
+        # sentence-transformers
         "sentence-similarity",
         "sentence-embeddings",
         "sentence-ranking",
         # diffusers
         "text-to-image",
     ],
 )
-def test_pt_container_local_model(task) -> None:
+def test_pt_container_local_model(task: str) -> None:
     container_name = f"integration-test-{task}"
     container_image = f"starlette-transformers:{DEVICE}"
     framework = "pytorch"
     model = task2model[task][framework]
     port = random.randint(5000, 6000)
-    device_request = (
-        [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
-    )
+    device_request = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
     make_sure_other_containers_are_stopped(client, container_name)
     with tempfile.TemporaryDirectory() as tmpdirname:
         # https://github.com/huggingface/infinity/blob/test-ovh/test/integ/utils.py
-        _storage_dir = _load_repository_from_hf(model, tmpdirname, framework="pytorch")
+        _load_repository_from_hf(model, tmpdirname, framework="pytorch")
         container = client.containers.run(
             container_image,
             name=container_name,
@@ -241,9 +232,7 @@ def test_pt_container_local_model(task) -> None:
 def test_pt_container_custom_handler(repository_id) -> None:
     container_name = "integration-test-custom"
     container_image = f"starlette-transformers:{DEVICE}"
-    device_request = (
-        [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
-    )
+    device_request = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
     port = random.randint(5000, 6000)
 
     make_sure_other_containers_are_stopped(client, container_name)
@@ -277,12 +266,10 @@ def test_pt_container_custom_handler(repository_id) -> None:
     "repository_id",
     ["philschmid/custom-pipeline-text-classification"],
 )
-def test_pt_container_legacy_custom_pipeline(repository_id) -> None:
+def test_pt_container_legacy_custom_pipeline(repository_id: str) -> None:
     container_name = "integration-test-custom"
     container_image = f"starlette-transformers:{DEVICE}"
-    device_request = (
-        [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
-    )
+    device_request = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
     port = random.randint(5000, 6000)
 
     make_sure_other_containers_are_stopped(client, container_name)
@@ -345,9 +332,7 @@ def test_tf_container_remote_model(task) -> None:
     container_image = f"starlette-transformers:{DEVICE}"
     framework = "tensorflow"
     model = task2model[task][framework]
-    device_request = (
-        [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
-    )
+    device_request = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
     if model is None:
         pytest.skip("no supported TF model")
     port = random.randint(5000, 6000)
@@ -401,9 +386,7 @@ def test_tf_container_local_model(task) -> None:
     container_image = f"starlette-transformers:{DEVICE}"
     framework = "tensorflow"
     model = task2model[task][framework]
-    device_request = (
-        [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
-    )
+    device_request = [docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])] if IS_GPU else []
     if model is None:
         pytest.skip("no supported TF model")
     port = random.randint(5000, 6000)
diff --git a/tests/integ/utils.py b/tests/integ/utils.py
@@ -6,6 +6,7 @@ def validate_classification(result=None, snapshot=None):
         assert result[idx].keys() == snapshot[idx].keys()
     return True
 
+
 def validate_conversational(result=None, snapshot=None):
     assert len(result[0]["generated_text"]) >= len(snapshot)
 
@@ -82,6 +83,13 @@ def validate_text_to_image(result=None, snapshot=None):
     assert isinstance(result, snapshot)
     return True
 
+
+def validate_image_text_to_text(result=None, snapshot=None):
+    assert isinstance(result, list)
+    assert all(isinstance(d, dict) and d.keys() == {"input_text", "generated_text"} for d in result)
+    return True
+
+
 def validate_custom(result=None, snapshot=None):
     logging.info(f"Validate custom task - result: {result}, snapshot: {snapshot}")
     assert result == snapshot