Enable passing images along with text for models that support multimodal input (#254)

hillary-mutisya · web-flow · commit 76c7edf52c79 · 2024-08-06T12:43:34.000-07:00
* Enable passing images along with text for models that support multimodal input

* Update comments for image inputs

* Fix spaces and comments
diff --git a/typescript/src/model.ts b/typescript/src/model.ts
@@ -13,9 +13,42 @@ export interface PromptSection {
     /**
      * Specifies the content of this section.
      */
-    content: string;
+    content: string | MultimodalPromptContent[];
 }
 
+/**
+ * GPT-4-vision, GPT-4-omni and GPT-4-turbo allow multi-modal input, where images and text can
+ * be part of the prompt. To support this, the content section of the prompt has an array of objects.
+ */
+export type MultimodalPromptContent =
+    | string
+    | TextPromptContent
+    | ImagePromptContent;
+
+export type TextPromptContent = {
+    type: "text";
+    text: string;
+};
+
+export type ImagePromptContent = {
+    type: "image_url";
+    image_url: ImageUrl;
+};
+
+export type ImageUrl = {
+    /*
+     * This could be a URL to a hosted image, or the base64-encoded image content.
+     */
+    url: string;
+    
+    /*
+     * Controls how the model processes the image and generates its textual understanding.
+     * In "low" mode, the model treats the image as 512x512px, while "high" mode considers
+     * the image at full size.
+     */
+    detail?: "auto" | "low" | "high";
+};
+
 /**
  * Represents a AI language model that can complete prompts. TypeChat uses an implementation of this
  * interface to communicate with an AI service that can translate natural language requests to JSON
@@ -135,7 +168,11 @@ function createFetchLanguageModel(url: string, headers: object, defaultParams: o
             const response = await fetch(url, options);
             if (response.ok) {
                 const json = await response.json() as { choices: { message: PromptSection }[] };
-                return success(json.choices[0].message.content ?? "");
+                if (typeof json.choices[0].message.content === "string") {
+                    return success(json.choices[0].message.content ?? "");
+                } else {
+                    return error(`REST API unexpected response format: ${JSON.stringify(json.choices[0].message.content)}`);
+                }
             }
             if (!isTransientHttpError(response.status) || retryCount >= retryMaxAttempts) {
                 return error(`REST API error ${response.status}: ${response.statusText}`);