Adding example that shows how to pass images in multi-modal prompts (#258)

hillary-mutisya · web-flow · commit 1b460732bcae · 2024-08-07T00:44:38.000-07:00
* Enable passing images along with text for models that support multimodal input

* Add example to demonstrate multimodal input

* Update package-lock.json
diff --git a/typescript/examples/crossword/README.md b/typescript/examples/crossword/README.md
@@ -0,0 +1,25 @@
+# Crossword
+
+The Crossword example shows how to include an image in a multimodal prompt and use the image to answer a user's question. The responses follow the  [`CrosswordActions`](./src/crosswordSchema.ts) type.
+
+## Target models
+
+This example explores multi-modal input. Torun this, you will need a model that accepts images as input. The example has beeentested with **gpt-4-vision** and **gpt-4-omni** models.
+
+# Try Crossword
+To run the Crossword example, follow the instructions in the [examples README](../README.md#step-1-configure-your-development-environment).
+
+# Usage
+Example prompts can be found in [`src/input.txt`](./src/input.txt).
+
+For example, given the following input statement:
+
+**Input**:
+```
+🏁> What is the clue for 61 across
+```
+
+**Output**:
+```
+"Monogram in French fashion"
+```
diff --git a/typescript/examples/crossword/package.json b/typescript/examples/crossword/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "crossword",
+  "version": "0.0.1",
+  "private": true,
+  "description": "",
+  "main": "dist/main.js",
+  "scripts": {
+    "build": "tsc -p src",
+    "postbuild": "copyfiles -u 1 src/**/*Schema.ts src/**/*.txt src/**/*.jpeg dist"
+  },
+  "author": "",
+  "license": "MIT",
+  "dependencies": {
+    "dotenv": "^16.3.1",
+    "find-config": "^1.0.0",
+    "typechat": "^0.1.0",
+    "typescript": "^5.3.3"
+  },
+  "devDependencies": {
+    "@types/find-config": "1.0.4",
+    "@types/node": "^20.10.4",
+    "copyfiles": "^2.4.1"
+  }
+}
diff --git a/typescript/examples/crossword/src/crosswordSchema.ts b/typescript/examples/crossword/src/crosswordSchema.ts
@@ -0,0 +1,33 @@
+// The following is a schema definition for determining the sentiment of a some user input.
+
+export type GetClueText = {
+    actionName: "getClueText";
+    parameters: {
+        clueNumber: number;
+        clueDirection: "across" | "down";
+        value: string;
+    };
+};
+
+// This gives the answer for the requested crossword clue
+export type GetAnswerValue = {
+    actionName: "getAnswerValue";
+    parameters: {
+        proposedAnswer: string;
+        clueNumber: number;
+        clueDirection: "across" | "down";
+    };
+};
+
+export type UnknownAction = {
+    actionName: "unknown";
+    parameters: {
+        // text typed by the user that the system did not understand
+        text: string;
+    };
+};
+
+export type CrosswordActions =
+    | GetClueText
+    | GetAnswerValue
+    | UnknownAction;
diff --git a/typescript/examples/crossword/src/input.txt b/typescript/examples/crossword/src/input.txt
@@ -0,0 +1,2 @@
+What is the clue for 1 down
+Give me a hint for solving 4 down
diff --git a/typescript/examples/crossword/src/main.ts b/typescript/examples/crossword/src/main.ts
@@ -0,0 +1,34 @@
+import assert from "assert";
+import dotenv from "dotenv";
+import findConfig from "find-config";
+import fs from "fs";
+import path from "path";
+import { createLanguageModel } from "typechat";
+import { processRequests } from "typechat/interactive";
+import { createTypeScriptJsonValidator } from "typechat/ts";
+import { CrosswordActions } from "./crosswordSchema";
+import { createCrosswordActionTranslator } from "./translator";
+
+const dotEnvPath = findConfig(".env");
+assert(dotEnvPath, ".env file not found!");
+dotenv.config({ path: dotEnvPath });
+
+const model = createLanguageModel(process.env);
+const schema = fs.readFileSync(path.join(__dirname, "crosswordSchema.ts"), "utf8");
+
+const rawImage = fs.readFileSync(path.join(__dirname, "puzzleScreenshot.jpeg"),"base64");
+const screenshot = `data:image/jpeg;base64,${rawImage}`;
+
+const validator = createTypeScriptJsonValidator<CrosswordActions>(schema, "CrosswordActions");
+const translator = createCrosswordActionTranslator(model, validator, screenshot);
+
+// Process requests interactively or from the input file specified on the command line
+processRequests("🏁> ", process.argv[2], async (request) => {
+    const response = await translator.translate(request);
+    if (!response.success) {
+        console.log(response.message);
+        return;
+    }
+
+    console.log(JSON.stringify(response.data));
+});
diff --git a/typescript/examples/crossword/src/puzzleScreenshot.jpeg b/typescript/examples/crossword/src/puzzleScreenshot.jpeg
diff --git a/typescript/examples/crossword/src/translator.ts b/typescript/examples/crossword/src/translator.ts
@@ -0,0 +1,82 @@
+import {
+  TypeChatLanguageModel,
+  createJsonTranslator,
+  TypeChatJsonTranslator,
+  MultimodalPromptContent,
+  PromptContent,
+} from "typechat";
+import { TypeScriptJsonValidator } from "typechat/ts";
+
+export function createCrosswordActionTranslator<T extends object>(
+  model: TypeChatLanguageModel,
+  validator: TypeScriptJsonValidator<T>,
+  crosswordImage: string
+): TypeChatJsonTranslator<T> {
+  const _imageContent = crosswordImage;
+
+  const _translator = createJsonTranslator(model, validator);
+  _translator.createRequestPrompt = createRequestPrompt
+
+  return _translator;
+
+  function createRequestPrompt(request: string): PromptContent {
+    const screenshotSection = getScreenshotPromptSection(_imageContent);
+    const contentSections = [
+      {
+        type: "text",
+        text: "You are a virtual assistant that can help users to complete requests by interacting with the UI of a webpage.",
+      },
+      ...screenshotSection,
+      {
+        type: "text",
+        text: `
+                Use the layout information provided to answer user queries. 
+                The responses should be translated into JSON objects of type ${_translator.validator.getTypeName()} using the typescript schema below:
+                
+                '''
+                ${_translator.validator.getSchemaText()}
+                '''
+            `,
+      },
+      {
+        type: "text",
+        text: `
+                The following is a user request:
+                '''
+                ${request}
+                '''
+                The following is the assistant's response translated into a JSON object with 2 spaces of indentation and no properties with the value undefined:   
+            `,
+      },
+    ] as MultimodalPromptContent[];
+
+    return contentSections;
+  }
+
+  function getScreenshotPromptSection(screenshot: string | undefined) {
+    let screenshotSection = [];
+    if (screenshot) {
+      screenshotSection.push({
+        type: "text",
+        text: "Here is a screenshot of the currently visible webpage",
+      });
+
+      screenshotSection.push({
+        type: "image_url",
+        image_url: {
+          url: screenshot,
+          detail: "high"
+        },
+      });
+
+      screenshotSection.push({
+        type: "text",
+        text: `Use the top left corner as coordinate 0,0 and draw a virtual grid of 1x1 pixels, 
+               where x values increase for each pixel as you go from left to right, and y values increase 
+               as you go from top to bottom. 
+            `,
+      });
+    }
+    return screenshotSection;
+  }
+}
diff --git a/typescript/examples/crossword/src/tsconfig.json b/typescript/examples/crossword/src/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "es2021",
+    "lib": ["es2021"],
+    "module": "node16",
+    "types": ["node"],
+    "outDir": "../dist",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "exactOptionalPropertyTypes": true,
+    "inlineSourceMap": true
+  }
+}
diff --git a/typescript/package-lock.json b/typescript/package-lock.json
diff --git a/typescript/src/model.ts b/typescript/src/model.ts
@@ -13,9 +13,13 @@ export interface PromptSection {
     /**
      * Specifies the content of this section.
      */
-    content: string | MultimodalPromptContent[];
+    content: PromptContent;
 }
 
+export type PromptContent =
+    | string
+    | MultimodalPromptContent[];
+
 /**
  * GPT-4-vision, GPT-4-omni and GPT-4-turbo allow multi-modal input, where images and text can
  * be part of the prompt. To support this, the content section of the prompt has an array of objects.
diff --git a/typescript/src/typechat.ts b/typescript/src/typechat.ts
@@ -1,5 +1,5 @@
 import { Result, success, error } from "./result";
-import { TypeChatLanguageModel, PromptSection } from "./model";
+import { TypeChatLanguageModel, PromptSection, PromptContent } from "./model";
 
 /**
  * Represents an object that can translate natural language requests in JSON objects of the given type.
@@ -31,7 +31,7 @@ export interface TypeChatJsonTranslator<T extends object> {
      * @param request The natural language request.
      * @returns A prompt that combines the request with the schema and type name of the underlying validator.
      */
-    createRequestPrompt(request: string): string;
+    createRequestPrompt(request: string): PromptContent;
     /**
      * Creates a repair prompt to append to an original prompt/response in order to repair a JSON object that
      * failed to validate. This function is called by `completeAndValidate` when `attemptRepair` is true and the

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+What is the clue for 1 down`
	`2`	`+Give me a hint for solving 4 down`