Skip to content

Commit 1b46073

Browse files
Adding example that shows how to pass images in multi-modal prompts (#258)
* Enable passing images along with text for models that support multimodal input * Add example to demonstrate multimodal input * Update package-lock.json
1 parent 76c7edf commit 1b46073

File tree

11 files changed

+242
-3
lines changed

11 files changed

+242
-3
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Crossword
2+
3+
The Crossword example shows how to include an image in a multimodal prompt and use the image to answer a user's question. The responses follow the [`CrosswordActions`](./src/crosswordSchema.ts) type.
4+
5+
## Target models
6+
7+
This example explores multi-modal input. Torun this, you will need a model that accepts images as input. The example has beeentested with **gpt-4-vision** and **gpt-4-omni** models.
8+
9+
# Try Crossword
10+
To run the Crossword example, follow the instructions in the [examples README](../README.md#step-1-configure-your-development-environment).
11+
12+
# Usage
13+
Example prompts can be found in [`src/input.txt`](./src/input.txt).
14+
15+
For example, given the following input statement:
16+
17+
**Input**:
18+
```
19+
🏁> What is the clue for 61 across
20+
```
21+
22+
**Output**:
23+
```
24+
"Monogram in French fashion"
25+
```
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"name": "crossword",
3+
"version": "0.0.1",
4+
"private": true,
5+
"description": "",
6+
"main": "dist/main.js",
7+
"scripts": {
8+
"build": "tsc -p src",
9+
"postbuild": "copyfiles -u 1 src/**/*Schema.ts src/**/*.txt src/**/*.jpeg dist"
10+
},
11+
"author": "",
12+
"license": "MIT",
13+
"dependencies": {
14+
"dotenv": "^16.3.1",
15+
"find-config": "^1.0.0",
16+
"typechat": "^0.1.0",
17+
"typescript": "^5.3.3"
18+
},
19+
"devDependencies": {
20+
"@types/find-config": "1.0.4",
21+
"@types/node": "^20.10.4",
22+
"copyfiles": "^2.4.1"
23+
}
24+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// The following is a schema definition for determining the sentiment of a some user input.
2+
3+
export type GetClueText = {
4+
actionName: "getClueText";
5+
parameters: {
6+
clueNumber: number;
7+
clueDirection: "across" | "down";
8+
value: string;
9+
};
10+
};
11+
12+
// This gives the answer for the requested crossword clue
13+
export type GetAnswerValue = {
14+
actionName: "getAnswerValue";
15+
parameters: {
16+
proposedAnswer: string;
17+
clueNumber: number;
18+
clueDirection: "across" | "down";
19+
};
20+
};
21+
22+
export type UnknownAction = {
23+
actionName: "unknown";
24+
parameters: {
25+
// text typed by the user that the system did not understand
26+
text: string;
27+
};
28+
};
29+
30+
export type CrosswordActions =
31+
| GetClueText
32+
| GetAnswerValue
33+
| UnknownAction;
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
What is the clue for 1 down
2+
Give me a hint for solving 4 down
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import assert from "assert";
2+
import dotenv from "dotenv";
3+
import findConfig from "find-config";
4+
import fs from "fs";
5+
import path from "path";
6+
import { createLanguageModel } from "typechat";
7+
import { processRequests } from "typechat/interactive";
8+
import { createTypeScriptJsonValidator } from "typechat/ts";
9+
import { CrosswordActions } from "./crosswordSchema";
10+
import { createCrosswordActionTranslator } from "./translator";
11+
12+
const dotEnvPath = findConfig(".env");
13+
assert(dotEnvPath, ".env file not found!");
14+
dotenv.config({ path: dotEnvPath });
15+
16+
const model = createLanguageModel(process.env);
17+
const schema = fs.readFileSync(path.join(__dirname, "crosswordSchema.ts"), "utf8");
18+
19+
const rawImage = fs.readFileSync(path.join(__dirname, "puzzleScreenshot.jpeg"),"base64");
20+
const screenshot = `data:image/jpeg;base64,${rawImage}`;
21+
22+
const validator = createTypeScriptJsonValidator<CrosswordActions>(schema, "CrosswordActions");
23+
const translator = createCrosswordActionTranslator(model, validator, screenshot);
24+
25+
// Process requests interactively or from the input file specified on the command line
26+
processRequests("🏁> ", process.argv[2], async (request) => {
27+
const response = await translator.translate(request);
28+
if (!response.success) {
29+
console.log(response.message);
30+
return;
31+
}
32+
33+
console.log(JSON.stringify(response.data));
34+
});
366 KB
Loading
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import {
2+
TypeChatLanguageModel,
3+
createJsonTranslator,
4+
TypeChatJsonTranslator,
5+
MultimodalPromptContent,
6+
PromptContent,
7+
} from "typechat";
8+
import { TypeScriptJsonValidator } from "typechat/ts";
9+
10+
export function createCrosswordActionTranslator<T extends object>(
11+
model: TypeChatLanguageModel,
12+
validator: TypeScriptJsonValidator<T>,
13+
crosswordImage: string
14+
): TypeChatJsonTranslator<T> {
15+
const _imageContent = crosswordImage;
16+
17+
const _translator = createJsonTranslator(model, validator);
18+
_translator.createRequestPrompt = createRequestPrompt
19+
20+
return _translator;
21+
22+
function createRequestPrompt(request: string): PromptContent {
23+
const screenshotSection = getScreenshotPromptSection(_imageContent);
24+
const contentSections = [
25+
{
26+
type: "text",
27+
text: "You are a virtual assistant that can help users to complete requests by interacting with the UI of a webpage.",
28+
},
29+
...screenshotSection,
30+
{
31+
type: "text",
32+
text: `
33+
Use the layout information provided to answer user queries.
34+
The responses should be translated into JSON objects of type ${_translator.validator.getTypeName()} using the typescript schema below:
35+
36+
'''
37+
${_translator.validator.getSchemaText()}
38+
'''
39+
`,
40+
},
41+
{
42+
type: "text",
43+
text: `
44+
The following is a user request:
45+
'''
46+
${request}
47+
'''
48+
The following is the assistant's response translated into a JSON object with 2 spaces of indentation and no properties with the value undefined:
49+
`,
50+
},
51+
] as MultimodalPromptContent[];
52+
53+
return contentSections;
54+
}
55+
56+
function getScreenshotPromptSection(screenshot: string | undefined) {
57+
let screenshotSection = [];
58+
if (screenshot) {
59+
screenshotSection.push({
60+
type: "text",
61+
text: "Here is a screenshot of the currently visible webpage",
62+
});
63+
64+
screenshotSection.push({
65+
type: "image_url",
66+
image_url: {
67+
url: screenshot,
68+
detail: "high"
69+
},
70+
});
71+
72+
screenshotSection.push({
73+
type: "text",
74+
text: `Use the top left corner as coordinate 0,0 and draw a virtual grid of 1x1 pixels,
75+
where x values increase for each pixel as you go from left to right, and y values increase
76+
as you go from top to bottom.
77+
`,
78+
});
79+
}
80+
return screenshotSection;
81+
}
82+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"compilerOptions": {
3+
"target": "es2021",
4+
"lib": ["es2021"],
5+
"module": "node16",
6+
"types": ["node"],
7+
"outDir": "../dist",
8+
"esModuleInterop": true,
9+
"forceConsistentCasingInFileNames": true,
10+
"strict": true,
11+
"noUnusedLocals": true,
12+
"noUnusedParameters": true,
13+
"exactOptionalPropertyTypes": true,
14+
"inlineSourceMap": true
15+
}
16+
}

typescript/package-lock.json

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

typescript/src/model.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,13 @@ export interface PromptSection {
1313
/**
1414
* Specifies the content of this section.
1515
*/
16-
content: string | MultimodalPromptContent[];
16+
content: PromptContent;
1717
}
1818

19+
export type PromptContent =
20+
| string
21+
| MultimodalPromptContent[];
22+
1923
/**
2024
* GPT-4-vision, GPT-4-omni and GPT-4-turbo allow multi-modal input, where images and text can
2125
* be part of the prompt. To support this, the content section of the prompt has an array of objects.

0 commit comments

Comments
 (0)