From 5557d7b5ada6a5f1ab0cb2b3ec0a2ee36c50b544 Mon Sep 17 00:00:00 2001 From: Dev Team Date: Tue, 3 Dec 2024 20:34:14 +0100 Subject: [PATCH] V9.3.8 - with Anthropic Computer Use --- extension/manifest.json | 2 +- src/common/command.ts | 3 + src/common/constant.ts | 4 + src/common/ts_utils.ts | 5 + src/components/header.js | 2 +- src/components/settings_modal/tabs/ai.tsx | 11 + src/config/preinstall_macros.js | 337 +++++++++++++++++- src/containers/dashboard/bottom/index.js | 6 + src/containers/dashboard/editor.js | 10 +- .../sidepanel/components/logs/index.js | 5 + src/index.js | 78 ++-- src/init_player.js | 261 +++++++++++++- src/services/ai/computer-use/computer-use.ts | 263 ++++++++++++++ src/services/ai/computer-use/model.ts | 25 ++ src/services/ai/computer-use/sampling.ts | 246 +++++++++++++ src/services/anthropic/anthropic.service.ts | 138 +------ 16 files changed, 1206 insertions(+), 190 deletions(-) create mode 100644 src/services/ai/computer-use/computer-use.ts create mode 100644 src/services/ai/computer-use/model.ts create mode 100644 src/services/ai/computer-use/sampling.ts diff --git a/extension/manifest.json b/extension/manifest.json index 860c9e3..3710e60 100644 --- a/extension/manifest.json +++ b/extension/manifest.json @@ -5,7 +5,7 @@ "description": "__MSG_description__", "short_name": "__MSG_short_name__", "default_locale": "en", - "version": "9.3.7", + "version": "9.3.8", "icons": { "128": "logo128.png" diff --git a/src/common/command.ts b/src/common/command.ts index d486a7c..9f0dffd 100644 --- a/src/common/command.ts +++ b/src/common/command.ts @@ -110,6 +110,7 @@ export const commandScopes = { 'aiPrompt': CommandScope.All, 'aiScreenXY': CommandScope.All, + 'aiComputerUse': CommandScope.All, 'setProxy': CommandScope.All, 'run': CommandScope.All, @@ -294,6 +295,7 @@ export function doesCommandSupportTargetOptions (str: string) { case 'verifyNotChecked': case 'aiPrompt': case 'aiScreenXY': + case 'aiComputerUse': return true default: @@ -360,6 +362,7 @@ export function canCommandFind (str: string): boolean { case 'forEach': case 'OCRExtractScreenshot': case 'aiPrompt': + case 'aiComputerUse': return false default: diff --git a/src/common/constant.ts b/src/common/constant.ts index dd4771b..eb36e4e 100644 --- a/src/common/constant.ts +++ b/src/common/constant.ts @@ -62,3 +62,7 @@ export const SCREENSHOT_DELAY = /Linux/i.test(self.navigator.userAgent) ? 200 : export const CS_IPC_TIMEOUT = 4000 export const STATE_STORAGE_KEY = 'background_state' + +export const ANTHROPIC = { + COMPUTER_USE_MODEL: 'claude-3-5-sonnet-20241022' +} diff --git a/src/common/ts_utils.ts b/src/common/ts_utils.ts index 6151273..bbe6064 100644 --- a/src/common/ts_utils.ts +++ b/src/common/ts_utils.ts @@ -1149,6 +1149,11 @@ export function isMac (): boolean { return !!/macintosh/i.test(userAgent) || (/mac os x/i.test(userAgent) && !/like mac os x/i.test(userAgent)) } +export function isWindows (): boolean { + const userAgent = window.navigator.userAgent + return !!/windows/i.test(userAgent) +} + export function resolvePath (path: any, basePath: string, relativePath: string): string { const dirPath = path.dirname(basePath) diff --git a/src/components/header.js b/src/components/header.js index 81f269e..11c314c 100644 --- a/src/components/header.js +++ b/src/components/header.js @@ -576,7 +576,7 @@ class Header extends React.Component { // preset #210 // uncomment the following line to activate it - // applyPresetLicense('ENTERPRISE LICENSE HERE') + // applyPresetLicense('LICENSE KEY HERE') window.addEventListener("beforeunload", this.beforeUnloadHandler); } diff --git a/src/components/settings_modal/tabs/ai.tsx b/src/components/settings_modal/tabs/ai.tsx index 58c5bf0..2a854ca 100644 --- a/src/components/settings_modal/tabs/ai.tsx +++ b/src/components/settings_modal/tabs/ai.tsx @@ -130,6 +130,17 @@ class AITab extends React.Component {
{this.state.promptResponse}
+
+ aiComputerUse: Max loops before stopping: + onConfigChange('aiComputerUseMaxLoops', e.target.value)} + placeholder="20" + /> +
{this.state.error}
diff --git a/src/config/preinstall_macros.js b/src/config/preinstall_macros.js index 72c7767..1187408 100644 --- a/src/config/preinstall_macros.js +++ b/src/config/preinstall_macros.js @@ -1,4 +1,337 @@ export default { + "AI(Beta)/CU_PlayTicTacToe": { + "CreationDate": "2024-12-02", + "Commands": [ + { + "Command": "XDesktopAutomation", + "Target": "false", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "This demo macro uses an external website which is not affiliated with Ui.Vision.", + "Value": "blue", + "Description": "" + }, + { + "Command": "bringBrowserToForeground", + "Target": "true", + "Value": "", + "Description": "" + }, + { + "Command": "open", + "Target": "https://www.gamepix.com/play/tic-tac-toe-html5", + "Value": "", + "Description": "" + }, + { + "Command": "aiComputerUse", + "Target": "You are playing a game of tic tac toe against the computer.\n\nYou are Player 1. \n\nIf you win, end with message 'GAMEWIN'. \n\nIf you lose, end with 'GAMELOST'. \n\nIf the game draws, end with 'GAMEDRAW'. \n\nIf you encounter invalid game state or cannot make a move, end with 'ERROR'. \n\nTool use instructions: Do not use mouse move commands, only click commands.", + "Value": "s", + "Description": "" + }, + { + "Command": "echo", + "Target": "Computer Use Result = ${s}", + "Value": "blue", + "Description": "" + }, + { + "Command": "if", + "Target": "${s}.lastIndexOf(\"GAMEWIN\") >= 0", + "Value": "", + "Description": "Search for substring in string: The return value of .lastIndexOf() is -1 if the substring is not found in the string at all." + }, + { + "Command": "echo", + "Target": "We won !!! :)", + "Value": "#shownotification", + "Description": "" + }, + { + "Command": "elseif", + "Target": "${s}.lastIndexOf(\"GAMELOST\") >= 0", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "We lost", + "Value": "cyan", + "Description": "" + }, + { + "Command": "elseif", + "Target": "${s}.lastIndexOf(\"GAMEDRAW\") >= 0", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "A draw", + "Value": "blue", + "Description": "" + }, + { + "Command": "elseif", + "Target": "${s}.lastIndexOf(\"ERROR\") >= 0", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "An error happened", + "Value": "brown", + "Description": "" + }, + { + "Command": "else", + "Target": "", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "This state should never happen. String should contain one of the keywords.", + "Value": "orange", + "Description": "" + }, + { + "Command": "end", + "Target": "", + "Value": "", + "Description": "" + } + ] + }, + "AI(Beta)/CU_UseWebCalculator": { + "CreationDate": "2024-12-02", + "Commands": [ + { + "Command": "XDesktopAutomation", + "Target": "false", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "This demo macro uses an external website which is not affiliated with Ui.Vision.", + "Value": "blue", + "Description": "" + }, + { + "Command": "bringBrowserToForeground", + "Target": "true", + "Value": "", + "Description": "" + }, + { + "Command": "open", + "Target": "https://www.theonlinecalculator.com/", + "Value": "", + "Description": "" + }, + { + "Command": "aiComputerUse", + "Target": "Use the calculator to compute 8 + 9 by clicking the buttons.\nVerify the result. \n\nEnd with SUCCESS, or ERROR if problems occur. \n\n", + "Value": "s", + "Description": "" + }, + { + "Command": "echo", + "Target": "Computer Use Result = ${s}", + "Value": "blue", + "Description": "" + }, + { + "Command": "if", + "Target": "${s}.lastIndexOf(\"SUCCESS\") >= 0", + "Value": "", + "Description": "Search for substring in string: The return value of .lastIndexOf() is -1 if the substring is not found in the string at all." + }, + { + "Command": "echo", + "Target": "All worked fine", + "Value": "green", + "Description": "" + }, + { + "Command": "elseif", + "Target": "${s}.lastIndexOf(\"ERROR\") >= 0", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "An error happened", + "Value": "brown", + "Description": "" + }, + { + "Command": "else", + "Target": "", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "This state should never happen. String should contain one of the key words.", + "Value": "orange", + "Description": "" + }, + { + "Command": "end", + "Target": "", + "Value": "", + "Description": "" + } + ] + }, + "AI(Beta)/CU_FillForm": { + "CreationDate": "2024-12-02", + "Commands": [ + { + "Command": "XDesktopAutomation", + "Target": "false", + "Value": "", + "Description": "" + }, + { + "Command": "open", + "Target": "https://ui.vision/contact", + "Value": "", + "Description": "" + }, + { + "Command": "bringBrowserToForeground", + "Target": "true", + "Value": "s", + "Description": "" + }, + { + "Command": "aiComputerUse", + "Target": "Fill out this web form with artificial data and submit it. \n\nTwo fields have specific content: \n\nFor Topic, select 'General Inquiry' from the dropdown (press 'G' when dropdown is open and then ENTER). \n\nFor the Subject use 'Test. Ignore this message. Filter me out'. \n\nIf successful, end with 'SUCCESS'. \nIf you encounter any errors, end with 'ERROR'. \n\nTool use instructions: \n\nSaves time: Skip the mouse_move before doing left_click\n\nAfter scrolling, take a fresh screenshot\n", + "Value": "s", + "Description": "" + }, + { + "Command": "echo", + "Target": "Computer Use Result = ${s}", + "Value": "blue", + "Description": "" + }, + { + "Command": "if", + "Target": "${s}.lastIndexOf(\"SUCCESS\") >= 0", + "Value": "", + "Description": "Search for substring in string: The return value of .lastIndexOf() is -1 if the substring is not found in the string at all." + }, + { + "Command": "echo", + "Target": "All worked fine", + "Value": "green", + "Description": "" + }, + { + "Command": "elseif", + "Target": "${s}.lastIndexOf(\"ERROR\") >= 0", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "An error happened", + "Value": "brown", + "Description": "" + }, + { + "Command": "else", + "Target": "", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "This state should never happen. The final LLM output should contain one of the keywords.", + "Value": "orange", + "Description": "" + }, + { + "Command": "end", + "Target": "", + "Value": "", + "Description": "" + } + ] + }, + "AI(Beta)/CU_PressClear_Desktop": { + "CreationDate": "2024-12-03", + "Commands": [ + { + "Command": "XDesktopAutomation", + "Target": "true", + "Value": "", + "Description": "" + }, + { + "Command": "aiComputerUse", + "Target": "Automate the Ui.Vision IDE. \n\nFind and press the Clear button. \n\nTo save time, do not use mouse move. Only do CLICK.\n\nTry only once. It is successful, if log tab is less than half full by the time you take a screenshot.\n\nEnd with SUCCESS, or ERROR if problems occur.", + "Value": "s", + "Description": "" + }, + { + "Command": "echo", + "Target": "Computer Use Result = ${s}", + "Value": "blue", + "Description": "" + }, + { + "Command": "if", + "Target": "${s}.lastIndexOf(\"SUCCESS\") >= 0", + "Value": "", + "Description": "Parse the LLM output for SUCCESS substring" + }, + { + "Command": "echo", + "Target": "All worked fine", + "Value": "green", + "Description": "" + }, + { + "Command": "elseif", + "Target": "${s}.lastIndexOf(\"ERROR\") >= 0", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "An error happened", + "Value": "brown", + "Description": "" + }, + { + "Command": "else", + "Target": "", + "Value": "", + "Description": "" + }, + { + "Command": "echo", + "Target": "This state should not happen. String should contain one of the keywords.", + "Value": "orange", + "Description": "" + }, + { + "Command": "end", + "Target": "", + "Value": "", + "Description": "" + } + ] + }, + "AI(Beta)/Prompt_CompareImages": { "CreationDate": "2024-11-11", "Commands": [ @@ -87,7 +420,7 @@ export default { } ] }, - "AI(Beta)/ScreenXY_Browser": { + "AI(Beta)/ScreenXY_SearchForum": { "CreationDate": "2024-11-22", "Commands": [ { @@ -158,7 +491,7 @@ export default { } ] }, - "AI(Beta)/ScreenXY_Desktop": { + "AI(Beta)/ScreenXY_PressClear_Desktop": { "CreationDate": "2024-11-22", "Commands": [ { diff --git a/src/containers/dashboard/bottom/index.js b/src/containers/dashboard/bottom/index.js index 80d2f6e..6b1d7cc 100644 --- a/src/containers/dashboard/bottom/index.js +++ b/src/containers/dashboard/bottom/index.js @@ -386,6 +386,12 @@ class DashboardBottom extends React.Component { } logStyle (log) { + // console.log('logStyle:>> ', log) + // this comes from 'aiComputerUse' + if (log.type === 'a') { + return { color: 'green' } + } + if (log.options && log.options.color) { return { color: log.options.color } } diff --git a/src/containers/dashboard/editor.js b/src/containers/dashboard/editor.js index 893a3da..8628900 100644 --- a/src/containers/dashboard/editor.js +++ b/src/containers/dashboard/editor.js @@ -184,7 +184,8 @@ class DashboardEditor extends React.Component { case 'OCRExtractbyTextRelative': case 'OCRSearch': case 'aiPrompt': - case 'aiScreenXY': { + case 'aiScreenXY': + case 'aiComputerUse': { const selectedIndex = this.props.editing.meta.selectedIndex const run = () => { // Note: run visionFind/visualSearch as single line command, but without timeout waiting @@ -286,6 +287,9 @@ class DashboardEditor extends React.Component { case 'aiScreenXY': throw new Error('No select possible in aiScreenXY mode') + case 'aiComputerUse': + throw new Error('No select possible in aiComputerUse mode') + case 'XClickText': case 'XClickTextRelative': case 'XClick': { @@ -1520,7 +1524,7 @@ class DashboardEditor extends React.Component { const isFindEnabled = selectedCmd && selectedCmd.cmd && canCommandFind(selectedCmd.cmd) const shouldUseSelectInputForTarget = selectedCmd && selectedCmd.targetOptions && selectedCmd.targetOptions.length && doesCommandSupportTargetOptions(selectedCmd.cmd) - const shouldUseTextareaForTarget = selectedCmd && ['executeScript', 'executeScript_Sandbox', 'aiPrompt', 'aiScreenXY'].indexOf(selectedCmd.cmd) !== -1 + const shouldUseTextareaForTarget = selectedCmd && ['executeScript', 'executeScript_Sandbox', 'aiPrompt', 'aiScreenXY', 'aiComputerUse'].indexOf(selectedCmd.cmd) !== -1 const shouldUseNormalInputForTarget = !shouldUseSelectInputForTarget && !shouldUseTextareaForTarget return ( @@ -1558,7 +1562,7 @@ class DashboardEditor extends React.Component { const input = this.cmdInputRef.current.querySelector('input') if(/^[a-zA-Z0-9]$/.test(e.key)) { this.setState({ userInputCmdValue: input.value + e.key }) - } + } }} onBlur={() => { let value = this.state.userInputCmdValue diff --git a/src/containers/sidepanel/components/logs/index.js b/src/containers/sidepanel/components/logs/index.js index d1c2cb8..644a24d 100644 --- a/src/containers/sidepanel/components/logs/index.js +++ b/src/containers/sidepanel/components/logs/index.js @@ -295,6 +295,11 @@ class Logs extends React.Component { } logStyle (log) { + // this comes from 'aiComputerUse' + if (log.type === 'a') { + return { color: 'green' } + } + if (log.options && log.options.color) { return { color: log.options.color } } diff --git a/src/index.js b/src/index.js index 9216e05..7b5ebc2 100644 --- a/src/index.js +++ b/src/index.js @@ -227,6 +227,7 @@ const restoreConfig = () => { useDarkTheme: false, sidePanelOnLeft: false, anthropicAPIKey: '', + aiComputerUseMaxLoops: 20, showSettingsOnStart: false, showSidebar: false, showBottomArea: true, @@ -287,7 +288,7 @@ const restoreConfig = () => { defaultProxy: '', defaultProxyAuth: '', turnOffProxyAfterReplay: true, - ...config + ...config, } store.dispatch(updateConfig(cfg)) return cfg @@ -353,48 +354,41 @@ const genPlayerPlayCallback = ({ options,installed}) => { const logContent = logs.map(log => renderLog(log, false)) const text = [logTitle, '###', ...logContent].join('\n') - if(isFullPath){ - - const ua = window.navigator.userAgent - const path = options.savelog; + if (isFullPath) { + const ua = window.navigator.userAgent + const path = options.savelog; - function os(){ - if (/windows/i.test(ua)) return 'windows' - if (/mac/i.test(ua)) return 'mac' - return 'linux' - } - - if (installed && installed!=undefined ) { - let osType = os(); - runDownloadLog(text,path,osType) - .then(data => { - return getDownloadMan().prepareDownload(options.savelog) - }) - - }else{ - pSaveLog = delay(() => {}, 500).then(() => { - downloadTextFile(text, decodeURIComponent(options.savelog)) - // Note: We have to wait until savelog download completes if there is any - return getDownloadMan().prepareDownload(options.savelog) - }) - - } - - }else{ - if (!isFullPath || !getStorageManager().isXFileMode()) { - pSaveLog = delay(() => {}, 500).then(() => { - downloadTextFile(text, decodeURIComponent(options.savelog)) - // Note: We have to wait until savelog download completes if there is any - return getDownloadMan().prepareDownload(options.savelog) - }) - } else { - pSaveLog = getLogService().logTo(options.savelog, text) - } - } - - - } - + function os() { + if (/windows/i.test(ua)) return 'windows' + if (/mac/i.test(ua)) return 'mac' + return 'linux' + } + + if (installed && installed!=undefined ) { + let osType = os(); + runDownloadLog(text,path,osType) + .then(data => { + return getDownloadMan().prepareDownload(options.savelog) + }) + } else { + pSaveLog = delay(() => {}, 500).then(() => { + downloadTextFile(text, decodeURIComponent(options.savelog)) + // Note: We have to wait until savelog download completes if there is any + return getDownloadMan().prepareDownload(options.savelog) + }) + } + } else { + if (!isFullPath || !getStorageManager().isXFileMode()) { + pSaveLog = delay(() => {}, 500).then(() => { + downloadTextFile(text, decodeURIComponent(options.savelog)) + // Note: We have to wait until savelog download completes if there is any + return getDownloadMan().prepareDownload(options.savelog) + }) + } else { + pSaveLog = getLogService().logTo(options.savelog, text) + } + } + } const closeBrowser = parseBoolLike(options.closeBrowser, false) const closeRPA = parseBoolLike(options.closeRPA !== undefined ? options.closeRPA : options.closeKantu, true) diff --git a/src/init_player.js b/src/init_player.js index 57f316f..916842e 100644 --- a/src/init_player.js +++ b/src/init_player.js @@ -20,7 +20,7 @@ import { getNativeCVAPI } from './services/desktop' import { getXUserIO } from './services/xmodules/x_user_io' import { getXLocal } from './services/xmodules/xlocal' import { runOCR, runDownloadLog, runOCRLocal, runOCRTesseractC, searchTextInOCRResponse, ocrMatchCenter, allWordsWithPosition, scaleOcrResponseCoordinates, scaleOcrTextSearchMatch, isOcrSpaceFreeKey } from './services/ocr' -import { compose, flatten, safeUpdateIn, parseBoolLike, clone, milliSecondsToStringInSecond, id, strictParseBoolLike, withCountDown, countDown, isMac } from './common/ts_utils' +import { compose, flatten, safeUpdateIn, parseBoolLike, clone, milliSecondsToStringInSecond, id, strictParseBoolLike, withCountDown, countDown, isMac, isWindows } from './common/ts_utils' import { readableSize } from './services/storage/flat/storage' import { OcrHighlightType } from './services/ocr/types' import { Counter } from './common/counter/counter' @@ -57,6 +57,7 @@ import { clearTimerForTimeoutStatus, startSendingTimeoutStatus } from './ext/pop import { convertOcrLanguageToTesseractLanguage } from './services/ocr/languages' import AnthropicService from '@/services/anthropic/anthropic.service' import { parseAiVisionTarget, aiScreenXYImageBuffers, aiPromptGetPromptAndImageArrayBuffers, getFileBufferFromScreenshotStorage } from './common/ai_vision' +import Sampling from '@/services/ai/computer-use/sampling' const REPLAY_SPEED_DELAY = { NODISPLAYV1: 1, @@ -1546,7 +1547,6 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm // found the target - const hit = true const newVars = (() => { vars.set( { @@ -1566,7 +1566,6 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm )({ vars: newVars, byPass: true, - // best: hit }) }).catch((error) => { throw new Error(error.message) @@ -1580,6 +1579,249 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm } + case 'aiComputerUse':{ + console.log('aiComputerUse...') + + if (!target || !target.length) { + throw new Error('target is required') + } + + + // useOrAi = 'user' | 'ai' + // isActionOrResult = 'action' | 'result' + const logMessage = (message, userOrAi = null, isActionOrResult = false) => { + if (userOrAi === 'ai') { + if (isActionOrResult === 'action') { + store.dispatch(act.addLog('a', `Action: ${message}`)) + } else { + store.dispatch(act.addLog('a', `${message}`)) + } + } else if(userOrAi === 'user') { + if (isActionOrResult === 'result') { + store.dispatch(act.addLog('u', `Result: ${message}`)) + } else { + store.dispatch(act.addLog('u', `${message}`)) + } + } else { + store.dispatch(act.addLog('info', `${message}`)) + } + } + + const isDesktop = isCVTypeForDesktop(vars.get('!CVSCOPE')) + + const captureScreenShotFunction = () => { + const storedImageRect = vars.get('!storedImageRect') + const searchArea = vars.get('!visualSearchArea') || 'viewport' + + return (isDesktop ? Promise.resolve() : csIpc.ask('PANEL_CLEAR_OCR_MATCHES_ON_PLAYING_PAGE')) + // Note: add 1s delay here to make sure old OCR overlayed are cleared before taking new screenshot + .then(() => delay(() => {}, 1000)) + .then(() => { + return captureImage({ + isDesktop, + storedImageRect, + searchArea: /\.png/i.test(searchArea) ? 'rect' : searchArea, + scaleDpi: true, + devicePixelRatio: window.devicePixelRatio + }) + }) + .then(() => delay(() => {}, 1000)) + .then(() => { + const screenshotFileName = isDesktop + ? ensureExtName('.png', C.LAST_DESKTOP_SCREENSHOT_FILE_NAME) + : ensureExtName('.png', C.LAST_SCREENSHOT_FILE_NAME) + logMessage('Screenshot taken', 'user', 'result') + return getFileBufferFromScreenshotStorage(screenshotFileName).then((imageBuffer) => { + return imageBuffer + }) + }) + } + + const handleMouseAction = async (action, scaleFactor) => { + // console.log('handleMouseAction:>> action::', action) + console.log('isDesktop:>> ', isDesktop) + console.log('scaleFactor:>> ', scaleFactor) + + const originalCoords = isWindows() && !isDesktop ? + { + x: Math.round(action.x / scaleFactor / window.devicePixelRatio), + y: Math.round(action.y / scaleFactor / window.devicePixelRatio) + }: + { + x: Math.round(action.x / scaleFactor), + y: Math.round(action.y / scaleFactor) + } + + console.log('originalCoords:>> ', originalCoords) + + const executeMouseCommand = (command) => { + console.log('executeMouseCommand:>> command:>> ', command) + + const target = `${originalCoords.x },${originalCoords.y}` + switch (command) { + case 'mouse_move': + store.dispatch(act.addLog('info', `Running XMove command target: ${target}`)) + return runCsFreeCommands({ + cmd: 'XMove', + target: `${originalCoords.x},${originalCoords.y}`, + }) + case 'left_click': + store.dispatch(act.addLog('info', `Running XClick command target: ${target}`)) + return runCsFreeCommands({ + cmd: 'XClick', + target: `${originalCoords.x},${originalCoords.y}`, + }) + case 'right_click': + store.dispatch(act.addLog('info', `Running XClick (#right) command target: ${target}`)) + return runCsFreeCommands({ + cmd: 'XClick', + target: `${originalCoords.x},${originalCoords.y}`, + value: '#right', + }) + default: + console.log('handleMouseAction:>> unknown command:>> ', command) + return Promise.resolve() + } + } + + const uiVisionCmd = action.command === 'mouse_move' ? 'XMove' : 'XClick' + + return executeMouseCommand(action.command).then ((result) => { + console.log('handleMouseAction:>> result:>> ', result) + + logMessage(`${uiVisionCmd} ${originalCoords.x},${originalCoords.y} (Scale factor: ${scaleFactor.toFixed(5)})`, 'user', 'result') + return { + success: true + } + }).then ((result) => { + if (result.success) { + const actionText = action.command === 'mouse_move' ? 'Moved' : + action.command === 'left_click' ? 'Left clicked' : + 'Right clicked'; + return { + success: true, + message: `${actionText} at ${originalCoords.x},${originalCoords.y}`, + coordinates: originalCoords + }; + } + }) + } + + const handleKeyboardAction = async (action) => { + console.log('handleKeyboardAction:>> action::', action) + + const executeKeyboardCommand = (action) => { + console.log('executeKeyboardCommand:>> action:>> ', action) + switch (action.type) { + case 'keyboard': + case 'text': + store.dispatch(act.addLog('info', `Running XType command, value: ${action.value}`)) + + return runCsFreeCommands({ + cmd: 'XType', + target: action.value, + }) + default: + console.error('executeKeyboardCommand:>> unknown command:>> ', action.type) + return Promise.resolve() + } + } + + return executeKeyboardCommand(action).then ((result) => { + return { + success: true + } + }) + } + + let currentLoop = 0 + const getTerminationRequest = (_currentLoop) => { + const state = store.getState() + currentLoop = _currentLoop + const maxLoop = state.config.aiComputerUseMaxLoops + if (_currentLoop > maxLoop) { + return 'max_loop_reached'; + } + if (state.player.status === Player.C.STATUS.STOPPED) { + return 'player_stopped'; + } + } + + const promptText = target // `You are using a web browser. All click and move actions must include coordinates. If you need to scroll down the page, use the keyboard e. g. PageDown.` + try { + // console.log('Creating Sampling instance...') + let anthropicAPIKey = store.getState().config.anthropicAPIKey; + console.log('anthropicAPIKey :>> ', anthropicAPIKey); + + const sampling = new Sampling(anthropicAPIKey, C.ANTHROPIC.COMPUTER_USE_MODEL, promptText, + captureScreenShotFunction, handleMouseAction, handleKeyboardAction, + getTerminationRequest, logMessage + ); + + logMessage('Computer Use sequence start:') + + const userPrompt = target + logMessage(userPrompt, 'user') + + console.log('Running sampling...') + // const result = await sampling.run('Use the calculator to calculate 5 + 8 and verify the result. Then stop.'); + // const result = await sampling.run('You see a web form. Fill out all fields that you see. Use random but realistic data for names and email. Ignore drop downs. Scroll down with keyboard if needed. Submit the page. Then stop.'); + // anti spam stops this. good. const result = await sampling.run('You see a website of a forum. Sign up for a new account. Fill out all fields that you see. Use random but realistic data for names and email. Ignore drop downs. Scroll down with keyboard if needed. Submit the page. Then stop. Skip all MOUSE MOVE commands. Just use CLICK.'); + // const result = await sampling.run('You see a website. Look for big firefox icon. If not found, use Page_down to scroll down. Look again. Do this until you found the Firefox or at the end of the page. Then stop.'); + // const result = await sampling.run('Look at the desktop and find the Firefox icon. Click it to open Firefox. When Firefox is open, use CTRL+L to jump to the Firefox address bar (this is where the URL is). Then enter https://ui.vision into the address bar. Press Enter to load the website. Verify the website has loaded. Then stop. Always return x y coordinates with the CLICK and MOVE commands.'); + // const result = await sampling.run('Type CTRL+L in Ui.Vision syntax. That is ${KEY_CTRL+KEY_L}.Then stop.'); + // const result = await sampling.run('All left_click amd move actions must include coordinates. A calculator is open on the desktop. Use it to calculate 5 + 8 and verify the result. Then stop.'); + // return sampling.run('You see a website. A tic tac toe game is open. You are Player 1. Play the game and win. Then stop.', getTerminationRequest).then((result) => { + return sampling.run(userPrompt).then((result) => { + console.log('Sampling completed. Result:>>', JSON.stringify(result, null, 2)) + + if(result.stopReason === 'max_loop_reached') { + throw new Error('E501: Loop Limit Reached. Increase if needed.') + } else if (result.stopReason === 'player_stopped') { + logMessage(`Computer Use sequence ended (${currentLoop + 1} loops)`) + return { + byPass: true, + log: { + info: 'Player stopped manually.' + } + } + } else { + + const messages = result//.content[0].text + const aiMessages = messages.filter((message) => message.role === 'assistant') + const aiResponse = aiMessages[aiMessages.length - 1]?.content?.[0]?.text + + // found the target + const newVars = (() => { + vars.set( + { + [value]: aiResponse, + }, + true + ) + return { + [value]: aiResponse, + } + })() + + return compose( + )({ + vars: newVars, + byPass: true, + }) + } + + }).catch((error) => { + console.error('Error in aiComputerUse:', error) + throw error + }) + + } catch (error) { + console.error('Error in aiComputerUse:', error) + throw error + } + } + case 'aiScreenXY': { console.log('aiScreenXY...') @@ -1591,7 +1833,6 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm const storedImageRect = vars.get('!storedImageRect') const searchArea = vars.get('!visualSearchArea') || 'viewport' - return (isDesktop ? Promise.resolve() : csIpc.ask('PANEL_CLEAR_OCR_MATCHES_ON_PLAYING_PAGE')) // Note: add 1s delay here to make sure old OCR overlayed are cleared before taking new screenshot .then(() => delay(() => {}, 1000)) @@ -1613,7 +1854,6 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm return getFileBufferFromScreenshotStorage(screenshotFileName).then((imageBuffer) => { - let anthropicAPIKey = store.getState().config.anthropicAPIKey; console.log('anthropicAPIKey :>> ', anthropicAPIKey); @@ -1654,11 +1894,8 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm console.log('newVars:>> ', newVars) console.log(`newVars['!ai1'] === undefined: ${newVars['!ai1'] === undefined}`) - if (extra && extra.debugVisual) { - if (isDesktop) { - console.log('debugVisual extra:>>', extra) captureImage({ @@ -1768,6 +2005,8 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm } + + case 'OCRExtractScreenshot': guardOcrSettings() @@ -3700,7 +3939,7 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm // console.log('isDesktop:>> ', isDesktop) if (extra && extra.debugVisual) { - console.log('debugVisual extra:>>', extra) + console.log('desktop_coordinates debugVisual extra:>>', extra) return captureImage({ isDesktop : true, @@ -3730,8 +3969,8 @@ const interpretCsFreeCommands = ({ store, vars, getTcPlayer, getInterpreter, xCm height: screen.availHeight }, coordinates: { - x: x, - y: y + x: x / window.devicePixelRatio, + y: y / window.devicePixelRatio } }) .then(() => { diff --git a/src/services/ai/computer-use/computer-use.ts b/src/services/ai/computer-use/computer-use.ts new file mode 100644 index 0000000..d4eaad4 --- /dev/null +++ b/src/services/ai/computer-use/computer-use.ts @@ -0,0 +1,263 @@ +import { Jimp } from 'jimp' +import { ComputerUseActionResult } from './model' + +type ProcessedImageResult = { + originalMetadata: any + scaledBuffer: ArrayBuffer + scaleFactor: number + scaledWidth: number + scaledHeight: number + originalWidth: number + originalHeight: number +} + + + +class ComputerUse { + screenshotCount: number + lastCoords: any + MAX_PIXELS: number + config: any + keyMap: any + scaleFactor: number + captureScreenShotFunction: () => Promise + handleMouseAction: (action: any, scaleFactor: number) => Promise + handleKeyboardAction: (action: any) => Promise + logMessage: (message: string, userOrAi?: 'user' | 'ai', isActionOrResult?: 'action' | 'result') => void + + constructor( + captureScreenShotFunction: () => Promise, + handleMouseAction: (action: any, scaleFactor: number) => Promise, + handleKeyboardAction: (action: any) => Promise, + logMessage: (message: string, userOrAi?: 'user' | 'ai', isActionOrResult?: 'action' | 'result') => void + ) { + this.captureScreenShotFunction = captureScreenShotFunction + this.handleMouseAction = handleMouseAction + this.handleKeyboardAction = handleKeyboardAction + this.logMessage = logMessage + + this.screenshotCount = 0 + this.lastCoords = null + this.MAX_PIXELS = 1191888 + this.scaleFactor = 1 + this.config = { + timeoutSeconds: 30, + closeRPA: false, + closeBrowser: false + } + + // Map API key names to UI.Vision format + this.keyMap = { + Return: '${KEY_ENTER}', + Enter: '${KEY_ENTER}', + Tab: '${KEY_TAB}', + Escape: '${KEY_ESC}', + Backspace: '${KEY_BACK}', + Delete: '${KEY_DELETE}', + ArrowUp: '${KEY_UP}', + Arrow_Down: '${KEY_DOWN}', + Arrow_Left: '${KEY_LEFT}', + Arrow_Right: '${KEY_RIGHT}', + Home: '${KEY_HOME}', + End: '${KEY_END}', + Page_Up: '${KEY_PGUP}', + Page_Down: '${KEY_PGDN}', + Space: '${KEY_SPACE}', + Control: '${KEY_CTRL}', + Alt: '${KEY_ALT}', + Shift: '${KEY_SHIFT}', + F1: '${KEY_F1}', + F2: '${KEY_F2}', + F3: '${KEY_F3}', + F4: '${KEY_F4}', + F5: '${KEY_F5}', + F6: '${KEY_F6}', + F7: '${KEY_F7}', + F8: '${KEY_F8}', + F9: '${KEY_F9}', + F10: '${KEY_F10}', + F11: '${KEY_F11}', + F12: '${KEY_F12}' + } + } + + async processImage(imageBuffer: ArrayBuffer): Promise { + const image = await Jimp.read(imageBuffer) + + const metadata = { + width: image.bitmap.width, + height: image.bitmap.height + } + + console.log(`Original dimensions: ${metadata.width} x ${metadata.height}`) + + let scaledBuffer = imageBuffer + let scaleFactor = 1 + let scaledWidth = metadata.width + let scaledHeight = metadata.height + + const totalPixels = metadata.width * metadata.height + + if (totalPixels > this.MAX_PIXELS) { + scaleFactor = Math.sqrt(this.MAX_PIXELS / totalPixels) + scaledWidth = Math.round(metadata.width * scaleFactor) + scaledHeight = Math.round(metadata.height * scaleFactor) + + console.log(`Scaling image by factor ${scaleFactor.toFixed(3)}`) + console.log(`Scaled dimensions: ${scaledWidth} x ${scaledHeight}`) + + image.resize({ w: scaledWidth, h: scaledHeight }) + + scaledBuffer = await image.getBuffer('image/png') + } + + return { + originalMetadata: metadata, + scaledBuffer, + scaleFactor, + scaledWidth, + scaledHeight, + originalWidth: metadata.width, + originalHeight: metadata.height + } + } + + async processAction(action: any) { + console.log('Processing action:', action) + + // Convert API action format to UI.Vision format + const uiVisionAction = this.convertToUIVisionFormat(action) + if (!uiVisionAction.success) { + return uiVisionAction // Return error if conversion failed + } + + console.log('Converted action:', uiVisionAction.action) + + // Use the converted action + return await this.executeUIVisionAction(uiVisionAction.action) + } + + convertToUIVisionFormat(action: any) { + try { + // Handle key/text input conversion + if ((action.action === 'key' && action.text) || (action.action === 'key_press' && action.key)) { + const keyValue = action.text || action.key + return { + success: true, + action: { + command: 'xtype', + value: this.keyMap[keyValue] || keyValue, + type: 'keyboard' + } + } + } + + // Handle text typing + if (action.action === 'type' && action.text) { + return { + success: true, + action: { + command: 'xtype', + value: action.text, + type: 'text' + } + } + } + + // Handle mouse actions + if (['mouse_move', 'left_click', 'right_click'].includes(action.action)) { + let coords = null + if (action.coordinates) { + coords = action.coordinates + } else if (action.coordinate && Array.isArray(action.coordinate)) { + coords = { x: action.coordinate[0], y: action.coordinate[1] } + } + + if (coords) { + return { + success: true, + action: { + command: action.action, + x: coords.x, + y: coords.y, + type: 'mouse' + } + } + } + } + + // Handle screenshot + if (action.action === 'screenshot') { + return { + success: true, + action: { + command: 'screenshot', + type: 'capture' + } + } + } + + return { + success: false, + error: `Unable to convert action: ${JSON.stringify(action)}` + } + } catch (error: any) { + console.error('Error converting action:', error) + return { + success: false, + error: `Conversion error: ${error.message}` + } + } + } + + async executeUIVisionAction(action: any) { + try { + switch (action.type) { + case 'keyboard': + case 'text': + const keyResult = await this.handleKeyboardAction(action) + return { + success: keyResult.success, + message: `Typed: ${action.value}` + } + + case 'mouse': + return await this.handleMouseAction(action, this.scaleFactor) + + case 'capture': + return await this.handleScreenshot() + + default: + return { + success: false, + error: `Unknown action type: ${action.type}` + } + } + } catch (error: any) { + console.error('Error executing action:', error) + return { + success: false, + error: error.message + } + } + } + + async handleScreenshot() { + const imageBuffer = await this.captureScreenShotFunction() + console.log('imageBuffer:>> ', imageBuffer) + + const processedImage = await this.processImage(imageBuffer) + this.scaleFactor = processedImage.scaleFactor || 1 + + console.log('processedImage:>> ', processedImage) + + const base64Image = Buffer.from( processedImage.scaledBuffer).toString('base64') + return { + success: true, + message: 'Screenshot taken with UI.Vision', + base64Image: base64Image + } + } +} + +export default ComputerUse diff --git a/src/services/ai/computer-use/model.ts b/src/services/ai/computer-use/model.ts new file mode 100644 index 0000000..325336b --- /dev/null +++ b/src/services/ai/computer-use/model.ts @@ -0,0 +1,25 @@ + +export interface RunUIVisionMacroParams { + macroName: string + xType: string + x: string + y: string +} + +export interface RunUIVisionMacroResult { + success: boolean + message: string +} + +export interface ComputerUseActionResult { + success: boolean + message: string + coordinates: string[] | number[] +} + +export interface HandleScreenshotResult { + success: boolean + message: string + base64Image: string + filepath: string +} \ No newline at end of file diff --git a/src/services/ai/computer-use/sampling.ts b/src/services/ai/computer-use/sampling.ts new file mode 100644 index 0000000..1cef616 --- /dev/null +++ b/src/services/ai/computer-use/sampling.ts @@ -0,0 +1,246 @@ +import Anthropic from '@anthropic-ai/sdk' +import ComputerUse from './computer-use' +import { ComputerUseActionResult } from './model' + +class Sampling { + model: string + systemPrompt: string + messages: any[] + computer: any + anthropic: any + captureScreenShotFunction: () => Promise + handleMouseAction: (action: any, scaleFactor: number) => Promise + handleKeyboardAction: (action: any) => Promise + getTerminationRequest: (loopCount: number) => string | null + logMessage: (message: string, userOrAi?: 'user' | 'ai', isActionOrResult?: 'action' | 'result') => void + loopCount = 0 + constructor( + anthropicApiKey: string, + model: string, + systemPrompt: string, + captureScreenShotFunction: () => Promise, + handleMouseAction: (action: any, scaleFactor: number) => Promise, + handleKeyboardAction: (action: any) => Promise, + getTerminationRequest: (loopCount: number) => string | null, + logMessage: (message: string, userOrAi?: 'user' | 'ai', isActionOrResult?: 'action' | 'result') => void + ) { + this.model = model + this.systemPrompt = systemPrompt + this.messages = [] + this.captureScreenShotFunction = captureScreenShotFunction + this.handleMouseAction = handleMouseAction + this.handleKeyboardAction = handleKeyboardAction + + this.getTerminationRequest = getTerminationRequest + this.logMessage = logMessage + + this.computer = new ComputerUse(captureScreenShotFunction, handleMouseAction, handleKeyboardAction, logMessage) + + this.anthropic = new Anthropic({ + apiKey: anthropicApiKey, + dangerouslyAllowBrowser: true + }) + } + async processToolUse(toolUse: any, toolUseId: any) { + try { + const toolResults = [] + + for (const action of toolUse) { + // Ensure coordinate format is consistent + if (action.coordinate && Array.isArray(action.coordinate)) { + action.coordinates = { x: action.coordinate[0], y: action.coordinate[1] } + delete action.coordinate + } + + // Ensure any click action has coordinates + if ((action.action === 'left_click' || action.action === 'right_click') && !action.coordinates) { + throw new Error(`${action.action} action requires coordinates`) + } + + console.log('Processing tool action:', action) + const result = await this.computer.processAction(action) + + toolResults.push({ + type: 'tool_result', + tool_use_id: toolUseId, + content: result.success + ? result.base64Image + ? [ + { type: 'text', text: result.message }, + { + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: result.base64Image + } + } + ] + : [{ type: 'text', text: result.message }] + : [{ type: 'text', text: result.error }], + is_error: !result.success + }) + } + + return toolResults + } catch (error: any) { + console.error('Error in processToolUse:', error) + return [ + { + type: 'tool_result', + tool_use_id: toolUseId, + content: [{ type: 'text', text: error.message }], + is_error: true + } + ] + } + } + + async callAPI(params: any) { + try { + const { width, height } = { + width: window.screen.availWidth, + height: window.screen.availHeight + } + + // console.log('Calling API with messages:', JSON.stringify(params.messages, null, 2)) + const userPrompt = params.messages[0].content[0].text + + // this.logMessage(userPrompt, 'user') + + const response = await this.anthropic.beta.messages.create({ + model: params.model, + max_tokens: 1024, + tools: [ + { + type: 'computer_20241022', + name: 'computer', + display_width_px: width, + display_height_px: height, + display_number: 1 + } + ], + messages: params.messages, + system: params.system, + betas: ['computer-use-2024-10-22'] + }) + + console.log('Raw API response:', JSON.stringify(response, null, 2)) + + const aiResponseText = response.content[0]?.text + if(aiResponseText) { + this.logMessage(aiResponseText, 'ai', ) + } + + const coordinate = response.content[1]?.input?.coordinate + const coordinateText = coordinate ? ` ${coordinate[0]}, ${coordinate[1]}` : '' + const aiToolUseInputAction = response.content[1]?.input?.action + if(aiToolUseInputAction) { + const message = aiToolUseInputAction + coordinateText + this.logMessage(message, 'ai', 'action') + } + + const toolUse = [] + let toolUseId = null + + for (const content of response.content) { + if (content.type === 'tool_use') { + console.log('Found tool_use:', content) + toolUseId = content.id + if (content.input) { + toolUse.push(content.input) + } + } + } + + console.log('Extracted tool use:', toolUse) + + return { + content: response.content, + tool_use: toolUse, + tool_use_id: toolUseId + } + } catch (error) { + console.error('API call failed:', error) + throw error + } + } + + async run(userMessage: any): Promise { + try { + // Add user message + this.messages.push({ + role: 'user', + content: [ + { + type: 'text', + text: userMessage + } + ] + }) + + const stopReason = this.getTerminationRequest(this.loopCount) + if (stopReason) { + if (stopReason === 'max_loop_reached') { + console.log('max_loop_reached:>> ', this.messages) + return { messages: this.messages, stopReason: stopReason } + } else if (stopReason === 'player_stopped') { + console.log('player_stopped:>> ', this.messages) + return { messages: this.messages, stopReason: stopReason } + } + return { messages: this.messages, stopReason: stopReason } + } + + console.log('run this.messages:>> ', this.messages) + // this.logMessage(`Calling Anthropic API with user message: ${userMessage}`) + + const response = await this.callAPI({ + model: this.model, + messages: this.messages, + system: this.systemPrompt + }) + + this.loopCount++ + + // Add assistant's response with tool_use blocks + this.messages.push({ + role: 'assistant', + content: response.content + }) + + // Process tool use if present + if (response.tool_use && response.tool_use.length > 0) { + const toolResults = await this.processToolUse(response.tool_use, response.tool_use_id) + + console.log('response:>> ', response) + console.log('Tool results:>> ', toolResults) + + // Add tool results + this.messages.push({ + role: 'user', + content: toolResults + }) + + // Check for task completion in the API's response + const completionIndicator = response.content.find( + (content: any) => content.type === 'text' && content.text.toLowerCase().includes('task completed') + ) + + if (completionIndicator) { + console.log('=== Task completed as indicated by the API ===\n task completion messages:>> ', this.messages) + return this.messages + } + + // Continue with the task + return this.run('Continue with the task...') + } + + return this.messages + } catch (error) { + console.error('Error in run:', error) + throw error + } + } +} + +export default Sampling diff --git a/src/services/anthropic/anthropic.service.ts b/src/services/anthropic/anthropic.service.ts index 520c299..3ccd95a 100644 --- a/src/services/anthropic/anthropic.service.ts +++ b/src/services/anthropic/anthropic.service.ts @@ -7,10 +7,13 @@ import { getStorageManager } from '../storage' import { dataURItoBlob } from '@/common/utils' import { isMac } from '@/common/ts_utils' import { getNativeXYAPI } from '../xy' +import { ANTHROPIC } from '@/common/constant' + interface Coordinates { coords: Array<{ x: number; y: number }> isSinglePoint?: boolean } + type ScaleImageIfNeededResult = { buffer: ArrayBuffer scaleFactor: number @@ -31,7 +34,7 @@ class AnthropicService { private anthropic: Anthropic MAX_WIDTH = 1280 MAX_HEIGHT = 800 - AI_MODEL = 'claude-3-5-sonnet-20241022' + // AI_MODEL = 'claude-3-5-sonnet-20241022' MAX_TOKENS = 1024 MAX_PIXELS = 1191888 // Maximum total pixels @@ -87,7 +90,7 @@ class AnthropicService { async getPromptResponse(promptText: string): Promise { try { const message = await this.anthropic.messages.create({ - model: this.AI_MODEL, + model: ANTHROPIC.COMPUTER_USE_MODEL, max_tokens: this.MAX_TOKENS, messages: [{ role: 'user', content: promptText }] }) @@ -112,7 +115,7 @@ class AnthropicService { // Call Anthropic API const message = await this.anthropic.messages.create({ - model: this.AI_MODEL, + model: ANTHROPIC.COMPUTER_USE_MODEL, max_tokens: this.MAX_TOKENS, messages: [ { @@ -193,131 +196,6 @@ class AnthropicService { } } - async findCoordinates(mainImageBuffer: ArrayBuffer, promptText: string): Promise { - try { - // Scale images if needed - const mainImageData = await this.scaleImageIfNeeded(mainImageBuffer) - const mainImageBase64 = Buffer.from(mainImageBuffer).toString('base64') - - console.log('promptText:>>', promptText) - - const message = await this.anthropic.messages.create({ - model: 'claude-3-5-sonnet-20241022', - max_tokens: 1024, - messages: [ - { - role: 'user', - content: [ - { - type: 'text', - //Only screenshot input. -> Find based on description - text: promptText // 'Find the center of the Sign Up button. Answer with x,y|||' - }, - { - type: 'image', - source: { - type: 'base64', - media_type: 'image/png', - data: mainImageBase64 - } - } - ] - } - ] - }) - - const responseText = message.content[0].type === 'text' ? message.content[0].text : '' - console.log("Claude's response:", responseText) - - // Check if image was found - if (responseText.toLowerCase().includes('not found')) { - console.log('Reference image could not be found in the main image') - throw new Error('Reference image could not be found in the main image') - } - - // Parse coordinates - const { coords: scaledCoords, isSinglePoint } = this.parseCoordinates(responseText) - if (scaledCoords.length === 0) { - console.log('Could not parse coordinates') - throw new Error('Could not parse coordinates') - } - - // Scale coordinates back to original image size - const originalCoords = scaledCoords.map((coord) => ({ - x: Math.round(coord.x / mainImageData.scaleFactor), - y: Math.round(coord.y / mainImageData.scaleFactor) - })) - - // Log coordinates based on mode - console.log('Found coordinates:') - if (isSinglePoint) { - console.log(`Center (scaled): ${scaledCoords[0].x}, ${scaledCoords[0].y}`) - console.log(`Center (original): ${originalCoords[0].x}, ${originalCoords[0].y}`) - } else { - console.log(`Top-left (scaled): ${scaledCoords[0].x}, ${scaledCoords[0].y}`) - console.log(`Top-left (original): ${originalCoords[0].x}, ${originalCoords[0].y}`) - console.log(`Bottom-right (scaled): ${scaledCoords[1].x}, ${scaledCoords[1].y}`) - console.log(`Bottom-right (original): ${originalCoords[1].x}, ${originalCoords[1].y}`) - } - console.log(`Scale factor used: ${mainImageData.scaleFactor}`) - - // Create SVG marker based on mode - let svg - if (isSinglePoint) { - svg = ` - - - X - - ` - } else { - svg = ` - - - - - TL - BR - - ` - } - - // TODO: uncomment and use the following code - // // Composite the SVG marker onto the original image - // const outputPath = path.join(path.dirname(mainImagePath), `matched_${path.basename(mainImagePath)}`) - // await sharp(mainImageBuffer) - // .composite([ - // { - // input: Buffer.from(svg), - // top: 0, - // left: 0 - // } - // ]) - // .toFile(outputPath) - - // // If scaling was applied, save the scaled version for reference - // if (mainImageData.scaleFactor < 1) { - // const scaledOutputPath = path.join(path.dirname(mainImagePath), `scaled_${path.basename(mainImagePath)}`) - // // await fs.writeFile(scaledOutputPath, mainImageData.buffer); - // // console.log(`Scaled image saved to: ${scaledOutputPath}`); - // } - - console.log(`Image processed successfully!`) - return { coords: originalCoords, isSinglePoint } - // console.log(`Output file: ${outputPath}`) - } catch (error) { - console.error('Error getting response from Anthropic:', error) - throw this.uivError(error) - } - } /* * Process image * @@ -371,7 +249,7 @@ class AnthropicService { // Call Anthropic API const message = await this.anthropic.messages.create({ - model: 'claude-3-5-sonnet-20241022', + model: ANTHROPIC.COMPUTER_USE_MODEL, max_tokens: 1024, messages: [ { @@ -498,7 +376,7 @@ class AnthropicService { // Call Computer Use API const computerUseResponse = await this.anthropic.beta.messages.create({ - model: 'claude-3-5-sonnet-20241022', + model: ANTHROPIC.COMPUTER_USE_MODEL, max_tokens: 1024, tools: [ {