Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

infering json schemas #1007

Merged
merged 3 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/components/BuiltinAgents.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { LinkCard } from '@astrojs/starlight/components';

### Builtin Agents

<LinkCard title="agent data" description="Infer the schema of data, query data from files" href="/genaiscript/reference/scripts/system#systemagent_data" />
<LinkCard title="agent docs" description="query the documentation" href="/genaiscript/reference/scripts/system#systemagent_docs" />
<LinkCard title="agent fs" description="query files to accomplish tasks" href="/genaiscript/reference/scripts/system#systemagent_fs" />
<LinkCard title="agent git" description="query a repository using Git to accomplish tasks. Provide all the context information available to execute git queries." href="/genaiscript/reference/scripts/system#systemagent_git" />
Expand Down
2 changes: 2 additions & 0 deletions docs/src/components/BuiltinTools.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ import { LinkCard } from '@astrojs/starlight/components';
### Builtin tools

<LinkCard title="fs_ask_file" description="Runs a LLM query over the content of a file. Use this tool to extract information from a file." href="/genaiscript/reference/scripts/system#systemfs_ask_file" />
<LinkCard title="fs_data_query" description="Query data in a file using GROQ syntax" href="/genaiscript/reference/scripts/system#systemfs_data_query" />
<LinkCard title="fs_diff_files" description="Computes a diff between two different files. Use git diff instead to compare versions of a file." href="/genaiscript/reference/scripts/system#systemfs_diff_files" />
<LinkCard title="fs_find_files" description="Finds file matching a glob pattern. Use pattern to specify a regular expression to search for in the file content. Be careful about asking too many files." href="/genaiscript/reference/scripts/system#systemfs_find_files" />
<LinkCard title="fs_infer_schema" description="Infers the JSON schema of a file" href="/genaiscript/reference/scripts/system#systemfs_infer_schema" />
<LinkCard title="fs_read_file" description="Reads a file as text from the file system. Returns undefined if the file does not exist." href="/genaiscript/reference/scripts/system#systemfs_read_file" />
<LinkCard title="git_branch_default" description="Gets the default branch using git." href="/genaiscript/reference/scripts/system#systemgit" />
<LinkCard title="git_branch_current" description="Gets the current branch using git." href="/genaiscript/reference/scripts/system#systemgit" />
Expand Down
121 changes: 121 additions & 0 deletions docs/src/content/docs/reference/scripts/system.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,43 @@ $`- You are concise, no yapping, no extra sentences, do not suggest to share tho
`````


### `system.agent_data`



Agent that can query data in files



`````js wrap title="system.agent_data"
system({
description: "Agent that can query data in files",
})

defAgent(
"data",
"Infer the schema of data, query data from files",
`You are an expert data scientist that can answer questions about data in files.
Answer the question in <QUERY>.`,
{
system: [
"system",
"system.assistant",
"system.tools",
"system.python_code_interpreter",
"system.fs_find_files",
"system.fs_read_file",
"system.fs_infer_schema",
"system.fs_data_query",
"system.safety_harmful_content",
"system.safety_protected_material",
],
}
)

`````


### `system.agent_docs`

Agent that can query on the documentation.
Expand Down Expand Up @@ -399,6 +436,8 @@ defAgent(
"system.explanations",
"system.transcribe",
"system.video",
"system.safety_harmful_content",
"system.safety_protected_material",
],
}
)
Expand Down Expand Up @@ -863,6 +902,47 @@ defTool(
`````


### `system.fs_data_query`



A tool that can query data in a file

- tool `fs_data_query`: Query data in a file using GROQ syntax

`````js wrap title="system.fs_data_query"
system({
description: "A tool that can query data in a file",
})

defTool(
"fs_data_query",
"Query data in a file using GROQ syntax",
{
type: "object",
properties: {
filename: {
type: "string",
description: "The filename to query data from",
},
query: {
type: "string",
description: "The GROQ query to run on the data",
},
},
},
async (args) => {
const { context, query, filename } = args
context.log(`query ${query} in ${filename}`)
const data = await workspace.readData(filename)
const res = await parsers.GROQ(query, data)
return res
}
)

`````


### `system.fs_diff_files`

File Diff Files
Expand Down Expand Up @@ -1016,6 +1096,47 @@ defTool(
`````


### `system.fs_infer_schema`



Tools to infer the schema of a dataset or file

- tool `fs_infer_schema`: Infers the JSON schema of a file

`````js wrap title="system.fs_infer_schema"
system({
description: "Tools to infer the schema of a dataset or file",
})

defTool(
"fs_infer_schema",
"Infers the JSON schema of a file",
{
type: "object",
properties: {
filename: {
type: "string",
description: "The filename to infer the schema from",
},
},
required: ["filename"],
},
async (args) => {
const { context, filename } = args
context.log(`infer schema of ${filename}`)
const jsonData = await workspace.readData(filename)
if (jsonData === undefined)
return "Unabled to infer the schema from the data."

const schema = await JSONSchema.infer(jsonData)
return JSON.stringify(schema)
}
)

`````


### `system.fs_read_file`

File Read File
Expand Down
43 changes: 8 additions & 35 deletions packages/cli/src/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,18 @@ import { HTMLToText } from "../../core/src/html"
import { isJSONLFilename, JSONLTryParse } from "../../core/src/jsonl"
import { parsePdf } from "../../core/src/pdf"
import { estimateTokens } from "../../core/src/tokens"
import { YAMLParse, YAMLStringify } from "../../core/src/yaml"
import { YAMLStringify } from "../../core/src/yaml"
import { resolveTokenEncoder } from "../../core/src/encoders"
import {
CSV_REGEX,
INI_REGEX,
JSON5_REGEX,
MD_REGEX,
MDX_REGEX,
PROMPTY_REGEX,
TOML_REGEX,
XLSX_REGEX,
XML_REGEX,
YAML_REGEX,
} from "../../core/src/constants"
import { MD_REGEX, PROMPTY_REGEX } from "../../core/src/constants"
import { promptyParse, promptyToGenAIScript } from "../../core/src/prompty"
import { basename, join } from "node:path"
import { CSVParse, CSVStringify, CSVToMarkdown } from "../../core/src/csv"
import { INIParse, INIStringify } from "../../core/src/ini"
import { TOMLParse } from "../../core/src/toml"
import { JSON5parse, JSON5Stringify } from "../../core/src/json5"
import { XLSXParse } from "../../core/src/xlsx"
import { CSVStringify, CSVToMarkdown } from "../../core/src/csv"
import { INIStringify } from "../../core/src/ini"
import { JSON5Stringify } from "../../core/src/json5"
import { jinjaRender } from "../../core/src/jinja"
import { splitMarkdown } from "../../core/src/frontmatter"
import { parseOptionsVars } from "./vars"
import { XMLParse } from "../../core/src/xml"
import { dataTryParse } from "../../core/src/data"
import { resolveFileContent } from "../../core/src/file"

/**
Expand Down Expand Up @@ -132,22 +119,8 @@ export async function parseAnyToJSON(
file: string,
options: { format: string }
) {
let data: any
if (XLSX_REGEX.test(file)) data = await XLSXParse(await readFile(file))
else {
const src = await readFile(file, { encoding: "utf-8" })
if (CSV_REGEX.test(file)) data = CSVParse(src)
else if (INI_REGEX.test(file)) data = INIParse(src)
else if (TOML_REGEX.test(file)) data = TOMLParse(src)
else if (JSON5_REGEX.test(file))
data = JSON5parse(src, { repair: true })
else if (YAML_REGEX.test(file)) data = YAMLParse(src)
else if (XML_REGEX.test(file)) data = XMLParse(src)
else if (MD_REGEX.test(file) || MDX_REGEX.test(file))
data = YAMLParse(splitMarkdown(src).frontmatter)
else throw new Error("Unsupported file format")
}

const data = await dataTryParse({ filename: file })
if (!data) throw new Error(`Unknown data format for ${file}`)
let out: string
switch (options?.format?.toLowerCase() || "") {
case "yaml":
Expand Down
55 changes: 55 additions & 0 deletions packages/core/src/data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { readFile } from "fs/promises"
import {
XLSX_REGEX,
CSV_REGEX,
INI_REGEX,
TOML_REGEX,
JSON5_REGEX,
YAML_REGEX,
XML_REGEX,
MD_REGEX,
MDX_REGEX,
} from "./constants"
import { CSVParse, CSVTryParse } from "./csv"
import { splitMarkdown } from "./frontmatter"
import { INIParse, INITryParse } from "./ini"
import { JSON5parse } from "./json5"
import { TOMLParse } from "./toml"
import { XLSXParse } from "./xlsx"
import { XMLParse } from "./xml"
import { YAMLParse } from "./yaml"
import { resolveFileContent } from "./file"
import { TraceOptions } from "./trace"
import { host } from "./host"
import { fromBase64 } from "./base64"

export async function dataTryParse(
file: WorkspaceFile,
options?: TraceOptions & XMLParseOptions & INIParseOptions & CSVParseOptions
) {
await resolveFileContent(file)

const { filename, content, encoding } = file
let data: any
if (XLSX_REGEX.test(filename))
data = await XLSXParse(
encoding === "base64"
? fromBase64(content)
: await host.readFile(filename)
)
else {
if (CSV_REGEX.test(filename)) data = CSVTryParse(content, options)
else if (INI_REGEX.test(filename)) data = INITryParse(content, options)
else if (TOML_REGEX.test(filename)) data = TOMLParse(content)
else if (JSON5_REGEX.test(filename))
data = JSON5parse(content, { repair: true })
else if (YAML_REGEX.test(filename)) data = YAMLParse(content)
else if (XML_REGEX.test(filename)) data = XMLParse(content, options)
else if (MD_REGEX.test(filename) || MDX_REGEX.test(filename))
data = YAMLParse(splitMarkdown(content).frontmatter)
else {
return undefined // unknown
}
}
return data
}
24 changes: 18 additions & 6 deletions packages/core/src/filesystem.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import { JSONLineCache } from "./cache"
import { DOT_ENV_REGEX } from "./constants"
import { CSVParse, CSVTryParse } from "./csv"
import { dataTryParse } from "./data"
import { NotSupportedError, errorMessage } from "./error"
import { resolveFileContent } from "./file"
import { readText, writeText } from "./fs"
import { resolveFileContent, toWorkspaceFile } from "./file"
import { filePathOrUrlToWorkspaceFile, readText, writeText } from "./fs"
import { host } from "./host"
import { INITryParse } from "./ini"
import { JSON5parse, JSON5TryParse } from "./json5"
Expand All @@ -12,7 +13,7 @@ import { XMLParse, XMLTryParse } from "./xml"
import { YAMLParse, YAMLTryParse } from "./yaml"

export function createFileSystem(): Omit<WorkspaceFileSystem, "grep"> {
const fs: Omit<WorkspaceFileSystem, "grep"> = {
const fs = {
findFiles: async (glob, options) => {
const { readText } = options || {}
const names = (
Expand Down Expand Up @@ -61,12 +62,16 @@ export function createFileSystem(): Omit<WorkspaceFileSystem, "grep"> {
}
return file
},
readJSON: async (f: string | Awaitable<WorkspaceFile>) => {
readJSON: async (
f: string | Awaitable<WorkspaceFile>
): Promise<any> => {
const file = await fs.readText(f)
const res = JSON5TryParse(file.content, undefined)
return res
},
readYAML: async (f: string | Awaitable<WorkspaceFile>) => {
readYAML: async (
f: string | Awaitable<WorkspaceFile>
): Promise<any> => {
const file = await fs.readText(f)
const res = YAMLTryParse(file.content)
return res
Expand Down Expand Up @@ -95,12 +100,19 @@ export function createFileSystem(): Omit<WorkspaceFileSystem, "grep"> {
const res = INITryParse(file.content, options?.defaultValue)
return res
},
readData: async (
f: string | Awaitable<WorkspaceFile>
): Promise<any> => {
const file = await f
const data = await dataTryParse(toWorkspaceFile(file))
return data
},
cache: async (name: string) => {
if (!name) throw new NotSupportedError("missing cache name")
const res = JSONLineCache.byName<any, any>(name)
return res
},
}
} satisfies Omit<WorkspaceFileSystem, "grep">
;(fs as any).readFile = readText
return Object.freeze(fs)
}
24 changes: 24 additions & 0 deletions packages/core/src/genaisrc/system.agent_data.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
system({
description: "Agent that can query data in files",
})

defAgent(
"data",
"Infer the schema of data, query data from files",
`You are an expert data scientist that can answer questions about data in files.
Answer the question in <QUERY>.`,
{
system: [
"system",
"system.assistant",
"system.tools",
"system.python_code_interpreter",
"system.fs_find_files",
"system.fs_read_file",
"system.fs_infer_schema",
"system.fs_data_query",
"system.safety_harmful_content",
"system.safety_protected_material",
],
}
)
2 changes: 2 additions & 0 deletions packages/core/src/genaisrc/system.agent_video.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ defAgent(
"system.explanations",
"system.transcribe",
"system.video",
"system.safety_harmful_content",
"system.safety_protected_material",
],
}
)
Loading
Loading