run-llama · marcusschiesser · Oct 17, 2024 · Oct 8, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/.changeset/selfish-tips-lie.md b/.changeset/selfish-tips-lie.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Simplify and unify handling file uploads
diff --git a/templates/components/engines/python/agent/tools/interpreter.py b/templates/components/engines/python/agent/tools/interpreter.py
@@ -4,7 +4,7 @@
 import uuid
 from typing import List, Optional
 
-from app.engine.utils.file_helper import FileMetadata, save_file
+from app.services.file import DocumentFile, FileService
 from e2b_code_interpreter import CodeInterpreter
 from e2b_code_interpreter.models import Logs
 from llama_index.core.tools import FunctionTool
@@ -32,7 +32,7 @@ class E2BCodeInterpreter:
     output_dir = "output/tools"
     uploaded_files_dir = "output/uploaded"
 
-    def __init__(self, api_key: str = None):
+    def __init__(self, api_key: Optional[str] = None):
         if api_key is None:
             api_key = os.getenv("E2B_API_KEY")
         filesever_url_prefix = os.getenv("FILESERVER_URL_PREFIX")
@@ -72,15 +72,17 @@ def _init_interpreter(self, sandbox_files: List[str] = []):
                         self.interpreter.files.write(file_path, content)
             logger.info(f"Uploaded {len(sandbox_files)} files to sandbox")
 
-    def _save_to_disk(self, base64_data: str, ext: str) -> FileMetadata:
+    def _save_to_disk(self, base64_data: str, ext: str) -> DocumentFile:
         buffer = base64.b64decode(base64_data)
 
-        filename = f"{uuid.uuid4()}.{ext}"  # generate a unique filename
-        output_path = os.path.join(self.output_dir, filename)
+        # Output from e2b doesn't have a name. Create a random name for it.
+        filename = f"e2b_file_{uuid.uuid4()}.{ext}"
 
-        file_metadata = save_file(buffer, file_path=output_path)
+        document_file = FileService.save_file(
+            buffer, file_name=filename, save_dir=self.output_dir
+        )
 
-        return file_metadata
+        return document_file
 
     def _parse_result(self, result) -> List[InterpreterExtraResult]:
         """
@@ -99,12 +101,12 @@ def _parse_result(self, result) -> List[InterpreterExtraResult]:
             for ext, data in zip(formats, results):
                 match ext:
                     case "png" | "svg" | "jpeg" | "pdf":
-                        file_metadata = self._save_to_disk(data, ext)
+                        document_file = self._save_to_disk(data, ext)
                         output.append(
                             InterpreterExtraResult(
                                 type=ext,
-                                filename=file_metadata.name,
-                                url=file_metadata.url,
+                                filename=document_file.name,
+                                url=document_file.url,
                             )
                         )
                     case _:

diff --git a/templates/components/engines/typescript/agent/tools/interpreter.ts b/templates/components/engines/typescript/agent/tools/interpreter.ts
@@ -111,13 +111,16 @@ export class InterpreterTool implements BaseTool<InterpreterParameter> {
     // upload files to sandbox
     if (input.sandboxFiles) {
       console.log(`Uploading ${input.sandboxFiles.length} files to sandbox`);
-      for (const filePath of input.sandboxFiles) {
-        const fileName = path.basename(filePath);
-        const localFilePath = path.join(this.uploadedFilesDir, fileName);
-        const content = fs.readFileSync(localFilePath);
-        await this.codeInterpreter?.files.write(filePath, content);
+      try {
+        for (const filePath of input.sandboxFiles) {
+          const fileName = path.basename(filePath);
+          const localFilePath = path.join(this.uploadedFilesDir, fileName);
+          const content = fs.readFileSync(localFilePath);
+          await this.codeInterpreter?.files.write(filePath, content);
+        }
+      } catch (error) {
+        console.error("Got error when uploading files to sandbox", error);
       }
-      console.log(`Uploaded ${input.sandboxFiles.length} files to sandbox`);
     }
     return this.codeInterpreter;
   }

diff --git a/templates/components/llamaindex/typescript/documents/helper.ts b/templates/components/llamaindex/typescript/documents/helper.ts
@@ -3,6 +3,7 @@ import crypto from "node:crypto";
 import fs from "node:fs";
 import path from "node:path";
 import { getExtractors } from "../../engine/loader";
+import { DocumentFile } from "../streaming/annotations";
 
 const MIME_TYPE_TO_EXT: Record<string, string> = {
   "application/pdf": "pdf",
@@ -14,43 +15,38 @@ const MIME_TYPE_TO_EXT: Record<string, string> = {
 
 const UPLOADED_FOLDER = "output/uploaded";
 
-export type FileMetadata = {
-  id: string;
-  name: string;
-  url: string;
-  refs: string[];
-};
-
 export async function storeAndParseFile(
-  filename: string,
+  name: string,
   fileBuffer: Buffer,
   mimeType: string,
-): Promise<FileMetadata> {
-  const fileMetadata = await storeFile(filename, fileBuffer, mimeType);
-  const documents: Document[] = await parseFile(fileBuffer, filename, mimeType);
+): Promise<DocumentFile> {
+  const file = await storeFile(name, fileBuffer, mimeType);
+  const documents: Document[] = await parseFile(fileBuffer, name, mimeType);
   // Update document IDs in the file metadata
-  fileMetadata.refs = documents.map((document) => document.id_ as string);
-  return fileMetadata;
+  file.refs = documents.map((document) => document.id_ as string);
+  return file;
 }
 
 export async function storeFile(
-  filename: string,
+  name: string,
   fileBuffer: Buffer,
   mimeType: string,
 ) {
   const fileExt = MIME_TYPE_TO_EXT[mimeType];
   if (!fileExt) throw new Error(`Unsupported document type: ${mimeType}`);
 
   const fileId = crypto.randomUUID();
-  const newFilename = `${fileId}_${sanitizeFileName(filename)}`;
+  const newFilename = `${sanitizeFileName(name)}_${fileId}.${fileExt}`;
   const filepath = path.join(UPLOADED_FOLDER, newFilename);
   const fileUrl = await saveDocument(filepath, fileBuffer);
   return {
     id: fileId,
     name: newFilename,
+    size: fileBuffer.length,
+    type: fileExt,
     url: fileUrl,
     refs: [] as string[],
-  } as FileMetadata;
+  } as DocumentFile;
 }
 
 export async function parseFile(
@@ -104,5 +100,6 @@ export async function saveDocument(filepath: string, content: string | Buffer) {
 }
 
 function sanitizeFileName(fileName: string) {
-  return fileName.replace(/[^a-zA-Z0-9_.-]/g, "_");
+  // Remove file extension and sanitize
+  return fileName.split(".")[0].replace(/[^a-zA-Z0-9_-]/g, "_");
 }
diff --git a/templates/components/llamaindex/typescript/documents/upload.ts b/templates/components/llamaindex/typescript/documents/upload.ts
@@ -2,40 +2,40 @@ import { Document, LLamaCloudFileService, VectorStoreIndex } from "llamaindex";
 import { LlamaCloudIndex } from "llamaindex/cloud/LlamaCloudIndex";
 import fs from "node:fs/promises";
 import path from "node:path";
-import { FileMetadata, parseFile, storeFile } from "./helper";
+import { DocumentFile } from "../streaming/annotations";
+import { parseFile, storeFile } from "./helper";
 import { runPipeline } from "./pipeline";
 
 export async function uploadDocument(
   index: VectorStoreIndex | LlamaCloudIndex | null,
-  filename: string,
+  name: string,
   raw: string,
-): Promise<FileMetadata> {
+): Promise<DocumentFile> {
   const [header, content] = raw.split(",");
   const mimeType = header.replace("data:", "").replace(";base64", "");
   const fileBuffer = Buffer.from(content, "base64");
 
   // Store file
-  const fileMetadata = await storeFile(filename, fileBuffer, mimeType);
+  const fileMetadata = await storeFile(name, fileBuffer, mimeType);
 
   // If the file is csv and has codeExecutorTool, we don't need to index the file.
   if (mimeType === "text/csv" && (await hasCodeExecutorTool())) {
     return fileMetadata;
   }
-
+  let documentIds: string[] = [];
   if (index instanceof LlamaCloudIndex) {
     // trigger LlamaCloudIndex API to upload the file and run the pipeline
     const projectId = await index.getProjectId();
     const pipelineId = await index.getPipelineId();
     try {
-      const documentId = await LLamaCloudFileService.addFileToPipeline(
-        projectId,
-        pipelineId,
-        new File([fileBuffer], filename, { type: mimeType }),
-        { private: "true" },
-      );
-      // Update file metadata with document IDs
-      fileMetadata.refs = [documentId];
-      return fileMetadata;
+      documentIds = [
+        await LLamaCloudFileService.addFileToPipeline(
+          projectId,
+          pipelineId,
+          new File([fileBuffer], name, { type: mimeType }),
+          { private: "true" },
+        ),
+      ];
     } catch (error) {
       if (
         error instanceof ReferenceError &&
@@ -47,14 +47,14 @@ export async function uploadDocument(
       }
       throw error;
     }
+  } else {
+    // run the pipeline for other vector store indexes
+    const documents: Document[] = await parseFile(fileBuffer, name, mimeType);
+    documentIds = await runPipeline(index, documents);
   }
 
-  // run the pipeline for other vector store indexes
-  const documents: Document[] = await parseFile(fileBuffer, filename, mimeType);
   // Update file metadata with document IDs
-  fileMetadata.refs = documents.map((document) => document.id_ as string);
-  // Run the pipeline
-  await runPipeline(index, documents);
+  fileMetadata.refs = documentIds;
   return fileMetadata;
 }
 

diff --git a/templates/components/llamaindex/typescript/streaming/annotations.ts b/templates/components/llamaindex/typescript/streaming/annotations.ts
@@ -3,17 +3,13 @@ import { MessageContent, MessageContentDetail } from "llamaindex";
 
 export type DocumentFileType = "csv" | "pdf" | "txt" | "docx";
 
-export type UploadedFileMeta = {
+export type DocumentFile = {
   id: string;
   name: string;
-  url?: string;
-  refs?: string[];
-};
-
-export type DocumentFile = {
-  type: DocumentFileType;
+  size: number;
+  type: string;
   url: string;
-  metadata: UploadedFileMeta;
+  refs?: string[];
 };
 
 type Annotation = {
@@ -30,7 +26,7 @@ export function isValidMessages(messages: Message[]): boolean {
 export function retrieveDocumentIds(messages: Message[]): string[] {
   // retrieve document Ids from the annotations of all messages (if any)
   const documentFiles = retrieveDocumentFiles(messages);
-  return documentFiles.map((file) => file.metadata?.refs || []).flat();
+  return documentFiles.map((file) => file.refs || []).flat();
 }
 
 export function retrieveDocumentFiles(messages: Message[]): DocumentFile[] {
@@ -63,16 +59,15 @@ export function retrieveMessageContent(messages: Message[]): MessageContent {
 }
 
 function getFileContent(file: DocumentFile): string {
-  const fileMetadata = file.metadata;
-  let defaultContent = `=====File: ${fileMetadata.name}=====\n`;
+  let defaultContent = `=====File: ${file.name}=====\n`;
   // Include file URL if it's available
   const urlPrefix = process.env.FILESERVER_URL_PREFIX;
   let urlContent = "";
   if (urlPrefix) {
-    if (fileMetadata.url) {
-      urlContent = `File URL: ${fileMetadata.url}\n`;
+    if (file.url) {
+      urlContent = `File URL: ${file.url}\n`;
     } else {
-      urlContent = `File URL (instruction: do not update this file URL yourself): ${urlPrefix}/output/uploaded/${fileMetadata.name}\n`;
+      urlContent = `File URL (instruction: do not update this file URL yourself): ${urlPrefix}/output/uploaded/${file.name}\n`;
     }
   } else {
     console.warn(
@@ -82,11 +77,11 @@ function getFileContent(file: DocumentFile): string {
   defaultContent += urlContent;
 
   // Include document IDs if it's available
-  if (fileMetadata.refs) {
-    defaultContent += `Document IDs: ${fileMetadata.refs}\n`;
+  if (file.refs) {
+    defaultContent += `Document IDs: ${file.refs}\n`;
   }
   // Include sandbox file paths
-  const sandboxFilePath = `/tmp/${fileMetadata.name}`;
+  const sandboxFilePath = `/tmp/${file.name}`;
   defaultContent += `Sandbox file path (instruction: only use sandbox path for artifact or code interpreter tool): ${sandboxFilePath}\n`;
 
   return defaultContent;

diff --git a/templates/components/routers/python/sandbox.py b/templates/components/routers/python/sandbox.py
@@ -172,10 +172,14 @@ def _download_cell_results(cell_results: Optional[List]) -> List[Dict[str, str]]
                 data = result[ext]
 
                 if ext in ["png", "svg", "jpeg", "pdf"]:
-                    file_path = os.path.join("output", "tools", f"{uuid.uuid4()}.{ext}")
                     base64_data = data
                     buffer = base64.b64decode(base64_data)
-                    file_meta = save_file(content=buffer, file_path=file_path)
+                    file_name = f"{uuid.uuid4()}.{ext}"
+                    file_meta = save_file(
+                        content=buffer,
+                        file_name=file_name,
+                        save_dir=os.path.join("output", "tools"),
+                    )
                     output.append(
                         {
                             "type": ext,