Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions datastore/providers/postgres_datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,21 +112,32 @@ async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter):
Deletes rows in the table that match the filter.
"""

filters = "WHERE"
conditions = []
params = []
if filter.document_id:
filters += f" document_id = '{filter.document_id}' AND"
conditions.append("document_id = %s")
params.append(filter.document_id)
if filter.source:
filters += f" source = '{filter.source}' AND"
conditions.append("source = %s")
params.append(filter.source)
if filter.source_id:
filters += f" source_id = '{filter.source_id}' AND"
conditions.append("source_id = %s")
params.append(filter.source_id)
if filter.author:
filters += f" author = '{filter.author}' AND"
conditions.append("author = %s")
params.append(filter.author)
if filter.start_date:
filters += f" created_at >= '{filter.start_date}' AND"
conditions.append("created_at >= %s")
params.append(filter.start_date)
if filter.end_date:
filters += f" created_at <= '{filter.end_date}' AND"
filters = filters[:-4]
conditions.append("created_at <= %s")
params.append(filter.end_date)

if not conditions:
return

filters = "WHERE " + " AND ".join(conditions)

with self.client.cursor() as cur:
cur.execute(f"DELETE FROM {table} {filters}")
cur.execute(f"DELETE FROM {table} {filters}", params)
self.client.commit()
14 changes: 12 additions & 2 deletions scripts/process_zip/process_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,26 @@
DOCUMENT_UPSERT_BATCH_SIZE = 50


def _safe_extract(zip_file, target_dir):
"""Extract zip contents after validating no path traversal attacks."""
target_dir = os.path.realpath(target_dir)
for member in zip_file.namelist():
member_path = os.path.realpath(os.path.join(target_dir, member))
if not member_path.startswith(target_dir + os.sep) and member_path != target_dir:
raise ValueError(f"Attempted path traversal in zip: {member}")
zip_file.extractall(target_dir)


async def process_file_dump(
filepath: str,
datastore: DataStore,
custom_metadata: dict,
screen_for_pii: bool,
extract_metadata: bool,
):
# create a ZipFile object and extract all the files into a directory named 'dump'
# create a ZipFile object and safely extract all the files into a directory named 'dump'
with zipfile.ZipFile(filepath) as zip_file:
zip_file.extractall("dump")
_safe_extract(zip_file, "dump")

documents = []
skipped_files = []
Expand Down
6 changes: 4 additions & 2 deletions server/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import secrets
from typing import Optional
import uvicorn
from fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile
Expand All @@ -21,11 +22,12 @@

bearer_scheme = HTTPBearer()
BEARER_TOKEN = os.environ.get("BEARER_TOKEN")
assert BEARER_TOKEN is not None
if not BEARER_TOKEN:
raise ValueError("BEARER_TOKEN environment variable is not set")


def validate_token(credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme)):
if credentials.scheme != "Bearer" or credentials.credentials != BEARER_TOKEN:
if credentials.scheme != "Bearer" or not secrets.compare_digest(credentials.credentials, BEARER_TOKEN):
raise HTTPException(status_code=401, detail="Invalid or missing token")
Comment on lines 29 to 31
Copy link

Copilot AI Mar 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

secrets.compare_digest will raise a TypeError if BEARER_TOKEN is None. Today that’s prevented via assert BEARER_TOKEN is not None, but asserts can be disabled (e.g., running Python with -O), which would turn a missing env var into a 500 instead of a deterministic startup failure or a 401. Prefer an explicit runtime check (raise a clear exception during startup / module import, or guard in validate_token) rather than relying on assert for required configuration.

Copilot uses AI. Check for mistakes.
return credentials

Expand Down
12 changes: 7 additions & 5 deletions services/file.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import tempfile
from io import BufferedReader
from typing import Optional
from fastapi import UploadFile
Expand Down Expand Up @@ -98,11 +99,12 @@ async def extract_text_from_form_file(file: UploadFile):

file_stream = await file.read()

temp_file_path = "/tmp/temp_file"

# write the file to a temporary location
with open(temp_file_path, "wb") as f:
f.write(file_stream)
# write the file to a secure temporary location
with tempfile.NamedTemporaryFile(
delete=False, suffix=os.path.splitext(file.filename or "")[1]
) as tmp:
tmp.write(file_stream)
temp_file_path = tmp.name

try:
extracted_text = extract_text_from_filepath(temp_file_path, mimetype)
Expand Down