Centralize validation, incidental touchups

gleachkr · gleachkr · commit 98ac7cd60e66 · 2025-10-10T13:45:02.000-04:00
diff --git a/src/pdf2sqlite/abstract.py b/src/pdf2sqlite/abstract.py
@@ -1,36 +1,49 @@
 import base64
 import litellm
+from .view import task_view
+from rich.markdown import Markdown
+from rich.panel import Panel
 import sys
 
-def abstract(title, pdf_bytes, model):
+def systemPrompt(title):
+    return (
+            " You are an AI document summarizer. "
+            "You will be given a short PDF. "
+            f"This PDF contains the first few pages of a document titled {title}. "
+            "Give a concise one-paragraph description of the overall topic and contents of the document."
+            )
 
-    if not litellm.utils.supports_pdf_input(model):
-        sys.exit(f"Aborting. The model supplied, `{model}` doesn't support PDF input!")
+def abstract(title, pdf_bytes, model, live, tasks):
 
     base64_string = base64.b64encode(pdf_bytes).decode("utf-8")
 
     response = litellm.completion(
+            stream = True,
             model = model,
             messages = [ { 
                "role" : "system",
-                  "content": f"""
-You are an AI document summarizer. You will be given a short PDF. This PDF contains the first few pages of a document titled {title}. Give a concise one-paragraph description of the overall topic and contents of the document.
-"""
+                  "content": systemPrompt(title)
              },
             {
                 "role": "user",
                 "content": [                    
                     {
-                        "type": "image_url",
-                        "image_url":  f"data:application/pdf;base64,{base64_string}"
+                        "type": "text",
+                        "text" : "Please summarize this page."
                     },
                     {
-                        "type": "text",
-                        "text" : "Please summarize this PDF."
+                        "type": "file",
+                        "file":  {
+                            "file_data": f"data:application/pdf;base64,{base64_string}"
+                        },
                     },
-
                 ],
             }])
 
-    return response.choices[0].message.content
+    description = ""
+    for chunk in response:
+        description = description + (chunk.choices[0].delta.content or "")
+        live.update(task_view(title, tasks + [Panel(Markdown(description))]))
+
+    return description
 
diff --git a/src/pdf2sqlite/describe_figure.py b/src/pdf2sqlite/describe_figure.py
@@ -1,6 +1,5 @@
 import base64
 import litellm
-import sys
 from .view import task_view
 from rich.markdown import Markdown
 from rich.panel import Panel
@@ -28,9 +27,6 @@ def describe(image_bytes, mimetype, model, live, title, tasks):
     # previous gists could supply additional context, but let's try it
     # context-free to start
 
-    if not litellm.utils.supports_vision(model=model):
-        sys.exit(f"Aborting. The model supplied, `{model}` doesn't support image inputs!")
-
     base64_string = base64.b64encode(image_bytes).decode("utf-8")
 
     response = litellm.completion(
diff --git a/src/pdf2sqlite/pdf2sqlite.py b/src/pdf2sqlite/pdf2sqlite.py
@@ -1,7 +1,6 @@
 import os
 import io
-import base64
-import sys
+from .validation import validate_args
 import sqlite3
 from sqlite3 import Connection, Cursor
 from PIL import Image
@@ -21,16 +20,17 @@
 from .describe_figure import describe
 from .view import task_view, fresh_view
 
-def generate_description(title : str, args : Namespace, reader : PdfReader):
+def generate_description(title : str, args : Namespace, reader : PdfReader, live: Live):
     new_pdf = PdfWriter(None)
     pages = reader.pages[:10]
     for i, page in enumerate(pages):
         new_pdf.insert_page(page, i)
     pdf_bytes = io.BytesIO()
     new_pdf.write(pdf_bytes)
     pdf_bytes = pdf_bytes.getvalue()
-    description = abstract(title, pdf_bytes, args.abstracter)
-    print(f"generated description of PDF: \"{description}\"")
+    tasks = ["Generating PDF Description"]
+    live.update(task_view(title, tasks))
+    description = abstract(title, pdf_bytes, args.abstracter, live, tasks)
     return description
 
 def insert_pdf_by_name(title : str, description : str | None, cursor : Cursor):
@@ -204,13 +204,13 @@ def insert_pdf(args : Namespace, the_pdf : str , live : Live, cursor : Cursor, d
 
     gists = [] # these are the page by page gists. We keep them around so that they can provide context for later gists
 
-    description = generate_description(title, args, reader) if args.abstracter else None
+    description = generate_description(title, args, reader, live) if args.abstracter else None
 
     pdf_id = insert_pdf_by_name(title, description, cursor)
 
     db.commit()
 
-    toc_and_sections = extract_toc_and_sections(reader)
+    toc_and_sections = extract_toc_and_sections(reader, live)
 
     if toc_and_sections['sections']:
         insert_sections(toc_and_sections['sections'], pdf_id, cursor)
@@ -232,18 +232,6 @@ def insert_pdf(args : Namespace, the_pdf : str , live : Live, cursor : Cursor, d
         insert_page(page, rich_tables, live, pdf_id, cursor, args, gists, title, description)
         db.commit()
 
-def validate_pdf(the_pdf : str):
-    with open(the_pdf, "rb") as pdf:
-        header = pdf.read(4)
-        if header != b'%PDF':
-            sys.exit(f"Aborting. The file {the_pdf} isn't a valid PDF!")
-
-def validate_database(the_db : str):
-    with open(the_db, "rb") as database:
-        #validate input
-        header = database.read(6)
-        if header != b'SQLite':
-            sys.exit(f"Aborting. The file {the_db} isn't a valid SQLite database!")
 
 def main():
     parser = argparse.ArgumentParser(
@@ -286,12 +274,7 @@ def nonnegative_int(value):
         # zero disables
         pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH = args.decompression_limit
 
-    for pdf in args.pdfs:
-        validate_pdf(pdf)
-
-    #validate database
-    if os.path.exists(args.database):
-        validate_database(args.database)
+    validate_args(args)
 
     with Live(fresh_view(), refresh_per_second=4) as live:
         try:
diff --git a/src/pdf2sqlite/summarize.py b/src/pdf2sqlite/summarize.py
@@ -1,6 +1,5 @@
 import base64
 import litellm
-import sys
 from .view import task_view
 from rich.markdown import Markdown
 from rich.panel import Panel
@@ -10,38 +9,37 @@ def system_prompt(page_nu, title, description, gists):
     giststring = ""
     if gists:
         gist_base = page_nu - len(gists)
-        giststring = "Here are the summaries of preceeding pages that you have access to. Try not to repeat information from these"
+        giststring = ("Here are the summaries of preceeding pages that you have access to. "
+                      "Try not to repeat information from these")
         for index, gist in enumerate(gists):
             giststring += f"<gist page_nu={index + gist_base}>{gist}</gist>"
 
     descstring = ""
     if description:
-        descstring = (
-            "Here is a description of the document. "
-            "You MUST NOT repeat information that is already in this description. "
-            "Describe what is on your page, not overall document features."  
-            f"<description>{description}</description>"
-        )
+        descstring = ("Here is a description of the document. "
+                      "You MUST NOT repeat information that is already in this description. "
+                      "Describe what is on your page, not overall document features."  
+                      f"<description>{description}</description>")
 
-    return f"""
-You are an AI document summarizer. You will be given a one page PDF. This PDF is page {page_nu} of a document titled {title}. You must accurately and concisely report the contents of this PDF using a single sentence. Your summary needs to be searchable, so include any important keywords that describe the content. {descstring} {giststring}
-"""
+    return ("You are an AI document summarizer. "
+            "You will be given a one page PDF. "
+            f"This PDF is page {page_nu} of a document titled {title}. "
+            "You must accurately and concisely report the contents of this PDF using a single sentence. "
+            "Your summary needs to be searchable, so include any important keywords that describe the content. "
+            f"{descstring} {giststring}")
 
 def summarize(gists, description, page_nu, title, page_bytes, model, live, tasks):
     # previous gists could supply additional context, but let's try it
     # context-free to start
 
-    if not litellm.utils.supports_pdf_input(model):
-        sys.exit(f"Aborting. The model supplied, `{model}` doesn't support PDF input!")
-
     base64_string = base64.b64encode(page_bytes).decode("utf-8")
 
     response = litellm.completion(
             stream = True,
             model = model,
             messages = [ { 
                "role" : "system",
-                  "content": system_prompt(page_nu, title, description, gists)
+               "content": system_prompt(page_nu, title, description, gists)
              },
             {
                 "role": "user",
diff --git a/src/pdf2sqlite/validation.py b/src/pdf2sqlite/validation.py
@@ -0,0 +1,40 @@
+import sys
+import os
+from argparse import Namespace
+import litellm
+
+def validate_args(args: Namespace):
+    for pdf in args.pdfs:
+        validate_pdf(pdf)
+
+    if os.path.exists(args.database):
+        validate_database(args.database)
+
+    validate_llms(args)
+
+def validate_pdf(the_pdf : str):
+    with open(the_pdf, "rb") as pdf:
+        header = pdf.read(4)
+        if header != b'%PDF':
+            sys.exit(f"Aborting. The file {the_pdf} isn't a valid PDF!")
+
+def validate_database(the_db : str):
+    with open(the_db, "rb") as database:
+        #validate input
+        header = database.read(6)
+        if header != b'SQLite':
+            sys.exit(f"Aborting. The file {the_db} isn't a valid SQLite database!")
+
+def validate_llms(args : Namespace):
+
+    if (args.vision_model):
+        if not litellm.utils.supports_vision(args.vision_model):
+            sys.exit(f"Aborting. The vision model supplied, `{args.vision_model}` doesn't support image inputs!")
+
+    if (args.summarizer):
+        if not litellm.utils.supports_pdf_input(args.summarizer):
+            sys.exit(f"Aborting. The summarization model supplied, `{args.summarizer}` doesn't support PDF input!")
+
+    if (args.abstracter):
+        if not litellm.utils.supports_pdf_input(args.abstracter):
+            sys.exit(f"Aborting. The abstracter model supplied, `{args.abstracter}` doesn't support PDF input!")