Skip to content

Commit 98ac7cd

Browse files
committed
Centralize validation, incidental touchups
1 parent c6572d6 commit 98ac7cd

File tree

5 files changed

+86
-56
lines changed

5 files changed

+86
-56
lines changed

src/pdf2sqlite/abstract.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,49 @@
11
import base64
22
import litellm
3+
from .view import task_view
4+
from rich.markdown import Markdown
5+
from rich.panel import Panel
36
import sys
47

5-
def abstract(title, pdf_bytes, model):
8+
def systemPrompt(title):
9+
return (
10+
" You are an AI document summarizer. "
11+
"You will be given a short PDF. "
12+
f"This PDF contains the first few pages of a document titled {title}. "
13+
"Give a concise one-paragraph description of the overall topic and contents of the document."
14+
)
615

7-
if not litellm.utils.supports_pdf_input(model):
8-
sys.exit(f"Aborting. The model supplied, `{model}` doesn't support PDF input!")
16+
def abstract(title, pdf_bytes, model, live, tasks):
917

1018
base64_string = base64.b64encode(pdf_bytes).decode("utf-8")
1119

1220
response = litellm.completion(
21+
stream = True,
1322
model = model,
1423
messages = [ {
1524
"role" : "system",
16-
"content": f"""
17-
You are an AI document summarizer. You will be given a short PDF. This PDF contains the first few pages of a document titled {title}. Give a concise one-paragraph description of the overall topic and contents of the document.
18-
"""
25+
"content": systemPrompt(title)
1926
},
2027
{
2128
"role": "user",
2229
"content": [
2330
{
24-
"type": "image_url",
25-
"image_url": f"data:application/pdf;base64,{base64_string}"
31+
"type": "text",
32+
"text" : "Please summarize this page."
2633
},
2734
{
28-
"type": "text",
29-
"text" : "Please summarize this PDF."
35+
"type": "file",
36+
"file": {
37+
"file_data": f"data:application/pdf;base64,{base64_string}"
38+
},
3039
},
31-
3240
],
3341
}])
3442

35-
return response.choices[0].message.content
43+
description = ""
44+
for chunk in response:
45+
description = description + (chunk.choices[0].delta.content or "")
46+
live.update(task_view(title, tasks + [Panel(Markdown(description))]))
47+
48+
return description
3649

src/pdf2sqlite/describe_figure.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import base64
22
import litellm
3-
import sys
43
from .view import task_view
54
from rich.markdown import Markdown
65
from rich.panel import Panel
@@ -28,9 +27,6 @@ def describe(image_bytes, mimetype, model, live, title, tasks):
2827
# previous gists could supply additional context, but let's try it
2928
# context-free to start
3029

31-
if not litellm.utils.supports_vision(model=model):
32-
sys.exit(f"Aborting. The model supplied, `{model}` doesn't support image inputs!")
33-
3430
base64_string = base64.b64encode(image_bytes).decode("utf-8")
3531

3632
response = litellm.completion(

src/pdf2sqlite/pdf2sqlite.py

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import os
22
import io
3-
import base64
4-
import sys
3+
from .validation import validate_args
54
import sqlite3
65
from sqlite3 import Connection, Cursor
76
from PIL import Image
@@ -21,16 +20,17 @@
2120
from .describe_figure import describe
2221
from .view import task_view, fresh_view
2322

24-
def generate_description(title : str, args : Namespace, reader : PdfReader):
23+
def generate_description(title : str, args : Namespace, reader : PdfReader, live: Live):
2524
new_pdf = PdfWriter(None)
2625
pages = reader.pages[:10]
2726
for i, page in enumerate(pages):
2827
new_pdf.insert_page(page, i)
2928
pdf_bytes = io.BytesIO()
3029
new_pdf.write(pdf_bytes)
3130
pdf_bytes = pdf_bytes.getvalue()
32-
description = abstract(title, pdf_bytes, args.abstracter)
33-
print(f"generated description of PDF: \"{description}\"")
31+
tasks = ["Generating PDF Description"]
32+
live.update(task_view(title, tasks))
33+
description = abstract(title, pdf_bytes, args.abstracter, live, tasks)
3434
return description
3535

3636
def insert_pdf_by_name(title : str, description : str | None, cursor : Cursor):
@@ -204,13 +204,13 @@ def insert_pdf(args : Namespace, the_pdf : str , live : Live, cursor : Cursor, d
204204

205205
gists = [] # these are the page by page gists. We keep them around so that they can provide context for later gists
206206

207-
description = generate_description(title, args, reader) if args.abstracter else None
207+
description = generate_description(title, args, reader, live) if args.abstracter else None
208208

209209
pdf_id = insert_pdf_by_name(title, description, cursor)
210210

211211
db.commit()
212212

213-
toc_and_sections = extract_toc_and_sections(reader)
213+
toc_and_sections = extract_toc_and_sections(reader, live)
214214

215215
if toc_and_sections['sections']:
216216
insert_sections(toc_and_sections['sections'], pdf_id, cursor)
@@ -232,18 +232,6 @@ def insert_pdf(args : Namespace, the_pdf : str , live : Live, cursor : Cursor, d
232232
insert_page(page, rich_tables, live, pdf_id, cursor, args, gists, title, description)
233233
db.commit()
234234

235-
def validate_pdf(the_pdf : str):
236-
with open(the_pdf, "rb") as pdf:
237-
header = pdf.read(4)
238-
if header != b'%PDF':
239-
sys.exit(f"Aborting. The file {the_pdf} isn't a valid PDF!")
240-
241-
def validate_database(the_db : str):
242-
with open(the_db, "rb") as database:
243-
#validate input
244-
header = database.read(6)
245-
if header != b'SQLite':
246-
sys.exit(f"Aborting. The file {the_db} isn't a valid SQLite database!")
247235

248236
def main():
249237
parser = argparse.ArgumentParser(
@@ -286,12 +274,7 @@ def nonnegative_int(value):
286274
# zero disables
287275
pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH = args.decompression_limit
288276

289-
for pdf in args.pdfs:
290-
validate_pdf(pdf)
291-
292-
#validate database
293-
if os.path.exists(args.database):
294-
validate_database(args.database)
277+
validate_args(args)
295278

296279
with Live(fresh_view(), refresh_per_second=4) as live:
297280
try:

src/pdf2sqlite/summarize.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import base64
22
import litellm
3-
import sys
43
from .view import task_view
54
from rich.markdown import Markdown
65
from rich.panel import Panel
@@ -10,38 +9,37 @@ def system_prompt(page_nu, title, description, gists):
109
giststring = ""
1110
if gists:
1211
gist_base = page_nu - len(gists)
13-
giststring = "Here are the summaries of preceeding pages that you have access to. Try not to repeat information from these"
12+
giststring = ("Here are the summaries of preceeding pages that you have access to. "
13+
"Try not to repeat information from these")
1414
for index, gist in enumerate(gists):
1515
giststring += f"<gist page_nu={index + gist_base}>{gist}</gist>"
1616

1717
descstring = ""
1818
if description:
19-
descstring = (
20-
"Here is a description of the document. "
21-
"You MUST NOT repeat information that is already in this description. "
22-
"Describe what is on your page, not overall document features."
23-
f"<description>{description}</description>"
24-
)
19+
descstring = ("Here is a description of the document. "
20+
"You MUST NOT repeat information that is already in this description. "
21+
"Describe what is on your page, not overall document features."
22+
f"<description>{description}</description>")
2523

26-
return f"""
27-
You are an AI document summarizer. You will be given a one page PDF. This PDF is page {page_nu} of a document titled {title}. You must accurately and concisely report the contents of this PDF using a single sentence. Your summary needs to be searchable, so include any important keywords that describe the content. {descstring} {giststring}
28-
"""
24+
return ("You are an AI document summarizer. "
25+
"You will be given a one page PDF. "
26+
f"This PDF is page {page_nu} of a document titled {title}. "
27+
"You must accurately and concisely report the contents of this PDF using a single sentence. "
28+
"Your summary needs to be searchable, so include any important keywords that describe the content. "
29+
f"{descstring} {giststring}")
2930

3031
def summarize(gists, description, page_nu, title, page_bytes, model, live, tasks):
3132
# previous gists could supply additional context, but let's try it
3233
# context-free to start
3334

34-
if not litellm.utils.supports_pdf_input(model):
35-
sys.exit(f"Aborting. The model supplied, `{model}` doesn't support PDF input!")
36-
3735
base64_string = base64.b64encode(page_bytes).decode("utf-8")
3836

3937
response = litellm.completion(
4038
stream = True,
4139
model = model,
4240
messages = [ {
4341
"role" : "system",
44-
"content": system_prompt(page_nu, title, description, gists)
42+
"content": system_prompt(page_nu, title, description, gists)
4543
},
4644
{
4745
"role": "user",

src/pdf2sqlite/validation.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import sys
2+
import os
3+
from argparse import Namespace
4+
import litellm
5+
6+
def validate_args(args: Namespace):
7+
for pdf in args.pdfs:
8+
validate_pdf(pdf)
9+
10+
if os.path.exists(args.database):
11+
validate_database(args.database)
12+
13+
validate_llms(args)
14+
15+
def validate_pdf(the_pdf : str):
16+
with open(the_pdf, "rb") as pdf:
17+
header = pdf.read(4)
18+
if header != b'%PDF':
19+
sys.exit(f"Aborting. The file {the_pdf} isn't a valid PDF!")
20+
21+
def validate_database(the_db : str):
22+
with open(the_db, "rb") as database:
23+
#validate input
24+
header = database.read(6)
25+
if header != b'SQLite':
26+
sys.exit(f"Aborting. The file {the_db} isn't a valid SQLite database!")
27+
28+
def validate_llms(args : Namespace):
29+
30+
if (args.vision_model):
31+
if not litellm.utils.supports_vision(args.vision_model):
32+
sys.exit(f"Aborting. The vision model supplied, `{args.vision_model}` doesn't support image inputs!")
33+
34+
if (args.summarizer):
35+
if not litellm.utils.supports_pdf_input(args.summarizer):
36+
sys.exit(f"Aborting. The summarization model supplied, `{args.summarizer}` doesn't support PDF input!")
37+
38+
if (args.abstracter):
39+
if not litellm.utils.supports_pdf_input(args.abstracter):
40+
sys.exit(f"Aborting. The abstracter model supplied, `{args.abstracter}` doesn't support PDF input!")

0 commit comments

Comments
 (0)