Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions ingest/file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from langchain_community.document_loaders import TextLoader
import fitz
from docx2pdf import convert
import pymupdf4llm
# local imports
import utils as ut

Expand All @@ -41,8 +42,9 @@ def parse_file(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, s
Tuple[List[Tuple[int, str]], Dict[str, str]]
tuple of pages (list of tuples of pagenumbers and page texts) and metadata (dictionary)
"""
tables = []
if file_path.endswith(".pdf"):
raw_pages, metadata = self.parse_pymupdf(file_path)
raw_pages, metadata, tables = self.parse_pymupdf(file_path)
elif file_path.endswith(".txt") or file_path.endswith(".md"):
raw_pages, metadata = self.parse_txt(file_path)
elif file_path.endswith(".html"):
Expand All @@ -51,7 +53,7 @@ def parse_file(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, s
raw_pages, metadata = self.parse_word(file_path)

# return raw text from pages and metadata
return raw_pages, metadata
return raw_pages, metadata, tables

def get_metadata(self, file_path: str, doc_metadata: Dict[str, str]) -> Dict[str, str]:
"""
Expand Down Expand Up @@ -259,8 +261,41 @@ def parse_pymupdf(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str
ut.detect_language(pages[page_with_max_text][1])
logger.info(f"The language detected for this document is {metadata['Language']}")
metadata["last_change_time"] = os.stat(file_path).st_mtime
tables = self.extract_tables(file_path)

return pages, metadata
return pages, metadata, tables

def extract_tables(self, file_path, show_progress=False):
"""
first chunks each page of the pdf file into markdown and then extracts tables from the markdown text.
md_text_list contains image, graphics, metadata, table positions, and text of the pdf file in markdown format.
Might be helpful in text parsing.

Parameters
----------
file_path : str
path to the pdf file
show_progress : bool, optional
_description_, by default False

Returns
-------
_type_
list of tuples, each tuple contains the page number and the table in markdown format
"""
md_text_list = pymupdf4llm.to_markdown(file_path, page_chunks=True, show_progress=show_progress)
# Regular expression to detect Markdown tables
table_pattern = re.compile(r"(\|.*?\|\n\|[-|]+\|\n(?:\|.*?\|\n)+)")

# Process each page to find Markdown tables
table_pages = []
for i, md_text in enumerate(md_text_list):
text = md_text['text']
tables = table_pattern.findall(text)
if tables:
table_pages.extend([(i, table) for table in tables])

return table_pages

def parse_txt(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
"""
Expand Down Expand Up @@ -312,6 +347,6 @@ def parse_word(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, s
"""
# convert docx to pdf
path_to_pdf = self.convert_docx_to_pdf(file_path)
pages, metadata = self.parse_pymupdf(path_to_pdf)
pages, metadata, _ = self.parse_pymupdf(path_to_pdf)

return pages, metadata
55 changes: 50 additions & 5 deletions ingest/ingester.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,49 @@ def clean_texts(self,

return cleaned_texts

def table_to_docs(self, table_pages: List[Tuple[int, str]], metadata: Dict[str, str]):
# Prepare splitter for table text
# language is needed but not utilized since it is passed to the NLTKTextSplitter
def split_creator(text: str, language=''):
splits = [split for split in text.split(separator)]
return splits

separator = "|\n"
splitter_table = SplitterCreator(self.text_splitter_method,
self.chunk_size,
0).get_splitter()
splitter_table.__setattr__('_tokenizer', split_creator) # Language related attribute is not used in this case

if hasattr(splitter_table, "_separator"): # we need to overload the separator attribute
splitter_table.__setattr__('_separator', separator) # for NLTKTextSplitter
else:
splitter_table.__setattr__('_separators', [separator]) # for RecursiveCharacterTextSplitter

# Split the table text into chunks
table_docs: List[docstore.Document] = []
chunk_num = 0
for page_num, table in table_pages:
chunks = splitter_table.split_text(table)
for chunk_text in chunks: # this part can be updated for text_to_docs then we can remove five lines of code
metadata_combined = {
"page_number": page_num,
"chunk": chunk_num,
"source": f"p{page_num}-{chunk_num}",
"isTable": True,
**metadata,
}
doc = docstore.Document(
page_content=chunk_text,
metadata=metadata_combined
)
table_docs.append(doc)
chunk_num += 1
return table_docs

def texts_to_docs(self,
texts: List[Tuple[int, str]],
metadata: Dict[str, str]) -> List[docstore.Document]:
metadata: Dict[str, str],
tables: List[Tuple[int, str]]) -> List[docstore.Document]:
"""
Split the text into chunks and return them as Documents.
"""
Expand Down Expand Up @@ -155,9 +195,12 @@ def texts_to_docs(self,
chunk_num += 1
prv_page_num = page_num

if tables:
docs.extend(self.table_to_docs(tables, metadata))

return docs

def clean_texts_to_docs(self, raw_texts, metadata) -> List[docstore.Document]:
def clean_texts_to_docs(self, raw_texts, metadata, tables) -> List[docstore.Document]:
""""
Combines the functions clean_text and text_to_docs
"""
Expand All @@ -170,7 +213,8 @@ def clean_texts_to_docs(self, raw_texts, metadata) -> List[docstore.Document]:
# for cleaned_text in cleaned_texts:
# cleaned_chunks = self.split_text_into_chunks(cleaned_text, metadata)
docs = self.texts_to_docs(texts=cleaned_texts,
metadata=metadata)
metadata=metadata,
tables=tables)
return docs

def count_ada_tokens(self, raw_texts: List[Tuple[int, str]]) -> int:
Expand Down Expand Up @@ -273,9 +317,10 @@ def ingest(self) -> None:
file_path = os.path.join(self.content_folder, file)
# extract raw text pages and metadata according to file type
logger.info(f"Parsing file {file}")
raw_texts, metadata = file_parser.parse_file(file_path)
raw_texts, metadata, tables = file_parser.parse_file(file_path)
documents = self.clean_texts_to_docs(raw_texts=raw_texts,
metadata=metadata)
metadata=metadata,
tables=tables)
# count tokens
tokens_document = self.count_ada_tokens(raw_texts)
logger.info(f"Extracted {len(documents)} chunks (Tokens: {tokens_document}) from {file}")
Expand Down
52 changes: 49 additions & 3 deletions query/querier.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
from typing import Dict, Any
from typing import Dict, Any, List
from pydantic import Field
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain.schema import AIMessage, HumanMessage
from langchain_core.prompts import PromptTemplate
from loguru import logger
# local imports
import settings
Expand Down Expand Up @@ -84,7 +87,42 @@ def make_chain(self,

# get chain
if self.chain_name == "conversationalretrievalchain":
self.chain = ConversationalRetrievalChain.from_llm(
# Create custom class to have more control over retrieval process
class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
extra_context: List[Any] = Field(default_factory=list)

def _get_docs(
self,
question: str,
inputs: Dict[str, Any],
*,
run_manager: CallbackManagerForChainRun,
) -> List[Document]:
"""Get docs."""
docs = self.retriever.invoke(
question, config={"callbacks": run_manager.get_child()}
)

# if extra_context (table) is provided, add it to the docs
if isinstance(self.extra_context, list):
pages = list({num for doc in docs for num in [doc.metadata['page_number'] - 1,
doc.metadata['page_number'],
doc.metadata['page_number']+1]})
logger.info(f"Pages: {pages}")
tables = []
# for doc, score in self.extra_context:
# if doc.metadata.get('page_number') in pages:
# tables.append(doc)
# logger.info(f"Content added with page_number = {doc.metadata.get('page_number')}: {doc.page_content[:10]}\n")
tables = [table for table, _ in self.extra_context if
table.metadata.get('page_number') in pages]
logger.info(f"Tables: {tables[0]}")
logger.info(f"Context: {self.extra_context[0]}")
docs.extend(tables)

return self._reduce_tokens_below_limit(docs)

self.chain = CustomConversationalRetrievalChain.from_llm(
llm=self.llm,
retriever=retriever,
chain_type=self.chain_type,
Expand Down Expand Up @@ -112,6 +150,14 @@ def ask_question(self, question: str) -> Dict[str, Any]:
logger.info(f"current question: {question}")
logger.info(f"current chat history: {self.chat_history}")

# Assuming `vectorstore` is an instance of Chroma
tables = self.vector_store.similarity_search_with_score(
query=question,
k=1000, # Number of results to return
filter={"isTable": True} # Filter to only return tables}
)
if self.llm_model == "gpt_4o": # token limit is higher for gpt_4o
self.chain.extra_context = tables
response = self.chain.invoke({"question": question, "chat_history": self.chat_history})
# if no chunk qualifies, overrule any answer generated by the LLM
if len(response["source_documents"]) == 0:
Expand Down
2 changes: 1 addition & 1 deletion summarize/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def summarize_file(self, file: str) -> None:
"""
# detect language first
file_parser = FileParser()
_, metadata = file_parser.parse_file(os.path.join(self.content_folder_path, file))
_, metadata, _ = file_parser.parse_file(os.path.join(self.content_folder_path, file))
language = ut.LANGUAGE_MAP.get(metadata['Language'], 'english')
# create splitter object
text_splitter = SplitterCreator(text_splitter_method=self.text_splitter_method,
Expand Down