pbl-nl · StefanTroost · Jan 17, 2025
@@ -15,6 +15,7 @@
 from langchain_community.document_loaders import TextLoader
 import fitz
 from docx2pdf import convert
+import pymupdf4llm
 # local imports
 import utils as ut
 
@@ -41,8 +42,9 @@ def parse_file(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, s
         Tuple[List[Tuple[int, str]], Dict[str, str]]
             tuple of pages (list of tuples of pagenumbers and page texts) and metadata (dictionary)
         """
+        tables = []
         if file_path.endswith(".pdf"):
-            raw_pages, metadata = self.parse_pymupdf(file_path)
+            raw_pages, metadata, tables = self.parse_pymupdf(file_path)
         elif file_path.endswith(".txt") or file_path.endswith(".md"):
             raw_pages, metadata = self.parse_txt(file_path)
         elif file_path.endswith(".html"):
@@ -51,7 +53,7 @@ def parse_file(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, s
             raw_pages, metadata = self.parse_word(file_path)
 
         # return raw text from pages and metadata
-        return raw_pages, metadata
+        return raw_pages, metadata, tables
 
     def get_metadata(self, file_path: str, doc_metadata: Dict[str, str]) -> Dict[str, str]:
         """
@@ -259,8 +261,41 @@ def parse_pymupdf(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str
             ut.detect_language(pages[page_with_max_text][1])
         logger.info(f"The language detected for this document is {metadata['Language']}")
         metadata["last_change_time"] = os.stat(file_path).st_mtime
+        tables = self.extract_tables(file_path)
 
-        return pages, metadata
+        return pages, metadata, tables
+
+    def extract_tables(self, file_path, show_progress=False):
+        """
+        first chunks each page of the pdf file into markdown and then extracts tables from the markdown text.
+        md_text_list contains image, graphics, metadata, table positions, and text of the pdf file in markdown format.
+        Might be helpful in text parsing.
+
+        Parameters
+        ----------
+        file_path : str
+            path to the pdf file
+        show_progress : bool, optional
+            _description_, by default False
+
+        Returns
+        -------
+        _type_
+            list of tuples, each tuple contains the page number and the table in markdown format
+        """
+        md_text_list = pymupdf4llm.to_markdown(file_path, page_chunks=True, show_progress=show_progress)
+        # Regular expression to detect Markdown tables
+        table_pattern = re.compile(r"(\|.*?\|\n\|[-|]+\|\n(?:\|.*?\|\n)+)")
+
+        # Process each page to find Markdown tables
+        table_pages = []
+        for i, md_text in enumerate(md_text_list):
+            text = md_text['text']
+            tables = table_pattern.findall(text)
+            if tables:
+                table_pages.extend([(i, table) for table in tables])
+
+        return table_pages
 
     def parse_txt(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
         """
@@ -312,6 +347,6 @@ def parse_word(self, file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, s
         """
         # convert docx to pdf
         path_to_pdf = self.convert_docx_to_pdf(file_path)
-        pages, metadata = self.parse_pymupdf(path_to_pdf)
+        pages, metadata, _ = self.parse_pymupdf(path_to_pdf)
 
         return pages, metadata
@@ -83,9 +83,49 @@ def clean_texts(self,
 
         return cleaned_texts
 
+    def table_to_docs(self, table_pages: List[Tuple[int, str]], metadata: Dict[str, str]):
+        # Prepare splitter for table text
+        # language is needed but not utilized since it is passed to the NLTKTextSplitter
+        def split_creator(text: str, language=''):
+            splits = [split for split in text.split(separator)]
+            return splits
+
+        separator = "|\n"
+        splitter_table = SplitterCreator(self.text_splitter_method,
+                                         self.chunk_size,
+                                         0).get_splitter()
+        splitter_table.__setattr__('_tokenizer', split_creator)  # Language related attribute is not used in this case
+
+        if hasattr(splitter_table, "_separator"):  # we need to overload the separator attribute
+            splitter_table.__setattr__('_separator', separator)  # for NLTKTextSplitter
+        else:
+            splitter_table.__setattr__('_separators', [separator])  # for RecursiveCharacterTextSplitter
+
+        # Split the table text into chunks
+        table_docs: List[docstore.Document] = []
+        chunk_num = 0
+        for page_num, table in table_pages:
+            chunks = splitter_table.split_text(table)
+            for chunk_text in chunks:  # this part can be updated for text_to_docs then we can remove five lines of code
+                metadata_combined = {
+                                "page_number": page_num,
+                                "chunk": chunk_num,
+                                "source": f"p{page_num}-{chunk_num}",
+                                "isTable": True,
+                                **metadata,
+                            }
+                doc = docstore.Document(
+                    page_content=chunk_text,
+                    metadata=metadata_combined
+                )
+                table_docs.append(doc)
+                chunk_num += 1
+        return table_docs
+
     def texts_to_docs(self,
                       texts: List[Tuple[int, str]],
-                      metadata: Dict[str, str]) -> List[docstore.Document]:
+                      metadata: Dict[str, str],
+                      tables: List[Tuple[int, str]]) -> List[docstore.Document]:
         """
         Split the text into chunks and return them as Documents.
         """
@@ -155,9 +195,12 @@ def texts_to_docs(self,
                 chunk_num += 1
                 prv_page_num = page_num
 
+        if tables:
+            docs.extend(self.table_to_docs(tables, metadata))
+
         return docs
 
-    def clean_texts_to_docs(self, raw_texts, metadata) -> List[docstore.Document]:
+    def clean_texts_to_docs(self, raw_texts, metadata, tables) -> List[docstore.Document]:
         """"
         Combines the functions clean_text and text_to_docs
         """
@@ -170,7 +213,8 @@ def clean_texts_to_docs(self, raw_texts, metadata) -> List[docstore.Document]:
         # for cleaned_text in cleaned_texts:
         #     cleaned_chunks = self.split_text_into_chunks(cleaned_text, metadata)
         docs = self.texts_to_docs(texts=cleaned_texts,
-                                  metadata=metadata)
+                                  metadata=metadata,
+                                  tables=tables)
         return docs
 
     def count_ada_tokens(self, raw_texts: List[Tuple[int, str]]) -> int:
@@ -273,9 +317,10 @@ def ingest(self) -> None:
                 file_path = os.path.join(self.content_folder, file)
                 # extract raw text pages and metadata according to file type
                 logger.info(f"Parsing file {file}")
-                raw_texts, metadata = file_parser.parse_file(file_path)
+                raw_texts, metadata, tables = file_parser.parse_file(file_path)
                 documents = self.clean_texts_to_docs(raw_texts=raw_texts,
-                                                     metadata=metadata)
+                                                     metadata=metadata,
+                                                     tables=tables)
                 # count tokens
                 tokens_document = self.count_ada_tokens(raw_texts)
                 logger.info(f"Extracted {len(documents)} chunks (Tokens: {tokens_document}) from {file}")

@@ -1,9 +1,12 @@
 import os
-from typing import Dict, Any
+from typing import Dict, Any, List
+from pydantic import Field
+from langchain_core.callbacks import CallbackManagerForChainRun
+from langchain_core.documents import Document
+from langchain_core.prompts import PromptTemplate
 from dotenv import load_dotenv
 from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
 from langchain.schema import AIMessage, HumanMessage
-from langchain_core.prompts import PromptTemplate
 from loguru import logger
 # local imports
 import settings
@@ -84,7 +87,42 @@ def make_chain(self,
 
         # get chain
         if self.chain_name == "conversationalretrievalchain":
-            self.chain = ConversationalRetrievalChain.from_llm(
+            # Create custom class to have more control over retrieval process
+            class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
+                extra_context: List[Any] = Field(default_factory=list)
+
+                def _get_docs(
+                    self,
+                    question: str,
+                    inputs: Dict[str, Any],
+                    *,
+                    run_manager: CallbackManagerForChainRun,
+                ) -> List[Document]:
+                    """Get docs."""
+                    docs = self.retriever.invoke(
+                        question, config={"callbacks": run_manager.get_child()}
+                    )
+
+                    # if extra_context (table) is provided, add it to the docs
+                    if isinstance(self.extra_context, list):
+                        pages = list({num for doc in docs for num in [doc.metadata['page_number'] - 1,
+                                                                      doc.metadata['page_number'],
+                                                                      doc.metadata['page_number']+1]})
+                        logger.info(f"Pages: {pages}")
+                        tables = []
+                        # for doc, score in self.extra_context:
+                        #     if doc.metadata.get('page_number') in pages:
+                        #         tables.append(doc)
+                        #         logger.info(f"Content added with page_number = {doc.metadata.get('page_number')}: {doc.page_content[:10]}\n")
+                        tables = [table for table, _ in self.extra_context if
+                                  table.metadata.get('page_number') in pages]
+                        logger.info(f"Tables: {tables[0]}")
+                        logger.info(f"Context: {self.extra_context[0]}")
+                        docs.extend(tables)
+
+                    return self._reduce_tokens_below_limit(docs)
+
+            self.chain = CustomConversationalRetrievalChain.from_llm(
                 llm=self.llm,
                 retriever=retriever,
                 chain_type=self.chain_type,
@@ -112,6 +150,14 @@ def ask_question(self, question: str) -> Dict[str, Any]:
         logger.info(f"current question: {question}")
         logger.info(f"current chat history: {self.chat_history}")
 
+        # Assuming `vectorstore` is an instance of Chroma
+        tables = self.vector_store.similarity_search_with_score(
+            query=question,
+            k=1000,  # Number of results to return
+            filter={"isTable": True}  # Filter to only return tables}
+        )
+        if self.llm_model == "gpt_4o":  # token limit is higher for gpt_4o
+            self.chain.extra_context = tables
         response = self.chain.invoke({"question": question, "chat_history": self.chat_history})
         # if no chunk qualifies, overrule any answer generated by the LLM
         if len(response["source_documents"]) == 0:

@@ -85,7 +85,7 @@ def summarize_file(self, file: str) -> None:
         """
         # detect language first
         file_parser = FileParser()
-        _, metadata = file_parser.parse_file(os.path.join(self.content_folder_path, file))
+        _, metadata, _ = file_parser.parse_file(os.path.join(self.content_folder_path, file))
         language = ut.LANGUAGE_MAP.get(metadata['Language'], 'english')
         # create splitter object
         text_splitter = SplitterCreator(text_splitter_method=self.text_splitter_method,