alejandro-ao · ManuFU · Sep 28, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/app.py b/app.py
@@ -1,3 +1,7 @@
+import os
+from typing import List, Dict
+
+import openai
 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
@@ -7,77 +11,126 @@
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
+from typing.io import IO
+
 from htmlTemplates import css, bot_template, user_template
+from langchain.chains.router import MultiRetrievalQAChain
+from langchain.llms import OpenAI
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.document_loaders import TextLoader, PyPDFLoader
+from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFaceHub
 
-def get_pdf_text(pdf_docs):
-    text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-
-
-def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
-    chunks = text_splitter.split_text(text)
-    return chunks
-
-
-def get_vectorstore(text_chunks):
-    embeddings = OpenAIEmbeddings()
-    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-    return vectorstore
 
-
-def get_conversation_chain(vectorstore):
-    llm = ChatOpenAI()
-    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
-
-    memory = ConversationBufferMemory(
-        memory_key='chat_history', return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm,
-        retriever=vectorstore.as_retriever(),
-        memory=memory
-    )
-    return conversation_chain
+class PDFProcessor:
+    @staticmethod
+    def create_vectorstore(text: str) -> FAISS:
+        """
+        This function takes a text string and generates a FAISS retriever object.
+        :param text: str: Input text
+        :return: FAISS: A FAISS retriever object.
+        """
+        try:
+            text_splitter = CharacterTextSplitter(
+                separator="\n",
+                chunk_size=1000,  # Consider making these values configurable through parameters or config
+                chunk_overlap=200,
+                length_function=len
+            )
+
+            embeddings = OpenAIEmbeddings()
+            # embeddings = HuggingFaceInstructEmbeddings(model_name="model_name")  # This line is commented out; adjust as per requirement
+
+            # Creating a FAISS retriever
+            retriever = FAISS.from_texts(text, embeddings).as_retriever()
+
+            return retriever
+
+        except Exception as e:
+            # Properly log the exception, or re-raise it if it can't be handled here
+            raise ValueError(f"Failed to create vector store: {str(e)}")
+
+    @staticmethod
+    def get_retriever_list(pdf_docs: List[IO]) -> List[Dict]:
+        """
+        Processes a list of PDF documents and returns a list of retriever information dictionaries.
+
+        :param pdf_docs: List of PDF documents.
+        :return: List of dictionaries containing retriever information.
+        """
+        retriever_infos = []
+
+        try:
+            for pdf in pdf_docs:
+                text = ""
+                pdf_reader = PdfReader(pdf)
+
+                for page in pdf_reader.pages:
+                    text += page.extract_text()
+
+                # Assuming create_vectorstore is a static method in the same class
+                retriever = PDFProcessor.create_vectorstore(text)
+
+                retriever_infos.append({
+                    "name": "Leistungsbeschreibung",
+                    "description": "studie der it trends im jahr 2022",
+                    "retriever": retriever
+                })
+
+            return retriever_infos
+
+        except Exception as e:
+            # Log the error if any occurred during the processing of PDFs.
+            print(f"Error occurred while processing PDFs: {e}")
+
+
+class ConversationManager:
+    @staticmethod
+    def get_conversation_chain(retriever_infos: List[Dict]) -> MultiRetrievalQAChain:
+        llm = ChatOpenAI(model_name="gpt-3.5-turbo")
+        # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
+
+        memory = ConversationBufferMemory(
+            memory_key='chat_history', return_messages=True)
+        conversation_chain = MultiRetrievalQAChain.from_retrievers(
+            llm=llm,
+            retriever_infos=retriever_infos,
+            verbose=True,
+            #   memory=memory,
+            #   return_source_documents=True
+        )
+        return conversation_chain
 
 
 def handle_userinput(user_question):
-    response = st.session_state.conversation({'question': user_question})
-    st.session_state.chat_history = response['chat_history']
+    if st.session_state.conversation is None:
+        st.warning("Please upload and process the PDFs first.")
+        return
 
-    for i, message in enumerate(st.session_state.chat_history):
-        if i % 2 == 0:
-            st.write(user_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
-        else:
-            st.write(bot_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
+    response = st.session_state.conversation.run(user_question)
+    print(response)
+    st.session_state.chat_history.append((user_question, response))
+
+    for user_question, response in reversed(st.session_state.chat_history):
+        st.write(bot_template.replace("{{MSG}}", response), unsafe_allow_html=True)
+        st.write(user_template.replace("{{MSG}}", user_question), unsafe_allow_html=True)
 
 
 def main():
-    load_dotenv()
+    load_dotenv(override=True)
     st.set_page_config(page_title="Chat with multiple PDFs",
                        page_icon=":books:")
     st.write(css, unsafe_allow_html=True)
 
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
+        st.session_state.chat_history = []
 
     st.header("Chat with multiple PDFs :books:")
     user_question = st.text_input("Ask a question about your documents:")
-    if user_question:
+
+    if user_question and user_question != st.session_state.chat_history[-1][0]:
         handle_userinput(user_question)
 
     with st.sidebar:
@@ -87,17 +140,10 @@ def main():
         if st.button("Process"):
             with st.spinner("Processing"):
                 # get pdf text
-                raw_text = get_pdf_text(pdf_docs)
-
-                # get the text chunks
-                text_chunks = get_text_chunks(raw_text)
-
-                # create vector store
-                vectorstore = get_vectorstore(text_chunks)
-
+                retriever_list = PDFProcessor.get_retriever_list(pdf_docs)
                 # create conversation chain
-                st.session_state.conversation = get_conversation_chain(
-                    vectorstore)
+                st.session_state.conversation = ConversationManager.get_conversation_chain(
+                    retriever_list)
 
 
 if __name__ == '__main__':

diff --git a/avatars/aisupport.png b/avatars/aisupport.png
diff --git a/avatars/user.png b/avatars/user.png
diff --git a/htmlTemplates.py b/htmlTemplates.py
@@ -1,3 +1,9 @@
+import base64
+def get_base64_encoded_image(image_path):
+    with open(image_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode('utf-8')
+
+
 css = '''
 <style>
 .chat-message {
@@ -25,20 +31,25 @@
 }
 '''
 
+bot_avatar = "avatars/aisupport.png"
+encoded_image_bot = get_base64_encoded_image(bot_avatar)
 bot_template = '''
 <div class="chat-message bot">
     <div class="avatar">
-        <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
+        <img src="{img_source}" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
     </div>
     <div class="message">{{MSG}}</div>
 </div>
-'''
+'''.replace("{img_source}","data:image/png;base64," + encoded_image_bot)
 
+
+user_avatar = "avatars/user.png"
+encoded_image_user = get_base64_encoded_image(user_avatar)
 user_template = '''
 <div class="chat-message user">
     <div class="avatar">
-        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
+        <img src="{img_source}">
     </div>    
     <div class="message">{{MSG}}</div>
 </div>
-'''
+'''.replace("{img_source}", "data:image/png;base64," + encoded_image_user)