Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check if pdf was uploaded and processed #43

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 106 additions & 60 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import os
from typing import List, Dict

import openai
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
Expand All @@ -7,77 +11,126 @@
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from typing.io import IO

from htmlTemplates import css, bot_template, user_template
from langchain.chains.router import MultiRetrievalQAChain
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub

def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text


def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks


def get_vectorstore(text_chunks):
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore


def get_conversation_chain(vectorstore):
llm = ChatOpenAI()
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
class PDFProcessor:
@staticmethod
def create_vectorstore(text: str) -> FAISS:
"""
This function takes a text string and generates a FAISS retriever object.
:param text: str: Input text
:return: FAISS: A FAISS retriever object.
"""
try:
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000, # Consider making these values configurable through parameters or config
chunk_overlap=200,
length_function=len
)

embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="model_name") # This line is commented out; adjust as per requirement

# Creating a FAISS retriever
retriever = FAISS.from_texts(text, embeddings).as_retriever()

return retriever

except Exception as e:
# Properly log the exception, or re-raise it if it can't be handled here
raise ValueError(f"Failed to create vector store: {str(e)}")

@staticmethod
def get_retriever_list(pdf_docs: List[IO]) -> List[Dict]:
"""
Processes a list of PDF documents and returns a list of retriever information dictionaries.

:param pdf_docs: List of PDF documents.
:return: List of dictionaries containing retriever information.
"""
retriever_infos = []

try:
for pdf in pdf_docs:
text = ""
pdf_reader = PdfReader(pdf)

for page in pdf_reader.pages:
text += page.extract_text()

# Assuming create_vectorstore is a static method in the same class
retriever = PDFProcessor.create_vectorstore(text)

retriever_infos.append({
"name": "Leistungsbeschreibung",
"description": "studie der it trends im jahr 2022",
"retriever": retriever
})

return retriever_infos

except Exception as e:
# Log the error if any occurred during the processing of PDFs.
print(f"Error occurred while processing PDFs: {e}")


class ConversationManager:
@staticmethod
def get_conversation_chain(retriever_infos: List[Dict]) -> MultiRetrievalQAChain:
llm = ChatOpenAI(model_name="gpt-3.5-turbo")
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = MultiRetrievalQAChain.from_retrievers(
llm=llm,
retriever_infos=retriever_infos,
verbose=True,
# memory=memory,
# return_source_documents=True
)
return conversation_chain


def handle_userinput(user_question):
response = st.session_state.conversation({'question': user_question})
st.session_state.chat_history = response['chat_history']
if st.session_state.conversation is None:
st.warning("Please upload and process the PDFs first.")
return

for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
response = st.session_state.conversation.run(user_question)
print(response)
st.session_state.chat_history.append((user_question, response))

for user_question, response in reversed(st.session_state.chat_history):
st.write(bot_template.replace("{{MSG}}", response), unsafe_allow_html=True)
st.write(user_template.replace("{{MSG}}", user_question), unsafe_allow_html=True)


def main():
load_dotenv()
load_dotenv(override=True)
st.set_page_config(page_title="Chat with multiple PDFs",
page_icon=":books:")
st.write(css, unsafe_allow_html=True)

if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.session_state.chat_history = []

st.header("Chat with multiple PDFs :books:")
user_question = st.text_input("Ask a question about your documents:")
if user_question:

if user_question and user_question != st.session_state.chat_history[-1][0]:
handle_userinput(user_question)

with st.sidebar:
Expand All @@ -87,17 +140,10 @@ def main():
if st.button("Process"):
with st.spinner("Processing"):
# get pdf text
raw_text = get_pdf_text(pdf_docs)

# get the text chunks
text_chunks = get_text_chunks(raw_text)

# create vector store
vectorstore = get_vectorstore(text_chunks)

retriever_list = PDFProcessor.get_retriever_list(pdf_docs)
# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore)
st.session_state.conversation = ConversationManager.get_conversation_chain(
retriever_list)


if __name__ == '__main__':
Expand Down
Binary file added avatars/aisupport.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added avatars/user.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 15 additions & 4 deletions htmlTemplates.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
import base64
def get_base64_encoded_image(image_path):
with open(image_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode('utf-8')


css = '''
<style>
.chat-message {
Expand Down Expand Up @@ -25,20 +31,25 @@
}
'''

bot_avatar = "avatars/aisupport.png"
encoded_image_bot = get_base64_encoded_image(bot_avatar)
bot_template = '''
<div class="chat-message bot">
<div class="avatar">
<img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
<img src="{img_source}" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
</div>
<div class="message">{{MSG}}</div>
</div>
'''
'''.replace("{img_source}","data:image/png;base64," + encoded_image_bot)


user_avatar = "avatars/user.png"
encoded_image_user = get_base64_encoded_image(user_avatar)
user_template = '''
<div class="chat-message user">
<div class="avatar">
<img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
<img src="{img_source}">
</div>
<div class="message">{{MSG}}</div>
</div>
'''
'''.replace("{img_source}", "data:image/png;base64," + encoded_image_user)