From 31d60a2d28cc5df34701d3ec73a3267601673283 Mon Sep 17 00:00:00 2001 From: Rukmini-Sunki Date: Tue, 18 Nov 2025 15:02:43 +0530 Subject: [PATCH] Add StudyMate application with PDF processing Implement StudyMate application for academic assistance --- study _mate | 404 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 study _mate diff --git a/study _mate b/study _mate new file mode 100644 index 0000000..2bcb69a --- /dev/null +++ b/study _mate @@ -0,0 +1,404 @@ +import streamlit as st +import fitz # PyMuPDF +import faiss +import numpy as np +from sentence_transformers import SentenceTransformer +from ibm_watsonx_ai.foundation_models import Model +from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams +import io +import re +from typing import List, Dict, Tuple +import os + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +class Config: + """Configuration for StudyMate application""" + CHUNK_SIZE = 500 # Characters per chunk + CHUNK_OVERLAP = 100 # Overlap between chunks + TOP_K_RESULTS = 3 # Number of chunks to retrieve + EMBEDDING_MODEL = "all-MiniLM-L6-v2" # SentenceTransformer model + + # IBM Watsonx Configuration + WATSONX_URL = "https://us-south.ml.cloud.ibm.com" + WATSONX_PROJECT_ID = "your_project_id_here" + WATSONX_API_KEY = "your_api_key_here" + MODEL_ID = "mistralai/mixtral-8x7b-instruct-v01" + +# ============================================================================ +# PDF PROCESSING +# ============================================================================ + +class PDFProcessor: + """Handles PDF text extraction and preprocessing""" + + @staticmethod + def extract_text_from_pdf(pdf_file) -> str: + """Extract text from uploaded PDF file using PyMuPDF""" + try: + pdf_bytes = pdf_file.read() + pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") + + full_text = "" + for page_num in range(pdf_document.page_count): + page = pdf_document[page_num] + full_text += page.get_text() + + pdf_document.close() + return full_text + except Exception as e: + st.error(f"Error extracting text from {pdf_file.name}: {str(e)}") + return "" + + @staticmethod + def clean_text(text: str) -> str: + """Clean and preprocess extracted text""" + # Remove excessive whitespace + text = re.sub(r'\s+', ' ', text) + # Remove special characters but keep punctuation + text = re.sub(r'[^\w\s.,!?;:\-\(\)\"\']+', '', text) + return text.strip() + + @staticmethod + def chunk_text(text: str, chunk_size: int = Config.CHUNK_SIZE, + overlap: int = Config.CHUNK_OVERLAP) -> List[str]: + """Split text into overlapping chunks""" + chunks = [] + start = 0 + text_length = len(text) + + while start < text_length: + end = start + chunk_size + chunk = text[start:end] + + # Try to break at sentence boundary + if end < text_length: + last_period = chunk.rfind('.') + last_newline = chunk.rfind('\n') + break_point = max(last_period, last_newline) + + if break_point > chunk_size * 0.5: # Only break if reasonable + chunk = chunk[:break_point + 1] + end = start + break_point + 1 + + chunks.append(chunk.strip()) + start = end - overlap + + return [c for c in chunks if len(c) > 50] # Filter very short chunks + +# ============================================================================ +# VECTOR STORE & RETRIEVAL +# ============================================================================ + +class VectorStore: + """Manages embeddings and semantic search using FAISS""" + + def __init__(self): + self.embedding_model = SentenceTransformer(Config.EMBEDDING_MODEL) + self.index = None + self.chunks = [] + self.metadata = [] + + def create_embeddings(self, chunks: List[str], source_names: List[str]): + """Create embeddings for text chunks and build FAISS index""" + self.chunks = chunks + self.metadata = [{"source": name} for name in source_names] + + with st.spinner("Creating embeddings..."): + embeddings = self.embedding_model.encode(chunks, show_progress_bar=False) + embeddings = np.array(embeddings).astype('float32') + + # Normalize embeddings for cosine similarity + faiss.normalize_L2(embeddings) + + # Create FAISS index + dimension = embeddings.shape[1] + self.index = faiss.IndexFlatIP(dimension) # Inner product (cosine similarity) + self.index.add(embeddings) + + return len(chunks) + + def search(self, query: str, top_k: int = Config.TOP_K_RESULTS) -> List[Dict]: + """Search for most relevant chunks using semantic similarity""" + if self.index is None: + return [] + + query_embedding = self.embedding_model.encode([query]) + query_embedding = np.array(query_embedding).astype('float32') + faiss.normalize_L2(query_embedding) + + distances, indices = self.index.search(query_embedding, top_k) + + results = [] + for idx, distance in zip(indices[0], distances[0]): + if idx < len(self.chunks): + results.append({ + "text": self.chunks[idx], + "score": float(distance), + "source": self.metadata[idx]["source"] + }) + + return results + +# ============================================================================ +# LLM INTEGRATION +# ============================================================================ + +class WatsonxLLM: + """IBM Watsonx AI integration for answer generation""" + + def __init__(self): + self.model = None + self._initialize_model() + + def _initialize_model(self): + """Initialize IBM Watsonx model""" + try: + # Generation parameters + parameters = { + GenParams.DECODING_METHOD: "greedy", + GenParams.MAX_NEW_TOKENS: 500, + GenParams.MIN_NEW_TOKENS: 50, + GenParams.TEMPERATURE: 0.7, + GenParams.TOP_K: 50, + GenParams.TOP_P: 0.9 + } + + self.model = Model( + model_id=Config.MODEL_ID, + params=parameters, + credentials={ + "url": Config.WATSONX_URL, + "apikey": Config.WATSONX_API_KEY + }, + project_id=Config.WATSONX_PROJECT_ID + ) + except Exception as e: + st.error(f"Failed to initialize Watsonx model: {str(e)}") + st.info("Please ensure your IBM Watsonx credentials are correctly configured.") + + def generate_answer(self, question: str, context_chunks: List[Dict]) -> str: + """Generate answer using retrieved context""" + if not self.model: + return "⚠️ LLM not initialized. Please check your IBM Watsonx credentials." + + # Prepare context from retrieved chunks + context = "\n\n".join([ + f"[Source: {chunk['source']}]\n{chunk['text']}" + for chunk in context_chunks + ]) + + # Create prompt + prompt = f"""You are an academic assistant helping students understand their study materials. Based on the provided context, answer the question accurately and concisely. + +Context: +{context} + +Question: {question} + +Answer: Provide a clear, accurate answer based solely on the context provided. If the context doesn't contain enough information, say so. Include relevant details and cite the source when appropriate.""" + + try: + response = self.model.generate_text(prompt=prompt) + return response + except Exception as e: + return f"⚠️ Error generating answer: {str(e)}" + +# ============================================================================ +# STREAMLIT APPLICATION +# ============================================================================ + +def initialize_session_state(): + """Initialize Streamlit session state variables""" + if 'vector_store' not in st.session_state: + st.session_state.vector_store = VectorStore() + if 'llm' not in st.session_state: + st.session_state.llm = WatsonxLLM() + if 'documents_processed' not in st.session_state: + st.session_state.documents_processed = False + if 'chat_history' not in st.session_state: + st.session_state.chat_history = [] + +def main(): + # Page configuration + st.set_page_config( + page_title="StudyMate - AI Academic Assistant", + page_icon="📚", + layout="wide" + ) + + # Initialize session state + initialize_session_state() + + # Header + st.title("📚 StudyMate - AI Academic Assistant") + st.markdown(""" + Upload your study materials (PDFs) and ask questions in natural language. + StudyMate will provide accurate, context-based answers from your documents. + """) + + # Sidebar for document upload and configuration + with st.sidebar: + st.header("📁 Document Management") + + uploaded_files = st.file_uploader( + "Upload PDF documents", + type=['pdf'], + accept_multiple_files=True, + help="Upload one or more PDF files containing your study materials" + ) + + if uploaded_files: + st.success(f"✅ {len(uploaded_files)} file(s) uploaded") + + if st.button("🔄 Process Documents", type="primary"): + process_documents(uploaded_files) + + st.divider() + + # Configuration options + with st.expander("⚙️ Advanced Settings"): + chunk_size = st.slider("Chunk Size", 300, 1000, Config.CHUNK_SIZE) + top_k = st.slider("Results to Retrieve", 1, 10, Config.TOP_K_RESULTS) + Config.CHUNK_SIZE = chunk_size + Config.TOP_K_RESULTS = top_k + + st.divider() + + # Statistics + if st.session_state.documents_processed: + st.subheader("📊 Statistics") + st.metric("Documents", len(uploaded_files) if uploaded_files else 0) + st.metric("Text Chunks", len(st.session_state.vector_store.chunks)) + + st.divider() + + # API Configuration + with st.expander("🔑 IBM Watsonx Setup"): + st.markdown(""" + **Configure your credentials in the code:** + 1. Set `WATSONX_API_KEY` + 2. Set `WATSONX_PROJECT_ID` + 3. Verify `WATSONX_URL` + + [Get IBM Watsonx Credentials →](https://www.ibm.com/watsonx) + """) + + # Main content area + if not st.session_state.documents_processed: + st.info("👆 Upload and process PDF documents to get started!") + + # Example questions + st.subheader("💡 Example Questions You Can Ask:") + st.markdown(""" + - What are the main concepts covered in chapter 3? + - Explain the theory of relativity based on my notes + - Summarize the key findings from the research paper + - What are the differences between X and Y? + - Define [specific term] from the textbook + """) + else: + # Question input + st.subheader("💬 Ask a Question") + + col1, col2 = st.columns([4, 1]) + with col1: + question = st.text_input( + "Enter your question:", + placeholder="e.g., What are the main themes in Chapter 2?", + label_visibility="collapsed" + ) + with col2: + ask_button = st.button("🔍 Ask", type="primary", use_container_width=True) + + # Process question + if ask_button and question: + process_question(question) + + # Display chat history + if st.session_state.chat_history: + st.divider() + st.subheader("📜 Conversation History") + + for idx, chat in enumerate(reversed(st.session_state.chat_history)): + with st.expander(f"Q: {chat['question'][:80]}...", expanded=(idx == 0)): + st.markdown(f"**Question:** {chat['question']}") + st.markdown(f"**Answer:** {chat['answer']}") + + if chat.get('sources'): + st.markdown("**📚 Sources:**") + for source in chat['sources']: + st.caption(f"- {source['source']} (Relevance: {source['score']:.2%})") + +def process_documents(uploaded_files): + """Process uploaded PDF documents""" + processor = PDFProcessor() + all_chunks = [] + all_sources = [] + + progress_bar = st.progress(0) + status_text = st.empty() + + for idx, pdf_file in enumerate(uploaded_files): + status_text.text(f"Processing {pdf_file.name}...") + + # Extract text + text = processor.extract_text_from_pdf(pdf_file) + + if text: + # Clean and chunk text + cleaned_text = processor.clean_text(text) + chunks = processor.chunk_text(cleaned_text) + + all_chunks.extend(chunks) + all_sources.extend([pdf_file.name] * len(chunks)) + + progress_bar.progress((idx + 1) / len(uploaded_files)) + + # Create embeddings + if all_chunks: + num_chunks = st.session_state.vector_store.create_embeddings(all_chunks, all_sources) + st.session_state.documents_processed = True + + status_text.empty() + progress_bar.empty() + st.success(f"✅ Successfully processed {len(uploaded_files)} documents into {num_chunks} chunks!") + else: + status_text.empty() + progress_bar.empty() + st.error("No text could be extracted from the uploaded documents.") + +def process_question(question: str): + """Process user question and generate answer""" + with st.spinner("Searching documents and generating answer..."): + # Retrieve relevant chunks + results = st.session_state.vector_store.search(question, Config.TOP_K_RESULTS) + + if not results: + st.warning("No relevant information found in the documents.") + return + + # Generate answer + answer = st.session_state.llm.generate_answer(question, results) + + # Store in chat history + st.session_state.chat_history.append({ + "question": question, + "answer": answer, + "sources": results + }) + + # Display answer + st.markdown("### 💡 Answer") + st.markdown(answer) + + # Display sources + st.markdown("### 📚 Retrieved Sources") + for idx, result in enumerate(results, 1): + with st.expander(f"Source {idx}: {result['source']} (Relevance: {result['score']:.2%})"): + st.text(result['text'][:500] + "..." if len(result['text']) > 500 else result['text']) + +if __name__ == "__main__": + main()