diff --git a/streamlit_app.py b/streamlit_app.py index 68e7df0f..84069dee 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -1,1156 +1,1162 @@ -import os -import io -import re -import tempfile -import uuid -from datetime import datetime, timedelta -from typing import Optional, List, Dict - -import streamlit as st -from fpdf import FPDF - -# Import project modules -from paperscope.main import fetch_and_summarize, query_db -from paperscope.pdf_parser import extract_text_from_pdf -from paperscope.vector_store import build_index, search_similar - -# Optional: history storage API -try: - from paperscope.storage import get_history, clear_history, delete_entry, save_history_entry - STORAGE_AVAILABLE = True -except ImportError: - STORAGE_AVAILABLE = False - def get_history(): - return [] - def clear_history(): - pass - def delete_entry(entry_id): - return False - def save_history_entry(entry): - pass - -# Check for demo mode -DEMO_MODE = os.getenv("DEMO_MODE", "").lower() in ("1", "true", "yes") -if DEMO_MODE: - from paperscope.summarizer_demo import summarize -else: - from paperscope.summarizer import summarize - -# Application configuration -st.set_page_config(page_title="PaperScope", page_icon="📄", layout="wide") - -# --------------------------------------------------------------------- -# Utility helpers -# --------------------------------------------------------------------- - -def safe_filename(filename: str) -> str: - """Sanitize a filename for safe filesystem/download use.""" - if not filename: - return "file" - filename = filename.split("/")[-1].split("\\")[-1] - filename = re.sub(r"[^\w\-_.]", "_", filename) - return filename[:200] - - -def generate_pdf_from_text(title: str, metadata: dict, body_text: str, annotations: str = ""): - """Generate a Unicode-safe PDF summary report using fpdf2.""" - pdf = FPDF() - pdf.add_page() - - font_path = os.path.join(os.path.dirname(__file__), "paperscope", "DejaVuSans.ttf") - if os.path.exists(font_path): - pdf.add_font("DejaVu", "", font_path, uni=True) - pdf.set_font("DejaVu", "", 16) - else: - pdf.set_font("Helvetica", "", 16) - - pdf.cell(0, 10, txt=title[:100], ln=True, align="C") - pdf.ln(10) - - if os.path.exists(font_path): - pdf.set_font("DejaVu", "", 12) - else: - pdf.set_font("Helvetica", "", 12) - - if metadata: - for key, value in metadata.items(): - pdf.multi_cell(0, 8, f"{key}: {str(value)[:100]}") - pdf.ln(6) - - pdf.multi_cell(0, 8, body_text) - pdf.ln(10) - - if annotations: - if os.path.exists(font_path): - pdf.set_font("DejaVu", "B", 13) - else: - pdf.set_font("Helvetica", "B", 13) - pdf.cell(0, 10, "Annotations:", ln=True) - if os.path.exists(font_path): - pdf.set_font("DejaVu", "", 12) - else: - pdf.set_font("Helvetica", "", 12) - pdf.multi_cell(0, 8, annotations) - - try: - pdf_bytes = pdf.output(dest="S") - if isinstance(pdf_bytes, str): - pdf_bytes = pdf_bytes.encode("latin-1") - except Exception: - pdf_bytes = pdf.output(dest="S").encode("utf-8") - - buffer = io.BytesIO(pdf_bytes) - buffer.seek(0) - return buffer - - -def safe_write_temp_file(uploaded_file, filename="temp.pdf") -> str: - """Save uploaded file to a safe temporary path and return path.""" - tmp_dir = tempfile.gettempdir() - path = os.path.join(tmp_dir, f"{uuid.uuid4().hex}_{filename}") - with open(path, "wb") as f: - f.write(uploaded_file.read()) - return path - - -def validate_summary(s: Optional[str]) -> bool: - return bool(s and s.strip()) - - -def parse_iso(iso_str: str) -> datetime: - """Parse ISO timestamp robustly, returning epoch if fails.""" - if not iso_str: - return datetime.fromtimestamp(0) - try: - return datetime.fromisoformat(iso_str) - except Exception: - try: - return datetime.strptime(iso_str, "%Y-%m-%d %H:%M:%S") - except Exception: - return datetime.fromtimestamp(0) - - -def to_date(iso_str: str) -> Optional[datetime.date]: - try: - return parse_iso(iso_str).date() - except Exception: - return None - - -# --------------------------------------------------------------------- -# Custom CSS & Styles (Modern Dark Theme) -# --------------------------------------------------------------------- -st.markdown(""" - -""", unsafe_allow_html=True) - -# --------------------------------------------------------------------- -# Sidebar navigation -# --------------------------------------------------------------------- -with st.sidebar: - st.markdown('', unsafe_allow_html=True) - - option = st.radio( - label="Choose a section", - options=[ - "Search arXiv Papers", - "Query Stored Summaries", - "Upload & Summarize PDF", - "Semantic Search (FAISS)", - "History" - ], - key="main_menu", - label_visibility="collapsed" - ) - - if DEMO_MODE: - st.markdown('
Demo Mode
', unsafe_allow_html=True) - st.caption("Results are approximate") - - if st.button("Load Demo Dataset"): - try: - from paperscope.demo_data import load_demo_data - ok, msg = load_demo_data(build_index=False) - if ok: - st.success(msg) - else: - st.error("Failed to load demo database") - except ImportError: - st.error("Demo data module not found") - -# --------------------------------------------------------------------- -# Page header -# --------------------------------------------------------------------- -st.markdown(""" -
-
PaperScope
-
Your personal assistant for academic research using LLMs
-
-""", unsafe_allow_html=True) - -# --------------------------------------------------------------------- -# Section: Search arXiv Papers -# --------------------------------------------------------------------- -if option == "Search arXiv Papers": - st.markdown('
', unsafe_allow_html=True) - st.markdown('
Search arXiv Papers
', unsafe_allow_html=True) - - keyword_input = st.text_input( - "Enter Keywords or Paper URL", - placeholder="e.g., reinforcement learning for robots OR https://arxiv.org/abs/2301.12345" - ) - st.markdown('
You can enter keywords to search arXiv, or paste a direct paper URL (arXiv or PDF link)
', unsafe_allow_html=True) - - if st.button("Fetch & Summarize"): - if DEMO_MODE: - st.info("This feature is disabled in Demo Mode. Use demo dataset or upload a PDF instead.") - elif not keyword_input or keyword_input.strip() == "": - st.warning("Please enter a keyword or URL to search") - else: - try: - from paperscope.url_handler import is_url - - if is_url(keyword_input): - with st.spinner("Fetching paper from URL and summarizing..."): - data = fetch_and_summarize(keyword_input) - - if data: - st.success("Paper fetched and summarized successfully") - for idx, item in enumerate(data[-5:]): - with st.expander(f"{item.get('title', 'Untitled Paper')}"): - st.markdown("**Summary:**") - st.write(item.get("summary", "No summary available")) - - # Download buttons - summary_text = item.get('summary', '') - md_bytes = f"# {item.get('title', 'Summary')}\n\n{summary_text}".encode("utf-8") - txt_bytes = summary_text.encode("utf-8") - - metadata = { - "source": item.get('source', 'arXiv'), - "paper_id": item.get('id', 'N/A') - } - pdf_buf = generate_pdf_from_text( - title=item.get('title', 'PaperScope Summary'), - metadata=metadata, - body_text=summary_text, - annotations=item.get('annotations', '') - ) - - dl_col1, dl_col2, dl_col3 = st.columns([1, 1, 1]) - with dl_col1: - st.download_button( - label="Download TXT", - data=txt_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", - mime="text/plain", - key=f"download_txt_arxiv_{idx}" - ) - with dl_col2: - st.download_button( - label="Download MD", - data=md_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.md", - mime="text/markdown", - key=f"download_md_arxiv_{idx}" - ) - with dl_col3: - st.download_button( - label="Download PDF", - data=pdf_buf, - file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", - mime="application/pdf", - key=f"download_pdf_arxiv_{idx}" - ) - else: - st.error("Failed to fetch paper from URL. Please check the URL and try again") - st.info("Supported formats: arXiv URLs (abs or pdf) and direct PDF links") - - else: - with st.spinner("Fetching and summarizing..."): - data = fetch_and_summarize(keyword_input) - - if data: - st.success(f"Found and processed {len(data)} paper(s)") - for idx, item in enumerate(data[-10:][::-1]): - with st.expander(f"{item.get('title', 'Untitled Paper')}"): - st.markdown("**Summary:**") - st.write(item.get("summary", "No summary available")) - - # Download buttons - summary_text = item.get('summary', '') - md_bytes = f"# {item.get('title', 'Summary')}\n\n{summary_text}".encode("utf-8") - txt_bytes = summary_text.encode("utf-8") - - metadata = { - "source": item.get('source', 'arXiv'), - "paper_id": item.get('id', 'N/A') - } - pdf_buf = generate_pdf_from_text( - title=item.get('title', 'PaperScope Summary'), - metadata=metadata, - body_text=summary_text, - annotations=item.get('annotations', '') - ) - - dl_col1, dl_col2, dl_col3 = st.columns([1, 1, 1]) - with dl_col1: - st.download_button( - label="Download TXT", - data=txt_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", - mime="text/plain", - key=f"download_txt_arxiv_{idx}" - ) - with dl_col2: - st.download_button( - label="Download MD", - data=md_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.md", - mime="text/markdown", - key=f"download_md_arxiv_{idx}" - ) - with dl_col3: - st.download_button( - label="Download PDF", - data=pdf_buf, - file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", - mime="application/pdf", - key=f"download_pdf_arxiv_{idx}" - ) - else: - st.warning("No results found") - - except Exception as e: - st.error(f"Error processing: {str(e)}") - st.info("Supported formats: arXiv URLs (abs or pdf) and direct PDF links") - - st.markdown('
', unsafe_allow_html=True) - -# --------------------------------------------------------------------- -# Section: Query Stored Summaries -# --------------------------------------------------------------------- -elif option == "Query Stored Summaries": - st.markdown('
', unsafe_allow_html=True) - st.markdown('
Query Stored Summaries
', unsafe_allow_html=True) - - query_input = st.text_input( - "Search stored summaries", - placeholder="e.g., contrastive learning" - ) - - if st.button("Run Keyword Search"): - if not query_input or query_input.strip() == "": - st.warning("Please enter a search query") - else: - with st.spinner("Searching database..."): - try: - results = query_db(query_input) - - if not results: - st.info("No matching summaries found") - else: - st.success(f"Found {len(results)} results") - for idx, item in enumerate(results): - with st.expander(f"{item.get('title', 'Untitled')}", expanded=(idx == 0)): - st.markdown("**Summary:**") - st.write(item.get('summary', 'No summary available')) - - # Download buttons - download_col1, download_col2, download_col3 = st.columns([1, 1, 1]) - summary_text = item.get('summary', '') - md_bytes = f"# {item.get('title', '')}\n\n{summary_text}".encode("utf-8") - txt_bytes = summary_text.encode("utf-8") - pdf_buf = generate_pdf_from_text( - title=item.get('title', 'Summary'), - metadata={"id": item.get('id', '')}, - body_text=summary_text, - annotations=item.get('annotations', '') - ) - - with download_col1: - st.download_button( - label="TXT", - data=txt_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", - mime="text/plain", - key=f"query_txt_{idx}" - ) - with download_col2: - st.download_button( - label="MD", - data=md_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.md", - mime="text/markdown", - key=f"query_md_{idx}" - ) - with download_col3: - st.download_button( - label="PDF", - data=pdf_buf, - file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", - mime="application/pdf", - key=f"query_pdf_{idx}" - ) - - except Exception as e: - st.error(f"Search failed: {str(e)}") - - st.markdown('
', unsafe_allow_html=True) - -# --------------------------------------------------------------------- -# Section: Upload & Summarize PDF -# --------------------------------------------------------------------- -elif option == "Upload & Summarize PDF": - st.markdown('
', unsafe_allow_html=True) - st.markdown('
Upload & Summarize PDF
', unsafe_allow_html=True) - - uploaded_file = st.file_uploader("Upload a PDF research paper", type=["pdf"]) - - if uploaded_file: - try: - tmp_path = safe_write_temp_file(uploaded_file, filename="uploaded.pdf") - except Exception as e: - st.error(f"Error saving uploaded file: {e}") - st.stop() - - with st.spinner("Extracting text from PDF..."): - try: - extracted_text = extract_text_from_pdf(tmp_path) - except Exception as e: - st.error(f"Extraction failed: {e}") - st.info("Make sure PDF contains selectable text (not scanned images) or use OCR-enabled parsing") - st.stop() - - if not extracted_text or len(extracted_text.strip()) == 0: - st.error("No text extracted from PDF. The file may be scanned or encrypted") - st.stop() - - with st.spinner("Generating AI-powered summary..."): - try: - summary_text = summarize(extracted_text) - except Exception as e: - st.error(f"Summarization failed: {e}") - st.stop() - - if not validate_summary(summary_text): - st.error("Generated an empty summary. Try again or adjust summarizer settings") - st.stop() - - st.subheader("Generated Summary") - st.success("Summary generated successfully") - st.write(summary_text) - - # Save to history - if STORAGE_AVAILABLE: - try: - history_entry = { - "id": f"local-{uuid.uuid4().hex[:8]}", - "title": uploaded_file.name, - "abstract": "", - "summary": summary_text, - "timestamp": datetime.now().isoformat(), - "source": "upload", - "annotations": "" - } - save_history_entry(history_entry) - except Exception: - pass - - # Download options - st.markdown("---") - st.markdown("### Download Your Summary") - - col_txt, col_md, col_pdf = st.columns(3) - txt_bytes = summary_text.encode("utf-8") - md_bytes = f"# {uploaded_file.name}\n\n{summary_text}".encode("utf-8") - - metadata = { - "Original Filename": uploaded_file.name, - "Generated On": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "Source": "Uploaded PDF" - } - pdf_buffer = generate_pdf_from_text( - title=uploaded_file.name, - metadata=metadata, - body_text=summary_text, - annotations="" - ) - - with col_txt: - st.download_button( - label="Download TXT", - data=txt_bytes, - file_name=f"{safe_filename(uploaded_file.name)}_summary.txt", - mime="text/plain", - key="download_upload_txt" - ) - with col_md: - st.download_button( - label="Download MD", - data=md_bytes, - file_name=f"{safe_filename(uploaded_file.name)}_summary.md", - mime="text/markdown", - key="download_upload_md" - ) - with col_pdf: - st.download_button( - label="Download PDF", - data=pdf_buffer, - file_name=f"{safe_filename(uploaded_file.name)}_summary.pdf", - mime="application/pdf", - key="download_upload_pdf" - ) - - st.markdown('
', unsafe_allow_html=True) - -# --------------------------------------------------------------------- -# Section: Semantic Search (FAISS) -# --------------------------------------------------------------------- -elif option == "Semantic Search (FAISS)": - st.markdown('
', unsafe_allow_html=True) - st.markdown('
Semantic Search (FAISS)
', unsafe_allow_html=True) - - if st.button("Rebuild Index"): - if DEMO_MODE: - st.info("Index building is simulated in Demo Mode. Load the demo instead") - else: - with st.spinner("Rebuilding semantic vector index..."): - try: - build_index() - st.success("Index rebuilt successfully") - except Exception as e: - st.error(f"Failed to rebuild index: {e}") - - semantic_query = st.text_input( - "Enter a semantic query", - placeholder="e.g., visual prompt tuning in robotics" - ) - - if st.button("Search with FAISS"): - if not semantic_query or semantic_query.strip() == "": - st.warning("Please enter a semantic query") - else: - with st.spinner("Running semantic search..."): - try: - results = search_similar(semantic_query) - - if not results: - st.info("No similar results found") - else: - st.success(f"Found {len(results)} similar items") - for idx, item in enumerate(results): - with st.expander(f"{item.get('title', 'Untitled')}", expanded=(idx == 0)): - st.markdown("**Summary:**") - st.write(item.get('summary', 'No summary available')) - - download_col1, download_col2, download_col3 = st.columns([1, 1, 1]) - summary_text = item.get('summary', '') - md_bytes = f"# {item.get('title', '')}\n\n{summary_text}".encode("utf-8") - txt_bytes = summary_text.encode("utf-8") - pdf_buf = generate_pdf_from_text( - title=item.get('title', 'Summary'), - metadata={"id": item.get('id', '')}, - body_text=summary_text, - annotations=item.get('annotations', '') - ) - - with download_col1: - st.download_button( - label="TXT", - data=txt_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", - mime="text/plain", - key=f"faiss_txt_{idx}" - ) - with download_col2: - st.download_button( - label="MD", - data=md_bytes, - file_name=f"{safe_filename(item.get('title', 'summary'))}.md", - mime="text/markdown", - key=f"faiss_md_{idx}" - ) - with download_col3: - st.download_button( - label="PDF", - data=pdf_buf, - file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", - mime="application/pdf", - key=f"faiss_pdf_{idx}" - ) - - except Exception as e: - st.error(f"Semantic search failed: {e}") - - st.markdown('
', unsafe_allow_html=True) - -# --------------------------------------------------------------------- -# Section: History -# --------------------------------------------------------------------- -elif option == "History": - st.markdown('
', unsafe_allow_html=True) - st.markdown('
Summary History
', unsafe_allow_html=True) - st.markdown('
View, filter, and download previously processed papers
', unsafe_allow_html=True) - - try: - history = get_history() or [] - except Exception: - history = [] - - if history: - metric_col1, metric_col2, metric_col3 = st.columns(3) - with metric_col1: - st.metric("Total Papers", len(history)) - with metric_col2: - today = datetime.now().date() - today_count = sum( - 1 for item in history - if to_date(item.get("timestamp")) == today - ) - st.metric("Added Today", today_count) - with metric_col3: - week_ago = datetime.now() - timedelta(days=7) - week_count = sum( - 1 for item in history - if parse_iso(item.get("timestamp")) > week_ago - ) - st.metric("Last 7 Days", week_count) - st.markdown("---") - - col_filter, col_sort, col_action = st.columns([3, 2, 1]) - with col_filter: - search_filter = st.text_input("Filter by title or keyword", placeholder="Type to filter...") - with col_sort: - sort_order = st.selectbox("Sort by", ["Newest First", "Oldest First", "Title A-Z"]) - with col_action: - if st.button("Clear All", type="secondary"): - if st.session_state.get("confirm_clear", False): - try: - clear_history() - st.success("History cleared!") - st.session_state.confirm_clear = False - st.rerun() - except Exception as e: - st.error(f"Failed to clear history: {e}") - else: - st.session_state.confirm_clear = True - st.warning("Click again to confirm") - - st.markdown("---") - - if not history: - st.info("No papers in history yet. Start by searching or uploading papers!") - else: - filtered = history - if search_filter and search_filter.strip(): - q = search_filter.lower() - filtered = [ - item for item in filtered - if q in (item.get('title', '').lower() + - item.get('abstract', '').lower() + - item.get('summary', '').lower()) - ] - - if sort_order == "Newest First": - filtered = sorted(filtered, key=lambda x: x.get('timestamp', ''), reverse=True) - elif sort_order == "Oldest First": - filtered = sorted(filtered, key=lambda x: x.get('timestamp', '')) - elif sort_order == "Title A-Z": - filtered = sorted(filtered, key=lambda x: x.get('title', '').lower()) - - st.caption(f"Showing {len(filtered)} paper(s)") - - for idx, item in enumerate(filtered): - with st.expander(f"{item.get('title', 'Untitled Paper')}", expanded=(idx == 0)): - top_col1, top_col2, top_col3 = st.columns([4, 1, 1]) - - with top_col1: - timestamp = item.get('timestamp', 'Unknown date') - if timestamp and timestamp != "Unknown date": - try: - ts = datetime.fromisoformat(timestamp) - timestamp = ts.strftime("%B %d, %Y at %I:%M %p") - except ValueError: - pass - st.caption(f"Added: {timestamp}") - st.caption(f"ID: `{item.get('id', 'N/A')}`") - - with top_col2: - summary_text = f"Title: {item.get('title', 'N/A')}\n\n" - summary_text += f"Abstract:\n{item.get('abstract', 'N/A')}\n\n" - summary_text += f"Summary:\n{item.get('summary', 'N/A')}\n\n" - txt_bytes = summary_text.encode("utf-8") - - st.download_button( - label="Download", - data=txt_bytes, - file_name=f"{safe_filename(item.get('id', 'paper'))}_summary.txt", - mime="text/plain", - key=f"history_download_{idx}", - help="Download summary as TXT" - ) - - with top_col3: - if st.button("Delete", key=f"delete_{idx}", help="Delete this paper", type="secondary"): - try: - if delete_entry(item.get('id')): - st.success("Paper deleted!") - st.rerun() - else: - st.error("Failed to delete paper") - except Exception as e: - st.error(f"Error deleting: {e}") - - st.markdown("---") - st.markdown("**Abstract:**") - st.markdown(f"> {item.get('abstract', 'No abstract available')}") - st.markdown("**Summary:**") - st.write(item.get('summary', 'No summary available')) - - paper_id = item.get('id', '') - if paper_id and ('arxiv' in paper_id.lower() or '/' in paper_id): - arxiv_id = paper_id.split('/')[-1] if '/' in paper_id else paper_id - st.markdown(f"[View on arXiv](https://arxiv.org/abs/{arxiv_id})") - - st.markdown("---") - st.markdown("**Additional Formats:**") - dl_col1, dl_col2, dl_col3 = st.columns([1, 1, 1]) - - md_content = f"# {item.get('title', '')}\n\n**Abstract:**\n\n{item.get('abstract','')}\n\n**Summary:**\n\n{item.get('summary','')}" - md_bytes = md_content.encode("utf-8") - - pdf_buf = generate_pdf_from_text( - title=item.get('title', 'History Entry'), - metadata={ - "id": item.get('id', ''), - "added_on": item.get('timestamp', '') - }, - body_text=item.get('summary', ''), - annotations=item.get('annotations', '') - ) - - with dl_col1: - st.download_button( - label="TXT", - data=txt_bytes, - file_name=f"{safe_filename(item.get('title', 'paper'))}.txt", - mime="text/plain", - key=f"history_txt_{idx}" - ) - with dl_col2: - st.download_button( - label="MD", - data=md_bytes, - file_name=f"{safe_filename(item.get('title', 'paper'))}.md", - mime="text/markdown", - key=f"history_md_{idx}" - ) - with dl_col3: - st.download_button( - label="PDF", - data=pdf_buf, - file_name=f"{safe_filename(item.get('title', 'paper'))}.pdf", - mime="application/pdf", - key=f"history_pdf_{idx}" - ) - - st.markdown('
', unsafe_allow_html=True) +import os +import io +import re +import tempfile +import uuid +from datetime import datetime, timedelta +from typing import Optional, List, Dict + +import streamlit as st +from fpdf import FPDF + +# Import project modules +from paperscope.main import fetch_and_summarize, query_db +from paperscope.pdf_parser import extract_text_from_pdf +from paperscope.vector_store import build_index, search_similar + +# Optional: history storage API +try: + from paperscope.storage import get_history, clear_history, delete_entry, save_history_entry + STORAGE_AVAILABLE = True +except ImportError: + STORAGE_AVAILABLE = False + def get_history(): + return [] + def clear_history(): + pass + def delete_entry(entry_id): + return False + def save_history_entry(entry): + pass + +# Check for demo mode +DEMO_MODE = os.getenv("DEMO_MODE", "").lower() in ("1", "true", "yes") +if DEMO_MODE: + from paperscope.summarizer_demo import summarize +else: + from paperscope.summarizer import summarize + +# Application configuration +st.set_page_config(page_title="PaperScope", page_icon="📄", layout="wide") + +# --------------------------------------------------------------------- +# Utility helpers +# --------------------------------------------------------------------- + +def safe_filename(filename: str) -> str: + """Sanitize a filename for safe filesystem/download use.""" + if not filename: + return "file" + filename = filename.split("/")[-1].split("\\")[-1] + filename = re.sub(r"[^\w\-_.]", "_", filename) + return filename[:200] + +def truncate_text(text: str, max_length: int = 100) -> str: + """Truncate text to max_length characters, adding ellipsis if needed.""" + if not text or len(text) <= max_length: + return text or "" + return text[:max_length - 3] + "..." + +def generate_pdf_from_text(title: str, metadata: dict, body_text: str, annotations: str = ""): + """Generate a Unicode-safe PDF summary report using fpdf2.""" + pdf = FPDF() + pdf.add_page() + + font_path = os.path.join(os.path.dirname(__file__), "paperscope", "DejaVuSans.ttf") + if os.path.exists(font_path): + pdf.add_font("DejaVu", "", font_path) + pdf.set_font("DejaVu", "", 16) + else: + pdf.set_font("Helvetica", "", 16) + + pdf.cell(0, 10, text=title[:100], new_x="LMARGIN", new_y="NEXT", align="C") + pdf.ln(10) + + if os.path.exists(font_path): + pdf.set_font("DejaVu", "", 12) + else: + pdf.set_font("Helvetica", "", 12) + + if metadata: + for key, value in metadata.items(): + value_str = truncate_text(str(value), max_length=150) + pdf.multi_cell(0, 8, f"{key}: {value_str}", new_x="LMARGIN", new_y="NEXT") + pdf.ln(6) + + pdf.multi_cell(0, 8, body_text) + pdf.ln(10) + + if annotations: + if os.path.exists(font_path): + pdf.set_font("DejaVu", "B", 13) + else: + pdf.set_font("Helvetica", "B", 13) + pdf.cell(0, 10, text="Annotations:", new_x="LMARGIN", new_y="NEXT") + if os.path.exists(font_path): + pdf.set_font("DejaVu", "", 12) + else: + pdf.set_font("Helvetica", "", 12) + pdf.multi_cell(0, 8, annotations) + + try: + pdf_bytes = pdf.output() + if isinstance(pdf_bytes, str): + pdf_bytes = pdf_bytes.encode("latin-1") + except Exception: + pdf_bytes = pdf.output().encode("utf-8") + + buffer = io.BytesIO(pdf_bytes) + buffer.seek(0) + return buffer + + +def safe_write_temp_file(uploaded_file, filename="temp.pdf") -> str: + """Save uploaded file to a safe temporary path and return path.""" + tmp_dir = tempfile.gettempdir() + path = os.path.join(tmp_dir, f"{uuid.uuid4().hex}_{filename}") + with open(path, "wb") as f: + f.write(uploaded_file.read()) + return path + + +def validate_summary(s: Optional[str]) -> bool: + return bool(s and s.strip()) + + +def parse_iso(iso_str: str) -> datetime: + """Parse ISO timestamp robustly, returning epoch if fails.""" + if not iso_str: + return datetime.fromtimestamp(0) + try: + return datetime.fromisoformat(iso_str) + except Exception: + try: + return datetime.strptime(iso_str, "%Y-%m-%d %H:%M:%S") + except Exception: + return datetime.fromtimestamp(0) + + +def to_date(iso_str: str) -> Optional[datetime.date]: + try: + return parse_iso(iso_str).date() + except Exception: + return None + + +# --------------------------------------------------------------------- +# Custom CSS & Styles (Modern Dark Theme) +# --------------------------------------------------------------------- +st.markdown(""" + +""", unsafe_allow_html=True) + +# --------------------------------------------------------------------- +# Sidebar navigation +# --------------------------------------------------------------------- +with st.sidebar: + st.markdown('', unsafe_allow_html=True) + + option = st.radio( + label="Choose a section", + options=[ + "Search arXiv Papers", + "Query Stored Summaries", + "Upload & Summarize PDF", + "Semantic Search (FAISS)", + "History" + ], + key="main_menu", + label_visibility="collapsed" + ) + + if DEMO_MODE: + st.markdown('
Demo Mode
', unsafe_allow_html=True) + st.caption("Results are approximate") + + if st.button("Load Demo Dataset"): + try: + from paperscope.demo_data import load_demo_data + ok, msg = load_demo_data(build_index=False) + if ok: + st.success(msg) + else: + st.error("Failed to load demo database") + except ImportError: + st.error("Demo data module not found") + +# --------------------------------------------------------------------- +# Page header +# --------------------------------------------------------------------- +st.markdown(""" +
+
PaperScope
+
Your personal assistant for academic research using LLMs
+
+""", unsafe_allow_html=True) + +# --------------------------------------------------------------------- +# Section: Search arXiv Papers +# --------------------------------------------------------------------- +if option == "Search arXiv Papers": + st.markdown('
', unsafe_allow_html=True) + st.markdown('
Search arXiv Papers
', unsafe_allow_html=True) + + keyword_input = st.text_input( + "Enter Keywords or Paper URL", + placeholder="e.g., reinforcement learning for robots OR https://arxiv.org/abs/2301.12345" + ) + st.markdown('
You can enter keywords to search arXiv, or paste a direct paper URL (arXiv or PDF link)
', unsafe_allow_html=True) + + if st.button("Fetch & Summarize"): + if DEMO_MODE: + st.info("This feature is disabled in Demo Mode. Use demo dataset or upload a PDF instead.") + elif not keyword_input or keyword_input.strip() == "": + st.warning("Please enter a keyword or URL to search") + else: + try: + from paperscope.url_handler import is_url + + if is_url(keyword_input): + with st.spinner("Fetching paper from URL and summarizing..."): + data = fetch_and_summarize(keyword_input) + + if data: + st.success("Paper fetched and summarized successfully") + for idx, item in enumerate(data[-5:]): + with st.expander(f"{item.get('title', 'Untitled Paper')}"): + st.markdown("**Summary:**") + st.write(item.get("summary", "No summary available")) + + # Download buttons + summary_text = item.get('summary', '') + md_bytes = f"# {item.get('title', 'Summary')}\n\n{summary_text}".encode("utf-8") + txt_bytes = summary_text.encode("utf-8") + + metadata = { + "source": item.get('source', 'arXiv'), + "paper_id": item.get('id', 'N/A') + } + pdf_buf = generate_pdf_from_text( + title=item.get('title', 'PaperScope Summary'), + metadata=metadata, + body_text=summary_text, + annotations=item.get('annotations', '') + ) + + dl_col1, dl_col2, dl_col3 = st.columns([1, 1, 1]) + with dl_col1: + st.download_button( + label="Download TXT", + data=txt_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", + mime="text/plain", + key=f"download_txt_arxiv_{idx}" + ) + with dl_col2: + st.download_button( + label="Download MD", + data=md_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.md", + mime="text/markdown", + key=f"download_md_arxiv_{idx}" + ) + with dl_col3: + st.download_button( + label="Download PDF", + data=pdf_buf, + file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", + mime="application/pdf", + key=f"download_pdf_arxiv_{idx}" + ) + else: + st.error("Failed to fetch paper from URL. Please check the URL and try again") + st.info("Supported formats: arXiv URLs (abs or pdf) and direct PDF links") + + else: + with st.spinner("Fetching and summarizing..."): + data = fetch_and_summarize(keyword_input) + + if data: + st.success(f"Found and processed {len(data)} paper(s)") + for idx, item in enumerate(data[-10:][::-1]): + with st.expander(f"{item.get('title', 'Untitled Paper')}"): + st.markdown("**Summary:**") + st.write(item.get("summary", "No summary available")) + + # Download buttons + summary_text = item.get('summary', '') + md_bytes = f"# {item.get('title', 'Summary')}\n\n{summary_text}".encode("utf-8") + txt_bytes = summary_text.encode("utf-8") + + metadata = { + "source": item.get('source', 'arXiv'), + "paper_id": item.get('id', 'N/A') + } + pdf_buf = generate_pdf_from_text( + title=item.get('title', 'PaperScope Summary'), + metadata=metadata, + body_text=summary_text, + annotations=item.get('annotations', '') + ) + + dl_col1, dl_col2, dl_col3 = st.columns([1, 1, 1]) + with dl_col1: + st.download_button( + label="Download TXT", + data=txt_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", + mime="text/plain", + key=f"download_txt_arxiv_{idx}" + ) + with dl_col2: + st.download_button( + label="Download MD", + data=md_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.md", + mime="text/markdown", + key=f"download_md_arxiv_{idx}" + ) + with dl_col3: + st.download_button( + label="Download PDF", + data=pdf_buf, + file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", + mime="application/pdf", + key=f"download_pdf_arxiv_{idx}" + ) + else: + st.warning("No results found") + + except Exception as e: + st.error(f"Error processing: {str(e)}") + st.info("Supported formats: arXiv URLs (abs or pdf) and direct PDF links") + + st.markdown('
', unsafe_allow_html=True) + +# --------------------------------------------------------------------- +# Section: Query Stored Summaries +# --------------------------------------------------------------------- +elif option == "Query Stored Summaries": + st.markdown('
', unsafe_allow_html=True) + st.markdown('
Query Stored Summaries
', unsafe_allow_html=True) + + query_input = st.text_input( + "Search stored summaries", + placeholder="e.g., contrastive learning" + ) + + if st.button("Run Keyword Search"): + if not query_input or query_input.strip() == "": + st.warning("Please enter a search query") + else: + with st.spinner("Searching database..."): + try: + results = query_db(query_input) + + if not results: + st.info("No matching summaries found") + else: + st.success(f"Found {len(results)} results") + for idx, item in enumerate(results): + with st.expander(f"{item.get('title', 'Untitled')}", expanded=(idx == 0)): + st.markdown("**Summary:**") + st.write(item.get('summary', 'No summary available')) + + # Download buttons + download_col1, download_col2, download_col3 = st.columns([1, 1, 1]) + summary_text = item.get('summary', '') + md_bytes = f"# {item.get('title', '')}\n\n{summary_text}".encode("utf-8") + txt_bytes = summary_text.encode("utf-8") + pdf_buf = generate_pdf_from_text( + title=item.get('title', 'Summary'), + metadata={"id": item.get('id', '')}, + body_text=summary_text, + annotations=item.get('annotations', '') + ) + + with download_col1: + st.download_button( + label="TXT", + data=txt_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", + mime="text/plain", + key=f"query_txt_{idx}" + ) + with download_col2: + st.download_button( + label="MD", + data=md_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.md", + mime="text/markdown", + key=f"query_md_{idx}" + ) + with download_col3: + st.download_button( + label="PDF", + data=pdf_buf, + file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", + mime="application/pdf", + key=f"query_pdf_{idx}" + ) + + except Exception as e: + st.error(f"Search failed: {str(e)}") + + st.markdown('
', unsafe_allow_html=True) + +# --------------------------------------------------------------------- +# Section: Upload & Summarize PDF +# --------------------------------------------------------------------- +elif option == "Upload & Summarize PDF": + st.markdown('
', unsafe_allow_html=True) + st.markdown('
Upload & Summarize PDF
', unsafe_allow_html=True) + + uploaded_file = st.file_uploader("Upload a PDF research paper", type=["pdf"]) + + if uploaded_file: + try: + tmp_path = safe_write_temp_file(uploaded_file, filename="uploaded.pdf") + except Exception as e: + st.error(f"Error saving uploaded file: {e}") + st.stop() + + with st.spinner("Extracting text from PDF..."): + try: + extracted_text = extract_text_from_pdf(tmp_path) + except Exception as e: + st.error(f"Extraction failed: {e}") + st.info("Make sure PDF contains selectable text (not scanned images) or use OCR-enabled parsing") + st.stop() + + if not extracted_text or len(extracted_text.strip()) == 0: + st.error("No text extracted from PDF. The file may be scanned or encrypted") + st.stop() + + with st.spinner("Generating AI-powered summary..."): + try: + summary_text = summarize(extracted_text) + except Exception as e: + st.error(f"Summarization failed: {e}") + st.stop() + + if not validate_summary(summary_text): + st.error("Generated an empty summary. Try again or adjust summarizer settings") + st.stop() + + st.subheader("Generated Summary") + st.success("Summary generated successfully") + st.write(summary_text) + + # Save to history + if STORAGE_AVAILABLE: + try: + history_entry = { + "id": f"local-{uuid.uuid4().hex[:8]}", + "title": uploaded_file.name, + "abstract": "", + "summary": summary_text, + "timestamp": datetime.now().isoformat(), + "source": "upload", + "annotations": "" + } + save_history_entry(history_entry) + except Exception: + pass + + # Download options + st.markdown("---") + st.markdown("### Download Your Summary") + + col_txt, col_md, col_pdf = st.columns(3) + txt_bytes = summary_text.encode("utf-8") + md_bytes = f"# {uploaded_file.name}\n\n{summary_text}".encode("utf-8") + + metadata = { + "Original Filename": uploaded_file.name, + "Generated On": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "Source": "Uploaded PDF" + } + pdf_buffer = generate_pdf_from_text( + title=uploaded_file.name, + metadata=metadata, + body_text=summary_text, + annotations="" + ) + + with col_txt: + st.download_button( + label="Download TXT", + data=txt_bytes, + file_name=f"{safe_filename(uploaded_file.name)}_summary.txt", + mime="text/plain", + key="download_upload_txt" + ) + with col_md: + st.download_button( + label="Download MD", + data=md_bytes, + file_name=f"{safe_filename(uploaded_file.name)}_summary.md", + mime="text/markdown", + key="download_upload_md" + ) + with col_pdf: + st.download_button( + label="Download PDF", + data=pdf_buffer, + file_name=f"{safe_filename(uploaded_file.name)}_summary.pdf", + mime="application/pdf", + key="download_upload_pdf" + ) + + st.markdown('
', unsafe_allow_html=True) + +# --------------------------------------------------------------------- +# Section: Semantic Search (FAISS) +# --------------------------------------------------------------------- +elif option == "Semantic Search (FAISS)": + st.markdown('
', unsafe_allow_html=True) + st.markdown('
Semantic Search (FAISS)
', unsafe_allow_html=True) + + if st.button("Rebuild Index"): + if DEMO_MODE: + st.info("Index building is simulated in Demo Mode. Load the demo instead") + else: + with st.spinner("Rebuilding semantic vector index..."): + try: + build_index() + st.success("Index rebuilt successfully") + except Exception as e: + st.error(f"Failed to rebuild index: {e}") + + semantic_query = st.text_input( + "Enter a semantic query", + placeholder="e.g., visual prompt tuning in robotics" + ) + + if st.button("Search with FAISS"): + if not semantic_query or semantic_query.strip() == "": + st.warning("Please enter a semantic query") + else: + with st.spinner("Running semantic search..."): + try: + results = search_similar(semantic_query) + + if not results: + st.info("No similar results found") + else: + st.success(f"Found {len(results)} similar items") + for idx, item in enumerate(results): + with st.expander(f"{item.get('title', 'Untitled')}", expanded=(idx == 0)): + st.markdown("**Summary:**") + st.write(item.get('summary', 'No summary available')) + + download_col1, download_col2, download_col3 = st.columns([1, 1, 1]) + summary_text = item.get('summary', '') + md_bytes = f"# {item.get('title', '')}\n\n{summary_text}".encode("utf-8") + txt_bytes = summary_text.encode("utf-8") + pdf_buf = generate_pdf_from_text( + title=item.get('title', 'Summary'), + metadata={"id": item.get('id', '')}, + body_text=summary_text, + annotations=item.get('annotations', '') + ) + + with download_col1: + st.download_button( + label="TXT", + data=txt_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.txt", + mime="text/plain", + key=f"faiss_txt_{idx}" + ) + with download_col2: + st.download_button( + label="MD", + data=md_bytes, + file_name=f"{safe_filename(item.get('title', 'summary'))}.md", + mime="text/markdown", + key=f"faiss_md_{idx}" + ) + with download_col3: + st.download_button( + label="PDF", + data=pdf_buf, + file_name=f"{safe_filename(item.get('title', 'summary'))}.pdf", + mime="application/pdf", + key=f"faiss_pdf_{idx}" + ) + + except Exception as e: + st.error(f"Semantic search failed: {e}") + + st.markdown('
', unsafe_allow_html=True) + +# --------------------------------------------------------------------- +# Section: History +# --------------------------------------------------------------------- +elif option == "History": + st.markdown('
', unsafe_allow_html=True) + st.markdown('
Summary History
', unsafe_allow_html=True) + st.markdown('
View, filter, and download previously processed papers
', unsafe_allow_html=True) + + try: + history = get_history() or [] + except Exception: + history = [] + + if history: + metric_col1, metric_col2, metric_col3 = st.columns(3) + with metric_col1: + st.metric("Total Papers", len(history)) + with metric_col2: + today = datetime.now().date() + today_count = sum( + 1 for item in history + if to_date(item.get("timestamp")) == today + ) + st.metric("Added Today", today_count) + with metric_col3: + week_ago = datetime.now() - timedelta(days=7) + week_count = sum( + 1 for item in history + if parse_iso(item.get("timestamp")) > week_ago + ) + st.metric("Last 7 Days", week_count) + st.markdown("---") + + col_filter, col_sort, col_action = st.columns([3, 2, 1]) + with col_filter: + search_filter = st.text_input("Filter by title or keyword", placeholder="Type to filter...") + with col_sort: + sort_order = st.selectbox("Sort by", ["Newest First", "Oldest First", "Title A-Z"]) + with col_action: + if st.button("Clear All", type="secondary"): + if st.session_state.get("confirm_clear", False): + try: + clear_history() + st.success("History cleared!") + st.session_state.confirm_clear = False + st.rerun() + except Exception as e: + st.error(f"Failed to clear history: {e}") + else: + st.session_state.confirm_clear = True + st.warning("Click again to confirm") + + st.markdown("---") + + if not history: + st.info("No papers in history yet. Start by searching or uploading papers!") + else: + filtered = history + if search_filter and search_filter.strip(): + q = search_filter.lower() + filtered = [ + item for item in filtered + if q in (item.get('title', '').lower() + + item.get('abstract', '').lower() + + item.get('summary', '').lower()) + ] + + if sort_order == "Newest First": + filtered = sorted(filtered, key=lambda x: x.get('timestamp', ''), reverse=True) + elif sort_order == "Oldest First": + filtered = sorted(filtered, key=lambda x: x.get('timestamp', '')) + elif sort_order == "Title A-Z": + filtered = sorted(filtered, key=lambda x: x.get('title', '').lower()) + + st.caption(f"Showing {len(filtered)} paper(s)") + + for idx, item in enumerate(filtered): + with st.expander(f"{item.get('title', 'Untitled Paper')}", expanded=(idx == 0)): + top_col1, top_col2, top_col3 = st.columns([4, 1, 1]) + + with top_col1: + timestamp = item.get('timestamp', 'Unknown date') + if timestamp and timestamp != "Unknown date": + try: + ts = datetime.fromisoformat(timestamp) + timestamp = ts.strftime("%B %d, %Y at %I:%M %p") + except ValueError: + pass + st.caption(f"Added: {timestamp}") + st.caption(f"ID: `{item.get('id', 'N/A')}`") + + with top_col2: + summary_text = f"Title: {item.get('title', 'N/A')}\n\n" + summary_text += f"Abstract:\n{item.get('abstract', 'N/A')}\n\n" + summary_text += f"Summary:\n{item.get('summary', 'N/A')}\n\n" + txt_bytes = summary_text.encode("utf-8") + + st.download_button( + label="Download", + data=txt_bytes, + file_name=f"{safe_filename(item.get('id', 'paper'))}_summary.txt", + mime="text/plain", + key=f"history_download_{idx}", + help="Download summary as TXT" + ) + + with top_col3: + if st.button("Delete", key=f"delete_{idx}", help="Delete this paper", type="secondary"): + try: + if delete_entry(item.get('id')): + st.success("Paper deleted!") + st.rerun() + else: + st.error("Failed to delete paper") + except Exception as e: + st.error(f"Error deleting: {e}") + + st.markdown("---") + st.markdown("**Abstract:**") + st.markdown(f"> {item.get('abstract', 'No abstract available')}") + st.markdown("**Summary:**") + st.write(item.get('summary', 'No summary available')) + + paper_id = item.get('id', '') + if paper_id and ('arxiv' in paper_id.lower() or '/' in paper_id): + arxiv_id = paper_id.split('/')[-1] if '/' in paper_id else paper_id + st.markdown(f"[View on arXiv](https://arxiv.org/abs/{arxiv_id})") + + st.markdown("---") + st.markdown("**Additional Formats:**") + dl_col1, dl_col2, dl_col3 = st.columns([1, 1, 1]) + + md_content = f"# {item.get('title', '')}\n\n**Abstract:**\n\n{item.get('abstract','')}\n\n**Summary:**\n\n{item.get('summary','')}" + md_bytes = md_content.encode("utf-8") + + pdf_buf = generate_pdf_from_text( + title=item.get('title', 'History Entry'), + metadata={ + "id": item.get('id', ''), + "added_on": item.get('timestamp', '') + }, + body_text=item.get('summary', ''), + annotations=item.get('annotations', '') + ) + + with dl_col1: + st.download_button( + label="TXT", + data=txt_bytes, + file_name=f"{safe_filename(item.get('title', 'paper'))}.txt", + mime="text/plain", + key=f"history_txt_{idx}" + ) + with dl_col2: + st.download_button( + label="MD", + data=md_bytes, + file_name=f"{safe_filename(item.get('title', 'paper'))}.md", + mime="text/markdown", + key=f"history_md_{idx}" + ) + with dl_col3: + st.download_button( + label="PDF", + data=pdf_buf, + file_name=f"{safe_filename(item.get('title', 'paper'))}.pdf", + mime="application/pdf", + key=f"history_pdf_{idx}" + ) + + st.markdown('
', unsafe_allow_html=True)