ResearchNest/pdf_question_extractor.py at main · Devlinx-s/ResearchNest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import re
import fitz  # PyMuPDF
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from datetime import datetime

@dataclass
class ExtractedQuestion:
    """Data class to hold extracted question information."""
    question_number: str
    question_text: str
    page_number: int
    section: str
    question_type: str
    marks: int = 1
    has_formula: bool = False
    has_diagram: bool = False
    metadata: Optional[Dict[str, Any]] = None

class PDFQuestionExtractor:
    """Enhanced PDF question extractor with improved text and structure analysis."""

    def __init__(self, pdf_path: str):
        """Initialize with path to PDF file."""
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
        self.current_section = ""
        self.questions: List[ExtractedQuestion] = []

    def extract_questions(self) -> List[ExtractedQuestion]:
        """Extract all questions from the PDF."""
        print(f"Extracting questions from: {os.path.basename(self.pdf_path)}")

        for page_num in range(len(self.doc)):
            page = self.doc[page_num]
            text = page.get_text()

            # Update current section if section header is found
            self._update_section(text)

            # Extract questions from this page
            self._extract_questions_from_page(text, page_num + 1)

        return self.questions

    def _update_section(self, text: str) -> None:
        """Update current section based on section headers in text."""
        section_match = re.search(r'Section [A-Z]: ([^\n]+)', text)
        if section_match:
            self.current_section = section_match.group(1).strip()

    def _extract_questions_from_page(self, text: str, page_num: int) -> None:
        """Extract questions from a single page's text."""
        # Split text into lines and process each potential question
        lines = [line.strip() for line in text.split('\n') if line.strip()]

        i = 0
        while i < len(lines):
            line = lines[i]

            # Check if line starts with a question number
            question_match = re.match(r'^(\d+)\.\s*(.*)', line)
            if question_match:
                question_num = question_match.group(1)
                question_text = [question_match.group(2)]

                # Collect continuation lines until next question or end
                i += 1
                while i < len(lines) and not re.match(r'^\d+\.\s+', lines[i]):
                    question_text.append(lines[i])
                    i += 1

                # Create question object
                full_text = ' '.join(question_text).strip()
                if full_text:  # Only add if we have actual question text
                    question = ExtractedQuestion(
                        question_number=question_num,
                        question_text=full_text,
                        page_number=page_num,
                        section=self.current_section,
                        question_type=self._determine_question_type(full_text),
                        marks=self._extract_marks(full_text),
                        has_formula=self._contains_formula(full_text),
                        has_diagram=self._contains_diagram_marker(full_text)
                    )
                    self.questions.append(question)
            else:
                i += 1

    def _determine_question_type(self, text: str) -> str:
        """Determine the type of question based on its content."""
        text_lower = text.lower()

        if re.search(r'\b(a|b|c|d|e)\)', text_lower):
            return "Multiple Choice"
        elif re.search(r'draw|diagram|label', text_lower):
            return "Diagram-based"
        elif 'explain' in text_lower or 'describe' in text_lower:
            return "Long Answer"
        elif 'calculate' in text_lower or 'solve' in text_lower:
            return "Problem Solving"
        else:
            return "Short Answer"

    def _extract_marks(self, text: str) -> int:
        """Extract marks from question text if specified."""
        marks_match = re.search(r'\((\d+)\s*marks?\)', text, re.IGNORECASE)
        if marks_match:
            return int(marks_match.group(1))
        return 1  # Default marks

    def _contains_formula(self, text: str) -> bool:
        """Check if question contains mathematical formulas."""
        # Simple check for common formula indicators
        formula_indicators = ['=', '^', '_', '\\frac', '\\sqrt', '\\sum']
        return any(indicator in text for indicator in formula_indicators)

    def _contains_diagram_marker(self, text: str) -> bool:
        """Check if question contains diagram-related markers."""
        return bool(re.search(r'diagram|figure|draw|label', text, re.IGNORECASE))

    def __del__(self):
        """Ensure the PDF document is properly closed."""
        if hasattr(self, 'doc'):
            self.doc.close()

# Helper function to integrate with existing code
def extract_questions_from_pdf(pdf_path: str) -> List[Dict[str, Any]]:
    """Extract questions from PDF and return as a list of dictionaries."""
    extractor = PDFQuestionExtractor(pdf_path)
    questions = extractor.extract_questions()

    # Convert to list of dictionaries for compatibility
    return [{
        'question_number': q.question_number,
        'question_text': q.question_text,
        'page_number': q.page_number,
        'section': q.section,
        'question_type': q.question_type,
        'marks': q.marks,
        'has_formula': q.has_formula,
        'has_diagram': q.has_diagram
    } for q in questions]