diff --git a/docs/readme/indexer-skills.md b/docs/readme/indexer-skills.md index 8f30ddc..f72151e 100644 --- a/docs/readme/indexer-skills.md +++ b/docs/readme/indexer-skills.md @@ -27,6 +27,19 @@ This document describes all available skills that can be used in the indexer pip 3. An `embedding` to generate embeddings from the chunks. 4. A `vector-store` to store the embeddings. +4. You have FAQ documents exported from Confluence (`.docx` files) and want to extract Q&A pairs for vectorization? You'll typically need: + + 1. An `exporter` (Scroll Word) or `file-scanner` to get the `.docx` files. + 2. A `confluence-faq-splitter` to extract Q&A pairs directly from the `.docx` headings. + 3. An `embedding` to generate embeddings from the Q&A chunks. + 4. A `vector-store` to store the embeddings. + +5. You have enriched Q&A JSON output from a Teams FAQ pipeline and want to index it? You'll typically need: + + 1. A `teams-qna-loader` to load the enriched Q&A pairs from the JSON file. + 2. An `embedding` to generate embeddings from the Q&A content. + 3. A `vector-store` to store the embeddings. + # Available Skills @@ -103,7 +116,7 @@ Supported file extensions:
Web loaders -Load data from web. +Load data from web or structured files. ### Jira Loader Loads data from Jira issues @@ -119,6 +132,18 @@ Loads data from Jira issues - JSTAD-XYZ - JIRA-1234 ``` + +### Teams Q&A Loader +Loads enriched Q&A pairs from a JSON file produced by the FAQ enrichment pipeline. Each Q&A pair becomes a single document with one chunk. The skill prefers rephrased questions/answers when available, falling back to originals. + +```yaml +- skill: &TeamsQnALoader + type: loader + name: teams-qna-loader + params: + file_path: data/processed_output/enriched_qna.json # Required: path to enriched Q&A JSON file + tag: teams-faq # Optional: tag for chunks (default: "enriched-qna") +```
@@ -151,6 +176,29 @@ Splits text by grouping semantically equivalent chunks together. A bit more adva api_version: your-api-version deployment_name: your-deployment-name ``` + +### Confluence FAQ Splitter +Extracts Q&A pairs directly from FAQ `.docx` files exported from Confluence. Each heading that contains a `?` or starts with a problem/question pattern (e.g. "How do I", "I cannot") is treated as a question, and the body content below it becomes the answer. Each Q&A pair is produced as a single atomic chunk. No `file-reader` is needed — this skill reads `.docx` files directly via `python-docx`. + +All parameters are optional with sensible defaults. + +```yaml +- skill: &ConfluenceFAQSplitter + type: splitter + name: confluence-faq-splitter + params: + min_heading_level: 2 # Minimum heading level for questions (default: 2) + max_heading_level: 6 # Maximum heading level for questions (default: 6) + skip_headings: # Heading titles to skip (default: ['summary']) + - summary + skip_patterns: # Text patterns to skip in answer content (default: ['CONFIDENTIAL', 'Search the FAQ', 'Search Artifactory FAQ']) + - CONFIDENTIAL + question_patterns: # Prefixes that indicate a question (default: ['i am ', 'i cannot ', 'how do i ', 'what is ', ...]) + - "how do i " + - "i cannot " + stop_sections: # Regex patterns for sections that end Q&A extraction (default: ['related articles', 'see also']) + - "^\\s*related\\s*articles?\\s*$" +```
Embedding diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml index 47a500a..a177b6c 100644 --- a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml +++ b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml @@ -105,6 +105,37 @@ definitions: type: integer required: False min: 0 + # ConfluenceFAQSplitter params + min_heading_level: + type: integer + required: False + min: 1 + max: 9 + max_heading_level: + type: integer + required: False + min: 1 + max: 9 + skip_patterns: + type: list + required: False + schema: + type: string + skip_headings: + type: list + required: False + schema: + type: string + question_patterns: + type: list + required: False + schema: + type: string + stop_sections: + type: list + required: False + schema: + type: string mode: type: string required: False @@ -162,6 +193,9 @@ definitions: path: type: string required: False + file_path: + type: string + required: False embedding_model: type: dict schema: diff --git a/src/docs2vecs/subcommands/indexer/skills/__init__.py b/src/docs2vecs/subcommands/indexer/skills/__init__.py index 2859e59..7cbd07a 100644 --- a/src/docs2vecs/subcommands/indexer/skills/__init__.py +++ b/src/docs2vecs/subcommands/indexer/skills/__init__.py @@ -13,6 +13,8 @@ from .llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill from .local_document_parser import LocalDocumentParser from .faiss_vector_store_skill import FaissVectorStoreSkill +from .teams_qna_loader_skill import TeamsQnALoaderSkill +from .confluence_faq_splitter_skill import ConfluenceFAQSplitter __all__ = [ @@ -31,4 +33,6 @@ "LlamaFastembedEmbeddingSkill", "LocalDocumentParser", "FaissVectorStoreSkill", + "TeamsQnALoaderSkill", + "ConfluenceFAQSplitter", ] diff --git a/src/docs2vecs/subcommands/indexer/skills/confluence_faq_splitter_skill.py b/src/docs2vecs/subcommands/indexer/skills/confluence_faq_splitter_skill.py new file mode 100644 index 0000000..a1623d9 --- /dev/null +++ b/src/docs2vecs/subcommands/indexer/skills/confluence_faq_splitter_skill.py @@ -0,0 +1,437 @@ +import hashlib +import re +from pathlib import Path +from typing import List, Optional, Dict + +from docx import Document as DocxDocument +from docx.text.paragraph import Paragraph +from docx.table import Table +from docx.oxml.ns import qn + +from docs2vecs.subcommands.indexer.config.config import Config +from docs2vecs.subcommands.indexer.document.chunk import Chunk +from docs2vecs.subcommands.indexer.document.document import Document +from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill + + +class ConfluenceFAQSplitter(IndexerSkill): + """ + Advanced Q&A extractor for DOCX files with sophisticated parsing. + + Features: + - Extracts Q&A pairs from FAQ DOCX documents + - Ignores Table of Contents and Summary sections + - Handles heading-based questions with '?' or problem patterns + - Extracts hyperlinks from answers + - Stops at 'Related articles' sections + - Preserves table content in answers + - Each Q&A pair becomes a single atomic chunk for optimal RAG retrieval + + Configuration parameters (all optional with sensible defaults): + - min_heading_level: Minimum heading level for questions (default: 2) + - max_heading_level: Maximum heading level for questions (default: 6) + - skip_patterns: List of text patterns to skip in answer content (default: ['CONFIDENTIAL', 'Search the FAQ', 'Search Artifactory FAQ']) + - skip_headings: List of heading titles to skip as questions (default: ['summary']) + - question_patterns: List of prefixes that indicate a question/problem statement (default: ['i am ', 'i cannot ', ...]) + - stop_sections: List of regex patterns for sections that end Q&A extraction (default: ['related articles', 'see also']) + """ + + # Default configuration values + DEFAULT_MIN_HEADING_LEVEL = 2 + DEFAULT_MAX_HEADING_LEVEL = 6 + DEFAULT_SKIP_PATTERNS = ['CONFIDENTIAL', 'Search the FAQ', 'Search Artifactory FAQ'] + DEFAULT_SKIP_HEADINGS = ['summary'] + DEFAULT_QUESTION_PATTERNS = [ + 'i am ', 'i cannot ', "i can't ", 'i see ', + 'i have ', 'i need ', 'my ', 'when i ', + 'how do i ', 'how can i ', 'what is ', 'what are ', + 'why does ', 'why is ', 'where is ', 'where can ' + ] + DEFAULT_STOP_SECTIONS = [ + r'^\s*related\s*articles?\s*$', + r'^\s*related\s*resources?\s*$', + r'^\s*see\s*also\s*$' + ] + + def __init__(self, config: dict, global_config: Config): + super().__init__(config, global_config) + + # Load configurable parameters with defaults + self.min_heading_level = self._config.get('min_heading_level', self.DEFAULT_MIN_HEADING_LEVEL) + self.max_heading_level = self._config.get('max_heading_level', self.DEFAULT_MAX_HEADING_LEVEL) + self.skip_patterns = self._config.get('skip_patterns', self.DEFAULT_SKIP_PATTERNS) + self.skip_headings = [h.lower() for h in self._config.get('skip_headings', self.DEFAULT_SKIP_HEADINGS)] + self.question_patterns = [p.lower() for p in self._config.get('question_patterns', self.DEFAULT_QUESTION_PATTERNS)] + + # Compile stop section regexes + stop_sections = self._config.get('stop_sections', self.DEFAULT_STOP_SECTIONS) + self.related_res = [re.compile(p, re.I) for p in stop_sections] + + self.dot_leader_re = re.compile(r"\.{2,}\s*\d{1,4}\s*$") + self.page_number_trail_re = re.compile(r"\s\d{1,4}\s*$") + # Regex to match markdown-style links: [Link](URL) + self.markdown_link_re = re.compile(r'\[Link\]\([^\)]+\)') + + self.logger.debug(f"ConfluenceFAQSplitter config: heading_levels={self.min_heading_level}-{self.max_heading_level}, " + f"skip_patterns={len(self.skip_patterns)}, question_patterns={len(self.question_patterns)}") + + def run(self, input: Optional[List[Document]] = None) -> List[Document]: + self.logger.info("Running ConfluenceFAQSplitter...") + + if not input: + self.logger.error("No documents provided in input") + return [] + + for doc in input: + self.logger.debug(f"Processing document: {doc.filename}") + + # Check if file is a DOCX + filename_str = str(doc.filename) + if not filename_str.lower().endswith('.docx'): + self.logger.warning(f"Skipping non-DOCX file: {doc.filename}") + continue + + try: + qa_pairs = self._extract_qa_from_docx(doc.filename) + self.logger.info(f"Extracted {len(qa_pairs)} Q&A pairs from {doc.filename}") + + for idx, qa_data in enumerate(qa_pairs, 1): + question = qa_data['question'] + answer = qa_data['answer'] + + if not question.strip() or not answer.strip(): + self.logger.debug(f"Skipping Q&A pair {idx} - missing question or answer") + continue + + links = qa_data.get('links', []) + + # Filter out links where the text is just the URL itself (redundant) + # Only include links with meaningful descriptive text in the References section + meaningful_links = [ + link for link in links + if not self._is_link_text_redundant(link['text'], link['url']) + ] + + # Format links for inclusion in content + links_text = "" + if meaningful_links: + links_list = [f"- {link['text']}: {link['url']}" for link in meaningful_links] + links_text = f"\n\nReferences (hyperlinks from the answer):\n" + "\n".join(links_list) + + # Combine question and answer into a single chunk + combined_text = f"Q: {question}\n\nA: {answer}{links_text}" + + chunk = Chunk() + chunk.document_id = hashlib.sha256(combined_text.encode()).hexdigest() + chunk.document_name = Path(doc.filename).name + chunk.tag = doc.tag + chunk.content = combined_text # Full Q&A for retrieval + chunk.chunk_id = f"{chunk.document_id}_{idx}" + chunk.source_link = doc.source_url or "" + + doc.add_chunk(chunk) + + self.logger.debug(f"Split {doc.filename} into {len(doc.chunks)} Q&A chunks") + + except Exception as e: + self.logger.error(f"Error processing {doc.filename}: {e}", exc_info=True) + continue + + return input + + def _extract_qa_from_docx(self, docx_path: str) -> List[Dict[str, any]]: + """ + Extract Q&A pairs from a DOCX file with sophisticated parsing. + + Returns: + List of dicts with 'question', 'answer', 'links' keys + """ + doc = DocxDocument(docx_path) + qa: List[Dict[str, any]] = [] + + current_q = None + current_ans: List[str] = [] + current_links: List[Dict[str, str]] = [] + in_toc = False + in_summary = False + + for blk in self._iter_block_items(doc): + text = self._norm(self._block_text(blk)) + lvl = self._heading_level(blk) if isinstance(blk, Paragraph) else None + title = self._norm(blk.text).lower() if isinstance(blk, Paragraph) else "" + + # ---------- TOC detection ---------- + if isinstance(blk, Paragraph) and title in ('table of contents', 'contents'): + in_toc = True + # Finalize any open Q before entering TOC + if current_q is not None: + qa.append({ + "question": current_q, + "answer": "\n".join(current_ans).strip() if current_ans else "", + "links": current_links, + }) + current_q, current_ans, current_links = None, [], [] + continue + + # End TOC at next major heading (H1 or H2) + if in_toc and isinstance(blk, Paragraph) and lvl is not None and lvl <= 2 and title not in ('table of contents', 'contents'): + in_toc = False + + # Skip TOC-styled paragraphs and dotted leader lines + if in_toc and isinstance(blk, Paragraph): + if self._style_name(blk).lower().startswith('toc') or self._is_toc_line(text): + continue + + # ---------- Skip-heading regions (e.g. Summary, Overview) ---------- + if isinstance(blk, Paragraph) and lvl is not None and title in self.skip_headings: + in_summary = True + continue + + if in_summary and isinstance(blk, Paragraph) and lvl is not None and title not in self.skip_headings: + in_summary = False + + # ---------- Stop at Related articles ---------- + if current_q is not None and self._is_related_heading(blk): + qa.append({ + "question": current_q, + "answer": "\n".join(current_ans).strip() if current_ans else "", + "links": current_links, + }) + current_q, current_ans, current_links = None, [], [] + continue + + # ---------- New question ---------- + if not in_toc and not in_summary and self._is_question_block(blk): + # Flush previous + if current_q is not None: + qa.append({ + "question": current_q, + "answer": "\n".join(current_ans).strip() if current_ans else "", + "links": current_links, + }) + current_q = self._norm(blk.text) + current_ans = [] + current_links = [] + continue + + # ---------- Accumulate answer ---------- + if current_q is not None: + # Skip banners/boilerplate using configurable patterns + if any(text.upper() == pat.upper() for pat in self.skip_patterns): + continue + if text: + block_text = self._block_text(blk) + # Remove markdown-style links [Link](URL) + cleaned_text = self._remove_markdown_links(block_text) + if cleaned_text: # Only add if there's content left after cleaning + current_ans.append(cleaned_text) + # Extract hyperlinks from block + links = self._extract_hyperlinks_from_block(blk) + current_links.extend(links) + + # Finalize last Q + if current_q is not None: + qa.append({ + "question": current_q, + "answer": "\n".join(current_ans).strip() if current_ans else "", + "links": current_links, + }) + + return qa + + # ---------- Helper methods ---------- + + def _iter_block_items(self, parent): + """Yield paragraphs and tables in document order.""" + body = parent.element.body + for child in body.iterchildren(): + if child.tag == qn('w:p'): + yield Paragraph(child, parent) + elif child.tag == qn('w:tbl'): + yield Table(child, parent) + + def _extract_table_text(self, table) -> str: + """Recursively extract text from a table, including nested tables.""" + parts = [] + for row in table.rows: + for cell in row.cells: + for p in cell.paragraphs: + parts.append(p.text or '') + for nested_table in cell.tables: + nested_text = self._extract_table_text(nested_table) + if nested_text: + parts.append(nested_text) + + out = [] + for t in (x.strip() for x in parts): + if t == '' and (not out or out[-1] == ''): + continue + out.append(t) + return "\n".join(out).strip() + + def _block_text(self, block) -> str: + """Get text from a paragraph or table.""" + if isinstance(block, Paragraph): + return block.text or '' + if isinstance(block, Table): + return self._extract_table_text(block) + return '' + + def _style_name(self, par: Paragraph) -> str: + try: + return (par.style.name or '').strip() + except Exception: + return '' + + def _heading_level(self, par: Paragraph): + if not isinstance(par, Paragraph): + return None + m = re.match(r'Heading\s*(\d+)$', self._style_name(par)) + return int(m.group(1)) if m else None + + def _norm(self, s: str) -> str: + """Normalize text by replacing non-breaking spaces and collapsing whitespace.""" + s = (s or '').replace('\u00A0', ' ') + s = re.sub(r'\s{2,}', ' ', s.strip()) + return s + + def _remove_markdown_links(self, s: str) -> str: + """Remove markdown-style links in the form [Link](URL) from text.""" + return self.markdown_link_re.sub('', s).strip() + + def _is_link_text_redundant(self, text: str, url: str) -> bool: + """ + Check if hyperlink text is redundant (i.e., it's just the URL itself or very similar). + References: + - https://example.com: https://example.com ← Useless, filtered out + + Returns True if the link text is redundant and should be excluded from the References section. + This keeps the answer clean while preserving meaningful link descriptions. + """ + # Normalize both for comparison + text_normalized = text.strip().lower() + url_normalized = url.strip().lower() + + # Strip common trailing characters that might be added accidentally + # (parentheses, periods, commas, etc.) + text_cleaned = text_normalized.rstrip(').,;: ') + url_cleaned = url_normalized.rstrip(').,;: ') + + # Remove common URL prefixes for comparison + url_without_protocol = re.sub(r'^https?://', '', url_cleaned) + url_without_www = re.sub(r'^www\.', '', url_without_protocol) + text_without_protocol = re.sub(r'^https?://', '', text_cleaned) + + # Check if text is the same as URL (with or without protocol) + if text_cleaned == url_cleaned: + return True + if text_without_protocol == url_without_protocol: + return True + if text_cleaned == url_without_www: + return True + + return False + + def _is_related_heading(self, block) -> bool: + text = self._norm(self._block_text(block)) + return any(rx.match(text) for rx in self.related_res) + + def _is_toc_line(self, text: str) -> bool: + t = self._norm(text) + if not t: + return False + return bool(self.dot_leader_re.search(t) or + (self.page_number_trail_re.search(t) and t.count('.') >= 3)) + + def _is_question_block(self, block) -> bool: + """ + Identifies Q&A headings as either: + 1. Containing '?' (for questions) + 2. Starting with configurable problem/question patterns + + Uses configurable heading level range and skip_headings list. + """ + if not isinstance(block, Paragraph): + return False + t = self._norm(block.text) + if not t: + return False + + # Skip headings that match skip_headings list (e.g., 'Summary') + if t.lower() in self.skip_headings: + return False + + lvl = self._heading_level(block) + if lvl is None or lvl < self.min_heading_level or lvl > self.max_heading_level: + return False + + # Accept if it has a question mark + if '?' in t: + return True + + # Accept configurable problem/question statement patterns + t_lower = t.lower() + return any(t_lower.startswith(pat) for pat in self.question_patterns) + + def _extract_hyperlinks_from_paragraph(self, paragraph) -> List[Dict[str, str]]: + """Extract hyperlinks from a paragraph.""" + links = [] + if not isinstance(paragraph, Paragraph): + return links + + try: + part = paragraph.part + rels = part.rels + except Exception: + return links + + for hyperlink in paragraph._element.xpath('.//w:hyperlink'): + text_parts = [] + for run in hyperlink.xpath('.//w:t'): + if run.text: + text_parts.append(run.text) + text = ''.join(text_parts) + + if not text: + continue + + # Try to get external URL via relationship ID + r_id = hyperlink.get(qn('r:id')) + if r_id and r_id in rels: + url = rels[r_id].target_ref + if url: + links.append({'text': text, 'url': url}) + continue + + # Try to get internal anchor (bookmark link) + # Skip scroll-bookmark internal links (e.g., #scroll-bookmark-17) + anchor = hyperlink.get(qn('w:anchor')) + if anchor and not anchor.startswith('scroll-bookmark'): + links.append({'text': text, 'url': f'#{anchor}'}) + + return links + + def _extract_hyperlinks_from_table(self, table) -> List[Dict[str, str]]: + """Recursively extract hyperlinks from a table, including nested tables.""" + links = [] + for row in table.rows: + for cell in row.cells: + for para in cell.paragraphs: + links.extend(self._extract_hyperlinks_from_paragraph(para)) + for nested_table in cell.tables: + links.extend(self._extract_hyperlinks_from_table(nested_table)) + return links + + def _extract_hyperlinks_from_block(self, block) -> List[Dict[str, str]]: + """ + Extract hyperlinks from a block (paragraph or table). + Returns list of dicts with 'text' and 'url' keys. + """ + links = [] + + if isinstance(block, Paragraph): + links.extend(self._extract_hyperlinks_from_paragraph(block)) + elif isinstance(block, Table): + links.extend(self._extract_hyperlinks_from_table(block)) + + return links diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py index c0e331b..3747519 100644 --- a/src/docs2vecs/subcommands/indexer/skills/factory.py +++ b/src/docs2vecs/subcommands/indexer/skills/factory.py @@ -16,6 +16,8 @@ from docs2vecs.subcommands.indexer.skills import SemanticSplitter from docs2vecs.subcommands.indexer.skills import VectorStoreTracker from docs2vecs.subcommands.indexer.skills import FaissVectorStoreSkill +from docs2vecs.subcommands.indexer.skills.confluence_faq_splitter_skill import ConfluenceFAQSplitter +from docs2vecs.subcommands.indexer.skills.teams_qna_loader_skill import TeamsQnALoaderSkill class SkillType(StrEnum): @@ -51,6 +53,7 @@ class AvailableSkillName(StrEnum): # splitters SEMANTIC_SPLITTER = "semantic-splitter" RECURSIVE_CHARACTER_SPLITTER = "recursive-character-splitter" + CONFLUENCE_FAQ_SPLITTER = "confluence-faq-splitter" # embeddings AZ_ADA002_EMBEDDING = "azure-ada002-embedding" @@ -58,6 +61,7 @@ class AvailableSkillName(StrEnum): # web loaders JIRA_LOADER = "jira-loader" + TEAMS_QNA_LOADER = "teams-qna-loader" AVAILABLE_SKILLS = { @@ -82,8 +86,12 @@ class AvailableSkillName(StrEnum): SkillType.SPLITTER: { AvailableSkillName.SEMANTIC_SPLITTER: SemanticSplitter, AvailableSkillName.RECURSIVE_CHARACTER_SPLITTER: RecursiveCharacterTextSplitter, + AvailableSkillName.CONFLUENCE_FAQ_SPLITTER: ConfluenceFAQSplitter, + }, + SkillType.LOADER: { + AvailableSkillName.JIRA_LOADER: JiraLoaderSkill, + AvailableSkillName.TEAMS_QNA_LOADER: TeamsQnALoaderSkill, }, - SkillType.LOADER: {AvailableSkillName.JIRA_LOADER: JiraLoaderSkill}, } diff --git a/src/docs2vecs/subcommands/indexer/skills/teams_qna_loader_skill.py b/src/docs2vecs/subcommands/indexer/skills/teams_qna_loader_skill.py new file mode 100644 index 0000000..b522506 --- /dev/null +++ b/src/docs2vecs/subcommands/indexer/skills/teams_qna_loader_skill.py @@ -0,0 +1,125 @@ +import json +from pathlib import Path +from typing import List +from typing import Optional + +from docs2vecs.subcommands.indexer.config.config import Config +from docs2vecs.subcommands.indexer.document import Chunk +from docs2vecs.subcommands.indexer.document.document import Document +from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill + + +class TeamsQnALoaderSkill(IndexerSkill): + """A skill that loads enriched Q&A pairs from the FAQ pipeline JSON output. + + The JSON file should be an array of enriched Q&A objects with: + - thread_id: Unique identifier for the conversation thread + - question: Original question text + - rephrased_question: AI-polished question (used for embedding) + - rephrased_answer: AI-summarized answer (used as content) + - topic: Clustered topic category + - key_phrases: Extracted key phrases + - question_sender: Original question author + - timestamp: Message timestamp + - answers: Array of original answers + + Configuration parameters: + - file_path (str): Path to the enriched Q&A JSON file + """ + + def __init__(self, skill_config: dict, global_config: Config) -> None: + super().__init__(skill_config, global_config) + self._file_path = Path(self._config["file_path"]).expanduser().resolve() + self.tag = self._config.get("tag", "enriched-qna") + + def run(self, documents: Optional[List[Document]]) -> List[Document]: + """Load enriched Q&A pairs from JSON file and create Document objects with chunks. + + Args: + documents: Not used by this skill (loader skill) + + Returns: + List of Documents with chunks populated from enriched Q&A JSON + """ + self.logger.info(f"Running TeamsQnALoaderSkill on {self._file_path}...") + + if not self._file_path.exists(): + raise FileNotFoundError(f"Enriched Q&A JSON file not found: {self._file_path}") + + # Load JSON file + with self._file_path.open('r', encoding='utf-8') as f: + qna_list = json.load(f) + + if not qna_list: + self.logger.warning(f"No Q&A pairs found in JSON file: {self._file_path}") + return [] + + if not isinstance(qna_list, list): + raise ValueError(f"Expected JSON array of Q&A objects, got {type(qna_list).__name__}") + + result = [] + + # Process each enriched Q&A pair + for idx, qna in enumerate(qna_list): + # Extract rephrased question and answer, falling back to originals + question = qna.get("rephrased_question") or qna.get("question", "") + answer = qna.get("rephrased_answer") or self._get_best_answer(qna) + + # Skip if no meaningful content + if not question.strip() or not answer.strip(): + self.logger.debug(f"Skipping Q&A pair {idx} - missing question or answer") + continue + + # Build content with both question and answer + topic = qna.get("topic", "General") + content = f"Q: {question}\n\nA: {answer}" + + # Generate document ID from thread_id or index + thread_id = qna.get("thread_id") or f"qna_{idx}" + document_id = self._sanitize_id(thread_id) + + # Use source_link from the Q&A pair (Teams message deep link) if available + source_url = qna.get("source_link", "").strip() + + # Create a Document object + doc = Document(filename=str(self._file_path)) + + # Create a Chunk object from the Q&A pair + chunk = Chunk() + chunk.document_id = document_id + chunk.document_name = f"{topic} - FAQ" + chunk.tag = self.tag + chunk.content = content + chunk.chunk_id = f"{document_id}_chunk_0" + chunk.source_link = source_url + + # Add chunk to document + doc.add_chunk(chunk) + result.append(doc) + + self.logger.debug(f"Loaded Q&A: {document_id} | Topic: {topic}") + + self.logger.info(f"Successfully loaded {len(result)} enriched Q&A pairs from JSON") + + return result + + def _get_best_answer(self, qna: dict) -> str: + """Get the best answer from the answers array, preferring expert answers.""" + answers = qna.get("answers", []) + if not answers: + return "" + + # Prefer expert answers + expert_answers = [a for a in answers if a.get("is_expert", False)] + if expert_answers: + return expert_answers[0].get("answer", "") + + # Fall back to first answer + return answers[0].get("answer", "") + + def _sanitize_id(self, thread_id: str) -> str: + """Sanitize thread_id to be a valid document ID.""" + # Remove any characters that might cause issues in Azure Search + import re + sanitized = re.sub(r'[^a-zA-Z0-9_-]', '_', str(thread_id)) + return sanitized[:128] # Limit length