diff --git a/docs/readme/indexer-skills.md b/docs/readme/indexer-skills.md
index 8f30ddc..f72151e 100644
--- a/docs/readme/indexer-skills.md
+++ b/docs/readme/indexer-skills.md
@@ -27,6 +27,19 @@ This document describes all available skills that can be used in the indexer pip
3. An `embedding` to generate embeddings from the chunks.
4. A `vector-store` to store the embeddings.
+4. You have FAQ documents exported from Confluence (`.docx` files) and want to extract Q&A pairs for vectorization? You'll typically need:
+
+ 1. An `exporter` (Scroll Word) or `file-scanner` to get the `.docx` files.
+ 2. A `confluence-faq-splitter` to extract Q&A pairs directly from the `.docx` headings.
+ 3. An `embedding` to generate embeddings from the Q&A chunks.
+ 4. A `vector-store` to store the embeddings.
+
+5. You have enriched Q&A JSON output from a Teams FAQ pipeline and want to index it? You'll typically need:
+
+ 1. A `teams-qna-loader` to load the enriched Q&A pairs from the JSON file.
+ 2. An `embedding` to generate embeddings from the Q&A content.
+ 3. A `vector-store` to store the embeddings.
+
# Available Skills
@@ -103,7 +116,7 @@ Supported file extensions:
Web loaders
-Load data from web.
+Load data from web or structured files.
### Jira Loader
Loads data from Jira issues
@@ -119,6 +132,18 @@ Loads data from Jira issues
- JSTAD-XYZ
- JIRA-1234
```
+
+### Teams Q&A Loader
+Loads enriched Q&A pairs from a JSON file produced by the FAQ enrichment pipeline. Each Q&A pair becomes a single document with one chunk. The skill prefers rephrased questions/answers when available, falling back to originals.
+
+```yaml
+- skill: &TeamsQnALoader
+ type: loader
+ name: teams-qna-loader
+ params:
+ file_path: data/processed_output/enriched_qna.json # Required: path to enriched Q&A JSON file
+ tag: teams-faq # Optional: tag for chunks (default: "enriched-qna")
+```
@@ -151,6 +176,29 @@ Splits text by grouping semantically equivalent chunks together. A bit more adva
api_version: your-api-version
deployment_name: your-deployment-name
```
+
+### Confluence FAQ Splitter
+Extracts Q&A pairs directly from FAQ `.docx` files exported from Confluence. Each heading that contains a `?` or starts with a problem/question pattern (e.g. "How do I", "I cannot") is treated as a question, and the body content below it becomes the answer. Each Q&A pair is produced as a single atomic chunk. No `file-reader` is needed — this skill reads `.docx` files directly via `python-docx`.
+
+All parameters are optional with sensible defaults.
+
+```yaml
+- skill: &ConfluenceFAQSplitter
+ type: splitter
+ name: confluence-faq-splitter
+ params:
+ min_heading_level: 2 # Minimum heading level for questions (default: 2)
+ max_heading_level: 6 # Maximum heading level for questions (default: 6)
+ skip_headings: # Heading titles to skip (default: ['summary'])
+ - summary
+ skip_patterns: # Text patterns to skip in answer content (default: ['CONFIDENTIAL', 'Search the FAQ', 'Search Artifactory FAQ'])
+ - CONFIDENTIAL
+ question_patterns: # Prefixes that indicate a question (default: ['i am ', 'i cannot ', 'how do i ', 'what is ', ...])
+ - "how do i "
+ - "i cannot "
+ stop_sections: # Regex patterns for sections that end Q&A extraction (default: ['related articles', 'see also'])
+ - "^\\s*related\\s*articles?\\s*$"
+```
Embedding
diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
index 47a500a..a177b6c 100644
--- a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
+++ b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
@@ -105,6 +105,37 @@ definitions:
type: integer
required: False
min: 0
+ # ConfluenceFAQSplitter params
+ min_heading_level:
+ type: integer
+ required: False
+ min: 1
+ max: 9
+ max_heading_level:
+ type: integer
+ required: False
+ min: 1
+ max: 9
+ skip_patterns:
+ type: list
+ required: False
+ schema:
+ type: string
+ skip_headings:
+ type: list
+ required: False
+ schema:
+ type: string
+ question_patterns:
+ type: list
+ required: False
+ schema:
+ type: string
+ stop_sections:
+ type: list
+ required: False
+ schema:
+ type: string
mode:
type: string
required: False
@@ -162,6 +193,9 @@ definitions:
path:
type: string
required: False
+ file_path:
+ type: string
+ required: False
embedding_model:
type: dict
schema:
diff --git a/src/docs2vecs/subcommands/indexer/skills/__init__.py b/src/docs2vecs/subcommands/indexer/skills/__init__.py
index 2859e59..7cbd07a 100644
--- a/src/docs2vecs/subcommands/indexer/skills/__init__.py
+++ b/src/docs2vecs/subcommands/indexer/skills/__init__.py
@@ -13,6 +13,8 @@
from .llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
from .local_document_parser import LocalDocumentParser
from .faiss_vector_store_skill import FaissVectorStoreSkill
+from .teams_qna_loader_skill import TeamsQnALoaderSkill
+from .confluence_faq_splitter_skill import ConfluenceFAQSplitter
__all__ = [
@@ -31,4 +33,6 @@
"LlamaFastembedEmbeddingSkill",
"LocalDocumentParser",
"FaissVectorStoreSkill",
+ "TeamsQnALoaderSkill",
+ "ConfluenceFAQSplitter",
]
diff --git a/src/docs2vecs/subcommands/indexer/skills/confluence_faq_splitter_skill.py b/src/docs2vecs/subcommands/indexer/skills/confluence_faq_splitter_skill.py
new file mode 100644
index 0000000..a1623d9
--- /dev/null
+++ b/src/docs2vecs/subcommands/indexer/skills/confluence_faq_splitter_skill.py
@@ -0,0 +1,437 @@
+import hashlib
+import re
+from pathlib import Path
+from typing import List, Optional, Dict
+
+from docx import Document as DocxDocument
+from docx.text.paragraph import Paragraph
+from docx.table import Table
+from docx.oxml.ns import qn
+
+from docs2vecs.subcommands.indexer.config.config import Config
+from docs2vecs.subcommands.indexer.document.chunk import Chunk
+from docs2vecs.subcommands.indexer.document.document import Document
+from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
+
+
+class ConfluenceFAQSplitter(IndexerSkill):
+ """
+ Advanced Q&A extractor for DOCX files with sophisticated parsing.
+
+ Features:
+ - Extracts Q&A pairs from FAQ DOCX documents
+ - Ignores Table of Contents and Summary sections
+ - Handles heading-based questions with '?' or problem patterns
+ - Extracts hyperlinks from answers
+ - Stops at 'Related articles' sections
+ - Preserves table content in answers
+ - Each Q&A pair becomes a single atomic chunk for optimal RAG retrieval
+
+ Configuration parameters (all optional with sensible defaults):
+ - min_heading_level: Minimum heading level for questions (default: 2)
+ - max_heading_level: Maximum heading level for questions (default: 6)
+ - skip_patterns: List of text patterns to skip in answer content (default: ['CONFIDENTIAL', 'Search the FAQ', 'Search Artifactory FAQ'])
+ - skip_headings: List of heading titles to skip as questions (default: ['summary'])
+ - question_patterns: List of prefixes that indicate a question/problem statement (default: ['i am ', 'i cannot ', ...])
+ - stop_sections: List of regex patterns for sections that end Q&A extraction (default: ['related articles', 'see also'])
+ """
+
+ # Default configuration values
+ DEFAULT_MIN_HEADING_LEVEL = 2
+ DEFAULT_MAX_HEADING_LEVEL = 6
+ DEFAULT_SKIP_PATTERNS = ['CONFIDENTIAL', 'Search the FAQ', 'Search Artifactory FAQ']
+ DEFAULT_SKIP_HEADINGS = ['summary']
+ DEFAULT_QUESTION_PATTERNS = [
+ 'i am ', 'i cannot ', "i can't ", 'i see ',
+ 'i have ', 'i need ', 'my ', 'when i ',
+ 'how do i ', 'how can i ', 'what is ', 'what are ',
+ 'why does ', 'why is ', 'where is ', 'where can '
+ ]
+ DEFAULT_STOP_SECTIONS = [
+ r'^\s*related\s*articles?\s*$',
+ r'^\s*related\s*resources?\s*$',
+ r'^\s*see\s*also\s*$'
+ ]
+
+ def __init__(self, config: dict, global_config: Config):
+ super().__init__(config, global_config)
+
+ # Load configurable parameters with defaults
+ self.min_heading_level = self._config.get('min_heading_level', self.DEFAULT_MIN_HEADING_LEVEL)
+ self.max_heading_level = self._config.get('max_heading_level', self.DEFAULT_MAX_HEADING_LEVEL)
+ self.skip_patterns = self._config.get('skip_patterns', self.DEFAULT_SKIP_PATTERNS)
+ self.skip_headings = [h.lower() for h in self._config.get('skip_headings', self.DEFAULT_SKIP_HEADINGS)]
+ self.question_patterns = [p.lower() for p in self._config.get('question_patterns', self.DEFAULT_QUESTION_PATTERNS)]
+
+ # Compile stop section regexes
+ stop_sections = self._config.get('stop_sections', self.DEFAULT_STOP_SECTIONS)
+ self.related_res = [re.compile(p, re.I) for p in stop_sections]
+
+ self.dot_leader_re = re.compile(r"\.{2,}\s*\d{1,4}\s*$")
+ self.page_number_trail_re = re.compile(r"\s\d{1,4}\s*$")
+ # Regex to match markdown-style links: [Link](URL)
+ self.markdown_link_re = re.compile(r'\[Link\]\([^\)]+\)')
+
+ self.logger.debug(f"ConfluenceFAQSplitter config: heading_levels={self.min_heading_level}-{self.max_heading_level}, "
+ f"skip_patterns={len(self.skip_patterns)}, question_patterns={len(self.question_patterns)}")
+
+ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
+ self.logger.info("Running ConfluenceFAQSplitter...")
+
+ if not input:
+ self.logger.error("No documents provided in input")
+ return []
+
+ for doc in input:
+ self.logger.debug(f"Processing document: {doc.filename}")
+
+ # Check if file is a DOCX
+ filename_str = str(doc.filename)
+ if not filename_str.lower().endswith('.docx'):
+ self.logger.warning(f"Skipping non-DOCX file: {doc.filename}")
+ continue
+
+ try:
+ qa_pairs = self._extract_qa_from_docx(doc.filename)
+ self.logger.info(f"Extracted {len(qa_pairs)} Q&A pairs from {doc.filename}")
+
+ for idx, qa_data in enumerate(qa_pairs, 1):
+ question = qa_data['question']
+ answer = qa_data['answer']
+
+ if not question.strip() or not answer.strip():
+ self.logger.debug(f"Skipping Q&A pair {idx} - missing question or answer")
+ continue
+
+ links = qa_data.get('links', [])
+
+ # Filter out links where the text is just the URL itself (redundant)
+ # Only include links with meaningful descriptive text in the References section
+ meaningful_links = [
+ link for link in links
+ if not self._is_link_text_redundant(link['text'], link['url'])
+ ]
+
+ # Format links for inclusion in content
+ links_text = ""
+ if meaningful_links:
+ links_list = [f"- {link['text']}: {link['url']}" for link in meaningful_links]
+ links_text = f"\n\nReferences (hyperlinks from the answer):\n" + "\n".join(links_list)
+
+ # Combine question and answer into a single chunk
+ combined_text = f"Q: {question}\n\nA: {answer}{links_text}"
+
+ chunk = Chunk()
+ chunk.document_id = hashlib.sha256(combined_text.encode()).hexdigest()
+ chunk.document_name = Path(doc.filename).name
+ chunk.tag = doc.tag
+ chunk.content = combined_text # Full Q&A for retrieval
+ chunk.chunk_id = f"{chunk.document_id}_{idx}"
+ chunk.source_link = doc.source_url or ""
+
+ doc.add_chunk(chunk)
+
+ self.logger.debug(f"Split {doc.filename} into {len(doc.chunks)} Q&A chunks")
+
+ except Exception as e:
+ self.logger.error(f"Error processing {doc.filename}: {e}", exc_info=True)
+ continue
+
+ return input
+
+ def _extract_qa_from_docx(self, docx_path: str) -> List[Dict[str, any]]:
+ """
+ Extract Q&A pairs from a DOCX file with sophisticated parsing.
+
+ Returns:
+ List of dicts with 'question', 'answer', 'links' keys
+ """
+ doc = DocxDocument(docx_path)
+ qa: List[Dict[str, any]] = []
+
+ current_q = None
+ current_ans: List[str] = []
+ current_links: List[Dict[str, str]] = []
+ in_toc = False
+ in_summary = False
+
+ for blk in self._iter_block_items(doc):
+ text = self._norm(self._block_text(blk))
+ lvl = self._heading_level(blk) if isinstance(blk, Paragraph) else None
+ title = self._norm(blk.text).lower() if isinstance(blk, Paragraph) else ""
+
+ # ---------- TOC detection ----------
+ if isinstance(blk, Paragraph) and title in ('table of contents', 'contents'):
+ in_toc = True
+ # Finalize any open Q before entering TOC
+ if current_q is not None:
+ qa.append({
+ "question": current_q,
+ "answer": "\n".join(current_ans).strip() if current_ans else "",
+ "links": current_links,
+ })
+ current_q, current_ans, current_links = None, [], []
+ continue
+
+ # End TOC at next major heading (H1 or H2)
+ if in_toc and isinstance(blk, Paragraph) and lvl is not None and lvl <= 2 and title not in ('table of contents', 'contents'):
+ in_toc = False
+
+ # Skip TOC-styled paragraphs and dotted leader lines
+ if in_toc and isinstance(blk, Paragraph):
+ if self._style_name(blk).lower().startswith('toc') or self._is_toc_line(text):
+ continue
+
+ # ---------- Skip-heading regions (e.g. Summary, Overview) ----------
+ if isinstance(blk, Paragraph) and lvl is not None and title in self.skip_headings:
+ in_summary = True
+ continue
+
+ if in_summary and isinstance(blk, Paragraph) and lvl is not None and title not in self.skip_headings:
+ in_summary = False
+
+ # ---------- Stop at Related articles ----------
+ if current_q is not None and self._is_related_heading(blk):
+ qa.append({
+ "question": current_q,
+ "answer": "\n".join(current_ans).strip() if current_ans else "",
+ "links": current_links,
+ })
+ current_q, current_ans, current_links = None, [], []
+ continue
+
+ # ---------- New question ----------
+ if not in_toc and not in_summary and self._is_question_block(blk):
+ # Flush previous
+ if current_q is not None:
+ qa.append({
+ "question": current_q,
+ "answer": "\n".join(current_ans).strip() if current_ans else "",
+ "links": current_links,
+ })
+ current_q = self._norm(blk.text)
+ current_ans = []
+ current_links = []
+ continue
+
+ # ---------- Accumulate answer ----------
+ if current_q is not None:
+ # Skip banners/boilerplate using configurable patterns
+ if any(text.upper() == pat.upper() for pat in self.skip_patterns):
+ continue
+ if text:
+ block_text = self._block_text(blk)
+ # Remove markdown-style links [Link](URL)
+ cleaned_text = self._remove_markdown_links(block_text)
+ if cleaned_text: # Only add if there's content left after cleaning
+ current_ans.append(cleaned_text)
+ # Extract hyperlinks from block
+ links = self._extract_hyperlinks_from_block(blk)
+ current_links.extend(links)
+
+ # Finalize last Q
+ if current_q is not None:
+ qa.append({
+ "question": current_q,
+ "answer": "\n".join(current_ans).strip() if current_ans else "",
+ "links": current_links,
+ })
+
+ return qa
+
+ # ---------- Helper methods ----------
+
+ def _iter_block_items(self, parent):
+ """Yield paragraphs and tables in document order."""
+ body = parent.element.body
+ for child in body.iterchildren():
+ if child.tag == qn('w:p'):
+ yield Paragraph(child, parent)
+ elif child.tag == qn('w:tbl'):
+ yield Table(child, parent)
+
+ def _extract_table_text(self, table) -> str:
+ """Recursively extract text from a table, including nested tables."""
+ parts = []
+ for row in table.rows:
+ for cell in row.cells:
+ for p in cell.paragraphs:
+ parts.append(p.text or '')
+ for nested_table in cell.tables:
+ nested_text = self._extract_table_text(nested_table)
+ if nested_text:
+ parts.append(nested_text)
+
+ out = []
+ for t in (x.strip() for x in parts):
+ if t == '' and (not out or out[-1] == ''):
+ continue
+ out.append(t)
+ return "\n".join(out).strip()
+
+ def _block_text(self, block) -> str:
+ """Get text from a paragraph or table."""
+ if isinstance(block, Paragraph):
+ return block.text or ''
+ if isinstance(block, Table):
+ return self._extract_table_text(block)
+ return ''
+
+ def _style_name(self, par: Paragraph) -> str:
+ try:
+ return (par.style.name or '').strip()
+ except Exception:
+ return ''
+
+ def _heading_level(self, par: Paragraph):
+ if not isinstance(par, Paragraph):
+ return None
+ m = re.match(r'Heading\s*(\d+)$', self._style_name(par))
+ return int(m.group(1)) if m else None
+
+ def _norm(self, s: str) -> str:
+ """Normalize text by replacing non-breaking spaces and collapsing whitespace."""
+ s = (s or '').replace('\u00A0', ' ')
+ s = re.sub(r'\s{2,}', ' ', s.strip())
+ return s
+
+ def _remove_markdown_links(self, s: str) -> str:
+ """Remove markdown-style links in the form [Link](URL) from text."""
+ return self.markdown_link_re.sub('', s).strip()
+
+ def _is_link_text_redundant(self, text: str, url: str) -> bool:
+ """
+ Check if hyperlink text is redundant (i.e., it's just the URL itself or very similar).
+ References:
+ - https://example.com: https://example.com ← Useless, filtered out
+
+ Returns True if the link text is redundant and should be excluded from the References section.
+ This keeps the answer clean while preserving meaningful link descriptions.
+ """
+ # Normalize both for comparison
+ text_normalized = text.strip().lower()
+ url_normalized = url.strip().lower()
+
+ # Strip common trailing characters that might be added accidentally
+ # (parentheses, periods, commas, etc.)
+ text_cleaned = text_normalized.rstrip(').,;: ')
+ url_cleaned = url_normalized.rstrip(').,;: ')
+
+ # Remove common URL prefixes for comparison
+ url_without_protocol = re.sub(r'^https?://', '', url_cleaned)
+ url_without_www = re.sub(r'^www\.', '', url_without_protocol)
+ text_without_protocol = re.sub(r'^https?://', '', text_cleaned)
+
+ # Check if text is the same as URL (with or without protocol)
+ if text_cleaned == url_cleaned:
+ return True
+ if text_without_protocol == url_without_protocol:
+ return True
+ if text_cleaned == url_without_www:
+ return True
+
+ return False
+
+ def _is_related_heading(self, block) -> bool:
+ text = self._norm(self._block_text(block))
+ return any(rx.match(text) for rx in self.related_res)
+
+ def _is_toc_line(self, text: str) -> bool:
+ t = self._norm(text)
+ if not t:
+ return False
+ return bool(self.dot_leader_re.search(t) or
+ (self.page_number_trail_re.search(t) and t.count('.') >= 3))
+
+ def _is_question_block(self, block) -> bool:
+ """
+ Identifies Q&A headings as either:
+ 1. Containing '?' (for questions)
+ 2. Starting with configurable problem/question patterns
+
+ Uses configurable heading level range and skip_headings list.
+ """
+ if not isinstance(block, Paragraph):
+ return False
+ t = self._norm(block.text)
+ if not t:
+ return False
+
+ # Skip headings that match skip_headings list (e.g., 'Summary')
+ if t.lower() in self.skip_headings:
+ return False
+
+ lvl = self._heading_level(block)
+ if lvl is None or lvl < self.min_heading_level or lvl > self.max_heading_level:
+ return False
+
+ # Accept if it has a question mark
+ if '?' in t:
+ return True
+
+ # Accept configurable problem/question statement patterns
+ t_lower = t.lower()
+ return any(t_lower.startswith(pat) for pat in self.question_patterns)
+
+ def _extract_hyperlinks_from_paragraph(self, paragraph) -> List[Dict[str, str]]:
+ """Extract hyperlinks from a paragraph."""
+ links = []
+ if not isinstance(paragraph, Paragraph):
+ return links
+
+ try:
+ part = paragraph.part
+ rels = part.rels
+ except Exception:
+ return links
+
+ for hyperlink in paragraph._element.xpath('.//w:hyperlink'):
+ text_parts = []
+ for run in hyperlink.xpath('.//w:t'):
+ if run.text:
+ text_parts.append(run.text)
+ text = ''.join(text_parts)
+
+ if not text:
+ continue
+
+ # Try to get external URL via relationship ID
+ r_id = hyperlink.get(qn('r:id'))
+ if r_id and r_id in rels:
+ url = rels[r_id].target_ref
+ if url:
+ links.append({'text': text, 'url': url})
+ continue
+
+ # Try to get internal anchor (bookmark link)
+ # Skip scroll-bookmark internal links (e.g., #scroll-bookmark-17)
+ anchor = hyperlink.get(qn('w:anchor'))
+ if anchor and not anchor.startswith('scroll-bookmark'):
+ links.append({'text': text, 'url': f'#{anchor}'})
+
+ return links
+
+ def _extract_hyperlinks_from_table(self, table) -> List[Dict[str, str]]:
+ """Recursively extract hyperlinks from a table, including nested tables."""
+ links = []
+ for row in table.rows:
+ for cell in row.cells:
+ for para in cell.paragraphs:
+ links.extend(self._extract_hyperlinks_from_paragraph(para))
+ for nested_table in cell.tables:
+ links.extend(self._extract_hyperlinks_from_table(nested_table))
+ return links
+
+ def _extract_hyperlinks_from_block(self, block) -> List[Dict[str, str]]:
+ """
+ Extract hyperlinks from a block (paragraph or table).
+ Returns list of dicts with 'text' and 'url' keys.
+ """
+ links = []
+
+ if isinstance(block, Paragraph):
+ links.extend(self._extract_hyperlinks_from_paragraph(block))
+ elif isinstance(block, Table):
+ links.extend(self._extract_hyperlinks_from_table(block))
+
+ return links
diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py
index c0e331b..3747519 100644
--- a/src/docs2vecs/subcommands/indexer/skills/factory.py
+++ b/src/docs2vecs/subcommands/indexer/skills/factory.py
@@ -16,6 +16,8 @@
from docs2vecs.subcommands.indexer.skills import SemanticSplitter
from docs2vecs.subcommands.indexer.skills import VectorStoreTracker
from docs2vecs.subcommands.indexer.skills import FaissVectorStoreSkill
+from docs2vecs.subcommands.indexer.skills.confluence_faq_splitter_skill import ConfluenceFAQSplitter
+from docs2vecs.subcommands.indexer.skills.teams_qna_loader_skill import TeamsQnALoaderSkill
class SkillType(StrEnum):
@@ -51,6 +53,7 @@ class AvailableSkillName(StrEnum):
# splitters
SEMANTIC_SPLITTER = "semantic-splitter"
RECURSIVE_CHARACTER_SPLITTER = "recursive-character-splitter"
+ CONFLUENCE_FAQ_SPLITTER = "confluence-faq-splitter"
# embeddings
AZ_ADA002_EMBEDDING = "azure-ada002-embedding"
@@ -58,6 +61,7 @@ class AvailableSkillName(StrEnum):
# web loaders
JIRA_LOADER = "jira-loader"
+ TEAMS_QNA_LOADER = "teams-qna-loader"
AVAILABLE_SKILLS = {
@@ -82,8 +86,12 @@ class AvailableSkillName(StrEnum):
SkillType.SPLITTER: {
AvailableSkillName.SEMANTIC_SPLITTER: SemanticSplitter,
AvailableSkillName.RECURSIVE_CHARACTER_SPLITTER: RecursiveCharacterTextSplitter,
+ AvailableSkillName.CONFLUENCE_FAQ_SPLITTER: ConfluenceFAQSplitter,
+ },
+ SkillType.LOADER: {
+ AvailableSkillName.JIRA_LOADER: JiraLoaderSkill,
+ AvailableSkillName.TEAMS_QNA_LOADER: TeamsQnALoaderSkill,
},
- SkillType.LOADER: {AvailableSkillName.JIRA_LOADER: JiraLoaderSkill},
}
diff --git a/src/docs2vecs/subcommands/indexer/skills/teams_qna_loader_skill.py b/src/docs2vecs/subcommands/indexer/skills/teams_qna_loader_skill.py
new file mode 100644
index 0000000..b522506
--- /dev/null
+++ b/src/docs2vecs/subcommands/indexer/skills/teams_qna_loader_skill.py
@@ -0,0 +1,125 @@
+import json
+from pathlib import Path
+from typing import List
+from typing import Optional
+
+from docs2vecs.subcommands.indexer.config.config import Config
+from docs2vecs.subcommands.indexer.document import Chunk
+from docs2vecs.subcommands.indexer.document.document import Document
+from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
+
+
+class TeamsQnALoaderSkill(IndexerSkill):
+ """A skill that loads enriched Q&A pairs from the FAQ pipeline JSON output.
+
+ The JSON file should be an array of enriched Q&A objects with:
+ - thread_id: Unique identifier for the conversation thread
+ - question: Original question text
+ - rephrased_question: AI-polished question (used for embedding)
+ - rephrased_answer: AI-summarized answer (used as content)
+ - topic: Clustered topic category
+ - key_phrases: Extracted key phrases
+ - question_sender: Original question author
+ - timestamp: Message timestamp
+ - answers: Array of original answers
+
+ Configuration parameters:
+ - file_path (str): Path to the enriched Q&A JSON file
+ """
+
+ def __init__(self, skill_config: dict, global_config: Config) -> None:
+ super().__init__(skill_config, global_config)
+ self._file_path = Path(self._config["file_path"]).expanduser().resolve()
+ self.tag = self._config.get("tag", "enriched-qna")
+
+ def run(self, documents: Optional[List[Document]]) -> List[Document]:
+ """Load enriched Q&A pairs from JSON file and create Document objects with chunks.
+
+ Args:
+ documents: Not used by this skill (loader skill)
+
+ Returns:
+ List of Documents with chunks populated from enriched Q&A JSON
+ """
+ self.logger.info(f"Running TeamsQnALoaderSkill on {self._file_path}...")
+
+ if not self._file_path.exists():
+ raise FileNotFoundError(f"Enriched Q&A JSON file not found: {self._file_path}")
+
+ # Load JSON file
+ with self._file_path.open('r', encoding='utf-8') as f:
+ qna_list = json.load(f)
+
+ if not qna_list:
+ self.logger.warning(f"No Q&A pairs found in JSON file: {self._file_path}")
+ return []
+
+ if not isinstance(qna_list, list):
+ raise ValueError(f"Expected JSON array of Q&A objects, got {type(qna_list).__name__}")
+
+ result = []
+
+ # Process each enriched Q&A pair
+ for idx, qna in enumerate(qna_list):
+ # Extract rephrased question and answer, falling back to originals
+ question = qna.get("rephrased_question") or qna.get("question", "")
+ answer = qna.get("rephrased_answer") or self._get_best_answer(qna)
+
+ # Skip if no meaningful content
+ if not question.strip() or not answer.strip():
+ self.logger.debug(f"Skipping Q&A pair {idx} - missing question or answer")
+ continue
+
+ # Build content with both question and answer
+ topic = qna.get("topic", "General")
+ content = f"Q: {question}\n\nA: {answer}"
+
+ # Generate document ID from thread_id or index
+ thread_id = qna.get("thread_id") or f"qna_{idx}"
+ document_id = self._sanitize_id(thread_id)
+
+ # Use source_link from the Q&A pair (Teams message deep link) if available
+ source_url = qna.get("source_link", "").strip()
+
+ # Create a Document object
+ doc = Document(filename=str(self._file_path))
+
+ # Create a Chunk object from the Q&A pair
+ chunk = Chunk()
+ chunk.document_id = document_id
+ chunk.document_name = f"{topic} - FAQ"
+ chunk.tag = self.tag
+ chunk.content = content
+ chunk.chunk_id = f"{document_id}_chunk_0"
+ chunk.source_link = source_url
+
+ # Add chunk to document
+ doc.add_chunk(chunk)
+ result.append(doc)
+
+ self.logger.debug(f"Loaded Q&A: {document_id} | Topic: {topic}")
+
+ self.logger.info(f"Successfully loaded {len(result)} enriched Q&A pairs from JSON")
+
+ return result
+
+ def _get_best_answer(self, qna: dict) -> str:
+ """Get the best answer from the answers array, preferring expert answers."""
+ answers = qna.get("answers", [])
+ if not answers:
+ return ""
+
+ # Prefer expert answers
+ expert_answers = [a for a in answers if a.get("is_expert", False)]
+ if expert_answers:
+ return expert_answers[0].get("answer", "")
+
+ # Fall back to first answer
+ return answers[0].get("answer", "")
+
+ def _sanitize_id(self, thread_id: str) -> str:
+ """Sanitize thread_id to be a valid document ID."""
+ # Remove any characters that might cause issues in Azure Search
+ import re
+ sanitized = re.sub(r'[^a-zA-Z0-9_-]', '_', str(thread_id))
+ return sanitized[:128] # Limit length