Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,23 @@ class AzureAda002EmbeddingSkill(IndexerSkill):
def __init__(self, config: dict, global_config: Config):
super().__init__(config, global_config)

def az_ada002_embeddings(self, content: str):
def az_ada002_embeddings(self, content: str, chunk_id=None):
self.logger.debug(
f"Requesting embedding for chunk_id={chunk_id}, content_length={len(content)}"
)
embed_model = AzureOpenAIEmbedding(
deployment_name=self._config["deployment_name"],
api_key=self._config["api_key"],
azure_endpoint=self._config["endpoint"],
api_version=self._config["api_version"],
)
return embed_model.get_query_embedding(content)
try:
embedding = embed_model.get_query_embedding(content)
self.logger.debug(f"Received embedding for chunk_id={chunk_id}")
return embedding
except Exception as e:
self.logger.error(f"Embedding failed for chunk_id={chunk_id}: {e}")
return None
Comment thread
idrisfl marked this conversation as resolved.
Outdated

def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document]]:
self.logger.info(
Expand All @@ -36,10 +45,11 @@ def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document]
self.logger.debug(f"Processing document: {doc.filename}")
for chunk in doc.chunks:
self.logger.debug(f"Creating embedding for chunk: {chunk.chunk_id}")
chunk.embedding = (
""
if not chunk.content
else self.az_ada002_embeddings(chunk.content)
)

return input
if not chunk.content:
chunk.embedding = ""
else:
chunk.embedding = self.az_ada002_embeddings(
chunk.content, chunk_id=chunk.chunk_id
)

Comment thread
idrisfl marked this conversation as resolved.
Outdated
return input
28 changes: 27 additions & 1 deletion src/docs2vecs/subcommands/indexer/skills/default_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class DefaultFileReader(FileLoaderSkill):
- .doc, .docx: Word documents using UnstructuredWordDocumentLoader
- .ppt, .pptx: PowerPoint files using UnstructuredPowerPointLoader
- .xls, .xlsx: Excel files using UnstructuredExcelLoader
- .yml, .yaml: YAML files using PyYAML (multiple documents supported)
"""

def __init__(self, skill_config: dict, global_config: Config) -> None:
Expand All @@ -38,7 +39,32 @@ def __init__(self, skill_config: dict, global_config: Config) -> None:
".pptx": self._load_powerpoint,
".xls": self._load_excel,
".xlsx": self._load_excel,
".yml": self._load_yaml,
".yaml": self._load_yaml,
}

def _load_yaml(self, file_path: Path) -> List[Document]:
"""Load YAML files that may contain multiple documents separated by ---."""
try:
import yaml
except ImportError as err:
raise ImportError("yaml module is required to read YAML files.") from err

documents = []
with file_path.open() as fp:
for i, content in enumerate(yaml.safe_load_all(fp)):
if content is not None:
yaml_text = yaml.safe_dump(content)
doc_name = f"{file_path.stem}_doc_{i}{file_path.suffix}"
documents.append(
Document(
filename=str(file_path),
source_url=str(file_path),
text=yaml_text
)
)

return documents

def run(self, documents: Optional[List[Document]]) -> List[Document]:
"""Process input documents by reading their content based on file extension.
Expand Down Expand Up @@ -118,4 +144,4 @@ def _load_excel(self, file_path: Path) -> List[Document]:
"""Load Excel files using UnstructuredExcelLoader."""
loader = UnstructuredExcelLoader(str(file_path))
docs = loader.load()
return [Document(filename=str(file_path), source_url=str(file_path), text=doc.page_content) for doc in docs]
return [Document(filename=str(file_path), source_url=str(file_path), text=doc.page_content) for doc in docs]