diff --git a/src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py b/src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py index ee56cab..2cf9961 100644 --- a/src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py +++ b/src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py @@ -11,14 +11,21 @@ class AzureAda002EmbeddingSkill(IndexerSkill): def __init__(self, config: dict, global_config: Config): super().__init__(config, global_config) - def az_ada002_embeddings(self, content: str): + def az_ada002_embeddings(self, content: str, chunk_id=None): + self.logger.debug( + f"Requesting embedding for chunk_id={chunk_id}, content_length={len(content)} chars" + ) embed_model = AzureOpenAIEmbedding( deployment_name=self._config["deployment_name"], api_key=self._config["api_key"], azure_endpoint=self._config["endpoint"], api_version=self._config["api_version"], ) - return embed_model.get_query_embedding(content) + embedding = embed_model.get_query_embedding(content) + self.logger.debug( + f"Successfully received embedding for chunk_id={chunk_id}, embedding_dim={len(embedding) if embedding else 0}" + ) + return embedding def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document]]: self.logger.info( @@ -36,10 +43,8 @@ def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document] self.logger.debug(f"Processing document: {doc.filename}") for chunk in doc.chunks: self.logger.debug(f"Creating embedding for chunk: {chunk.chunk_id}") - chunk.embedding = ( - "" - if not chunk.content - else self.az_ada002_embeddings(chunk.content) + chunk.embedding = "" if not chunk.content else self.az_ada002_embeddings( + chunk.content, chunk_id=chunk.chunk_id ) - return input + return input \ No newline at end of file diff --git a/src/docs2vecs/subcommands/indexer/skills/default_file_reader.py b/src/docs2vecs/subcommands/indexer/skills/default_file_reader.py index 660b930..bc6b791 100644 --- a/src/docs2vecs/subcommands/indexer/skills/default_file_reader.py +++ b/src/docs2vecs/subcommands/indexer/skills/default_file_reader.py @@ -24,6 +24,7 @@ class DefaultFileReader(FileLoaderSkill): - .doc, .docx: Word documents using UnstructuredWordDocumentLoader - .ppt, .pptx: PowerPoint files using UnstructuredPowerPointLoader - .xls, .xlsx: Excel files using UnstructuredExcelLoader + - .yml, .yaml: YAML files using PyYAML (multiple documents supported) """ def __init__(self, skill_config: dict, global_config: Config) -> None: @@ -38,7 +39,32 @@ def __init__(self, skill_config: dict, global_config: Config) -> None: ".pptx": self._load_powerpoint, ".xls": self._load_excel, ".xlsx": self._load_excel, + ".yml": self._load_yaml, + ".yaml": self._load_yaml, } + + def _load_yaml(self, file_path: Path) -> List[Document]: + """Load YAML files that may contain multiple documents separated by ---.""" + try: + import yaml + except ImportError as err: + raise ImportError("yaml module is required to read YAML files.") from err + + documents = [] + with file_path.open() as fp: + for i, content in enumerate(yaml.safe_load_all(fp)): + if content is not None: + yaml_text = yaml.safe_dump(content) + doc_name = f"{file_path.stem}_doc_{i}{file_path.suffix}" + documents.append( + Document( + filename=str(file_path), + source_url=str(file_path), + text=yaml_text + ) + ) + + return documents def run(self, documents: Optional[List[Document]]) -> List[Document]: """Process input documents by reading their content based on file extension. @@ -118,4 +144,4 @@ def _load_excel(self, file_path: Path) -> List[Document]: """Load Excel files using UnstructuredExcelLoader.""" loader = UnstructuredExcelLoader(str(file_path)) docs = loader.load() - return [Document(filename=str(file_path), source_url=str(file_path), text=doc.page_content) for doc in docs] + return [Document(filename=str(file_path), source_url=str(file_path), text=doc.page_content) for doc in docs] \ No newline at end of file