From b2877ba1136fc47dec2aa2b53601c59ae46bc554 Mon Sep 17 00:00:00 2001 From: asopitech Date: Sat, 31 May 2025 11:12:44 +0900 Subject: [PATCH 1/5] Add .tool-versions file to specify Python and Node.js versions --- .tool-versions | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .tool-versions diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000000..63689e4046 --- /dev/null +++ b/.tool-versions @@ -0,0 +1,2 @@ +python 3.11.11 +nodejs 24.1.0 From 7f8a043d10b0e36268b88ca4666b89a3b1f47e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=82=E3=81=9D=E3=81=B4=E3=83=86=E3=83=83=E3=82=AF?= <122283370+asopitech@users.noreply.github.com> Date: Fri, 6 Jun 2025 00:53:47 +0000 Subject: [PATCH 2/5] =?UTF-8?q?Bedrock=20LLM:=20Anthropic/Nova=E3=83=81?= =?UTF-8?q?=E3=83=A3=E3=83=83=E3=83=88=E3=83=BBTextEmbeddingV2=E3=81=AE?= =?UTF-8?q?=E3=81=BF=E5=80=8B=E5=88=A5=E3=82=AF=E3=83=A9=E3=82=B9/?= =?UTF-8?q?=E3=83=95=E3=82=A1=E3=82=AF=E3=83=88=E3=83=AA=E7=99=BB=E9=8C=B2?= =?UTF-8?q?(dev)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- graphrag/language_model/factory.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/graphrag/language_model/factory.py b/graphrag/language_model/factory.py index 8b4a5fb3b9..7131ffef8b 100644 --- a/graphrag/language_model/factory.py +++ b/graphrag/language_model/factory.py @@ -14,6 +14,12 @@ OpenAIChatFNLLM, OpenAIEmbeddingFNLLM, ) +from graphrag.language_model.providers.bedrock_models import ( + BedrockChatLLM, + BedrockEmbeddingLLM, + BedrockAnthropicChatLLM, + BedrockNovaChatLLM, +) class ModelFactory: @@ -105,6 +111,15 @@ def is_supported_model(cls, model_type: str) -> bool: ModelFactory.register_chat( ModelType.OpenAIChat, lambda **kwargs: OpenAIChatFNLLM(**kwargs) ) +ModelFactory.register_chat( + ModelType.BedrockChat, lambda **kwargs: BedrockChatLLM(**kwargs) +) +ModelFactory.register_chat( + ModelType.BedrockAnthropicChat, lambda **kwargs: BedrockAnthropicChatLLM(**kwargs) +) +ModelFactory.register_chat( + ModelType.BedrockNovaChat, lambda **kwargs: BedrockNovaChatLLM(**kwargs) +) ModelFactory.register_embedding( ModelType.AzureOpenAIEmbedding, lambda **kwargs: AzureOpenAIEmbeddingFNLLM(**kwargs) @@ -112,3 +127,6 @@ def is_supported_model(cls, model_type: str) -> bool: ModelFactory.register_embedding( ModelType.OpenAIEmbedding, lambda **kwargs: OpenAIEmbeddingFNLLM(**kwargs) ) +ModelFactory.register_embedding( + ModelType.BedrockTextEmbeddingV2, lambda **kwargs: BedrockEmbeddingLLM(**kwargs) +) From 597b1e77f69056bf55ef2f1634948cff4256b784 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=82=E3=81=9D=E3=81=B4=E3=83=86=E3=83=83=E3=82=AF?= <122283370+asopitech@users.noreply.github.com> Date: Fri, 6 Jun 2025 00:56:58 +0000 Subject: [PATCH 3/5] =?UTF-8?q?Bedrock=20LLM:=20Anthropic/Nova=E3=83=81?= =?UTF-8?q?=E3=83=A3=E3=83=83=E3=83=88=E3=83=BBTextEmbeddingV2=E5=80=8B?= =?UTF-8?q?=E5=88=A5=E3=82=AF=E3=83=A9=E3=82=B9=E3=83=BB=E3=83=95=E3=82=A1?= =?UTF-8?q?=E3=82=AF=E3=83=88=E3=83=AA=E7=99=BB=E9=8C=B2=E3=80=81enums?= =?UTF-8?q?=E6=95=B4=E7=90=86(dev)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- graphrag/config/enums.py | 4 + .../providers/bedrock_models.py | 157 ++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 graphrag/language_model/providers/bedrock_models.py diff --git a/graphrag/config/enums.py b/graphrag/config/enums.py index f3efdbd246..10a7327750 100644 --- a/graphrag/config/enums.py +++ b/graphrag/config/enums.py @@ -93,10 +93,14 @@ class ModelType(str, Enum): # Embeddings OpenAIEmbedding = "openai_embedding" AzureOpenAIEmbedding = "azure_openai_embedding" + BedrockTextEmbeddingV2 = "bedrock_text_embedding_v2" # Chat Completion OpenAIChat = "openai_chat" AzureOpenAIChat = "azure_openai_chat" + BedrockChat = "bedrock_chat" + BedrockAnthropicChat = "bedrock_anthropic_chat" + BedrockNovaChat = "bedrock_nova_chat" # Debug MockChat = "mock_chat" diff --git a/graphrag/language_model/providers/bedrock_models.py b/graphrag/language_model/providers/bedrock_models.py new file mode 100644 index 0000000000..a85865206b --- /dev/null +++ b/graphrag/language_model/providers/bedrock_models.py @@ -0,0 +1,157 @@ +# Copyright (c) 2025 Microsoft Corporation. +# Licensed under the MIT License + +"""AWS Bedrock LLM provider definitions.""" + +from __future__ import annotations +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from graphrag.config.models.language_model_config import LanguageModelConfig + from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks + from graphrag.cache.pipeline_cache import PipelineCache + from graphrag.language_model.response.base import ModelResponse + +import os +import boto3 + +class BedrockAnthropicChatLLM: + """AWS Bedrock Anthropic Claude Chat Model provider.""" + def __init__( + self, + *, + name: str, + config: LanguageModelConfig, + callbacks: WorkflowCallbacks | None = None, + cache: PipelineCache | None = None, + ) -> None: + self.config = config + self.model_id = config.model + self.endpoint_url = config.api_base or os.environ.get("BEDROCK_ENDPOINT") + self.region = os.environ.get("AWS_REGION", "us-east-1") + self.client = boto3.client( + "bedrock-runtime", + region_name=self.region, + endpoint_url=self.endpoint_url, + ) + + async def achat(self, prompt: str, history: list | None = None, **kwargs) -> Any: + import json + messages = [] + if history: + for h in history: + if isinstance(h, dict) and "role" in h and "content" in h: + messages.append({"role": h["role"], "content": h["content"]}) + messages.append({"role": "user", "content": prompt}) + body = json.dumps({ + "messages": messages, + "max_tokens": kwargs.get("max_tokens", 4096), + "temperature": kwargs.get("temperature", 0.7), + "top_p": kwargs.get("top_p", 0.9), + }) + response = self.client.invoke_model( + body=body, + modelId=self.model_id, + accept="application/json", + contentType="application/json", + ) + result = response["body"].read().decode() + result_json = json.loads(result) + return result_json.get("content") + +class BedrockNovaChatLLM: + """AWS Bedrock Amazon Nova Chat Model provider.""" + def __init__( + self, + *, + name: str, + config: LanguageModelConfig, + callbacks: WorkflowCallbacks | None = None, + cache: PipelineCache | None = None, + ) -> None: + self.config = config + self.model_id = config.model + self.endpoint_url = config.api_base or os.environ.get("BEDROCK_ENDPOINT") + self.region = os.environ.get("AWS_REGION", "us-east-1") + self.client = boto3.client( + "bedrock-runtime", + region_name=self.region, + endpoint_url=self.endpoint_url, + ) + + async def achat(self, prompt: str, history: list | None = None, **kwargs) -> Any: + import json + body = json.dumps({ + "inputText": prompt, + "maxGeneratedTokens": kwargs.get("max_tokens", 4096), + "temperature": kwargs.get("temperature", 0.7), + "topP": kwargs.get("top_p", 0.9), + }) + response = self.client.invoke_model( + body=body, + modelId=self.model_id, + accept="application/json", + contentType="application/json", + ) + result = response["body"].read().decode() + result_json = json.loads(result) + return result_json.get("results", [{}])[0].get("outputText") + +# BedrockChatLLMは汎用モデル用として残す(未対応モデルIDで例外) +class BedrockChatLLM: + """AWS Bedrock汎用 Chat Model provider (未対応モデルIDで例外)""" + def __init__( + self, + *, + name: str, + config: LanguageModelConfig, + callbacks: WorkflowCallbacks | None = None, + cache: PipelineCache | None = None, + ) -> None: + self.config = config + self.model_id = config.model + self.endpoint_url = config.api_base or os.environ.get("BEDROCK_ENDPOINT") + self.region = os.environ.get("AWS_REGION", "us-east-1") + self.client = boto3.client( + "bedrock-runtime", + region_name=self.region, + endpoint_url=self.endpoint_url, + ) + + async def achat(self, prompt: str, history: list | None = None, **kwargs) -> Any: + raise ValueError(f"BedrockChatLLM: 未対応または不正なモデルIDです: {self.model_id}") + +class BedrockEmbeddingLLM: + """AWS Bedrock Embedding Model provider.""" + def __init__( + self, + *, + name: str, + config: LanguageModelConfig, + callbacks: WorkflowCallbacks | None = None, + cache: PipelineCache | None = None, + ) -> None: + self.config = config + self.model_id = config.model + self.endpoint_url = config.api_base or os.environ.get("BEDROCK_ENDPOINT") + self.region = os.environ.get("AWS_REGION", "us-east-1") + self.client = boto3.client( + "bedrock-runtime", + region_name=self.region, + endpoint_url=self.endpoint_url, + ) + + async def aembed_batch(self, text_list: list[str], **kwargs) -> list[list[float]]: + # NOTE: This is a simplified example. Adjust for the actual Bedrock embedding API. + body = { + "input": text_list, + "modelId": self.model_id, + } + response = self.client.invoke_model( + body=body, + modelId=self.model_id, + accept="application/json", + contentType="application/json", + ) + # 返却値のパースはモデルごとに調整が必要 + return response["body"].read().decode() From d3017d20a93b06d5b8b8485940785f4ba3a37cf7 Mon Sep 17 00:00:00 2001 From: asopitech Date: Sat, 7 Jun 2025 17:12:59 +0900 Subject: [PATCH 4/5] Add GitHub Actions workflow for Claude Assistant integration --- .github/workflows/claude.yaml | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/claude.yaml diff --git a/.github/workflows/claude.yaml b/.github/workflows/claude.yaml new file mode 100644 index 0000000000..24fcefb30f --- /dev/null +++ b/.github/workflows/claude.yaml @@ -0,0 +1,38 @@ +name: Claude Assistant +on: + issue_comment: + types: [created] + +permissions: + contents: write + +jobs: + claude-code-action: + if: > + github.event.issue.user.login == 'asopitech' && + contains(github.event.comment.body, '@claude') + runs-on: ubuntu-latest + environment: CLAUDE_CODE_ACTION + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + + # For AWS Bedrock with OIDC + - name: Configure AWS Credentials (OIDC) + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: us-west-2 + role-duration-seconds: 7200 + role-session-name: GitHubActions${{ github.run_id }} + mask-aws-account-id: true + + - name: Claude Code Review + uses: anthropic/claude-code-base-action@beta + with: + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + model: ${{ env.ANTHROPIC_MODEL }} + small-fast-model: ${{ env.ANTHROPIC_SMALL_FAST_MODEL }} + disable-prompt-caching: ${{ env.DISABLE_PROMPT_CACHING }} From db96b834989fa99bdd0bf3635a53e4c45d96ebe3 Mon Sep 17 00:00:00 2001 From: asopitech Date: Sat, 7 Jun 2025 17:17:42 +0900 Subject: [PATCH 5/5] Add CLAUDE.md documentation for project overview, development commands, and architecture --- CLAUDE.md | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..5c1ed68544 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,145 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +GraphRAG is a graph-based retrieval-augmented generation (RAG) system that extracts meaningful, structured data from unstructured text using LLMs. This fork includes AWS Bedrock support with specialized model classes for Anthropic Claude and Amazon Nova models. + +## Development Commands + +### Setup and Dependencies +```bash +# Install dependencies +poetry install + +# Start Azurite (for Azure storage emulation in tests) +./scripts/start-azurite.sh +``` + +### Core CLI Operations +```bash +# Initialize new GraphRAG project +poetry run poe init --root /path/to/project + +# Build knowledge graph index from source documents +poetry run poe index --root /path/to/project --config settings.yaml + +# Update existing index incrementally +poetry run poe update --root /path/to/project + +# Query the knowledge graph +poetry run poe query --method global --query "Your question here" +poetry run poe query --method local --query "Your question here" +poetry run poe query --method drift --query "Your question here" + +# Tune prompts for your domain +poetry run poe prompt_tune --root /path/to/project --domain "your domain" +``` + +### Testing and Quality +```bash +# Run all tests with coverage +poetry run poe test + +# Run specific test suites +poetry run poe test_unit # Unit tests +poetry run poe test_integration # Integration tests +poetry run poe test_smoke # End-to-end pipeline tests +poetry run poe test_verbs # Workflow operation tests +poetry run poe test_notebook # Example notebook validation + +# Run single test by pattern +poetry run poe test_only "test_pattern_here" + +# Code quality checks and fixes +poetry run poe check # Format, lint, type-check +poetry run poe format # Format code +poetry run poe fix # Auto-fix issues +poetry run poe fix_unsafe # Include potentially unsafe fixes + +# Coverage reporting +poetry run poe coverage_report +``` + +### Documentation +```bash +# Serve docs locally +poetry run poe serve_docs + +# Build documentation +poetry run poe build_docs +``` + +## Architecture Overview + +### Core Pipeline Flow +1. **Input Processing** → **Entity Extraction** → **Graph Construction** → **Community Detection** → **Summarization** → **Embedding Generation** → **Output Storage** + +### Key Modules +- **`graphrag/api/`** - High-level API interfaces for indexing and querying +- **`graphrag/cli/`** - Command-line interface built with Typer +- **`graphrag/config/`** - Pydantic-based configuration system with YAML support +- **`graphrag/data_model/`** - Core entities: Entity, Relationship, Community, TextUnit, Covariate +- **`graphrag/index/`** - Indexing pipeline with workflow-based operations +- **`graphrag/query/`** - Query engine supporting Local, Global, Drift, and Basic search +- **`graphrag/language_model/`** - LLM provider abstractions with factory pattern +- **`graphrag/vector_stores/`** - Vector store implementations (LanceDB, Azure AI Search, CosmosDB) +- **`graphrag/storage/`** - Storage backends (file, blob, memory, CosmosDB) + +### AWS Bedrock Integration +This fork includes specialized Bedrock model classes in `graphrag/language_model/providers/bedrock_models.py`: +- **`BedrockAnthropicChatLLM`** - For Anthropic Claude models +- **`BedrockNovaChatLLM`** - For Amazon Nova models +- **`BedrockEmbeddingLLM`** - For embedding models +- **`BedrockChatLLM`** - Generic fallback (throws exceptions for unsupported models) + +### Search Methods +- **Local Search** - Entity-focused queries using local graph context +- **Global Search** - Community-focused queries using community reports +- **Drift Search** - Advanced iterative search with dynamic context +- **Basic Search** - Simple text-based search + +### Configuration System +- YAML-based configuration with environment variable support +- Pydantic models for type-safe configuration validation +- Factory pattern for pluggable components (LLMs, storage, vector stores) +- Supports multiple LLM providers: OpenAI, Azure OpenAI, AWS Bedrock + +### Data Flow Architecture +The system uses a workflow-based pipeline where each operation transforms data through standardized interfaces. Key data structures flow through the pipeline: +- **Documents** → **TextUnits** (chunks) → **Entities/Relationships** → **Communities** → **Reports** +- Vector embeddings are generated for entities, text units, and community reports +- Final outputs include Parquet files, GraphML, and vector store indices + +## Working with This Codebase + +### Common Development Patterns +- Use the factory pattern for component registration (`factory.py` files) +- Follow Pydantic models for configuration (`graphrag/config/models/`) +- Leverage async/await patterns throughout LLM operations +- Use workflow callbacks for progress reporting and error handling + +### Key Configuration Files +- **`settings.yaml`** - Main configuration file (generated by `init` command) +- **`pyproject.toml`** - Poetry dependencies and poe task definitions +- **`.env`** - Environment variables for API keys and settings + +### Testing Strategy +- Unit tests focus on individual components and utilities +- Integration tests verify cross-component functionality +- Smoke tests validate end-to-end pipeline execution +- Verb tests ensure workflow operations work correctly +- Notebook tests validate example usage patterns + +### Version Management +Uses semversioner for semantic versioning. When making changes: +```bash +poetry run semversioner add-change -t patch -d "Description of changes" +``` + +### Important Notes +- Always run `poetry run poe check` before committing +- The indexing process can be expensive - start with small datasets +- Configuration format may change between versions - use `init --force` after updates +- Prompt tuning is recommended for optimal results with your specific domain \ No newline at end of file