From 19d5af5cc4a41afd51db259d8e41b55b7fb1e5ce Mon Sep 17 00:00:00 2001
From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com>
Date: Tue, 22 Jul 2025 20:18:38 +0000
Subject: [PATCH 1/6] Add context usage percentage to working memory endpoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add context_usage_percentage field to WorkingMemoryResponse model
- Add _calculate_context_usage_percentage() helper function
- Update GET /v1/working-memory/{session_id} to return percentage
- Update PUT /v1/working-memory/{session_id} to return percentage based on final state (after potential summarization)
- Percentage calculated as (current_tokens / token_threshold) * 100 where token_threshold = context_window * 0.7
- Returns None when no model info provided, otherwise 0-100% value

Resolves #37

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Andrew Brookins <abrookins@users.noreply.github.com>
---
 agent_memory_server/api.py     | 61 ++++++++++++++++++++++++++++++++--
 agent_memory_server/models.py  |  5 +++
 tests/test_full_integration.py | 18 +++++-----
 3 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
index b5c8a47..b9c799c 100644
--- a/agent_memory_server/api.py
+++ b/agent_memory_server/api.py
@@ -63,6 +63,41 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
     return total_tokens
 
 
+def _calculate_context_usage_percentage(
+    messages: list[MemoryMessage],
+    model_name: ModelNameLiteral | None,
+    context_window_max: int | None,
+) -> float | None:
+    """
+    Calculate the percentage of context window used before auto-summarization triggers.
+
+    Args:
+        messages: List of messages to calculate token count for
+        model_name: The client's LLM model name for context window determination
+        context_window_max: Direct specification of context window max tokens
+
+    Returns:
+        Percentage (0-100) of context used, or None if no model info provided
+    """
+    if not messages or (not model_name and not context_window_max):
+        return None
+
+    # Calculate current token usage
+    current_tokens = _calculate_messages_token_count(messages)
+
+    # Get effective token limit for the client's model
+    max_tokens = _get_effective_token_limit(model_name, context_window_max)
+
+    # Use the same threshold as _summarize_working_memory (70% of context window)
+    token_threshold = int(max_tokens * 0.7)
+
+    # Calculate percentage of threshold used
+    percentage = (current_tokens / token_threshold) * 100.0
+
+    # Cap at 100% for display purposes
+    return min(percentage, 100.0)
+
+
 async def _summarize_working_memory(
     memory: WorkingMemory,
     model_name: ModelNameLiteral | None = None,
@@ -269,7 +304,18 @@ async def get_working_memory(
 
     logger.debug(f"Working mem: {working_mem}")
 
-    return working_mem
+    # Calculate context usage percentage
+    context_usage_percentage = _calculate_context_usage_percentage(
+        messages=working_mem.messages,
+        model_name=model_name,
+        context_window_max=context_window_max,
+    )
+
+    # Return WorkingMemoryResponse with percentage
+    return WorkingMemoryResponse(
+        **working_mem.model_dump(),
+        context_usage_percentage=context_usage_percentage,
+    )
 
 
 @router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
@@ -348,7 +394,18 @@ async def put_working_memory(
             namespace=updated_memory.namespace,
         )
 
-    return updated_memory
+    # Calculate context usage percentage based on the final state (after potential summarization)
+    context_usage_percentage = _calculate_context_usage_percentage(
+        messages=updated_memory.messages,
+        model_name=model_name,
+        context_window_max=context_window_max,
+    )
+
+    # Return WorkingMemoryResponse with percentage
+    return WorkingMemoryResponse(
+        **updated_memory.model_dump(),
+        context_usage_percentage=context_usage_percentage,
+    )
 
 
 @router.delete("/v1/working-memory/{session_id}", response_model=AckResponse)
diff --git a/agent_memory_server/models.py b/agent_memory_server/models.py
index 7fda47c..87d9af5 100644
--- a/agent_memory_server/models.py
+++ b/agent_memory_server/models.py
@@ -222,6 +222,11 @@ class WorkingMemory(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response containing working memory"""
 
+    context_usage_percentage: float | None = Field(
+        default=None,
+        description="Percentage of context window used before auto-summarization triggers (0-100)",
+    )
+
 
 class WorkingMemoryRequest(BaseModel):
     """Request parameters for working memory operations"""
diff --git a/tests/test_full_integration.py b/tests/test_full_integration.py
index 1c0761a..1c68228 100644
--- a/tests/test_full_integration.py
+++ b/tests/test_full_integration.py
@@ -773,9 +773,9 @@ async def test_memory_prompt_with_long_term_search(
             )
             for msg in messages
         )
-        assert (
-            relevant_context_found
-        ), f"No relevant memory context found in messages: {messages}"
+        assert relevant_context_found, (
+            f"No relevant memory context found in messages: {messages}"
+        )
 
         # Cleanup
         await client.delete_long_term_memories([m.id for m in test_memories])
@@ -1079,9 +1079,9 @@ async def test_full_workflow_integration(
             )
             print(f"No topic filter search results: {no_topic_search}")
 
-        assert (
-            len(search_results["memories"]) > 0
-        ), f"No memories found in search results: {search_results}"
+        assert len(search_results["memories"]) > 0, (
+            f"No memories found in search results: {search_results}"
+        )
 
         # 6. Test tool integration with a realistic scenario
         tool_call = {
@@ -1126,9 +1126,9 @@ async def test_full_workflow_integration(
             m for m in long_term_memories.memories if m.id.startswith(memory_id_prefix)
         ]
 
-        assert (
-            len(our_memories) == 0
-        ), f"Expected 0 of our memories but found {len(our_memories)}: {our_memories}"
+        assert len(our_memories) == 0, (
+            f"Expected 0 of our memories but found {len(our_memories)}: {our_memories}"
+        )
 
 
 @pytest.mark.integration

From a1a778af3acad2a1c7b999ad2d36ed2a85e4a7ba Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Thu, 24 Jul 2025 16:33:25 -0700
Subject: [PATCH 2/6] Fix code formatting in test_full_integration.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 tests/test_full_integration.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/test_full_integration.py b/tests/test_full_integration.py
index 1c68228..1c0761a 100644
--- a/tests/test_full_integration.py
+++ b/tests/test_full_integration.py
@@ -773,9 +773,9 @@ async def test_memory_prompt_with_long_term_search(
             )
             for msg in messages
         )
-        assert relevant_context_found, (
-            f"No relevant memory context found in messages: {messages}"
-        )
+        assert (
+            relevant_context_found
+        ), f"No relevant memory context found in messages: {messages}"
 
         # Cleanup
         await client.delete_long_term_memories([m.id for m in test_memories])
@@ -1079,9 +1079,9 @@ async def test_full_workflow_integration(
             )
             print(f"No topic filter search results: {no_topic_search}")
 
-        assert len(search_results["memories"]) > 0, (
-            f"No memories found in search results: {search_results}"
-        )
+        assert (
+            len(search_results["memories"]) > 0
+        ), f"No memories found in search results: {search_results}"
 
         # 6. Test tool integration with a realistic scenario
         tool_call = {
@@ -1126,9 +1126,9 @@ async def test_full_workflow_integration(
             m for m in long_term_memories.memories if m.id.startswith(memory_id_prefix)
         ]
 
-        assert len(our_memories) == 0, (
-            f"Expected 0 of our memories but found {len(our_memories)}: {our_memories}"
-        )
+        assert (
+            len(our_memories) == 0
+        ), f"Expected 0 of our memories but found {len(our_memories)}: {our_memories}"
 
 
 @pytest.mark.integration

From b5c55d195898dfbd8a25a1eb1a9b6aa48ffecf1b Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Fri, 25 Jul 2025 10:37:17 -0700
Subject: [PATCH 3/6] Fix duplicate context_usage_percentage parameter in API
 responses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolve TypeError by properly handling the context_usage_percentage field
in WorkingMemoryResponse creation to avoid duplicate keyword arguments.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 agent_memory_server/api.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
index b9c799c..2718b4b 100644
--- a/agent_memory_server/api.py
+++ b/agent_memory_server/api.py
@@ -312,10 +312,9 @@ async def get_working_memory(
     )
 
     # Return WorkingMemoryResponse with percentage
-    return WorkingMemoryResponse(
-        **working_mem.model_dump(),
-        context_usage_percentage=context_usage_percentage,
-    )
+    working_mem_data = working_mem.model_dump()
+    working_mem_data["context_usage_percentage"] = context_usage_percentage
+    return WorkingMemoryResponse(**working_mem_data)
 
 
 @router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
@@ -402,10 +401,9 @@ async def put_working_memory(
     )
 
     # Return WorkingMemoryResponse with percentage
-    return WorkingMemoryResponse(
-        **updated_memory.model_dump(),
-        context_usage_percentage=context_usage_percentage,
-    )
+    updated_memory_data = updated_memory.model_dump()
+    updated_memory_data["context_usage_percentage"] = context_usage_percentage
+    return WorkingMemoryResponse(**updated_memory_data)
 
 
 @router.delete("/v1/working-memory/{session_id}", response_model=AckResponse)

From 8043c185e3d666e06db62782bbfaa97e6597fbe9 Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Fri, 25 Jul 2025 10:37:50 -0700
Subject: [PATCH 4/6] Add context_usage_percentage field and tests to SDK
 client
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add context_usage_percentage field to WorkingMemoryResponse model
- Add comprehensive test suite for the new field covering:
  - Field creation and default values
  - Serialization behavior
  - Validation of different percentage values
  - Dictionary-to-model conversion

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../agent_memory_client/models.py             |   5 +-
 agent-memory-client/tests/test_client.py      | 110 ++++++++++++++++++
 docs/memory-types.md                          |   5 +-
 dump.rdb                                      | Bin 0 -> 88 bytes
 4 files changed, 115 insertions(+), 5 deletions(-)
 create mode 100644 dump.rdb

diff --git a/agent-memory-client/agent_memory_client/models.py b/agent-memory-client/agent_memory_client/models.py
index bc731c9..12206ac 100644
--- a/agent-memory-client/agent_memory_client/models.py
+++ b/agent-memory-client/agent_memory_client/models.py
@@ -215,7 +215,10 @@ class SessionListResponse(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response from working memory operations"""
 
-    pass
+    context_usage_percentage: float | None = Field(
+        default=None,
+        description="Percentage of context window used before auto-summarization triggers (0-100)",
+    )
 
 
 class MemoryRecordResult(MemoryRecord):
diff --git a/agent-memory-client/tests/test_client.py b/agent-memory-client/tests/test_client.py
index e49f615..2b13817 100644
--- a/agent-memory-client/tests/test_client.py
+++ b/agent-memory-client/tests/test_client.py
@@ -653,3 +653,113 @@ def test_validation_with_none_values(self, enhanced_test_client):
 
         # Should not raise
         enhanced_test_client.validate_memory_record(memory)
+
+
+class TestContextUsagePercentage:
+    """Tests for context usage percentage functionality."""
+
+    @pytest.mark.asyncio
+    async def test_working_memory_response_with_context_percentage(
+        self, enhanced_test_client
+    ):
+        """Test that WorkingMemoryResponse properly handles context_usage_percentage field."""
+        session_id = "test-session"
+
+        # Test with context percentage set
+        working_memory_response = WorkingMemoryResponse(
+            session_id=session_id,
+            messages=[],
+            memories=[],
+            data={},
+            context=None,
+            user_id=None,
+            context_usage_percentage=45.5,
+        )
+
+        assert working_memory_response.context_usage_percentage == 45.5
+        assert working_memory_response.session_id == session_id
+
+        # Test with None context percentage (default)
+        working_memory_response_none = WorkingMemoryResponse(
+            session_id=session_id,
+            messages=[],
+            memories=[],
+            data={},
+            context=None,
+            user_id=None,
+        )
+
+        assert working_memory_response_none.context_usage_percentage is None
+
+    @pytest.mark.asyncio
+    async def test_context_percentage_serialization(self, enhanced_test_client):
+        """Test that context_usage_percentage is properly serialized."""
+        session_id = "test-session"
+
+        # Create response with context percentage
+        working_memory_response = WorkingMemoryResponse(
+            session_id=session_id,
+            messages=[],
+            memories=[],
+            data={},
+            context=None,
+            user_id=None,
+            context_usage_percentage=75.0,
+        )
+
+        # Test model_dump includes the field
+        dumped = working_memory_response.model_dump()
+        assert "context_usage_percentage" in dumped
+        assert dumped["context_usage_percentage"] == 75.0
+
+        # Test JSON serialization
+        json_data = working_memory_response.model_dump_json()
+        assert "context_usage_percentage" in json_data
+        assert "75.0" in json_data
+
+    @pytest.mark.asyncio
+    async def test_context_percentage_validation(self, enhanced_test_client):
+        """Test that context_usage_percentage accepts valid values."""
+        session_id = "test-session"
+
+        # Test valid percentages
+        valid_percentages = [0.0, 25.5, 50.0, 99.9, 100.0, None]
+
+        for percentage in valid_percentages:
+            working_memory_response = WorkingMemoryResponse(
+                session_id=session_id,
+                messages=[],
+                memories=[],
+                data={},
+                context=None,
+                user_id=None,
+                context_usage_percentage=percentage,
+            )
+            assert working_memory_response.context_usage_percentage == percentage
+
+    def test_working_memory_response_from_dict_with_context_percentage(self):
+        """Test that WorkingMemoryResponse can be created from dict with context_usage_percentage."""
+        session_id = "test-session"
+
+        # Test creating WorkingMemoryResponse from dict (simulating API response parsing)
+        response_dict = {
+            "session_id": session_id,
+            "messages": [],
+            "memories": [],
+            "data": {},
+            "context": None,
+            "user_id": None,
+            "context_usage_percentage": 33.3,
+            "tokens": 0,
+            "namespace": None,
+            "ttl_seconds": None,
+            "last_accessed": "2024-01-01T00:00:00Z",
+        }
+
+        # This simulates what happens when the API client parses the JSON response
+        result = WorkingMemoryResponse(**response_dict)
+
+        # Verify the context_usage_percentage is included
+        assert isinstance(result, WorkingMemoryResponse)
+        assert result.context_usage_percentage == 33.3
+        assert result.session_id == session_id
diff --git a/docs/memory-types.md b/docs/memory-types.md
index c1cf549..02bf30d 100644
--- a/docs/memory-types.md
+++ b/docs/memory-types.md
@@ -202,11 +202,8 @@ Long-term memory supports three types of memories:
 # Create long-term memories
 POST /v1/long-term-memory/
 
-# Search long-term memories only
+# Search long-term memories
 POST /v1/long-term-memory/search
-
-# Search across all memory types
-POST /v1/memory/search
 ```
 
 ### Search Capabilities
diff --git a/dump.rdb b/dump.rdb
new file mode 100644
index 0000000000000000000000000000000000000000..f6dbd0e92f437877d750f2544c3cbbafd42ad305
GIT binary patch
literal 88
zcmWG?b@2=~FfcUw#aWb^l3A=<mRiJWp=Y3H@QVu~kd#?ce8AxsYjR0uZt9^|N18Hz
pag-LPrs(FT<{oOO<z@KAk(i&Ro0M3bdVt~od-WGDC*AAp2LQ>cB}M=M

literal 0
HcmV?d00001


From ab196dff291c6695b1d39ba9264e1bc4c4995c72 Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Fri, 25 Jul 2025 11:58:37 -0700
Subject: [PATCH 5/6] Make summarization threshold configurable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review comments by making the 0.7 threshold configurable instead
of hardcoded. Added summarization_threshold setting that can be configured
via environment variable or config file.

- Added summarization_threshold to Settings (default: 0.7)
- Updated both _calculate_context_usage_percentage and _summarize_working_memory
  to use settings.summarization_threshold
- Improved maintainability and consistency between functions
- Allows users to customize when summarization is triggered

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 agent_memory_server/api.py    | 8 ++++----
 agent_memory_server/config.py | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
index 2718b4b..271b34c 100644
--- a/agent_memory_server/api.py
+++ b/agent_memory_server/api.py
@@ -88,8 +88,8 @@ def _calculate_context_usage_percentage(
     # Get effective token limit for the client's model
     max_tokens = _get_effective_token_limit(model_name, context_window_max)
 
-    # Use the same threshold as _summarize_working_memory (70% of context window)
-    token_threshold = int(max_tokens * 0.7)
+    # Use the same threshold as _summarize_working_memory (reserves space for new content)
+    token_threshold = int(max_tokens * settings.summarization_threshold)
 
     # Calculate percentage of threshold used
     percentage = (current_tokens / token_threshold) * 100.0
@@ -123,8 +123,8 @@ async def _summarize_working_memory(
     max_tokens = _get_effective_token_limit(model_name, context_window_max)
 
     # Reserve space for new messages, function calls, and response generation
-    # Use 70% of context window to leave room for new content
-    token_threshold = int(max_tokens * 0.7)
+    # Use configurable threshold to leave room for new content
+    token_threshold = int(max_tokens * settings.summarization_threshold)
 
     if current_tokens <= token_threshold:
         return memory
diff --git a/agent_memory_server/config.py b/agent_memory_server/config.py
index 8fdde4d..73acbb2 100644
--- a/agent_memory_server/config.py
+++ b/agent_memory_server/config.py
@@ -119,6 +119,9 @@ class Settings(BaseSettings):
 
     # Working memory settings
     window_size: int = 20  # Default number of recent messages to return
+    summarization_threshold: float = (
+        0.7  # Fraction of context window that triggers summarization
+    )
 
     # Other Application settings
     log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO"

From 2acd27bc92161595f094e61077f17c89d6eab48d Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Fri, 25 Jul 2025 14:30:17 -0700
Subject: [PATCH 6/6] Implement dual context percentage fields for working
 memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add context_percentage_total_used field showing actual context window usage (0-100%)
- Add context_percentage_until_summarization field showing percentage until auto-summarization triggers (0-100%)
- Update API calculation function to return both values as tuple
- Update server and SDK models with new fields
- Update comprehensive test coverage for both fields
- Remove old single context_usage_percentage field
- Maintain configurable summarization threshold (default 70%)

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../agent_memory_client/models.py             |  8 ++-
 agent-memory-client/tests/test_client.py      | 65 ++++++++++++-------
 agent_memory_server/api.py                    | 64 +++++++++++-------
 agent_memory_server/models.py                 |  8 ++-
 4 files changed, 92 insertions(+), 53 deletions(-)

diff --git a/agent-memory-client/agent_memory_client/models.py b/agent-memory-client/agent_memory_client/models.py
index 12206ac..a77b1ea 100644
--- a/agent-memory-client/agent_memory_client/models.py
+++ b/agent-memory-client/agent_memory_client/models.py
@@ -215,9 +215,13 @@ class SessionListResponse(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response from working memory operations"""
 
-    context_usage_percentage: float | None = Field(
+    context_percentage_total_used: float | None = Field(
         default=None,
-        description="Percentage of context window used before auto-summarization triggers (0-100)",
+        description="Percentage of total context window currently used (0-100)",
+    )
+    context_percentage_until_summarization: float | None = Field(
+        default=None,
+        description="Percentage until auto-summarization triggers (0-100, reaches 100% at summarization threshold)",
     )
 
 
diff --git a/agent-memory-client/tests/test_client.py b/agent-memory-client/tests/test_client.py
index 2b13817..a77619f 100644
--- a/agent-memory-client/tests/test_client.py
+++ b/agent-memory-client/tests/test_client.py
@@ -659,13 +659,13 @@ class TestContextUsagePercentage:
     """Tests for context usage percentage functionality."""
 
     @pytest.mark.asyncio
-    async def test_working_memory_response_with_context_percentage(
+    async def test_working_memory_response_with_context_percentages(
         self, enhanced_test_client
     ):
-        """Test that WorkingMemoryResponse properly handles context_usage_percentage field."""
+        """Test that WorkingMemoryResponse properly handles both context percentage fields."""
         session_id = "test-session"
 
-        # Test with context percentage set
+        # Test with both context percentages set
         working_memory_response = WorkingMemoryResponse(
             session_id=session_id,
             messages=[],
@@ -673,13 +673,15 @@ async def test_working_memory_response_with_context_percentage(
             data={},
             context=None,
             user_id=None,
-            context_usage_percentage=45.5,
+            context_percentage_total_used=45.5,
+            context_percentage_until_summarization=65.0,
         )
 
-        assert working_memory_response.context_usage_percentage == 45.5
+        assert working_memory_response.context_percentage_total_used == 45.5
+        assert working_memory_response.context_percentage_until_summarization == 65.0
         assert working_memory_response.session_id == session_id
 
-        # Test with None context percentage (default)
+        # Test with None context percentages (default)
         working_memory_response_none = WorkingMemoryResponse(
             session_id=session_id,
             messages=[],
@@ -689,14 +691,17 @@ async def test_working_memory_response_with_context_percentage(
             user_id=None,
         )
 
-        assert working_memory_response_none.context_usage_percentage is None
+        assert working_memory_response_none.context_percentage_total_used is None
+        assert (
+            working_memory_response_none.context_percentage_until_summarization is None
+        )
 
     @pytest.mark.asyncio
-    async def test_context_percentage_serialization(self, enhanced_test_client):
-        """Test that context_usage_percentage is properly serialized."""
+    async def test_context_percentages_serialization(self, enhanced_test_client):
+        """Test that both context percentage fields are properly serialized."""
         session_id = "test-session"
 
-        # Create response with context percentage
+        # Create response with both context percentages
         working_memory_response = WorkingMemoryResponse(
             session_id=session_id,
             messages=[],
@@ -704,22 +709,27 @@ async def test_context_percentage_serialization(self, enhanced_test_client):
             data={},
             context=None,
             user_id=None,
-            context_usage_percentage=75.0,
+            context_percentage_total_used=75.0,
+            context_percentage_until_summarization=85.5,
         )
 
-        # Test model_dump includes the field
+        # Test model_dump includes both fields
         dumped = working_memory_response.model_dump()
-        assert "context_usage_percentage" in dumped
-        assert dumped["context_usage_percentage"] == 75.0
+        assert "context_percentage_total_used" in dumped
+        assert "context_percentage_until_summarization" in dumped
+        assert dumped["context_percentage_total_used"] == 75.0
+        assert dumped["context_percentage_until_summarization"] == 85.5
 
         # Test JSON serialization
         json_data = working_memory_response.model_dump_json()
-        assert "context_usage_percentage" in json_data
+        assert "context_percentage_total_used" in json_data
+        assert "context_percentage_until_summarization" in json_data
         assert "75.0" in json_data
+        assert "85.5" in json_data
 
     @pytest.mark.asyncio
-    async def test_context_percentage_validation(self, enhanced_test_client):
-        """Test that context_usage_percentage accepts valid values."""
+    async def test_context_percentages_validation(self, enhanced_test_client):
+        """Test that both context percentage fields accept valid values."""
         session_id = "test-session"
 
         # Test valid percentages
@@ -733,12 +743,17 @@ async def test_context_percentage_validation(self, enhanced_test_client):
                 data={},
                 context=None,
                 user_id=None,
-                context_usage_percentage=percentage,
+                context_percentage_total_used=percentage,
+                context_percentage_until_summarization=percentage,
+            )
+            assert working_memory_response.context_percentage_total_used == percentage
+            assert (
+                working_memory_response.context_percentage_until_summarization
+                == percentage
             )
-            assert working_memory_response.context_usage_percentage == percentage
 
-    def test_working_memory_response_from_dict_with_context_percentage(self):
-        """Test that WorkingMemoryResponse can be created from dict with context_usage_percentage."""
+    def test_working_memory_response_from_dict_with_context_percentages(self):
+        """Test that WorkingMemoryResponse can be created from dict with both context percentage fields."""
         session_id = "test-session"
 
         # Test creating WorkingMemoryResponse from dict (simulating API response parsing)
@@ -749,7 +764,8 @@ def test_working_memory_response_from_dict_with_context_percentage(self):
             "data": {},
             "context": None,
             "user_id": None,
-            "context_usage_percentage": 33.3,
+            "context_percentage_total_used": 33.3,
+            "context_percentage_until_summarization": 47.5,
             "tokens": 0,
             "namespace": None,
             "ttl_seconds": None,
@@ -759,7 +775,8 @@ def test_working_memory_response_from_dict_with_context_percentage(self):
         # This simulates what happens when the API client parses the JSON response
         result = WorkingMemoryResponse(**response_dict)
 
-        # Verify the context_usage_percentage is included
+        # Verify both context percentage fields are included
         assert isinstance(result, WorkingMemoryResponse)
-        assert result.context_usage_percentage == 33.3
+        assert result.context_percentage_total_used == 33.3
+        assert result.context_percentage_until_summarization == 47.5
         assert result.session_id == session_id
diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
index 271b34c..578795d 100644
--- a/agent_memory_server/api.py
+++ b/agent_memory_server/api.py
@@ -63,13 +63,13 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
     return total_tokens
 
 
-def _calculate_context_usage_percentage(
+def _calculate_context_usage_percentages(
     messages: list[MemoryMessage],
     model_name: ModelNameLiteral | None,
     context_window_max: int | None,
-) -> float | None:
+) -> tuple[float | None, float | None]:
     """
-    Calculate the percentage of context window used before auto-summarization triggers.
+    Calculate context usage percentages for total usage and until summarization triggers.
 
     Args:
         messages: List of messages to calculate token count for
@@ -77,10 +77,13 @@ def _calculate_context_usage_percentage(
         context_window_max: Direct specification of context window max tokens
 
     Returns:
-        Percentage (0-100) of context used, or None if no model info provided
+        Tuple of (total_percentage, until_summarization_percentage)
+        - total_percentage: Percentage (0-100) of total context window used
+        - until_summarization_percentage: Percentage (0-100) until summarization triggers
+        Both values are None if no model info provided
     """
     if not messages or (not model_name and not context_window_max):
-        return None
+        return None, None
 
     # Calculate current token usage
     current_tokens = _calculate_messages_token_count(messages)
@@ -88,14 +91,15 @@ def _calculate_context_usage_percentage(
     # Get effective token limit for the client's model
     max_tokens = _get_effective_token_limit(model_name, context_window_max)
 
-    # Use the same threshold as _summarize_working_memory (reserves space for new content)
-    token_threshold = int(max_tokens * settings.summarization_threshold)
+    # Calculate percentage of total context window used
+    total_percentage = (current_tokens / max_tokens) * 100.0
 
-    # Calculate percentage of threshold used
-    percentage = (current_tokens / token_threshold) * 100.0
+    # Calculate percentage until summarization threshold
+    token_threshold = int(max_tokens * settings.summarization_threshold)
+    until_summarization_percentage = (current_tokens / token_threshold) * 100.0
 
-    # Cap at 100% for display purposes
-    return min(percentage, 100.0)
+    # Cap both at 100% for display purposes
+    return min(total_percentage, 100.0), min(until_summarization_percentage, 100.0)
 
 
 async def _summarize_working_memory(
@@ -304,16 +308,21 @@ async def get_working_memory(
 
     logger.debug(f"Working mem: {working_mem}")
 
-    # Calculate context usage percentage
-    context_usage_percentage = _calculate_context_usage_percentage(
-        messages=working_mem.messages,
-        model_name=model_name,
-        context_window_max=context_window_max,
+    # Calculate context usage percentages
+    total_percentage, until_summarization_percentage = (
+        _calculate_context_usage_percentages(
+            messages=working_mem.messages,
+            model_name=model_name,
+            context_window_max=context_window_max,
+        )
     )
 
-    # Return WorkingMemoryResponse with percentage
+    # Return WorkingMemoryResponse with both percentage values
     working_mem_data = working_mem.model_dump()
-    working_mem_data["context_usage_percentage"] = context_usage_percentage
+    working_mem_data["context_percentage_total_used"] = total_percentage
+    working_mem_data["context_percentage_until_summarization"] = (
+        until_summarization_percentage
+    )
     return WorkingMemoryResponse(**working_mem_data)
 
 
@@ -393,16 +402,21 @@ async def put_working_memory(
             namespace=updated_memory.namespace,
         )
 
-    # Calculate context usage percentage based on the final state (after potential summarization)
-    context_usage_percentage = _calculate_context_usage_percentage(
-        messages=updated_memory.messages,
-        model_name=model_name,
-        context_window_max=context_window_max,
+    # Calculate context usage percentages based on the final state (after potential summarization)
+    total_percentage, until_summarization_percentage = (
+        _calculate_context_usage_percentages(
+            messages=updated_memory.messages,
+            model_name=model_name,
+            context_window_max=context_window_max,
+        )
     )
 
-    # Return WorkingMemoryResponse with percentage
+    # Return WorkingMemoryResponse with both percentage values
     updated_memory_data = updated_memory.model_dump()
-    updated_memory_data["context_usage_percentage"] = context_usage_percentage
+    updated_memory_data["context_percentage_total_used"] = total_percentage
+    updated_memory_data["context_percentage_until_summarization"] = (
+        until_summarization_percentage
+    )
     return WorkingMemoryResponse(**updated_memory_data)
 
 
diff --git a/agent_memory_server/models.py b/agent_memory_server/models.py
index 87d9af5..204dfdf 100644
--- a/agent_memory_server/models.py
+++ b/agent_memory_server/models.py
@@ -222,9 +222,13 @@ class WorkingMemory(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response containing working memory"""
 
-    context_usage_percentage: float | None = Field(
+    context_percentage_total_used: float | None = Field(
         default=None,
-        description="Percentage of context window used before auto-summarization triggers (0-100)",
+        description="Percentage of total context window currently used (0-100)",
+    )
+    context_percentage_until_summarization: float | None = Field(
+        default=None,
+        description="Percentage until auto-summarization triggers (0-100, reaches 100% at summarization threshold)",
     )