From 13b662ccd6742258e35af7c9296ff3204d38c146 Mon Sep 17 00:00:00 2001 From: GangGreenTemperTatum <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:40:35 -0500 Subject: [PATCH 1/3] chore: summarize_when_long with no tool call response --- dreadnode/agent/hooks/summarize.py | 95 +++++++++++++++++++++++------- tests/test_preserve_tool_pairs.py | 85 ++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 20 deletions(-) create mode 100644 tests/test_preserve_tool_pairs.py diff --git a/dreadnode/agent/hooks/summarize.py b/dreadnode/agent/hooks/summarize.py index 8999d259..e46a6f13 100644 --- a/dreadnode/agent/hooks/summarize.py +++ b/dreadnode/agent/hooks/summarize.py @@ -44,12 +44,56 @@ def _get_last_input_tokens(event: AgentEvent) -> int: return last_generation_event.usage.input_tokens if last_generation_event.usage else 0 +def _find_tool_aware_boundary( + messages: list[rg.Message], + min_messages_to_keep: int, +) -> int: + """ + Find the best summarization boundary while preserving tool call/response pairs. + + This prevents breaking tool messages that would cause API errors with strict models + (OpenAI, Anthropic) that require every tool_call_id to have a matching response. + + Args: + messages: List of messages to analyze (excluding system message) + min_messages_to_keep: Minimum messages that must be kept after boundary + + Returns: + Index where to split (messages[:idx] summarized, messages[idx:] kept) + Returns 0 if no valid boundary found + """ + # Build tool_call_id -> assistant message index mapping + tool_call_map: dict[str, int] = {} + for i, msg in enumerate(messages): + if msg.role == "assistant" and hasattr(msg, "tool_calls"): + for tc in getattr(msg, "tool_calls", None) or []: + if hasattr(tc, "id"): + tool_call_map[tc.id] = i + + # Walk backward from desired split point to find first valid boundary + for boundary in range(len(messages) - min_messages_to_keep, -1, -1): + # Check if this boundary would orphan any tool responses + has_orphan = False + for msg in messages[boundary:]: + if msg.role == "tool" and hasattr(msg, "tool_call_id"): + call_idx = tool_call_map.get(msg.tool_call_id) + if call_idx is not None and call_idx < boundary: + has_orphan = True + break + + if not has_orphan: + return boundary + + return 0 # No valid boundary found + + @component def summarize_when_long( model: str | rg.Generator | None = None, max_tokens: int = 100_000, min_messages_to_keep: int = 5, guidance: str = "", + preserve_tool_pairs: bool = True, ) -> "Hook": """ Creates a hook to manage the agent's context window by summarizing the conversation history. @@ -66,6 +110,9 @@ def summarize_when_long( (default is None, meaning no proactive summarization). min_messages_to_keep: The minimum number of messages to retain after summarization (default is 5). guidance: Additional guidance for the summarization process (default is ""). + preserve_tool_pairs: If True, ensures tool call/response pairs stay together to avoid breaking + strict API requirements (OpenAI, Anthropic). Defaults to True. Set to False to use legacy + behavior that may break tool pairs but allows more aggressive summarization. """ if min_messages_to_keep < 2: @@ -91,6 +138,10 @@ async def summarize_when_long( # noqa: PLR0912 guidance, help="Additional guidance for the summarization process", ), + preserve_tool_pairs: bool = Config( + preserve_tool_pairs, + help="Preserve tool call/response pairs to avoid breaking strict API requirements", + ), ) -> Reaction | None: should_summarize = False @@ -123,26 +174,30 @@ async def summarize_when_long( # noqa: PLR0912 messages.pop(0) if messages and messages[0].role == "system" else None ) - # Find the best point to summarize by walking the message list once. - # A boundary is valid after a simple assistant message or a finished tool block. - best_summarize_boundary = 0 - for i, message in enumerate(messages): - # If the remaining messages are less than or equal to our minimum, we can't slice any further. - if len(messages) - i <= min_messages_to_keep: - break - - # Condition 1: The message is an assistant response without tool calls. - is_simple_assistant = message.role == "assistant" and not getattr( - message, "tool_calls", None - ) - - # Condition 2: The message is the last in a block of tool responses. - is_last_tool_in_block = message.role == "tool" and ( - i + 1 == len(messages) or messages[i + 1].role != "tool" - ) - - if is_simple_assistant or is_last_tool_in_block: - best_summarize_boundary = i + 1 + # Find the best point to summarize + if preserve_tool_pairs: + # Use tool-aware boundary finding to prevent breaking tool call/response pairs + best_summarize_boundary = _find_tool_aware_boundary(messages, min_messages_to_keep) + else: + # Legacy behavior: walk the message list once looking for simple boundaries + best_summarize_boundary = 0 + for i, message in enumerate(messages): + # If the remaining messages are less than or equal to our minimum, we can't slice any further. + if len(messages) - i <= min_messages_to_keep: + break + + # Condition 1: The message is an assistant response without tool calls. + is_simple_assistant = message.role == "assistant" and not getattr( + message, "tool_calls", None + ) + + # Condition 2: The message is the last in a block of tool responses. + is_last_tool_in_block = message.role == "tool" and ( + i + 1 == len(messages) or messages[i + 1].role != "tool" + ) + + if is_simple_assistant or is_last_tool_in_block: + best_summarize_boundary = i + 1 if best_summarize_boundary == 0: return None # No valid slice point was found. diff --git a/tests/test_preserve_tool_pairs.py b/tests/test_preserve_tool_pairs.py new file mode 100644 index 00000000..b91951e9 --- /dev/null +++ b/tests/test_preserve_tool_pairs.py @@ -0,0 +1,85 @@ +"""Tests for preserve_tool_pairs functionality in summarize_when_long hook.""" + +import rigging as rg +from dreadnode.agent.hooks.summarize import _find_tool_aware_boundary + + +class ToolCall: + """Minimal tool call representation for testing.""" + def __init__(self, call_id: str): + self.id = call_id + + +class ToolMessage(rg.Message): + """Tool response message for testing.""" + def __init__(self, call_id: str, content: str): + super().__init__("tool", content) + self.tool_call_id = call_id + + +def test_preserves_tool_pairs(): + """Tool call and response stay together when split.""" + messages = [ + rg.Message("user", "Hello"), + rg.Message("assistant", "Let me check", tool_calls=[ToolCall("call_1")]), + ToolMessage("call_1", "Result"), + rg.Message("assistant", "Done"), + rg.Message("user", "Thanks"), + ] + + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) + + # Should keep tool pair together by moving boundary earlier + assert boundary <= 1, "Boundary should preserve tool call/response pair" + + +def test_no_tools(): + """Works correctly without any tool messages.""" + messages = [ + rg.Message("user", "Hello"), + rg.Message("assistant", "Hi"), + rg.Message("user", "How are you"), + rg.Message("assistant", "Good"), + ] + + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) + assert boundary == 2, "Should split at natural boundary" + + +def test_multiple_tool_pairs(): + """Handles multiple tool call/response pairs correctly.""" + messages = [ + rg.Message("user", "Do A and B"), + rg.Message("assistant", "Running A", tool_calls=[ToolCall("a")]), + ToolMessage("a", "A done"), + rg.Message("assistant", "Running B", tool_calls=[ToolCall("b")]), + ToolMessage("b", "B done"), + rg.Message("user", "Thanks"), + ] + + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) + + # Should not split between any tool pairs + kept = messages[boundary:] + assert len(kept) >= 2, "Should keep minimum messages" + + +def test_no_valid_boundary(): + """Returns 0 when entire conversation is tool chain.""" + messages = [ + rg.Message("assistant", "Start", tool_calls=[ToolCall("1")]), + ToolMessage("1", "Result 1"), + rg.Message("assistant", "Continue", tool_calls=[ToolCall("2")]), + ToolMessage("2", "Result 2"), + ] + + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) + assert boundary == 0, "Should keep everything when no valid split exists" + + +if __name__ == "__main__": + test_preserves_tool_pairs() + test_no_tools() + test_multiple_tool_pairs() + test_no_valid_boundary() + print("All tests passed") From 424e3dcc02d683abd3761540ead2ff6bc2e13e84 Mon Sep 17 00:00:00 2001 From: GangGreenTemperTatum <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:53:34 -0500 Subject: [PATCH 2/3] fix: ci checks with dictionary --- tests/test_preserve_tool_pairs.py | 53 +++++++++++++++++-------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/tests/test_preserve_tool_pairs.py b/tests/test_preserve_tool_pairs.py index b91951e9..c8e2a1ad 100644 --- a/tests/test_preserve_tool_pairs.py +++ b/tests/test_preserve_tool_pairs.py @@ -4,25 +4,16 @@ from dreadnode.agent.hooks.summarize import _find_tool_aware_boundary -class ToolCall: - """Minimal tool call representation for testing.""" - def __init__(self, call_id: str): - self.id = call_id - - -class ToolMessage(rg.Message): - """Tool response message for testing.""" - def __init__(self, call_id: str, content: str): - super().__init__("tool", content) - self.tool_call_id = call_id - - def test_preserves_tool_pairs(): """Tool call and response stay together when split.""" messages = [ rg.Message("user", "Hello"), - rg.Message("assistant", "Let me check", tool_calls=[ToolCall("call_1")]), - ToolMessage("call_1", "Result"), + rg.Message( + "assistant", + "Let me check", + tool_calls=[{"id": "call_1", "type": "function", "function": {"name": "check", "arguments": "{}"}}], + ), + rg.Message("tool", "Result", tool_call_id="call_1"), rg.Message("assistant", "Done"), rg.Message("user", "Thanks"), ] @@ -50,10 +41,18 @@ def test_multiple_tool_pairs(): """Handles multiple tool call/response pairs correctly.""" messages = [ rg.Message("user", "Do A and B"), - rg.Message("assistant", "Running A", tool_calls=[ToolCall("a")]), - ToolMessage("a", "A done"), - rg.Message("assistant", "Running B", tool_calls=[ToolCall("b")]), - ToolMessage("b", "B done"), + rg.Message( + "assistant", + "Running A", + tool_calls=[{"id": "a", "type": "function", "function": {"name": "run_a", "arguments": "{}"}}], + ), + rg.Message("tool", "A done", tool_call_id="a"), + rg.Message( + "assistant", + "Running B", + tool_calls=[{"id": "b", "type": "function", "function": {"name": "run_b", "arguments": "{}"}}], + ), + rg.Message("tool", "B done", tool_call_id="b"), rg.Message("user", "Thanks"), ] @@ -67,10 +66,18 @@ def test_multiple_tool_pairs(): def test_no_valid_boundary(): """Returns 0 when entire conversation is tool chain.""" messages = [ - rg.Message("assistant", "Start", tool_calls=[ToolCall("1")]), - ToolMessage("1", "Result 1"), - rg.Message("assistant", "Continue", tool_calls=[ToolCall("2")]), - ToolMessage("2", "Result 2"), + rg.Message( + "assistant", + "Start", + tool_calls=[{"id": "1", "type": "function", "function": {"name": "start", "arguments": "{}"}}], + ), + rg.Message("tool", "Result 1", tool_call_id="1"), + rg.Message( + "assistant", + "Continue", + tool_calls=[{"id": "2", "type": "function", "function": {"name": "continue", "arguments": "{}"}}], + ), + rg.Message("tool", "Result 2", tool_call_id="2"), ] boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) From 63cc2dbd06213889a8a5f09d235912a97fb31286 Mon Sep 17 00:00:00 2001 From: GangGreenTemperTatum <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:12:59 -0500 Subject: [PATCH 3/3] fix: linting and typechecks --- dreadnode/agent/hooks/summarize.py | 11 ++++-- tests/test_preserve_tool_pairs.py | 60 +++++++++++++++++++++--------- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/dreadnode/agent/hooks/summarize.py b/dreadnode/agent/hooks/summarize.py index e46a6f13..538261cc 100644 --- a/dreadnode/agent/hooks/summarize.py +++ b/dreadnode/agent/hooks/summarize.py @@ -76,10 +76,12 @@ def _find_tool_aware_boundary( has_orphan = False for msg in messages[boundary:]: if msg.role == "tool" and hasattr(msg, "tool_call_id"): - call_idx = tool_call_map.get(msg.tool_call_id) - if call_idx is not None and call_idx < boundary: - has_orphan = True - break + tool_call_id = getattr(msg, "tool_call_id", None) + if tool_call_id is not None: + call_idx = tool_call_map.get(tool_call_id) + if call_idx is not None and call_idx < boundary: + has_orphan = True + break if not has_orphan: return boundary @@ -93,6 +95,7 @@ def summarize_when_long( max_tokens: int = 100_000, min_messages_to_keep: int = 5, guidance: str = "", + *, preserve_tool_pairs: bool = True, ) -> "Hook": """ diff --git a/tests/test_preserve_tool_pairs.py b/tests/test_preserve_tool_pairs.py index c8e2a1ad..9ba23c39 100644 --- a/tests/test_preserve_tool_pairs.py +++ b/tests/test_preserve_tool_pairs.py @@ -1,27 +1,41 @@ """Tests for preserve_tool_pairs functionality in summarize_when_long hook.""" import rigging as rg + from dreadnode.agent.hooks.summarize import _find_tool_aware_boundary def test_preserves_tool_pairs(): - """Tool call and response stay together when split.""" + """Tool call and response stay together when boundary would split them.""" messages = [ rg.Message("user", "Hello"), rg.Message( "assistant", "Let me check", - tool_calls=[{"id": "call_1", "type": "function", "function": {"name": "check", "arguments": "{}"}}], + tool_calls=[ + { + "id": "call_1", + "type": "function", + "function": {"name": "check", "arguments": "{}"}, + } + ], ), rg.Message("tool", "Result", tool_call_id="call_1"), rg.Message("assistant", "Done"), rg.Message("user", "Thanks"), ] - boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) + # With min=3, naive boundary would be at index 2, keeping [2,3,4] + # But that would orphan the tool response at index 2 (call at index 1) + # So boundary should move back to index 1 to keep the pair together + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=3) + assert boundary == 1, f"Expected boundary 1 to keep tool pair together, got {boundary}" - # Should keep tool pair together by moving boundary earlier - assert boundary <= 1, "Boundary should preserve tool call/response pair" + # Verify the kept messages include the complete tool pair + kept = messages[boundary:] + assert len(kept) == 4 + assert kept[0].role == "assistant" + assert kept[1].role == "tool" def test_no_tools(): @@ -44,44 +58,55 @@ def test_multiple_tool_pairs(): rg.Message( "assistant", "Running A", - tool_calls=[{"id": "a", "type": "function", "function": {"name": "run_a", "arguments": "{}"}}], + tool_calls=[ + {"id": "a", "type": "function", "function": {"name": "run_a", "arguments": "{}"}} + ], ), rg.Message("tool", "A done", tool_call_id="a"), rg.Message( "assistant", "Running B", - tool_calls=[{"id": "b", "type": "function", "function": {"name": "run_b", "arguments": "{}"}}], + tool_calls=[ + {"id": "b", "type": "function", "function": {"name": "run_b", "arguments": "{}"}} + ], ), rg.Message("tool", "B done", tool_call_id="b"), rg.Message("user", "Thanks"), ] - boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) - - # Should not split between any tool pairs - kept = messages[boundary:] - assert len(kept) >= 2, "Should keep minimum messages" + # With min=3, naive boundary at index 3 would keep [3,4,5] + # But index 4 (tool response "b") references call at index 3 + # So boundary should move back to index 3 to keep the second pair + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=3) + assert boundary == 3, f"Expected boundary 3 to preserve second tool pair, got {boundary}" def test_no_valid_boundary(): - """Returns 0 when entire conversation is tool chain.""" + """Returns 0 when min_messages would force splitting all tool pairs.""" messages = [ rg.Message( "assistant", "Start", - tool_calls=[{"id": "1", "type": "function", "function": {"name": "start", "arguments": "{}"}}], + tool_calls=[ + {"id": "1", "type": "function", "function": {"name": "start", "arguments": "{}"}} + ], ), rg.Message("tool", "Result 1", tool_call_id="1"), rg.Message( "assistant", "Continue", - tool_calls=[{"id": "2", "type": "function", "function": {"name": "continue", "arguments": "{}"}}], + tool_calls=[ + {"id": "2", "type": "function", "function": {"name": "continue", "arguments": "{}"}} + ], ), rg.Message("tool", "Result 2", tool_call_id="2"), ] - boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=2) - assert boundary == 0, "Should keep everything when no valid split exists" + # With min=3, we'd need to keep last 3 messages + # Any boundary would orphan at least one tool response + # So should return 0 to keep everything + boundary = _find_tool_aware_boundary(messages, min_messages_to_keep=3) + assert boundary == 0, f"Should keep everything when no valid split exists, got {boundary}" if __name__ == "__main__": @@ -89,4 +114,3 @@ def test_no_valid_boundary(): test_no_tools() test_multiple_tool_pairs() test_no_valid_boundary() - print("All tests passed")