diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 909bb9f..a4605e3 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -42,7 +42,7 @@ def completion_config( """ self._client.track('$ld:ai:config:function:single', context, key, 1) - model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate( + model, provider, messages, instructions, tracker, enabled, judge_configuration, _ = self.__evaluate( key, context, default_value.to_dict(), variables ) @@ -96,18 +96,31 @@ def judge_config( """ self._client.track('$ld:ai:judge:function:single', context, key, 1) - model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate( + model, provider, messages, instructions, tracker, enabled, judge_configuration, variation = self.__evaluate( key, context, default_value.to_dict(), variables ) - # Extract evaluation_metric_keys from the variation - variation = self._client.variation(key, context, default_value.to_dict()) - evaluation_metric_keys = variation.get('evaluationMetricKeys', default_value.evaluation_metric_keys or []) + def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]: + """ + Extract evaluation_metric_key with backward compatibility. + + Priority: 1) evaluationMetricKey from variation, 2) first from evaluationMetricKeys in variation + """ + if evaluation_metric_key := variation.get('evaluationMetricKey'): + return evaluation_metric_key + + variation_keys = variation.get('evaluationMetricKeys') + if isinstance(variation_keys, list) and variation_keys: + return variation_keys[0] + + return None + + evaluation_metric_key = _extract_evaluation_metric_key(variation) config = AIJudgeConfig( key=key, enabled=bool(enabled), - evaluation_metric_keys=evaluation_metric_keys, + evaluation_metric_key=evaluation_metric_key, model=model, messages=messages, provider=provider, @@ -144,7 +157,7 @@ async def create_judge( enabled=True, model=ModelConfig("gpt-4"), provider=ProviderConfig("openai"), - evaluation_metric_keys=['$ld:ai:judge:relevance'], + evaluation_metric_key='$ld:ai:judge:relevance', messages=[LDMessage(role='system', content='You are a relevance judge.')] ), variables={'metric': "relevance"} @@ -160,15 +173,12 @@ async def create_judge( self._client.track('$ld:ai:judge:function:createJudge', context, key, 1) try: - # Warn if reserved variables are provided if variables: if 'message_history' in variables: - # Note: Python doesn't have a logger on the client, but we could add one - pass # Would log warning if logger available + pass if 'response_to_evaluate' in variables: - pass # Would log warning if logger available + pass - # Overwrite reserved variables to ensure they remain as placeholders for judge evaluation extended_variables = dict(variables) if variables else {} extended_variables['message_history'] = '{{message_history}}' extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}' @@ -176,17 +186,14 @@ async def create_judge( judge_config = self.judge_config(key, context, default_value, extended_variables) if not judge_config.enabled or not judge_config.tracker: - # Would log info if logger available return None - # Create AI provider for the judge provider = await AIProviderFactory.create(judge_config, default_ai_provider) if not provider: return None return Judge(judge_config, judge_config.tracker, provider) except Exception as error: - # Would log error if logger available return None async def _initialize_judges( @@ -279,7 +286,6 @@ async def create_chat( config = self.completion_config(key, context, default_value, variables) if not config.enabled or not config.tracker: - # Would log info if logger available return None provider = await AIProviderFactory.create(config, default_ai_provider) @@ -333,7 +339,6 @@ def agent_config( :param variables: Variables for interpolation. :return: Configured AIAgentConfig instance. """ - # Track single agent usage self._client.track( "$ld:ai:agent:function:single", context, @@ -399,7 +404,6 @@ def agent_configs( :param context: The context to evaluate the agent configurations in. :return: Dictionary mapping agent keys to their AIAgentConfig configurations. """ - # Track multiple agents usage agent_count = len(agent_configs) self._client.track( "$ld:ai:agent:function:multiple", @@ -538,7 +542,7 @@ def __evaluate( variables: Optional[Dict[str, Any]] = None, ) -> Tuple[ Optional[ModelConfig], Optional[ProviderConfig], Optional[List[LDMessage]], - Optional[str], LDAIConfigTracker, bool, Optional[Any] + Optional[str], LDAIConfigTracker, bool, Optional[Any], Dict[str, Any] ]: """ Internal method to evaluate a configuration and extract components. @@ -547,7 +551,7 @@ def __evaluate( :param context: The evaluation context. :param default_dict: Default configuration as dictionary. :param variables: Variables for interpolation. - :return: Tuple of (model, provider, messages, instructions, tracker, enabled). + :return: Tuple of (model, provider, messages, instructions, tracker, enabled, judge_configuration, variation). """ variation = self._client.variation(key, context, default_dict) @@ -556,7 +560,6 @@ def __evaluate( all_variables.update(variables) all_variables['ldctx'] = context.to_dict() - # Extract messages messages = None if 'messages' in variation and isinstance(variation['messages'], list) and all( isinstance(entry, dict) for entry in variation['messages'] @@ -571,18 +574,15 @@ def __evaluate( for entry in variation['messages'] ] - # Extract instructions instructions = None if 'instructions' in variation and isinstance(variation['instructions'], str): instructions = self.__interpolate_template(variation['instructions'], all_variables) - # Extract provider config provider_config = None if 'provider' in variation and isinstance(variation['provider'], dict): provider = variation['provider'] provider_config = ProviderConfig(provider.get('name', '')) - # Extract model config model = None if 'model' in variation and isinstance(variation['model'], dict): parameters = variation['model'].get('parameters', None) @@ -593,7 +593,6 @@ def __evaluate( custom=custom ) - # Create tracker tracker = LDAIConfigTracker( self._client, variation.get('_ldMeta', {}).get('variationKey', ''), @@ -606,7 +605,6 @@ def __evaluate( enabled = variation.get('_ldMeta', {}).get('enabled', False) - # Extract judge configuration judge_configuration = None if 'judgeConfiguration' in variation and isinstance(variation['judgeConfiguration'], dict): judge_config = variation['judgeConfiguration'] @@ -622,7 +620,7 @@ def __evaluate( if judges: judge_configuration = JudgeConfiguration(judges=judges) - return model, provider_config, messages, instructions, tracker, enabled, judge_configuration + return model, provider_config, messages, instructions, tracker, enabled, judge_configuration, variation def __evaluate_agent( self, @@ -640,7 +638,7 @@ def __evaluate_agent( :param variables: Variables for interpolation. :return: Configured AIAgentConfig instance. """ - model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate( + model, provider, messages, instructions, tracker, enabled, judge_configuration, _ = self.__evaluate( key, context, default_value.to_dict(), variables ) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index eb4ef4b..0ca402a 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -9,8 +9,7 @@ from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, LDMessage from ldai.providers.ai_provider import AIProvider -from ldai.providers.types import (ChatResponse, EvalScore, JudgeResponse, - StructuredResponse) +from ldai.providers.types import ChatResponse, EvalScore, JudgeResponse from ldai.tracker import LDAIConfigTracker @@ -38,9 +37,7 @@ def __init__( self._ai_config = ai_config self._ai_config_tracker = ai_config_tracker self._ai_provider = ai_provider - self._evaluation_response_structure = EvaluationSchemaBuilder.build( - ai_config.evaluation_metric_keys - ) + self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key) async def evaluate( self, @@ -57,9 +54,9 @@ async def evaluate( :return: Evaluation results or None if not sampled """ try: - if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0: + if not self._ai_config.evaluation_metric_key: log.warn( - 'Judge configuration is missing required evaluationMetricKeys' + 'Judge configuration is missing required evaluationMetricKey' ) return None @@ -72,8 +69,8 @@ async def evaluate( return None messages = self._construct_evaluation_messages(input_text, output_text) + assert self._evaluation_response_structure is not None - # Track metrics of the structured model invocation response = await self._ai_config_tracker.track_metrics_of( lambda: self._ai_provider.invoke_structured_model(messages, self._evaluation_response_structure), lambda result: result.metrics, @@ -83,8 +80,8 @@ async def evaluate( evals = self._parse_evaluation_response(response.data) - if len(evals) != len(self._ai_config.evaluation_metric_keys): - log.warn('Judge evaluation did not return all evaluations') + if self._ai_config.evaluation_metric_key not in evals: + log.warn('Judge evaluation did not return the expected evaluation') success = False return JudgeResponse( @@ -191,30 +188,34 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor evaluations = data['evaluations'] - for metric_key in self._ai_config.evaluation_metric_keys: - evaluation = evaluations.get(metric_key) + metric_key = self._ai_config.evaluation_metric_key + if not metric_key: + log.warn('Evaluation metric key is missing') + return results - if not evaluation or not isinstance(evaluation, dict): - log.warn(f'Missing evaluation for metric key: {metric_key}') - continue + evaluation = evaluations.get(metric_key) - score = evaluation.get('score') - reasoning = evaluation.get('reasoning') + if not evaluation or not isinstance(evaluation, dict): + log.warn(f'Missing evaluation for metric key: {metric_key}') + return results - if not isinstance(score, (int, float)) or score < 0 or score > 1: - log.warn( - f'Invalid score evaluated for {metric_key}: {score}. ' - 'Score must be a number between 0 and 1 inclusive' - ) - continue + score = evaluation.get('score') + reasoning = evaluation.get('reasoning') - if not isinstance(reasoning, str): - log.warn( - f'Invalid reasoning evaluated for {metric_key}: {reasoning}. ' - 'Reasoning must be a string' - ) - continue + if not isinstance(score, (int, float)) or score < 0 or score > 1: + log.warn( + f'Invalid score evaluated for {metric_key}: {score}. ' + 'Score must be a number between 0 and 1 inclusive' + ) + return results + + if not isinstance(reasoning, str): + log.warn( + f'Invalid reasoning evaluated for {metric_key}: {reasoning}. ' + 'Reasoning must be a string' + ) + return results - results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) + results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) return results diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index 29b885d..c69e0af 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -1,6 +1,6 @@ """Internal class for building dynamic evaluation response schemas.""" -from typing import Any, Dict +from typing import Any, Dict, Optional class EvaluationSchemaBuilder: @@ -10,26 +10,29 @@ class EvaluationSchemaBuilder: """ @staticmethod - def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]: + def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]: """ - Build an evaluation response schema from evaluation metric keys. + Build an evaluation response schema from evaluation metric key. - :param evaluation_metric_keys: List of evaluation metric keys - :return: Schema dictionary for structured output + :param evaluation_metric_key: Evaluation metric key, or None if not available + :return: Schema dictionary for structured output, or None if evaluation_metric_key is None """ + if not evaluation_metric_key: + return None + return { 'title': 'EvaluationResponse', - 'description': f"Response containing evaluation results for {', '.join(evaluation_metric_keys)} metrics", + 'description': f"Response containing evaluation results for {evaluation_metric_key} metric", 'type': 'object', 'properties': { 'evaluations': { 'type': 'object', 'description': ( f"Object containing evaluation results for " - f"{', '.join(evaluation_metric_keys)} metrics" + f"{evaluation_metric_key} metric" ), - 'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_keys), - 'required': evaluation_metric_keys, + 'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key), + 'required': [evaluation_metric_key], 'additionalProperties': False, }, }, @@ -38,17 +41,16 @@ def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]: } @staticmethod - def _build_key_properties(evaluation_metric_keys: list[str]) -> Dict[str, Any]: + def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]: """ - Build properties for each evaluation metric key. + Build properties for a single evaluation metric key. - :param evaluation_metric_keys: List of evaluation metric keys - :return: Dictionary of properties for each key + :param evaluation_metric_key: Evaluation metric key + :return: Dictionary of properties for the key """ - result: Dict[str, Any] = {} - for key in evaluation_metric_keys: - result[key] = EvaluationSchemaBuilder._build_key_schema(key) - return result + return { + evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key) + } @staticmethod def _build_key_schema(key: str) -> Dict[str, Any]: diff --git a/packages/sdk/server-ai/src/ldai/models.py b/packages/sdk/server-ai/src/ldai/models.py index 5c6e511..c3e33bc 100644 --- a/packages/sdk/server-ai/src/ldai/models.py +++ b/packages/sdk/server-ai/src/ldai/models.py @@ -285,7 +285,9 @@ class AIJudgeConfigDefault(AIConfigDefault): Default Judge-specific AI Config with required evaluation metric key. """ messages: Optional[List[LDMessage]] = None + # Deprecated: evaluation_metric_key is used instead evaluation_metric_keys: Optional[List[str]] = None + evaluation_metric_key: Optional[str] = None def to_dict(self) -> dict: """ @@ -293,7 +295,9 @@ def to_dict(self) -> dict: """ result = self._base_to_dict() result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None - if self.evaluation_metric_keys is not None: + result['evaluationMetricKey'] = self.evaluation_metric_key + # Include deprecated evaluationMetricKeys for backward compatibility + if self.evaluation_metric_keys: result['evaluationMetricKeys'] = self.evaluation_metric_keys return result @@ -303,16 +307,18 @@ class AIJudgeConfig(AIConfig): """ Judge-specific AI Config with required evaluation metric key. """ + # Deprecated: evaluation_metric_key is used instead evaluation_metric_keys: List[str] = field(default_factory=list) messages: Optional[List[LDMessage]] = None + evaluation_metric_key: Optional[str] = None def to_dict(self) -> dict: """ Render the given judge config as a dictionary object. """ result = self._base_to_dict() - result['evaluationMetricKeys'] = self.evaluation_metric_keys result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None + result['evaluationMetricKey'] = self.evaluation_metric_key return result diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py new file mode 100644 index 0000000..407e38e --- /dev/null +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -0,0 +1,625 @@ +"""Tests for Judge functionality.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest +from ldclient import Config, Context, LDClient +from ldclient.integrations.test_data import TestData + +from ldai.judge import Judge +from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder +from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig +from ldai.providers.types import EvalScore, JudgeResponse, LDAIMetrics, StructuredResponse +from ldai.tracker import LDAIConfigTracker + + +@pytest.fixture +def td() -> TestData: + td = TestData.data_source() + td.update( + td.flag('judge-config') + .variations( + { + 'model': {'name': 'gpt-4', 'parameters': {'temperature': 0.3}}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:relevance', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + return td + + +@pytest.fixture +def client(td: TestData) -> LDClient: + config = Config('sdk-key', update_processor_class=td, send_events=False) + return LDClient(config=config) + + +@pytest.fixture +def mock_ai_provider(): + """Create a mock AI provider.""" + provider = MagicMock() + provider.invoke_structured_model = AsyncMock() + return provider + + +@pytest.fixture +def context() -> Context: + return Context.create('user-key') + + +@pytest.fixture +def tracker(client: LDClient, context: Context) -> LDAIConfigTracker: + return LDAIConfigTracker( + client, 'judge-v1', 'judge-config', 1, 'gpt-4', 'openai', context + ) + + +@pytest.fixture +def judge_config_with_key() -> AIJudgeConfig: + """Create a judge config with evaluation_metric_key.""" + return AIJudgeConfig( + key='judge-config', + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + +@pytest.fixture +def judge_config_without_key() -> AIJudgeConfig: + """Create a judge config without evaluation_metric_key.""" + return AIJudgeConfig( + key='judge-config', + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + +@pytest.fixture +def judge_config_without_messages() -> AIJudgeConfig: + """Create a judge config without messages.""" + return AIJudgeConfig( + key='judge-config', + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=None, + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + +class TestJudgeInitialization: + """Tests for Judge initialization.""" + + def test_judge_initializes_with_evaluation_metric_key( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Judge should initialize successfully with evaluation_metric_key.""" + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + assert judge._ai_config == judge_config_with_key + assert judge._evaluation_response_structure is not None + assert judge._evaluation_response_structure['title'] == 'EvaluationResponse' + assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required'] + + def test_judge_initializes_without_evaluation_metric_key( + self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Judge should initialize but have None for evaluation_response_structure.""" + judge = Judge(judge_config_without_key, tracker, mock_ai_provider) + + assert judge._ai_config == judge_config_without_key + assert judge._evaluation_response_structure is None + + +class TestJudgeEvaluate: + """Tests for Judge.evaluate() method.""" + + @pytest.mark.asyncio + async def test_evaluate_returns_none_when_evaluation_metric_key_missing( + self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should return None when evaluation_metric_key is missing.""" + judge = Judge(judge_config_without_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input text", "output text") + + assert result is None + mock_ai_provider.invoke_structured_model.assert_not_called() + + @pytest.mark.asyncio + async def test_evaluate_returns_none_when_messages_missing( + self, judge_config_without_messages: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should return None when messages are missing.""" + judge = Judge(judge_config_without_messages, tracker, mock_ai_provider) + + result = await judge.evaluate("input text", "output text") + + assert result is None + mock_ai_provider.invoke_structured_model.assert_not_called() + + @pytest.mark.asyncio + async def test_evaluate_success_with_valid_response( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should return JudgeResponse with valid evaluation.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 0.85, + 'reasoning': 'The response is highly relevant to the input.' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("What is AI?", "AI is artificial intelligence.") + + assert result is not None + assert isinstance(result, JudgeResponse) + assert result.success is True + assert '$ld:ai:judge:relevance' in result.evals + assert result.evals['$ld:ai:judge:relevance'].score == 0.85 + assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + + @pytest.mark.asyncio + async def test_evaluate_handles_missing_evaluation_in_response( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle missing evaluation in response.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + 'wrong-key': { + 'score': 0.5, + 'reasoning': 'Some reasoning' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is False + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_handles_invalid_score( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle invalid score values.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 1.5, + 'reasoning': 'Some reasoning' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is False + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_handles_missing_reasoning( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle missing reasoning.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 0.8, + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is False + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_handles_exception( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle exceptions gracefully.""" + mock_ai_provider.invoke_structured_model.side_effect = Exception("Provider error") + tracker.track_metrics_of = AsyncMock(side_effect=Exception("Provider error")) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert isinstance(result, JudgeResponse) + assert result.success is False + assert result.error is not None + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_respects_sampling_rate( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should respect sampling rate.""" + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output", sampling_rate=0.0) + + assert result is None + mock_ai_provider.invoke_structured_model.assert_not_called() + + +class TestJudgeEvaluateMessages: + """Tests for Judge.evaluate_messages() method.""" + + @pytest.mark.asyncio + async def test_evaluate_messages_calls_evaluate( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """evaluate_messages should call evaluate with constructed input/output.""" + from ldai.providers.types import ChatResponse + + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 0.9, + 'reasoning': 'Very relevant' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + messages = [ + LDMessage(role='user', content='Question 1'), + LDMessage(role='assistant', content='Answer 1'), + ] + chat_response = ChatResponse( + message=LDMessage(role='assistant', content='Answer 2'), + metrics=LDAIMetrics(success=True) + ) + + result = await judge.evaluate_messages(messages, chat_response) + + assert result is not None + assert result.success is True + assert tracker.track_metrics_of.called + + +class TestEvaluationSchemaBuilder: + """Tests for EvaluationSchemaBuilder.""" + + def test_build_creates_correct_schema(self): + """Schema builder should create correct schema structure.""" + schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance') + + assert schema['title'] == 'EvaluationResponse' + assert schema['type'] == 'object' + assert 'evaluations' in schema['properties'] + assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required'] + assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties'] + + metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance'] + assert metric_schema['type'] == 'object' + assert 'score' in metric_schema['properties'] + assert 'reasoning' in metric_schema['properties'] + assert metric_schema['properties']['score']['type'] == 'number' + assert metric_schema['properties']['score']['minimum'] == 0 + assert metric_schema['properties']['score']['maximum'] == 1 + + def test_build_key_properties_creates_single_key(self): + """_build_key_properties should create properties for a single key.""" + properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance') + + assert '$ld:ai:judge:relevance' in properties + assert len(properties) == 1 + assert properties['$ld:ai:judge:relevance']['type'] == 'object' + + +class TestJudgeConfigSerialization: + """Tests for AIJudgeConfig serialization.""" + + def test_to_dict_includes_evaluation_metric_key(self): + """to_dict should include evaluationMetricKey.""" + config = AIJudgeConfig( + key='test-judge', + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + ) + + result = config.to_dict() + + assert result['evaluationMetricKey'] == '$ld:ai:judge:relevance' + assert 'evaluationMetricKeys' not in result + + def test_to_dict_handles_none_evaluation_metric_key(self): + """to_dict should handle None evaluation_metric_key.""" + config = AIJudgeConfig( + key='test-judge', + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + ) + + result = config.to_dict() + + assert result['evaluationMetricKey'] is None + + def test_judge_config_default_to_dict(self): + """AIJudgeConfigDefault.to_dict should work correctly.""" + config = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + ) + + result = config.to_dict() + + assert result['evaluationMetricKey'] == '$ld:ai:judge:relevance' + assert 'evaluationMetricKeys' not in result + + +class TestClientJudgeConfig: + """Tests for LDAIClient.judge_config() method.""" + + def test_judge_config_extracts_evaluation_metric_key( + self, client: LDClient, context: Context + ): + """judge_config should extract evaluationMetricKey from variation.""" + from ldai import LDAIClient + + ldai_client = LDAIClient(client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-config', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:relevance' + assert config.enabled is True + assert config.messages is not None + assert len(config.messages) > 0 + + def test_judge_config_uses_default_when_flag_does_not_exist( + self, client: LDClient, context: Context + ): + """judge_config should use default evaluation_metric_key when flag does not exist.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:default', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-no-key', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:default' + + def test_judge_config_uses_first_evaluation_metric_keys_from_variation( + self, context: Context + ): + """judge_config should use first value from evaluationMetricKeys when evaluationMetricKey is None.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + td.update( + td.flag('judge-with-keys') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKeys': ['$ld:ai:judge:relevance', '$ld:ai:judge:quality'], + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-with-keys', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:relevance' + + def test_judge_config_uses_first_evaluation_metric_keys_from_default( + self, context: Context + ): + """judge_config should use first value from default evaluation_metric_keys when flag does not exist.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key=None, + evaluation_metric_keys=['$ld:ai:judge:default-key', '$ld:ai:judge:secondary'], + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-fallback-keys', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:default-key' + + def test_judge_config_prefers_evaluation_metric_key_over_keys( + self, context: Context + ): + """judge_config should prefer evaluationMetricKey over evaluationMetricKeys when both are present.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + td.update( + td.flag('judge-both-present') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:preferred', + 'evaluationMetricKeys': ['$ld:ai:judge:relevance', '$ld:ai:judge:quality'], + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-both-present', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:preferred' + + def test_judge_config_uses_same_variation_for_consistency( + self, context: Context + ): + """judge_config should use the same variation from __evaluate to avoid race conditions.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + from unittest.mock import patch + + td = TestData.data_source() + td.update( + td.flag('judge-consistency-test') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:from-flag', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:from-default', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + variation_calls = [] + original_variation = test_client.variation + + def tracked_variation(key, context, default): + result = original_variation(key, context, default) + variation_calls.append((key, result.get('evaluationMetricKey'))) + return result + + with patch.object(test_client, 'variation', side_effect=tracked_variation): + config = ldai_client.judge_config('judge-consistency-test', context, default_value) + + assert len(variation_calls) == 1, f"Expected 1 variation call, got {len(variation_calls)}" + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:from-flag'