From 8d01693699e5c782813994cbd9ffe79aa7586923 Mon Sep 17 00:00:00 2001 From: Kevin Freeman Date: Wed, 21 Jan 2026 10:00:11 -0500 Subject: [PATCH 1/5] Add support for custom judges via evaluation metric key --- packages/sdk/server-ai/src/ldai/client.py | 48 +- .../sdk/server-ai/src/ldai/judge/__init__.py | 52 +- .../ldai/judge/evaluation_schema_builder.py | 36 +- packages/sdk/server-ai/src/ldai/models.py | 9 +- packages/sdk/server-ai/tests/test_judge.py | 597 ++++++++++++++++++ 5 files changed, 674 insertions(+), 68 deletions(-) create mode 100644 packages/sdk/server-ai/tests/test_judge.py diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 47465ef..674371f 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -98,14 +98,35 @@ def judge_config( key, context, default_value.to_dict(), variables ) - # Extract evaluation_metric_keys from the variation variation = self._client.variation(key, context, default_value.to_dict()) - evaluation_metric_keys = variation.get('evaluationMetricKeys', default_value.evaluation_metric_keys or []) + def _extract_evaluation_metric_key(variation: Dict[str, Any], default_value: AIJudgeConfigDefault) -> Optional[str]: + """ + Extract evaluation_metric_key with backward compatibility. + + Priority: 1) evaluationMetricKey from variation, 2) evaluation_metric_key from default, + 3) first from evaluationMetricKeys in variation, 4) first from evaluation_metric_keys in default + """ + if evaluation_metric_key := variation.get('evaluationMetricKey'): + return evaluation_metric_key + + if default_value.evaluation_metric_key: + return default_value.evaluation_metric_key + + variation_keys = variation.get('evaluationMetricKeys') + if isinstance(variation_keys, list) and variation_keys: + return variation_keys[0] + + if default_value.evaluation_metric_keys: + return default_value.evaluation_metric_keys[0] + + return None + + evaluation_metric_key = _extract_evaluation_metric_key(variation, default_value) config = AIJudgeConfig( key=key, enabled=bool(enabled), - evaluation_metric_keys=evaluation_metric_keys, + evaluation_metric_key=evaluation_metric_key, model=model, messages=messages, provider=provider, @@ -142,7 +163,7 @@ async def create_judge( enabled=True, model=ModelConfig("gpt-4"), provider=ProviderConfig("openai"), - evaluation_metric_keys=['$ld:ai:judge:relevance'], + evaluation_metric_key='$ld:ai:judge:relevance', messages=[LDMessage(role='system', content='You are a relevance judge.')] ), variables={'metric': "relevance"} @@ -158,15 +179,12 @@ async def create_judge( self._client.track('$ld:ai:judge:function:createJudge', context, key, 1) try: - # Warn if reserved variables are provided if variables: if 'message_history' in variables: - # Note: Python doesn't have a logger on the client, but we could add one - pass # Would log warning if logger available + pass if 'response_to_evaluate' in variables: - pass # Would log warning if logger available + pass - # Overwrite reserved variables to ensure they remain as placeholders for judge evaluation extended_variables = dict(variables) if variables else {} extended_variables['message_history'] = '{{message_history}}' extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}' @@ -174,17 +192,14 @@ async def create_judge( judge_config = self.judge_config(key, context, default_value, extended_variables) if not judge_config.enabled or not judge_config.tracker: - # Would log info if logger available return None - # Create AI provider for the judge provider = await AIProviderFactory.create(judge_config, default_ai_provider) if not provider: return None return Judge(judge_config, judge_config.tracker, provider) except Exception as error: - # Would log error if logger available return None async def _initialize_judges( @@ -277,7 +292,6 @@ async def create_chat( config = self.completion_config(key, context, default_value, variables) if not config.enabled or not config.tracker: - # Would log info if logger available return None provider = await AIProviderFactory.create(config, default_ai_provider) @@ -331,7 +345,6 @@ def agent_config( :param variables: Variables for interpolation. :return: Configured AIAgentConfig instance. """ - # Track single agent usage self._client.track( "$ld:ai:agent:function:single", context, @@ -397,7 +410,6 @@ def agent_configs( :param context: The context to evaluate the agent configurations in. :return: Dictionary mapping agent keys to their AIAgentConfig configurations. """ - # Track multiple agents usage agent_count = len(agent_configs) self._client.track( "$ld:ai:agent:function:multiple", @@ -461,7 +473,6 @@ def __evaluate( all_variables.update(variables) all_variables['ldctx'] = context.to_dict() - # Extract messages messages = None if 'messages' in variation and isinstance(variation['messages'], list) and all( isinstance(entry, dict) for entry in variation['messages'] @@ -476,18 +487,15 @@ def __evaluate( for entry in variation['messages'] ] - # Extract instructions instructions = None if 'instructions' in variation and isinstance(variation['instructions'], str): instructions = self.__interpolate_template(variation['instructions'], all_variables) - # Extract provider config provider_config = None if 'provider' in variation and isinstance(variation['provider'], dict): provider = variation['provider'] provider_config = ProviderConfig(provider.get('name', '')) - # Extract model config model = None if 'model' in variation and isinstance(variation['model'], dict): parameters = variation['model'].get('parameters', None) @@ -498,7 +506,6 @@ def __evaluate( custom=custom ) - # Create tracker tracker = LDAIConfigTracker( self._client, variation.get('_ldMeta', {}).get('variationKey', ''), @@ -511,7 +518,6 @@ def __evaluate( enabled = variation.get('_ldMeta', {}).get('enabled', False) - # Extract judge configuration judge_configuration = None if 'judgeConfiguration' in variation and isinstance(variation['judgeConfiguration'], dict): judge_config = variation['judgeConfiguration'] diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index eb4ef4b..3733894 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -38,9 +38,7 @@ def __init__( self._ai_config = ai_config self._ai_config_tracker = ai_config_tracker self._ai_provider = ai_provider - self._evaluation_response_structure = EvaluationSchemaBuilder.build( - ai_config.evaluation_metric_keys - ) + self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key) async def evaluate( self, @@ -57,9 +55,9 @@ async def evaluate( :return: Evaluation results or None if not sampled """ try: - if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0: + if not self._ai_config.evaluation_metric_key: log.warn( - 'Judge configuration is missing required evaluationMetricKeys' + 'Judge configuration is missing required evaluationMetricKey' ) return None @@ -83,8 +81,8 @@ async def evaluate( evals = self._parse_evaluation_response(response.data) - if len(evals) != len(self._ai_config.evaluation_metric_keys): - log.warn('Judge evaluation did not return all evaluations') + if self._ai_config.evaluation_metric_key not in evals: + log.warn('Judge evaluation did not return the expected evaluation') success = False return JudgeResponse( @@ -191,30 +189,30 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor evaluations = data['evaluations'] - for metric_key in self._ai_config.evaluation_metric_keys: - evaluation = evaluations.get(metric_key) + metric_key = self._ai_config.evaluation_metric_key + evaluation = evaluations.get(metric_key) - if not evaluation or not isinstance(evaluation, dict): - log.warn(f'Missing evaluation for metric key: {metric_key}') - continue + if not evaluation or not isinstance(evaluation, dict): + log.warn(f'Missing evaluation for metric key: {metric_key}') + return results - score = evaluation.get('score') - reasoning = evaluation.get('reasoning') + score = evaluation.get('score') + reasoning = evaluation.get('reasoning') - if not isinstance(score, (int, float)) or score < 0 or score > 1: - log.warn( - f'Invalid score evaluated for {metric_key}: {score}. ' - 'Score must be a number between 0 and 1 inclusive' - ) - continue + if not isinstance(score, (int, float)) or score < 0 or score > 1: + log.warn( + f'Invalid score evaluated for {metric_key}: {score}. ' + 'Score must be a number between 0 and 1 inclusive' + ) + return results - if not isinstance(reasoning, str): - log.warn( - f'Invalid reasoning evaluated for {metric_key}: {reasoning}. ' - 'Reasoning must be a string' - ) - continue + if not isinstance(reasoning, str): + log.warn( + f'Invalid reasoning evaluated for {metric_key}: {reasoning}. ' + 'Reasoning must be a string' + ) + return results - results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) + results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) return results diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index 29b885d..5a8019c 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -1,6 +1,6 @@ """Internal class for building dynamic evaluation response schemas.""" -from typing import Any, Dict +from typing import Any, Dict, Optional class EvaluationSchemaBuilder: @@ -10,26 +10,29 @@ class EvaluationSchemaBuilder: """ @staticmethod - def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]: + def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]: """ - Build an evaluation response schema from evaluation metric keys. + Build an evaluation response schema from evaluation metric key. - :param evaluation_metric_keys: List of evaluation metric keys - :return: Schema dictionary for structured output + :param evaluation_metric_key: Evaluation metric key, or None if not available + :return: Schema dictionary for structured output, or None if evaluation_metric_key is None """ + if evaluation_metric_key is None: + return None + return { 'title': 'EvaluationResponse', - 'description': f"Response containing evaluation results for {', '.join(evaluation_metric_keys)} metrics", + 'description': f"Response containing evaluation results for {evaluation_metric_key} metric", 'type': 'object', 'properties': { 'evaluations': { 'type': 'object', 'description': ( f"Object containing evaluation results for " - f"{', '.join(evaluation_metric_keys)} metrics" + f"{evaluation_metric_key} metric" ), - 'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_keys), - 'required': evaluation_metric_keys, + 'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key), + 'required': [evaluation_metric_key], 'additionalProperties': False, }, }, @@ -38,17 +41,16 @@ def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]: } @staticmethod - def _build_key_properties(evaluation_metric_keys: list[str]) -> Dict[str, Any]: + def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]: """ - Build properties for each evaluation metric key. + Build properties for a single evaluation metric key. - :param evaluation_metric_keys: List of evaluation metric keys - :return: Dictionary of properties for each key + :param evaluation_metric_key: Evaluation metric key + :return: Dictionary of properties for the key """ - result: Dict[str, Any] = {} - for key in evaluation_metric_keys: - result[key] = EvaluationSchemaBuilder._build_key_schema(key) - return result + return { + evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key) + } @staticmethod def _build_key_schema(key: str) -> Dict[str, Any]: diff --git a/packages/sdk/server-ai/src/ldai/models.py b/packages/sdk/server-ai/src/ldai/models.py index 988d97d..4b6e7e7 100644 --- a/packages/sdk/server-ai/src/ldai/models.py +++ b/packages/sdk/server-ai/src/ldai/models.py @@ -285,7 +285,9 @@ class AIJudgeConfigDefault(AIConfigDefault): Default Judge-specific AI Config with required evaluation metric key. """ messages: Optional[List[LDMessage]] = None + # Deprecated: evaluation_metric_key is used instead evaluation_metric_keys: Optional[List[str]] = None + evaluation_metric_key: Optional[str] = None def to_dict(self) -> dict: """ @@ -293,8 +295,7 @@ def to_dict(self) -> dict: """ result = self._base_to_dict() result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None - if self.evaluation_metric_keys is not None: - result['evaluationMetricKeys'] = self.evaluation_metric_keys + result['evaluationMetricKey'] = self.evaluation_metric_key return result @@ -303,16 +304,18 @@ class AIJudgeConfig(AIConfig): """ Judge-specific AI Config with required evaluation metric key. """ + # Deprecated: evaluation_metric_key is used instead evaluation_metric_keys: List[str] = field(default_factory=list) messages: Optional[List[LDMessage]] = None + evaluation_metric_key: Optional[str] = None def to_dict(self) -> dict: """ Render the given judge config as a dictionary object. """ result = self._base_to_dict() - result['evaluationMetricKeys'] = self.evaluation_metric_keys result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None + result['evaluationMetricKey'] = self.evaluation_metric_key return result diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py new file mode 100644 index 0000000..35ee6c8 --- /dev/null +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -0,0 +1,597 @@ +"""Tests for Judge functionality.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest +from ldclient import Config, Context, LDClient +from ldclient.integrations.test_data import TestData + +from ldai.judge import Judge +from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder +from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig +from ldai.providers.types import EvalScore, JudgeResponse, LDAIMetrics, StructuredResponse +from ldai.tracker import LDAIConfigTracker + + +@pytest.fixture +def td() -> TestData: + td = TestData.data_source() + td.update( + td.flag('judge-config') + .variations( + { + 'model': {'name': 'gpt-4', 'parameters': {'temperature': 0.3}}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:relevance', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + return td + + +@pytest.fixture +def client(td: TestData) -> LDClient: + config = Config('sdk-key', update_processor_class=td, send_events=False) + return LDClient(config=config) + + +@pytest.fixture +def mock_ai_provider(): + """Create a mock AI provider.""" + provider = MagicMock() + provider.invoke_structured_model = AsyncMock() + return provider + + +@pytest.fixture +def context() -> Context: + return Context.create('user-key') + + +@pytest.fixture +def tracker(client: LDClient, context: Context) -> LDAIConfigTracker: + return LDAIConfigTracker( + client, 'judge-v1', 'judge-config', 1, 'gpt-4', 'openai', context + ) + + +@pytest.fixture +def judge_config_with_key() -> AIJudgeConfig: + """Create a judge config with evaluation_metric_key.""" + return AIJudgeConfig( + key='judge-config', + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + +@pytest.fixture +def judge_config_without_key() -> AIJudgeConfig: + """Create a judge config without evaluation_metric_key.""" + return AIJudgeConfig( + key='judge-config', + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + +@pytest.fixture +def judge_config_without_messages() -> AIJudgeConfig: + """Create a judge config without messages.""" + return AIJudgeConfig( + key='judge-config', + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=None, + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + +class TestJudgeInitialization: + """Tests for Judge initialization.""" + + def test_judge_initializes_with_evaluation_metric_key( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Judge should initialize successfully with evaluation_metric_key.""" + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + assert judge._ai_config == judge_config_with_key + assert judge._evaluation_response_structure is not None + assert judge._evaluation_response_structure['title'] == 'EvaluationResponse' + assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required'] + + def test_judge_initializes_without_evaluation_metric_key( + self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Judge should initialize but have None for evaluation_response_structure.""" + judge = Judge(judge_config_without_key, tracker, mock_ai_provider) + + assert judge._ai_config == judge_config_without_key + assert judge._evaluation_response_structure is None + + +class TestJudgeEvaluate: + """Tests for Judge.evaluate() method.""" + + @pytest.mark.asyncio + async def test_evaluate_returns_none_when_evaluation_metric_key_missing( + self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should return None when evaluation_metric_key is missing.""" + judge = Judge(judge_config_without_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input text", "output text") + + assert result is None + mock_ai_provider.invoke_structured_model.assert_not_called() + + @pytest.mark.asyncio + async def test_evaluate_returns_none_when_messages_missing( + self, judge_config_without_messages: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should return None when messages are missing.""" + judge = Judge(judge_config_without_messages, tracker, mock_ai_provider) + + result = await judge.evaluate("input text", "output text") + + assert result is None + mock_ai_provider.invoke_structured_model.assert_not_called() + + @pytest.mark.asyncio + async def test_evaluate_success_with_valid_response( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should return JudgeResponse with valid evaluation.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 0.85, + 'reasoning': 'The response is highly relevant to the input.' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("What is AI?", "AI is artificial intelligence.") + + assert result is not None + assert isinstance(result, JudgeResponse) + assert result.success is True + assert '$ld:ai:judge:relevance' in result.evals + assert result.evals['$ld:ai:judge:relevance'].score == 0.85 + assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + + @pytest.mark.asyncio + async def test_evaluate_handles_missing_evaluation_in_response( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle missing evaluation in response.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + 'wrong-key': { + 'score': 0.5, + 'reasoning': 'Some reasoning' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is False + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_handles_invalid_score( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle invalid score values.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 1.5, + 'reasoning': 'Some reasoning' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is False + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_handles_missing_reasoning( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle missing reasoning.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 0.8, + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is False + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_handles_exception( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should handle exceptions gracefully.""" + mock_ai_provider.invoke_structured_model.side_effect = Exception("Provider error") + tracker.track_metrics_of = AsyncMock(side_effect=Exception("Provider error")) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output") + + assert result is not None + assert isinstance(result, JudgeResponse) + assert result.success is False + assert result.error is not None + assert len(result.evals) == 0 + + @pytest.mark.asyncio + async def test_evaluate_respects_sampling_rate( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should respect sampling rate.""" + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + result = await judge.evaluate("input", "output", sampling_rate=0.0) + + assert result is None + mock_ai_provider.invoke_structured_model.assert_not_called() + + +class TestJudgeEvaluateMessages: + """Tests for Judge.evaluate_messages() method.""" + + @pytest.mark.asyncio + async def test_evaluate_messages_calls_evaluate( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """evaluate_messages should call evaluate with constructed input/output.""" + from ldai.providers.types import ChatResponse + + mock_response = StructuredResponse( + data={ + 'evaluations': { + '$ld:ai:judge:relevance': { + 'score': 0.9, + 'reasoning': 'Very relevant' + } + } + }, + raw_response='{"evaluations": {...}}', + metrics=LDAIMetrics(success=True) + ) + + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + + messages = [ + LDMessage(role='user', content='Question 1'), + LDMessage(role='assistant', content='Answer 1'), + ] + chat_response = ChatResponse( + message=LDMessage(role='assistant', content='Answer 2'), + metrics=LDAIMetrics(success=True) + ) + + result = await judge.evaluate_messages(messages, chat_response) + + assert result is not None + assert result.success is True + assert tracker.track_metrics_of.called + + +class TestEvaluationSchemaBuilder: + """Tests for EvaluationSchemaBuilder.""" + + def test_build_creates_correct_schema(self): + """Schema builder should create correct schema structure.""" + schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance') + + assert schema['title'] == 'EvaluationResponse' + assert schema['type'] == 'object' + assert 'evaluations' in schema['properties'] + assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required'] + assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties'] + + metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance'] + assert metric_schema['type'] == 'object' + assert 'score' in metric_schema['properties'] + assert 'reasoning' in metric_schema['properties'] + assert metric_schema['properties']['score']['type'] == 'number' + assert metric_schema['properties']['score']['minimum'] == 0 + assert metric_schema['properties']['score']['maximum'] == 1 + + def test_build_key_properties_creates_single_key(self): + """_build_key_properties should create properties for a single key.""" + properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance') + + assert '$ld:ai:judge:relevance' in properties + assert len(properties) == 1 + assert properties['$ld:ai:judge:relevance']['type'] == 'object' + + +class TestJudgeConfigSerialization: + """Tests for AIJudgeConfig serialization.""" + + def test_to_dict_includes_evaluation_metric_key(self): + """to_dict should include evaluationMetricKey.""" + config = AIJudgeConfig( + key='test-judge', + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + ) + + result = config.to_dict() + + assert result['evaluationMetricKey'] == '$ld:ai:judge:relevance' + assert 'evaluationMetricKeys' not in result + + def test_to_dict_handles_none_evaluation_metric_key(self): + """to_dict should handle None evaluation_metric_key.""" + config = AIJudgeConfig( + key='test-judge', + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + ) + + result = config.to_dict() + + assert result['evaluationMetricKey'] is None + + def test_judge_config_default_to_dict(self): + """AIJudgeConfigDefault.to_dict should work correctly.""" + config = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + ) + + result = config.to_dict() + + assert result['evaluationMetricKey'] == '$ld:ai:judge:relevance' + assert 'evaluationMetricKeys' not in result + + +class TestClientJudgeConfig: + """Tests for LDAIClient.judge_config() method.""" + + def test_judge_config_extracts_evaluation_metric_key( + self, client: LDClient, context: Context + ): + """judge_config should extract evaluationMetricKey from variation.""" + from ldai import LDAIClient + + ldai_client = LDAIClient(client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:relevance', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-config', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:relevance' + assert config.enabled is True + assert config.messages is not None + assert len(config.messages) > 0 + + def test_judge_config_uses_default_when_key_missing( + self, client: LDClient, context: Context + ): + """judge_config should use default evaluation_metric_key when not in variation.""" + from ldai import LDAIClient + + td = TestData.data_source() + td.update( + td.flag('judge-no-key') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:default', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-no-key', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:default' + + def test_judge_config_uses_first_evaluation_metric_keys_from_variation( + self, context: Context + ): + """judge_config should use first value from evaluationMetricKeys when evaluationMetricKey is None.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + td.update( + td.flag('judge-with-keys') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKeys': ['$ld:ai:judge:relevance', '$ld:ai:judge:quality'], + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-with-keys', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:relevance' + + def test_judge_config_uses_first_evaluation_metric_keys_from_default( + self, context: Context + ): + """judge_config should use first value from default evaluation_metric_keys when both variation and default evaluation_metric_key are None.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + td.update( + td.flag('judge-fallback-keys') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key=None, + evaluation_metric_keys=['$ld:ai:judge:default-key', '$ld:ai:judge:secondary'], + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-fallback-keys', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:default-key' + + def test_judge_config_prefers_evaluation_metric_key_over_keys( + self, context: Context + ): + """judge_config should prefer evaluationMetricKey over evaluationMetricKeys when both are present.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + + td = TestData.data_source() + td.update( + td.flag('judge-both-present') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:preferred', + 'evaluationMetricKeys': ['$ld:ai:judge:relevance', '$ld:ai:judge:quality'], + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key=None, + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + config = ldai_client.judge_config('judge-both-present', context, default_value) + + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:preferred' From 350f884e890a2d430e36af95e4b8e4fc0fd54ea0 Mon Sep 17 00:00:00 2001 From: Kevin Freeman Date: Wed, 21 Jan 2026 10:12:21 -0500 Subject: [PATCH 2/5] fixed linter issues --- packages/sdk/server-ai/src/ldai/judge/__init__.py | 9 ++++++--- .../src/ldai/judge/evaluation_schema_builder.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index 3733894..0ca402a 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -9,8 +9,7 @@ from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, LDMessage from ldai.providers.ai_provider import AIProvider -from ldai.providers.types import (ChatResponse, EvalScore, JudgeResponse, - StructuredResponse) +from ldai.providers.types import ChatResponse, EvalScore, JudgeResponse from ldai.tracker import LDAIConfigTracker @@ -70,8 +69,8 @@ async def evaluate( return None messages = self._construct_evaluation_messages(input_text, output_text) + assert self._evaluation_response_structure is not None - # Track metrics of the structured model invocation response = await self._ai_config_tracker.track_metrics_of( lambda: self._ai_provider.invoke_structured_model(messages, self._evaluation_response_structure), lambda result: result.metrics, @@ -190,6 +189,10 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor evaluations = data['evaluations'] metric_key = self._ai_config.evaluation_metric_key + if not metric_key: + log.warn('Evaluation metric key is missing') + return results + evaluation = evaluations.get(metric_key) if not evaluation or not isinstance(evaluation, dict): diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index 5a8019c..671a94d 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -17,7 +17,7 @@ def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]: :param evaluation_metric_key: Evaluation metric key, or None if not available :return: Schema dictionary for structured output, or None if evaluation_metric_key is None """ - if evaluation_metric_key is None: + if not evaluation_metric_key: return None return { From 00a265e01cab67435c511ca5bb7c89ac597254b3 Mon Sep 17 00:00:00 2001 From: Kevin Freeman Date: Wed, 21 Jan 2026 10:18:24 -0500 Subject: [PATCH 3/5] Linting --- packages/sdk/server-ai/src/ldai/client.py | 17 ++++++++++------- .../src/ldai/judge/evaluation_schema_builder.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 674371f..ed2b0bf 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -99,28 +99,31 @@ def judge_config( ) variation = self._client.variation(key, context, default_value.to_dict()) - def _extract_evaluation_metric_key(variation: Dict[str, Any], default_value: AIJudgeConfigDefault) -> Optional[str]: + + def _extract_evaluation_metric_key( + variation: Dict[str, Any], default_value: AIJudgeConfigDefault + ) -> Optional[str]: """ Extract evaluation_metric_key with backward compatibility. - + Priority: 1) evaluationMetricKey from variation, 2) evaluation_metric_key from default, 3) first from evaluationMetricKeys in variation, 4) first from evaluation_metric_keys in default """ if evaluation_metric_key := variation.get('evaluationMetricKey'): return evaluation_metric_key - + if default_value.evaluation_metric_key: return default_value.evaluation_metric_key - + variation_keys = variation.get('evaluationMetricKeys') if isinstance(variation_keys, list) and variation_keys: return variation_keys[0] - + if default_value.evaluation_metric_keys: return default_value.evaluation_metric_keys[0] - + return None - + evaluation_metric_key = _extract_evaluation_metric_key(variation, default_value) config = AIJudgeConfig( diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index 671a94d..c69e0af 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -19,7 +19,7 @@ def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]: """ if not evaluation_metric_key: return None - + return { 'title': 'EvaluationResponse', 'description': f"Response containing evaluation results for {evaluation_metric_key} metric", From d277b49891ecf3977d0e0021fb39c2d47b4aff4a Mon Sep 17 00:00:00 2001 From: Kevin Freeman Date: Wed, 21 Jan 2026 12:27:43 -0500 Subject: [PATCH 4/5] Addressed PR feedback; fixed race condition --- packages/sdk/server-ai/src/ldai/client.py | 24 +++++------ packages/sdk/server-ai/tests/test_judge.py | 50 ++++++++++++++++++++++ 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index ed2b0bf..af6f50d 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -40,7 +40,7 @@ def completion_config( """ self._client.track('$ld:ai:config:function:single', context, key, 1) - model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate( + model, provider, messages, instructions, tracker, enabled, judge_configuration, _ = self.__evaluate( key, context, default_value.to_dict(), variables ) @@ -94,31 +94,29 @@ def judge_config( """ self._client.track('$ld:ai:judge:function:single', context, key, 1) - model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate( + model, provider, messages, instructions, tracker, enabled, judge_configuration, variation = self.__evaluate( key, context, default_value.to_dict(), variables ) - variation = self._client.variation(key, context, default_value.to_dict()) - def _extract_evaluation_metric_key( variation: Dict[str, Any], default_value: AIJudgeConfigDefault ) -> Optional[str]: """ Extract evaluation_metric_key with backward compatibility. - Priority: 1) evaluationMetricKey from variation, 2) evaluation_metric_key from default, - 3) first from evaluationMetricKeys in variation, 4) first from evaluation_metric_keys in default + Priority: 1) evaluationMetricKey from variation, 2) evaluationMetricKeys from variation, + 3) evaluation_metric_key from default, 4) evaluation_metric_keys from default """ if evaluation_metric_key := variation.get('evaluationMetricKey'): return evaluation_metric_key - if default_value.evaluation_metric_key: - return default_value.evaluation_metric_key - variation_keys = variation.get('evaluationMetricKeys') if isinstance(variation_keys, list) and variation_keys: return variation_keys[0] + if default_value.evaluation_metric_key: + return default_value.evaluation_metric_key + if default_value.evaluation_metric_keys: return default_value.evaluation_metric_keys[0] @@ -458,7 +456,7 @@ def __evaluate( variables: Optional[Dict[str, Any]] = None, ) -> Tuple[ Optional[ModelConfig], Optional[ProviderConfig], Optional[List[LDMessage]], - Optional[str], LDAIConfigTracker, bool, Optional[Any] + Optional[str], LDAIConfigTracker, bool, Optional[Any], Dict[str, Any] ]: """ Internal method to evaluate a configuration and extract components. @@ -467,7 +465,7 @@ def __evaluate( :param context: The evaluation context. :param default_dict: Default configuration as dictionary. :param variables: Variables for interpolation. - :return: Tuple of (model, provider, messages, instructions, tracker, enabled). + :return: Tuple of (model, provider, messages, instructions, tracker, enabled, judge_configuration, variation). """ variation = self._client.variation(key, context, default_dict) @@ -536,7 +534,7 @@ def __evaluate( if judges: judge_configuration = JudgeConfiguration(judges=judges) - return model, provider_config, messages, instructions, tracker, enabled, judge_configuration + return model, provider_config, messages, instructions, tracker, enabled, judge_configuration, variation def __evaluate_agent( self, @@ -554,7 +552,7 @@ def __evaluate_agent( :param variables: Variables for interpolation. :return: Configured AIAgentConfig instance. """ - model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate( + model, provider, messages, instructions, tracker, enabled, judge_configuration, _ = self.__evaluate( key, context, default_value.to_dict(), variables ) diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index 35ee6c8..afef060 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -595,3 +595,53 @@ def test_judge_config_prefers_evaluation_metric_key_over_keys( assert config is not None assert config.evaluation_metric_key == '$ld:ai:judge:preferred' + + def test_judge_config_uses_same_variation_for_consistency( + self, context: Context + ): + """judge_config should use the same variation from __evaluate to avoid race conditions.""" + from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData + from unittest.mock import patch + + td = TestData.data_source() + td.update( + td.flag('judge-consistency-test') + .variations( + { + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:from-flag', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + } + ) + .variation_for_all(0) + ) + + test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) + ldai_client = LDAIClient(test_client) + + default_value = AIJudgeConfigDefault( + enabled=True, + evaluation_metric_key='$ld:ai:judge:from-default', + messages=[LDMessage(role='system', content='You are a judge.')], + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + ) + + variation_calls = [] + original_variation = test_client.variation + + def tracked_variation(key, context, default): + result = original_variation(key, context, default) + variation_calls.append((key, result.get('evaluationMetricKey'))) + return result + + with patch.object(test_client, 'variation', side_effect=tracked_variation): + config = ldai_client.judge_config('judge-consistency-test', context, default_value) + + assert len(variation_calls) == 1, f"Expected 1 variation call, got {len(variation_calls)}" + assert config is not None + assert config.evaluation_metric_key == '$ld:ai:judge:from-flag' From d67d0abf2cb374b90f212f20b8e3fa8d64daed1f Mon Sep 17 00:00:00 2001 From: Kevin Freeman Date: Wed, 21 Jan 2026 12:37:01 -0500 Subject: [PATCH 5/5] modified default behaviour --- packages/sdk/server-ai/src/ldai/client.py | 15 ++-------- packages/sdk/server-ai/src/ldai/models.py | 3 ++ packages/sdk/server-ai/tests/test_judge.py | 32 ++++------------------ 3 files changed, 11 insertions(+), 39 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index af6f50d..842dd00 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -98,14 +98,11 @@ def judge_config( key, context, default_value.to_dict(), variables ) - def _extract_evaluation_metric_key( - variation: Dict[str, Any], default_value: AIJudgeConfigDefault - ) -> Optional[str]: + def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]: """ Extract evaluation_metric_key with backward compatibility. - Priority: 1) evaluationMetricKey from variation, 2) evaluationMetricKeys from variation, - 3) evaluation_metric_key from default, 4) evaluation_metric_keys from default + Priority: 1) evaluationMetricKey from variation, 2) first from evaluationMetricKeys in variation """ if evaluation_metric_key := variation.get('evaluationMetricKey'): return evaluation_metric_key @@ -114,15 +111,9 @@ def _extract_evaluation_metric_key( if isinstance(variation_keys, list) and variation_keys: return variation_keys[0] - if default_value.evaluation_metric_key: - return default_value.evaluation_metric_key - - if default_value.evaluation_metric_keys: - return default_value.evaluation_metric_keys[0] - return None - evaluation_metric_key = _extract_evaluation_metric_key(variation, default_value) + evaluation_metric_key = _extract_evaluation_metric_key(variation) config = AIJudgeConfig( key=key, diff --git a/packages/sdk/server-ai/src/ldai/models.py b/packages/sdk/server-ai/src/ldai/models.py index 4b6e7e7..19d5efc 100644 --- a/packages/sdk/server-ai/src/ldai/models.py +++ b/packages/sdk/server-ai/src/ldai/models.py @@ -296,6 +296,9 @@ def to_dict(self) -> dict: result = self._base_to_dict() result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None result['evaluationMetricKey'] = self.evaluation_metric_key + # Include deprecated evaluationMetricKeys for backward compatibility + if self.evaluation_metric_keys: + result['evaluationMetricKeys'] = self.evaluation_metric_keys return result diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index afef060..407e38e 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -442,25 +442,15 @@ def test_judge_config_extracts_evaluation_metric_key( assert config.messages is not None assert len(config.messages) > 0 - def test_judge_config_uses_default_when_key_missing( + def test_judge_config_uses_default_when_flag_does_not_exist( self, client: LDClient, context: Context ): - """judge_config should use default evaluation_metric_key when not in variation.""" + """judge_config should use default evaluation_metric_key when flag does not exist.""" from ldai import LDAIClient + from ldclient import Config, LDClient + from ldclient.integrations.test_data import TestData td = TestData.data_source() - td.update( - td.flag('judge-no-key') - .variations( - { - 'model': {'name': 'gpt-4'}, - 'provider': {'name': 'openai'}, - 'messages': [{'role': 'system', 'content': 'You are a judge.'}], - '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, - } - ) - .variation_for_all(0) - ) test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) ldai_client = LDAIClient(test_client) @@ -520,24 +510,12 @@ def test_judge_config_uses_first_evaluation_metric_keys_from_variation( def test_judge_config_uses_first_evaluation_metric_keys_from_default( self, context: Context ): - """judge_config should use first value from default evaluation_metric_keys when both variation and default evaluation_metric_key are None.""" + """judge_config should use first value from default evaluation_metric_keys when flag does not exist.""" from ldai import LDAIClient from ldclient import Config, LDClient from ldclient.integrations.test_data import TestData td = TestData.data_source() - td.update( - td.flag('judge-fallback-keys') - .variations( - { - 'model': {'name': 'gpt-4'}, - 'provider': {'name': 'openai'}, - 'messages': [{'role': 'system', 'content': 'You are a judge.'}], - '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, - } - ) - .variation_for_all(0) - ) test_client = LDClient(Config('sdk-key', update_processor_class=td, send_events=False)) ldai_client = LDAIClient(test_client)