From 4c0e70a1c968184c800a7919ef7607a0c79730ec Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 11 Jun 2026 13:10:18 +0000 Subject: [PATCH 1/2] fix(eval): improve rubric text normalization for judge-garbled output Replace _normalize_text's simple lower().strip() with NFKC unicode normalization, smart-quote/dash translation, and markdown artifact stripping. Add substring fallback with uniqueness guard to convert_auto_rater_response_to_score for cases where normalization alone isn't sufficient. Fixes #6072 --- .../adk/evaluation/rubric_based_evaluator.py | 33 ++++- .../evaluation/test_rubric_based_evaluator.py | 113 ++++++++++++++++++ 2 files changed, 145 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py index 451a14f1a5..df6a6f0b18 100644 --- a/src/google/adk/evaluation/rubric_based_evaluator.py +++ b/src/google/adk/evaluation/rubric_based_evaluator.py @@ -17,6 +17,7 @@ import abc import logging import re +import unicodedata from typing import Optional from typing_extensions import override @@ -277,10 +278,30 @@ def summarize( ) +_SMART_CHARS = { + 0x2018: "'", + 0x2019: "'", + 0x201C: '"', + 0x201D: '"', + 0x2013: "-", + 0x2014: "-", + 0x2026: "...", +} + + def _normalize_text(text: str) -> str: - """Returns a normalized version of the passed in text.""" + """Returns a normalized version of the passed in text. + + Handles common judge-model garbling: markdown bullets, smart quotes, + bold/italic markers, en/em dashes, and extra whitespace. + """ if not isinstance(text, str): return "" + text = unicodedata.normalize("NFKC", text) + text = text.translate(_SMART_CHARS) + text = re.sub(r'^[\s*•\-"\']+', "", text) + text = re.sub(r'[\s*•\-"\']+$', "", text) + text = re.sub(r"\s+", " ", text) return text.lower().strip() @@ -394,6 +415,16 @@ def convert_auto_rater_response_to_score( for rubric_response in rubric_responses: normalized_rubric_text = _normalize_text(rubric_response.property_text) rubric = normalized_rubric_to_rubric_map.get(normalized_rubric_text, None) + + if not rubric: + candidates = [ + r + for ct, r in normalized_rubric_to_rubric_map.items() + if ct in normalized_rubric_text or normalized_rubric_text in ct + ] + if len(candidates) == 1: + rubric = candidates[0] + if rubric: rubric_scores.append( RubricScore( diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py index 87a10cbc82..2f4ffc8dba 100644 --- a/tests/unittests/evaluation/test_rubric_based_evaluator.py +++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py @@ -28,6 +28,7 @@ from google.adk.evaluation.rubric_based_evaluator import DefaultAutoRaterResponseParser from google.adk.evaluation.rubric_based_evaluator import MajorityVotePerInvocationResultsAggregator from google.adk.evaluation.rubric_based_evaluator import MeanInvocationResultsSummarizer +from google.adk.evaluation.rubric_based_evaluator import _normalize_text from google.adk.evaluation.rubric_based_evaluator import RubricBasedEvaluator from google.adk.models.llm_response import LlmResponse from google.genai import types as genai_types @@ -658,3 +659,115 @@ def test_create_effective_rubrics_filters_by_rubric_type( "2", "test_type_rubric", } + + +class TestNormalizeText: + """Validate _normalize_text handles common judge-model garbling patterns.""" + + RUBRIC = "the response correctly uses tools" + + @pytest.mark.parametrize( + "label,input_text", + [ + ("exact", "The response correctly uses tools"), + ("markdown_bullet", "- The response correctly uses tools"), + ("bullet_bold", "* **The response correctly uses tools**"), + ( + "smart_double_quotes", + "“The response correctly uses tools”", + ), + ("double_spaces", "The response correctly uses tools"), + ( + "em_dash_prefix", + "— The response correctly uses tools", + ), + ( + "en_dash_prefix", + "– The response correctly uses tools", + ), + ( + "unicode_bullet", + "• The response correctly uses tools", + ), + ( + "leading_whitespace", + " The response correctly uses tools", + ), + ], + ) + def test_garbled_text_matches_rubric(self, label, input_text): + assert _normalize_text(input_text) == self.RUBRIC + + def test_ellipsis_normalized(self): + assert ( + _normalize_text("The response… uses tools") + == "the response... uses tools" + ) + + def test_accented_chars_preserved(self): + assert _normalize_text("réponse") == "réponse" + + def test_non_string_returns_empty(self): + assert _normalize_text(None) == "" + assert _normalize_text(42) == "" + + def test_empty_string(self): + assert _normalize_text("") == "" + + +class TestSubstringFallbackUniquenessGuard: + """Verify substring fallback only matches when exactly one candidate exists. + + The convert_auto_rater_response_to_score method falls back to substring + matching when exact normalized match fails. When multiple rubrics share + a common substring, the fallback must reject the ambiguous match. + """ + + def _build_evaluator_and_score(self, rubric_texts, judge_property_text): + """Helper: build a FakeRubricBasedEvaluator and score a judge response.""" + rubrics = [] + for i, text in enumerate(rubric_texts): + rubrics.append( + Rubric( + rubric_id=f"rubric_{i}", + rubric_content=RubricContent(text_property=text), + ) + ) + + metric = EvalMetric( + metric_name="test_metric", + threshold=0.5, + criterion=RubricsBasedCriterion(rubrics=rubrics, threshold=0.5), + ) + evaluator = FakeRubricBasedEvaluator(eval_metric=metric) + evaluator.create_effective_rubrics_list([]) + + response_text = ( + f"Property: {judge_property_text}\n" + "Rationale: test rationale\n" + "Verdict: yes" + ) + response = LlmResponse( + content=genai_types.Content( + parts=[genai_types.Part(text=response_text)] + ) + ) + return evaluator.convert_auto_rater_response_to_score(response) + + def test_unique_substring_match_accepted(self): + result = self._build_evaluator_and_score( + rubric_texts=["Uses tools correctly"], + judge_property_text="Uses tools correctly", + ) + assert len(result.rubric_scores) == 1 + assert result.rubric_scores[0].rubric_id == "rubric_0" + + def test_ambiguous_substring_match_rejected(self): + result = self._build_evaluator_and_score( + rubric_texts=[ + "Uses tools correctly", + "Uses tools efficiently", + ], + judge_property_text="Uses tools", + ) + assert len(result.rubric_scores) == 0 From 9bb59fcbd6ee036cbe0fea285973bc84eb937f03 Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 11 Jun 2026 22:40:41 +0000 Subject: [PATCH 2/2] style: fix import ordering for pre-commit compliance --- src/google/adk/evaluation/rubric_based_evaluator.py | 3 ++- tests/unittests/evaluation/test_rubric_based_evaluator.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py index df6a6f0b18..4fecdf3953 100644 --- a/src/google/adk/evaluation/rubric_based_evaluator.py +++ b/src/google/adk/evaluation/rubric_based_evaluator.py @@ -17,9 +17,10 @@ import abc import logging import re -import unicodedata from typing import Optional +import unicodedata + from typing_extensions import override from ..models.llm_response import LlmResponse diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py index 2f4ffc8dba..11eaf3b543 100644 --- a/tests/unittests/evaluation/test_rubric_based_evaluator.py +++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py @@ -25,10 +25,10 @@ from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.evaluator import PerInvocationResult from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score +from google.adk.evaluation.rubric_based_evaluator import _normalize_text from google.adk.evaluation.rubric_based_evaluator import DefaultAutoRaterResponseParser from google.adk.evaluation.rubric_based_evaluator import MajorityVotePerInvocationResultsAggregator from google.adk.evaluation.rubric_based_evaluator import MeanInvocationResultsSummarizer -from google.adk.evaluation.rubric_based_evaluator import _normalize_text from google.adk.evaluation.rubric_based_evaluator import RubricBasedEvaluator from google.adk.models.llm_response import LlmResponse from google.genai import types as genai_types