From 4c0e70a1c968184c800a7919ef7607a0c79730ec Mon Sep 17 00:00:00 2001
From: Jordan <jordantotten@google.com>
Date: Thu, 11 Jun 2026 13:10:18 +0000
Subject: [PATCH 1/2] fix(eval): improve rubric text normalization for
 judge-garbled output

Replace _normalize_text's simple lower().strip() with NFKC unicode
normalization, smart-quote/dash translation, and markdown artifact
stripping. Add substring fallback with uniqueness guard to
convert_auto_rater_response_to_score for cases where normalization
alone isn't sufficient.

Fixes #6072
---
 .../adk/evaluation/rubric_based_evaluator.py  |  33 ++++-
 .../evaluation/test_rubric_based_evaluator.py | 113 ++++++++++++++++++
 2 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py
index 451a14f1a5..df6a6f0b18 100644
--- a/src/google/adk/evaluation/rubric_based_evaluator.py
+++ b/src/google/adk/evaluation/rubric_based_evaluator.py
@@ -17,6 +17,7 @@
 import abc
 import logging
 import re
+import unicodedata
 from typing import Optional
 
 from typing_extensions import override
@@ -277,10 +278,30 @@ def summarize(
     )
 
 
+_SMART_CHARS = {
+    0x2018: "'",
+    0x2019: "'",
+    0x201C: '"',
+    0x201D: '"',
+    0x2013: "-",
+    0x2014: "-",
+    0x2026: "...",
+}
+
+
 def _normalize_text(text: str) -> str:
-  """Returns a normalized version of the passed in text."""
+  """Returns a normalized version of the passed in text.
+
+  Handles common judge-model garbling: markdown bullets, smart quotes,
+  bold/italic markers, en/em dashes, and extra whitespace.
+  """
   if not isinstance(text, str):
     return ""
+  text = unicodedata.normalize("NFKC", text)
+  text = text.translate(_SMART_CHARS)
+  text = re.sub(r'^[\s*•\-"\']+', "", text)
+  text = re.sub(r'[\s*•\-"\']+$', "", text)
+  text = re.sub(r"\s+", " ", text)
   return text.lower().strip()
 
 
@@ -394,6 +415,16 @@ def convert_auto_rater_response_to_score(
     for rubric_response in rubric_responses:
       normalized_rubric_text = _normalize_text(rubric_response.property_text)
       rubric = normalized_rubric_to_rubric_map.get(normalized_rubric_text, None)
+
+      if not rubric:
+        candidates = [
+            r
+            for ct, r in normalized_rubric_to_rubric_map.items()
+            if ct in normalized_rubric_text or normalized_rubric_text in ct
+        ]
+        if len(candidates) == 1:
+          rubric = candidates[0]
+
       if rubric:
         rubric_scores.append(
             RubricScore(
diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py
index 87a10cbc82..2f4ffc8dba 100644
--- a/tests/unittests/evaluation/test_rubric_based_evaluator.py
+++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py
@@ -28,6 +28,7 @@
 from google.adk.evaluation.rubric_based_evaluator import DefaultAutoRaterResponseParser
 from google.adk.evaluation.rubric_based_evaluator import MajorityVotePerInvocationResultsAggregator
 from google.adk.evaluation.rubric_based_evaluator import MeanInvocationResultsSummarizer
+from google.adk.evaluation.rubric_based_evaluator import _normalize_text
 from google.adk.evaluation.rubric_based_evaluator import RubricBasedEvaluator
 from google.adk.models.llm_response import LlmResponse
 from google.genai import types as genai_types
@@ -658,3 +659,115 @@ def test_create_effective_rubrics_filters_by_rubric_type(
         "2",
         "test_type_rubric",
     }
+
+
+class TestNormalizeText:
+  """Validate _normalize_text handles common judge-model garbling patterns."""
+
+  RUBRIC = "the response correctly uses tools"
+
+  @pytest.mark.parametrize(
+      "label,input_text",
+      [
+          ("exact", "The response correctly uses tools"),
+          ("markdown_bullet", "- The response correctly uses tools"),
+          ("bullet_bold", "* **The response correctly uses tools**"),
+          (
+              "smart_double_quotes",
+              "“The response correctly uses tools”",
+          ),
+          ("double_spaces", "The  response  correctly  uses  tools"),
+          (
+              "em_dash_prefix",
+              "— The response correctly uses tools",
+          ),
+          (
+              "en_dash_prefix",
+              "– The response correctly uses tools",
+          ),
+          (
+              "unicode_bullet",
+              "• The response correctly uses tools",
+          ),
+          (
+              "leading_whitespace",
+              "   The response correctly uses tools",
+          ),
+      ],
+  )
+  def test_garbled_text_matches_rubric(self, label, input_text):
+    assert _normalize_text(input_text) == self.RUBRIC
+
+  def test_ellipsis_normalized(self):
+    assert (
+        _normalize_text("The response… uses tools")
+        == "the response... uses tools"
+    )
+
+  def test_accented_chars_preserved(self):
+    assert _normalize_text("réponse") == "réponse"
+
+  def test_non_string_returns_empty(self):
+    assert _normalize_text(None) == ""
+    assert _normalize_text(42) == ""
+
+  def test_empty_string(self):
+    assert _normalize_text("") == ""
+
+
+class TestSubstringFallbackUniquenessGuard:
+  """Verify substring fallback only matches when exactly one candidate exists.
+
+  The convert_auto_rater_response_to_score method falls back to substring
+  matching when exact normalized match fails. When multiple rubrics share
+  a common substring, the fallback must reject the ambiguous match.
+  """
+
+  def _build_evaluator_and_score(self, rubric_texts, judge_property_text):
+    """Helper: build a FakeRubricBasedEvaluator and score a judge response."""
+    rubrics = []
+    for i, text in enumerate(rubric_texts):
+      rubrics.append(
+          Rubric(
+              rubric_id=f"rubric_{i}",
+              rubric_content=RubricContent(text_property=text),
+          )
+      )
+
+    metric = EvalMetric(
+        metric_name="test_metric",
+        threshold=0.5,
+        criterion=RubricsBasedCriterion(rubrics=rubrics, threshold=0.5),
+    )
+    evaluator = FakeRubricBasedEvaluator(eval_metric=metric)
+    evaluator.create_effective_rubrics_list([])
+
+    response_text = (
+        f"Property: {judge_property_text}\n"
+        "Rationale: test rationale\n"
+        "Verdict: yes"
+    )
+    response = LlmResponse(
+        content=genai_types.Content(
+            parts=[genai_types.Part(text=response_text)]
+        )
+    )
+    return evaluator.convert_auto_rater_response_to_score(response)
+
+  def test_unique_substring_match_accepted(self):
+    result = self._build_evaluator_and_score(
+        rubric_texts=["Uses tools correctly"],
+        judge_property_text="Uses tools correctly",
+    )
+    assert len(result.rubric_scores) == 1
+    assert result.rubric_scores[0].rubric_id == "rubric_0"
+
+  def test_ambiguous_substring_match_rejected(self):
+    result = self._build_evaluator_and_score(
+        rubric_texts=[
+            "Uses tools correctly",
+            "Uses tools efficiently",
+        ],
+        judge_property_text="Uses tools",
+    )
+    assert len(result.rubric_scores) == 0

From 9bb59fcbd6ee036cbe0fea285973bc84eb937f03 Mon Sep 17 00:00:00 2001
From: Jordan <jordantotten@google.com>
Date: Thu, 11 Jun 2026 22:40:41 +0000
Subject: [PATCH 2/2] style: fix import ordering for pre-commit compliance

---
 src/google/adk/evaluation/rubric_based_evaluator.py       | 3 ++-
 tests/unittests/evaluation/test_rubric_based_evaluator.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py
index df6a6f0b18..4fecdf3953 100644
--- a/src/google/adk/evaluation/rubric_based_evaluator.py
+++ b/src/google/adk/evaluation/rubric_based_evaluator.py
@@ -17,9 +17,10 @@
 import abc
 import logging
 import re
-import unicodedata
 from typing import Optional
 
+import unicodedata
+
 from typing_extensions import override
 
 from ..models.llm_response import LlmResponse
diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py
index 2f4ffc8dba..11eaf3b543 100644
--- a/tests/unittests/evaluation/test_rubric_based_evaluator.py
+++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py
@@ -25,10 +25,10 @@
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.evaluator import PerInvocationResult
 from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
+from google.adk.evaluation.rubric_based_evaluator import _normalize_text
 from google.adk.evaluation.rubric_based_evaluator import DefaultAutoRaterResponseParser
 from google.adk.evaluation.rubric_based_evaluator import MajorityVotePerInvocationResultsAggregator
 from google.adk.evaluation.rubric_based_evaluator import MeanInvocationResultsSummarizer
-from google.adk.evaluation.rubric_based_evaluator import _normalize_text
 from google.adk.evaluation.rubric_based_evaluator import RubricBasedEvaluator
 from google.adk.models.llm_response import LlmResponse
 from google.genai import types as genai_types