Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,28 @@ def test_create_eval_run_with_inference_configs(client):
assert evaluation_run.error is None


def test_create_eval_run_with_metric_resource_name(client):
"""Tests create_evaluation_run with metric_resource_name."""
client._api_client._http_options.api_version = "v1beta1"
client._api_client._http_options.base_url = (
"https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
)
metric_resource_name = "projects/977012026409/locations/us-central1/evaluationMetrics/6048334299558576128"
metric = types.EvaluationRunMetric(
metric="my_custom_metric",
metric_resource_name=metric_resource_name,
)
evaluation_run = client.evals.create_evaluation_run(
dataset=types.EvaluationDataset(
eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY
),
metrics=[metric],
dest=GCS_DEST,
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.evaluation_config.metrics[0].metric == "my_custom_metric"


# Dataframe tests fail in replay mode because of UUID generation mismatch.
# def test_create_eval_run_data_source_evaluation_dataset(client):
# """Tests that create_evaluation_run() creates a correctly structured
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,19 +143,21 @@
User prompt:
{prompt}"""

_PROMPTS_DF = pd.DataFrame(
{
"prompt": [
"Explain the theory of relativity in one sentence.",
"Write a short poem about a cat.",
]
}
)


def test_public_method_generate_rubrics(client):
"""Tests the public generate_rubrics method."""
prompts_df = pd.DataFrame(
{
"prompt": [
"Explain the theory of relativity in one sentence.",
"Write a short poem about a cat.",
]
}
)

eval_dataset = client.evals.generate_rubrics(
src=prompts_df,
src=_PROMPTS_DF,
prompt_template=_TEST_RUBRIC_GENERATION_PROMPT,
rubric_group_name="text_quality_rubrics",
)
Expand All @@ -176,6 +178,36 @@ def test_public_method_generate_rubrics(client):
assert isinstance(first_rubric_group["text_quality_rubrics"][0], types.evals.Rubric)


def test_public_method_generate_rubrics_with_metric(client):
"""Tests the public generate_rubrics method with a metric."""
client._api_client._http_options.api_version = "v1beta1"
client._api_client._http_options.base_url = (
"https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
)
metric_resource_name = "projects/977012026409/locations/us-central1/evaluationMetrics/6048334299558576128"
metric = types.Metric(
name="my_custom_metric", metric_resource_name=metric_resource_name
)
eval_dataset = client.evals.generate_rubrics(
src=_PROMPTS_DF, rubric_group_name="my_registered_rubrics", metric=metric
)
eval_dataset_df = eval_dataset.eval_dataset_df

assert isinstance(eval_dataset, types.EvaluationDataset)
assert isinstance(eval_dataset_df, pd.DataFrame)
assert "rubric_groups" in eval_dataset_df.columns
assert len(eval_dataset_df) == 2

first_rubric_group = eval_dataset_df["rubric_groups"][0]
assert isinstance(first_rubric_group, dict)
assert "my_registered_rubrics" in first_rubric_group
assert isinstance(first_rubric_group["my_registered_rubrics"], list)
assert first_rubric_group["my_registered_rubrics"]
assert isinstance(
first_rubric_group["my_registered_rubrics"][0], types.evals.Rubric
)


pytestmark = pytest_helper.setup(
file=__file__,
globals_for_file=globals(),
Expand Down
13 changes: 12 additions & 1 deletion vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from . import _gcs_utils
from . import evals
from . import types
from . import _transformers as t

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1328,7 +1329,7 @@ def _resolve_dataset_inputs(


def _resolve_evaluation_run_metrics(
metrics: list[types.EvaluationRunMetric], api_client: Any
metrics: Union[list[types.EvaluationRunMetric], list[types.Metric]], api_client: Any
) -> list[types.EvaluationRunMetric]:
"""Resolves a list of evaluation run metric instances, loading RubricMetric if necessary."""
if not metrics:
Expand Down Expand Up @@ -1361,6 +1362,16 @@ def _resolve_evaluation_run_metrics(
e,
)
raise
elif isinstance(metric_instance, types.Metric):
config_dict = t.t_metrics([metric_instance])[0]
res_name = config_dict.pop("metric_resource_name", None)
resolved_metrics_list.append(
types.EvaluationRunMetric(
metric=metric_instance.name,
metric_config=config_dict if config_dict else None,
metric_resource_name=res_name,
)
)
else:
try:
metric_name_str = str(metric_instance)
Expand Down
5 changes: 5 additions & 0 deletions vertexai/_genai/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def t_metrics(

for metric in metrics:
metric_payload_item: dict[str, Any] = {}
if hasattr(metric, "metric_resource_name") and metric.metric_resource_name:
metric_payload_item["metric_resource_name"] = metric.metric_resource_name

metric_name = getv(metric, ["name"]).lower()

Expand Down Expand Up @@ -79,6 +81,9 @@ def t_metrics(
"return_raw_output": return_raw_output
}
metric_payload_item["pointwise_metric_spec"] = pointwise_spec
elif "metric_resource_name" in metric_payload_item:
# Valid case: Metric is identified by resource name; no inline spec required.
pass
else:
raise ValueError(
f"Unsupported metric type or invalid metric name: {metric_name}"
Expand Down
63 changes: 59 additions & 4 deletions vertexai/_genai/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,13 @@ def _EvaluationRunMetric_from_vertex(
if getv(from_object, ["metric"]) is not None:
setv(to_object, ["metric"], getv(from_object, ["metric"]))

if getv(from_object, ["metricResourceName"]) is not None:
setv(
to_object,
["metric_resource_name"],
getv(from_object, ["metricResourceName"]),
)

if getv(from_object, ["metricConfig"]) is not None:
setv(
to_object,
Expand All @@ -410,6 +417,13 @@ def _EvaluationRunMetric_to_vertex(
if getv(from_object, ["metric"]) is not None:
setv(to_object, ["metric"], getv(from_object, ["metric"]))

if getv(from_object, ["metric_resource_name"]) is not None:
setv(
to_object,
["metricResourceName"],
getv(from_object, ["metric_resource_name"]),
)

if getv(from_object, ["metric_config"]) is not None:
setv(
to_object,
Expand Down Expand Up @@ -512,6 +526,13 @@ def _GenerateInstanceRubricsRequest_to_vertex(
),
)

if getv(from_object, ["metric_resource_name"]) is not None:
setv(
to_object,
["metricResourceName"],
getv(from_object, ["metric_resource_name"]),
)

if getv(from_object, ["config"]) is not None:
setv(to_object, ["config"], getv(from_object, ["config"]))

Expand Down Expand Up @@ -1049,6 +1070,7 @@ def _generate_rubrics(
types.PredefinedMetricSpecOrDict
] = None,
rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None,
metric_resource_name: Optional[str] = None,
config: Optional[types.RubricGenerationConfigOrDict] = None,
) -> types.GenerateInstanceRubricsResponse:
"""
Expand All @@ -1059,6 +1081,7 @@ def _generate_rubrics(
contents=contents,
predefined_rubric_generation_spec=predefined_rubric_generation_spec,
rubric_generation_spec=rubric_generation_spec,
metric_resource_name=metric_resource_name,
config=config,
)

Expand Down Expand Up @@ -1561,16 +1584,20 @@ def generate_rubrics(
rubric_type_ontology: Optional[list[str]] = None,
predefined_spec_name: Optional[Union[str, "types.PrebuiltMetric"]] = None,
metric_spec_parameters: Optional[dict[str, Any]] = None,
metric: Optional[types.MetricOrDict] = None,
config: Optional[types.RubricGenerationConfigOrDict] = None,
) -> types.EvaluationDataset:
"""Generates rubrics for each prompt in the source and adds them as a new column
structured as a dictionary.

You can generate rubrics by providing either:
1. A `predefined_spec_name` to use a Vertex AI backend recipe.
2. A `prompt_template` along with other configuration parameters
1. A `metric` to use a pre-registered metric resource.
2. A `predefined_spec_name` to use a Vertex AI backend recipe.
3. A `prompt_template` along with other configuration parameters
(`generator_model_config`, `rubric_content_type`, `rubric_type_ontology`)
for custom rubric generation.
with `metric` taking precedence over `predefined_spec_name`,
and `predefined_spec_name` taking precedence over `prompt_template`

These two modes are mutually exclusive.

Expand Down Expand Up @@ -1600,6 +1627,9 @@ def generate_rubrics(
metric_spec_parameters: Optional. Parameters for the Predefined Metric,
used to customize rubric generation. Only used if `predefined_spec_name` is set.
Example: {"guidelines": ["The response must be in Japanese."]}
metric: Optional. A types.Metric object containing a metric_resource_name,
or a resource name string. If provided, this will take precedence over
predefined_spec_name and prompt_template.
config: Optional. Configuration for the rubric generation process.

Returns:
Expand Down Expand Up @@ -1639,10 +1669,32 @@ def generate_rubrics(
)
all_rubric_groups: list[dict[str, list[types.Rubric]]] = []

actual_metric_resource_name = None
if metric:
if isinstance(metric, str) and metric.startswith("projects/"):
actual_metric_resource_name = metric
else:
metric_obj = (
types.Metric.model_validate(metric)
if isinstance(metric, dict)
else metric
)
actual_metric_resource_name = getattr(
metric_obj, "metric_resource_name", None
)
if not actual_metric_resource_name:
raise ValueError(
"The provided Metric object must have metric_resource_name set."
)

rubric_gen_spec = None
predefined_spec = None

if predefined_spec_name:
if actual_metric_resource_name:
# Precedence: Registered metric resource overrides everything else.
predefined_spec = None
rubric_gen_spec = None
elif predefined_spec_name:
if prompt_template:
logger.warning(
"prompt_template is ignored when predefined_spec_name is provided."
Expand Down Expand Up @@ -1699,7 +1751,7 @@ def generate_rubrics(
rubric_gen_spec = types.RubricGenerationSpec.model_validate(spec_dict)
else:
raise ValueError(
"Either predefined_spec_name or prompt_template must be provided."
"Either metric, predefined_spec_name or prompt_template must be provided."
)

for _, row in prompts_df.iterrows():
Expand All @@ -1722,6 +1774,7 @@ def generate_rubrics(
contents=contents,
rubric_generation_spec=rubric_gen_spec,
predefined_rubric_generation_spec=predefined_spec,
metric_resource_name=actual_metric_resource_name,
config=config,
)
rubric_group = {rubric_group_name: response.generated_rubrics}
Expand Down Expand Up @@ -2307,6 +2360,7 @@ async def _generate_rubrics(
types.PredefinedMetricSpecOrDict
] = None,
rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None,
metric_resource_name: Optional[str] = None,
config: Optional[types.RubricGenerationConfigOrDict] = None,
) -> types.GenerateInstanceRubricsResponse:
"""
Expand All @@ -2317,6 +2371,7 @@ async def _generate_rubrics(
contents=contents,
predefined_rubric_generation_spec=predefined_rubric_generation_spec,
rubric_generation_spec=rubric_generation_spec,
metric_resource_name=metric_resource_name,
config=config,
)

Expand Down
21 changes: 21 additions & 0 deletions vertexai/_genai/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2479,6 +2479,10 @@ class EvaluationRunMetric(_common.BaseModel):
metric: Optional[str] = Field(
default=None, description="""The name of the metric."""
)
metric_resource_name: Optional[str] = Field(
default=None,
description="""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""",
)
metric_config: Optional[UnifiedMetric] = Field(
default=None, description="""The unified metric used for evaluation run."""
)
Expand All @@ -2490,6 +2494,9 @@ class EvaluationRunMetricDict(TypedDict, total=False):
metric: Optional[str]
"""The name of the metric."""

metric_resource_name: Optional[str]
"""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}"""

metric_config: Optional[UnifiedMetricDict]
"""The unified metric used for evaluation run."""

Expand Down Expand Up @@ -4439,6 +4446,10 @@ class Metric(_common.BaseModel):
default=None,
description="""Optional steering instruction parameters for the automated predefined metric.""",
)
metric_resource_name: Optional[str] = Field(
default=None,
description="""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""",
)

# Allow extra fields to support metric-specific config fields.
model_config = ConfigDict(extra="allow")
Expand Down Expand Up @@ -4643,6 +4654,9 @@ class MetricDict(TypedDict, total=False):
metric_spec_parameters: Optional[dict[str, Any]]
"""Optional steering instruction parameters for the automated predefined metric."""

metric_resource_name: Optional[str]
"""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}"""


MetricOrDict = Union[Metric, MetricDict]

Expand Down Expand Up @@ -5354,6 +5368,10 @@ class _GenerateInstanceRubricsRequest(_common.BaseModel):
default=None,
description="""Specification for how the rubrics should be generated.""",
)
metric_resource_name: Optional[str] = Field(
default=None,
description="""Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored.""",
)
config: Optional[RubricGenerationConfig] = Field(default=None, description="""""")


Expand All @@ -5374,6 +5392,9 @@ class _GenerateInstanceRubricsRequestDict(TypedDict, total=False):
rubric_generation_spec: Optional[RubricGenerationSpecDict]
"""Specification for how the rubrics should be generated."""

metric_resource_name: Optional[str]
"""Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored."""

config: Optional[RubricGenerationConfigDict]
""""""

Expand Down
Loading