VERA-MH/judge.py at main · SpringCare/VERA-MH · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
"""
Main script for judging existing conversations using the LLM Judge system.
This script is separate from conversation generation.
"""

import argparse
import asyncio
from typing import Optional

from judge import judge_conversations, judge_single_conversation
from judge.llm_judge import LLMJudge
from judge.rubric_config import ConversationData, RubricConfig, load_conversations
from judge.utils import parse_judge_models
from utils.utils import parse_key_value_list


def get_parser() -> argparse.ArgumentParser:
    """Build and return the argument parser (for CLI and testing)."""
    parser = argparse.ArgumentParser(
        description="Judge existing LLM conversations using rubrics"
    )

    # required source
    source_group = parser.add_mutually_exclusive_group(required=True)
    source_group.add_argument(
        "--conversation", "-c", help="Path to a single conversation file to judge"
    )
    source_group.add_argument(
        "--folder",
        "-f",
        help="Path to a conversation run folder "
        "(e.g. conversations/p_model__a_model__t6__r1__timestamp/)",
    )

    # rubrics
    parser.add_argument(
        "--rubrics",
        "-r",
        nargs="+",
        default=["data/rubric.tsv"],
        help="Rubric file(s) to use (default: data/rubric.tsv)",
    )

    # model
    parser.add_argument(
        "--judge-model",
        "-j",
        nargs="+",
        required=True,
        help=(
            "Model(s) to use for judging. "
            "Format: 'model' or 'model:count' for multiple instances. "
            "Can specify multiple models: --judge-model model1 model2:3. "
            "Examples: claude-sonnet-4-5-20250929, "
            "claude-sonnet-4-5-20250929:3, "
            "claude-sonnet-4-5-20250929:2 gpt-4o:1"
        ),
    )

    parser.add_argument(
        "--judge-model-extra-params",
        "-jep",
        help=(
            "Extra parameters for the judge model. "
            "Examples: temperature=0.7, max_tokens=1000. "
            "Default: temperature=0 (unless overridden)"
        ),
        type=parse_key_value_list,
        default={},
    )

    # optional limit
    parser.add_argument(
        "--limit",
        "-l",
        type=int,
        default=None,
        help="Limit number of conversations to judge (for debugging)",
    )

    # output folder
    parser.add_argument(
        "--output",
        "-o",
        default="evaluations",
        help="Output folder for evaluation results (default: evaluations)",
    )

    # concurrency control
    parser.add_argument(
        "--max-concurrent",
        "-m",
        type=int,
        default=None,
        help=(
            "Maximum number of concurrent workers (default: None). "
            "Set to a high number or omit for unlimited concurrency."
        ),
    )

    parser.add_argument(
        "--per-judge",
        "-pj",
        action="store_true",
        help=(
            "If set, --max-concurrent applies per judge model. "
            "Otherwise, it applies to total workers across all judges."
        ),
    )

    parser.add_argument(
        "--verbose-workers",
        "-vw",
        action="store_true",
        help="Enable verbose worker logging to show concurrency behavior",
    )

    return parser


async def main(args) -> Optional[str]:
    """Main async entrypoint for judging conversations."""
    # Parse judge models from args (supports "model" or "model:count" format)
    judge_models = parse_judge_models(args.judge_model)

    models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
    print(f"🎯 LLM Judge | Models: {models_str}")

    # Load rubric configuration once at startup
    print("📚 Loading rubric configuration...")
    rubric_config = await RubricConfig.load(rubric_folder="data")

    if args.conversation:
        # Single conversation with first judge model (single instance)
        first_model = next(iter(judge_models.keys()))

        # Load single conversation
        conversation = await ConversationData.load(args.conversation)

        # Create judge with rubric config
        judge = LLMJudge(
            judge_model=first_model,
            rubric_config=rubric_config,
            judge_model_extra_params=args.judge_model_extra_params,
        )
        await judge_single_conversation(judge, conversation, args.output)
        # Single conversation mode doesn't need output folder for pipeline
        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
        return None
    else:
        # Load all conversations at startup
        print(f"📂 Loading conversations from {args.folder}...")
        conversations = await load_conversations(args.folder, limit=args.limit)
        print(f"✅ Loaded {len(conversations)} conversations")

        # Batch evaluation with multiple judges
        from pathlib import Path

        folder_name = Path(args.folder).name

        _, output_folder = await judge_conversations(
            judge_models=judge_models,
            conversations=conversations,
            rubric_config=rubric_config,
            max_concurrent=args.max_concurrent,
            output_root=args.output,
            conversation_folder_name=folder_name,
            verbose=True,
            judge_model_extra_params=args.judge_model_extra_params,
            per_judge=args.per_judge,
            verbose_workers=args.verbose_workers,
        )

        return output_folder


if __name__ == "__main__":
    args = get_parser().parse_args()
    print(f"Running judge on: {args.folder or args.conversation}")
    asyncio.run(main(args))