sandbox / judging_dataclasses.py
justinxzhao's picture
Added per-response plots.
3e0f8f8
raw
history blame
999 Bytes
from pydantic import BaseModel, Field, conint
from typing import List, Optional, Literal, Union
class Criteria(BaseModel):
name: str
description: str
min_score: conint(ge=0)
max_score: conint(ge=0)
class DirectAssessment(BaseModel):
type: Literal["direct_assessment"]
criteria: List[Criteria]
prompt: str
class PairwiseComparison(BaseModel):
type: Literal["pairwise_comparison"]
granularity: Literal["coarse", "fine", "super fine"]
ties_allowed: bool
position_swapping: bool
reference_model: str
prompt: str
class JudgingConfig(BaseModel):
assessment: Union[DirectAssessment, PairwiseComparison]
class DirectAssessmentCriterionScore(BaseModel):
criterion: str
score: int
explanation: str
class DirectAssessmentCriteriaScores(BaseModel):
model: str
criteria_scores: List[DirectAssessmentCriterionScore]
class DirectAssessmentJudgingResponse(BaseModel):
judging_models: List[DirectAssessmentCriteriaScores]