跳到内容

指标

MetricType

基类: Enum

Ragas 中的指标类型枚举。

属性

名称 类型 描述
SINGLE_TURN str

表示单轮指标类型。

MULTI_TURN str

表示多轮指标类型。

Metric dataclass

Metric(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '')

基类: ABC

Ragas 中指标的抽象基类。

属性

名称 类型 描述
name str

指标的名称。

required_columns Dict[str, Set[str]]

一个字典,将指标类型名称映射到所需的列名集合。这是一个属性,如果列不在 VALID_COLUMNS 中,则会引发 ValueError

score

score(row: Dict, callbacks: Callbacks = None) -> float

计算单行数据的分数。

注意

此方法已弃用,将在 0.3 版本中移除。请改用 single_turn_ascoremulti_turn_ascore

源代码位于 src/ragas/metrics/base.py
@deprecated("0.2", removal="0.3", alternative="single_turn_ascore")
def score(self, row: t.Dict, callbacks: Callbacks = None) -> float:
    """
    Calculates the score for a single row of data.

    Note
    ----
    This method is deprecated and will be removed in 0.3. Please use `single_turn_ascore` or `multi_turn_ascore` instead.
    """
    callbacks = callbacks or []
    rm, group_cm = new_group(
        self.name,
        inputs=row,
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        if is_event_loop_running():
            try:
                import nest_asyncio

                nest_asyncio.apply()
            except ImportError:
                raise ImportError(
                    "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
                )
        loop = asyncio.get_event_loop()
        score = loop.run_until_complete(self._ascore(row=row, callbacks=group_cm))
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})
    return score

ascore async

ascore(row: Dict, callbacks: Callbacks = None, timeout: Optional[float] = None) -> float

异步计算单行数据的分数。

注意

此方法已弃用,将在 0.3 版本中移除。请改用 single_turn_ascore

源代码位于 src/ragas/metrics/base.py
@deprecated("0.2", removal="0.3", alternative="single_turn_ascore")
async def ascore(
    self,
    row: t.Dict,
    callbacks: Callbacks = None,
    timeout: t.Optional[float] = None,
) -> float:
    """
    Asynchronously calculates the score for a single row of data.

    Note
    ----
    This method is deprecated and will be removed in 0.3. Please use `single_turn_ascore` instead.
    """
    callbacks = callbacks or []
    rm, group_cm = new_group(
        self.name,
        inputs=row,
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        score = await asyncio.wait_for(
            self._ascore(row=row, callbacks=group_cm),
            timeout=timeout,
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})
    return score

MetricWithLLM dataclass

MetricWithLLM(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None)

基类: Metric, PromptMixin

使用语言模型进行评估的指标类。

属性

名称 类型 描述
llm Optional[BaseRagasLLM]

指标使用的语言模型。

train

train(path: Optional[str] = None, run_id: Optional[str] = None, demonstration_config: Optional[DemonstrationConfig] = None, instruction_config: Optional[InstructionConfig] = None, callbacks: Optional[Callbacks] = None, run_config: Optional[RunConfig] = None, batch_size: Optional[int] = None, with_debugging_logs=False, raise_exceptions: bool = True) -> None

使用本地 JSON 数据或 Ragas 平台的标注来训练指标

参数

名称 类型 描述 默认值
path str

本地 JSON 训练数据文件路径

None
run_id str

用于获取标注的直接运行 ID

None
demonstration_config DemonstrationConfig

演示优化配置

None
instruction_config InstructionConfig

指令优化配置

None
callbacks Callbacks

回调函数列表

None
run_config 运行配置

运行配置

None
batch_size int

训练的批次大小

None
with_debugging_logs bool

启用调试日志

False
raise_exceptions bool

训练期间是否引发异常

True

引发

类型 描述
ValueError

如果 path 和 run_id 的组合无效

源代码位于 src/ragas/metrics/base.py
def train(
    self,
    path: t.Optional[str] = None,
    run_id: t.Optional[str] = None,
    demonstration_config: t.Optional[DemonstrationConfig] = None,
    instruction_config: t.Optional[InstructionConfig] = None,
    callbacks: t.Optional[Callbacks] = None,
    run_config: t.Optional[RunConfig] = None,
    batch_size: t.Optional[int] = None,
    with_debugging_logs=False,
    raise_exceptions: bool = True,
) -> None:
    """
    Train the metric using local JSON data or annotations from Ragas platform

    Parameters
    ----------
    path : str, optional
        Path to local JSON training data file
    run_id : str, optional
        Direct run ID to fetch annotations
    demonstration_config : DemonstrationConfig, optional
        Configuration for demonstration optimization
    instruction_config : InstructionConfig, optional
        Configuration for instruction optimization
    callbacks : Callbacks, optional
        List of callback functions
    run_config : RunConfig, optional
        Run configuration
    batch_size : int, optional
        Batch size for training
    with_debugging_logs : bool, default=False
        Enable debugging logs
    raise_exceptions : bool, default=True
        Whether to raise exceptions during training

    Raises
    ------
    ValueError
        If invalid combination of path, and run_id is provided
    """
    # Validate input parameters
    provided_inputs = sum(x is not None for x in [path, run_id])
    if provided_inputs == 0:
        raise ValueError("One of path or run_id must be provided")
    if provided_inputs > 1:
        raise ValueError("Only one of path or run_id should be provided")

    run_config = run_config or RunConfig()
    callbacks = callbacks or []

    # Load the dataset based on input type
    if path is not None:
        if not path.endswith(".json"):
            raise ValueError("Train data must be in json format")
        dataset = MetricAnnotation.from_json(path, metric_name=self.name)
    elif run_id is not None:
        dataset = MetricAnnotation.from_app(
            run_id=run_id,
            metric_name=self.name,
        )
    else:
        raise ValueError("One of path or run_id must be provided")

    # only optimize the instruction if instruction_config is provided
    if instruction_config is not None:
        self._optimize_instruction(
            instruction_config=instruction_config,
            dataset=dataset,
            callbacks=callbacks,
            run_config=run_config,
            batch_size=batch_size,
            with_debugging_logs=with_debugging_logs,
            raise_exceptions=raise_exceptions,
        )

    # if demonstration_config is provided, optimize the demonstrations
    if demonstration_config is not None:
        self._optimize_demonstration(
            demonstration_config=demonstration_config,
            dataset=dataset,
        )

SingleTurnMetric dataclass

SingleTurnMetric(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '')

基类: Metric

用于评估单轮交互的指标类。

此类提供了同步和异步评分单轮样本的方法。

single_turn_score

single_turn_score(sample: SingleTurnSample, callbacks: Callbacks = None) -> float

同步评分单轮样本。

如果在类似 Jupyter 的环境中未安装 nest_asyncio,可能会引发 ImportError。

源代码位于 src/ragas/metrics/base.py
def single_turn_score(
    self,
    sample: SingleTurnSample,
    callbacks: Callbacks = None,
) -> float:
    """
    Synchronously score a single-turn sample.

    May raise ImportError if nest_asyncio is not installed in a Jupyter-like environment.
    """
    callbacks = callbacks or []
    # only get the required columns
    sample = self._only_required_columns_single_turn(sample)
    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        if is_event_loop_running():
            try:
                import nest_asyncio

                nest_asyncio.apply()
            except ImportError:
                raise ImportError(
                    "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
                )
        loop = asyncio.get_event_loop()
        score = loop.run_until_complete(
            self._single_turn_ascore(sample=sample, callbacks=group_cm)
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )
    return score

single_turn_ascore async

single_turn_ascore(sample: SingleTurnSample, callbacks: Callbacks = None, timeout: Optional[float] = None) -> float

异步评分单轮样本,可带超时设置。

如果评分过程超出指定超时,可能会引发 asyncio.TimeoutError。

源代码位于 src/ragas/metrics/base.py
async def single_turn_ascore(
    self,
    sample: SingleTurnSample,
    callbacks: Callbacks = None,
    timeout: t.Optional[float] = None,
) -> float:
    """
    Asynchronously score a single-turn sample with an optional timeout.

    May raise asyncio.TimeoutError if the scoring process exceeds the specified timeout.
    """
    callbacks = callbacks or []
    # only get the required columns
    sample = self._only_required_columns_single_turn(sample)
    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        score = await asyncio.wait_for(
            self._single_turn_ascore(sample=sample, callbacks=group_cm),
            timeout=timeout,
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )
    return score

MultiTurnMetric dataclass

MultiTurnMetric(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '')

基类: Metric

用于评估多轮对话的指标类。

此类扩展了 Metric 基类,提供了评分多轮对话样本的功能。

multi_turn_score

multi_turn_score(sample: MultiTurnSample, callbacks: Callbacks = None) -> float

同步评分多轮对话样本。

如果在类似 Jupyter 的环境中未安装 nest_asyncio,可能会引发 ImportError。

源代码位于 src/ragas/metrics/base.py
def multi_turn_score(
    self,
    sample: MultiTurnSample,
    callbacks: Callbacks = None,
) -> float:
    """
    Score a multi-turn conversation sample synchronously.

    May raise ImportError if nest_asyncio is not installed in Jupyter-like environments.
    """
    callbacks = callbacks or []
    sample = self._only_required_columns_multi_turn(sample)
    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        if is_event_loop_running():
            try:
                import nest_asyncio

                nest_asyncio.apply()
            except ImportError:
                raise ImportError(
                    "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
                )
        loop = asyncio.get_event_loop()
        score = loop.run_until_complete(
            self._multi_turn_ascore(sample=sample, callbacks=group_cm)
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )
    return score

multi_turn_ascore async

multi_turn_ascore(sample: MultiTurnSample, callbacks: Callbacks = None, timeout: Optional[float] = None) -> float

异步评分多轮对话样本。

如果评分过程超出指定超时,可能会引发 asyncio.TimeoutError。

源代码位于 src/ragas/metrics/base.py
async def multi_turn_ascore(
    self,
    sample: MultiTurnSample,
    callbacks: Callbacks = None,
    timeout: t.Optional[float] = None,
) -> float:
    """
    Score a multi-turn conversation sample asynchronously.

    May raise asyncio.TimeoutError if the scoring process exceeds the specified timeout.
    """
    callbacks = callbacks or []
    sample = self._only_required_columns_multi_turn(sample)

    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        score = await asyncio.wait_for(
            self._multi_turn_ascore(sample=sample, callbacks=group_cm),
            timeout=timeout,
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )

    return score

Ensember

将同一输入(n>1)的多个 LLM 输出合并为单个输出

from_discrete

from_discrete(inputs: list[list[Dict]], attribute: str) -> List[Dict]

二进制值的简单多数投票,例如 [0,0,1] -> 0 输入:字典列表的列表,每个字典包含单个输入的判断结果

源代码位于 src/ragas/metrics/base.py
def from_discrete(
    self, inputs: list[list[t.Dict]], attribute: str
) -> t.List[t.Dict]:
    """
    Simple majority voting for binary values, ie [0,0,1] -> 0
    inputs: list of list of dicts each containing verdict for a single input
    """

    if not isinstance(inputs, list):
        inputs = [inputs]

    if not all(len(item) == len(inputs[0]) for item in inputs):
        logger.warning("All inputs must have the same length")
        return inputs[0]

    if not all(attribute in item for input in inputs for item in input):
        logger.warning(f"All inputs must have {attribute} attribute")
        return inputs[0]

    if len(inputs) == 1:
        return inputs[0]

    verdict_agg = []
    for i in range(len(inputs[0])):
        item = inputs[0][i]
        verdicts = [inputs[k][i][attribute] for k in range(len(inputs))]
        verdict_counts = dict(Counter(verdicts).most_common())
        item[attribute] = list(verdict_counts.keys())[0]
        verdict_agg.append(item)

    return verdict_agg

AnswerCorrectness dataclass

AnswerCorrectness(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'response', 'reference'}}(), name: str = 'answer_correctness', embeddings: Optional[BaseRagasEmbeddings] = None, llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None, correctness_prompt: PydanticPrompt = CorrectnessClassifier(), statement_generator_prompt: PydanticPrompt = StatementGeneratorPrompt(), weights: list[float] = lambda: [0.75, 0.25](), beta: float = 1.0, answer_similarity: Optional[AnswerSimilarity] = None, max_retries: int = 1)

基类: MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric

结合事实性和语义相似度,衡量答案与地面真值的正确性。

属性

名称 类型 描述
name string

指标的名称

weights list[float]

一个包含事实性和语义相似度对应权重的列表,默认为 [0.75, 0.25]

answer_similarity Optional[AnswerSimilarity]

AnswerSimilarity 对象

ResponseRelevancy dataclass

ResponseRelevancy(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'response'}}(), name: str = 'answer_relevancy', embeddings: Optional[BaseRagasEmbeddings] = None, llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None, question_generation: PydanticPrompt = ResponseRelevancePrompt(), strictness: int = 3)

基类: MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric

根据给定问题对答案的相关性进行评分。包含不完整、冗余或不必要信息的答案会受到惩罚。分数范围从 0 到 1,1 为最佳。

属性

名称 类型 描述
name string

指标的名称

strictness int

此处表示每个答案生成的查询数量。理想范围在 3 到 5 之间。

embeddings Embedding

Embedding 对象的 langchain 封装器。例如 HuggingFaceEmbeddings('BAAI/bge-base-en')

SemanticSimilarity dataclass

SemanticSimilarity(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'reference', 'response'}}(), name: str = 'semantic_similarity', embeddings: Optional[BaseRagasEmbeddings] = None, is_cross_encoder: bool = False, threshold: Optional[float] = None)

基类: MetricWithEmbeddings, SingleTurnMetric

对地面真值与生成的答案之间的语义相似度进行评分。使用交叉编码器分数来量化语义相似度。SAS 论文:https://arxiv.org/pdf/2108.06130.pdf

属性

名称 类型 描述
name str
model_name

用于计算语义相似度的模型 默认为 open-ai-embeddings 选择交叉编码器模型以获得最佳结果 https://hugging-face.cn/spaces/mteb/leaderboard

threshold Optional[float]

如果给出阈值,则用于将输出映射到二进制,默认为 0.5

AspectCritic

AspectCritic(name: str, definition: str, llm: Optional[BaseRagasLLM] = None, required_columns: Optional[Dict[MetricType, Set[str]]] = None, output_type: Optional[MetricOutputType] = BINARY, single_turn_prompt: Optional[PydanticPrompt] = None, multi_turn_prompt: Optional[PydanticPrompt] = None, strictness: int = 1, max_retries: int = 1)

基类: MetricWithLLM, SingleTurnMetric, MultiTurnMetric

使用指标定义中指定的标准,对提交结果进行二元判断。

属性

名称 类型 描述
name str

指标的名称

definition str

判断提交结果的标准,例如“提交结果是否传播虚假信息?”

strictness int

进行自洽性检查的次数。最终判断使用多数投票决定。

源代码位于 src/ragas/metrics/_aspect_critic.py
def __init__(
    self,
    name: str,
    definition: str,
    llm: t.Optional[BaseRagasLLM] = None,
    required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None,
    output_type: t.Optional[MetricOutputType] = MetricOutputType.BINARY,
    single_turn_prompt: t.Optional[PydanticPrompt] = None,
    multi_turn_prompt: t.Optional[PydanticPrompt] = None,
    strictness: int = 1,
    max_retries: int = 1,
):
    self._required_columns = required_columns or {
        MetricType.SINGLE_TURN: {
            "user_input:optional",
            "response:optional",
            "retrieved_contexts:optional",
            "reference:optional",
            "reference_contexts:optional",
        },
        MetricType.MULTI_TURN: {
            "user_input:optional",
            "reference:optional",
        },
    }
    super().__init__(
        name=name,
        _required_columns=self._required_columns,
        llm=llm,
        output_type=output_type,
    )

    self._definition = definition
    self.single_turn_prompt = single_turn_prompt or SingleTurnAspectCriticPrompt()
    self.multi_turn_prompt = multi_turn_prompt or MultiTurnAspectCriticPrompt()
    self.max_retries = max_retries

    # update the instruction for the prompts with the definition
    instruction = f"Evaluate the Input based on the criterial defined. Use only 'Yes' (1) and 'No' (0) as verdict.\nCriteria Definition: {self._definition}"
    self.single_turn_prompt.instruction = instruction
    self.multi_turn_prompt.instruction = instruction

    # ensure odd number of checks to avoid tie in majority vote.
    self.strictness = strictness
    self.strictness = (
        self.strictness if self.strictness % 2 != 0 else self.strictness + 1
    )

ContextEntityRecall dataclass

ContextEntityRecall(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'reference', 'retrieved_contexts'}}(), name: str = 'context_entity_recall', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None, context_entity_recall_prompt: PydanticPrompt = ExtractEntitiesPrompt(), max_retries: int = 1)

基类: MetricWithLLM, SingleTurnMetric

根据地面真值和上下文中存在的实体计算召回率。设 CN 是上下文中存在的实体集合,GN 是地面真值中存在的实体集合。

则我们可以将上下文实体召回率定义如下:上下文实体召回率 = | CN ∩ GN | / | GN |

如果该值为 1,我们可以说检索机制检索到的上下文覆盖了地面真值中存在的所有实体,因此是有效的检索。因此,这可以用于评估在实体重要的特定用例中的检索机制,例如,旅游帮助聊天机器人。

属性

名称 类型 描述
name str
batch_size int

用于 openai 补全的批次大小。

LLMContextPrecisionWithReference dataclass

LLMContextPrecisionWithReference(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'retrieved_contexts', 'reference'}}(), name: str = 'llm_context_precision_with_reference', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None, context_precision_prompt: PydanticPrompt = ContextPrecisionPrompt(), max_retries: int = 1)

基类: MetricWithLLM, SingleTurnMetric

平均精确率是一个指标,评估模型选择的所有相关项是否排名靠前。

属性

名称 类型 描述
name str
evaluation_mode EvaluationMode
context_precision_prompt Prompt

LLMContextRecall dataclass

LLMContextRecall(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'retrieved_contexts', 'reference'}}(), name: str = 'context_recall', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = CONTINUOUS, context_recall_prompt: PydanticPrompt = ContextRecallClassificationPrompt(), max_retries: int = 1)

基类: MetricWithLLM, SingleTurnMetric

通过使用标注答案和检索到的上下文估计 TP 和 FN 来估计上下文召回率。

属性

名称 类型 描述
name str

FactualCorrectness dataclass

FactualCorrectness(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'response', 'reference'}}(), name: str = 'factual_correctness', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = CONTINUOUS, mode: Literal['precision', 'recall', 'f1'] = 'f1', beta: float = 1.0, atomicity: Literal['low', 'high'] = 'low', coverage: Literal['low', 'high'] = 'low', claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt(), nli_prompt: PydanticPrompt = NLIStatementPrompt(), language: str = 'english')

基类: MetricWithLLM, SingleTurnMetric

FactualCorrectness 是一个指标类,用于评估语言模型生成回复的事实正确性。它使用断言分解和自然语言推理 (NLI) 来对照参考文本验证回复中的断言。

属性:name (str):指标名称,默认为 "factual_correctness"。_required_columns (Dict[MetricType, Set[str]]):一个字典,指定每种指标类型所需的列。默认为 {"SINGLE_TURN": {"response", "reference"}}。mode (Literal["precision", "recall", "f1"]):评估模式,可以是 "precision"、"recall" 或 "f1"。默认为 "f1"。beta (float):用于 F1 分数计算的 beta 值。beta > 1 更侧重召回率,而 beta < 1 更侧重精确率。默认为 1.0。atomicity (Literal["low", "high"]):断言分解的原子性级别。默认为 "low"。coverage (Literal["low", "high"]):断言分解的覆盖范围级别。默认为 "low"。claim_decomposition_prompt (PydanticPrompt):用于断言分解的 prompt。nli_prompt (PydanticPrompt):用于自然语言推理 (NLI) 的 prompt。

Faithfulness dataclass

Faithfulness(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'response', 'retrieved_contexts'}}(), name: str = 'faithfulness', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = CONTINUOUS, nli_statements_prompt: PydanticPrompt = NLIStatementPrompt(), statement_generator_prompt: PydanticPrompt = StatementGeneratorPrompt(), max_retries: int = 1)

FaithfulnesswithHHEM dataclass

FaithfulnesswithHHEM(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'response', 'retrieved_contexts'}}(), name: str = 'faithfulness_with_hhem', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = CONTINUOUS, nli_statements_prompt: PydanticPrompt = NLIStatementPrompt(), statement_generator_prompt: PydanticPrompt = StatementGeneratorPrompt(), max_retries: int = 1, device: str = 'cpu', batch_size: int = 10)

基类: Faithfulness

NoiseSensitivity dataclass

NoiseSensitivity(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'response', 'reference', 'retrieved_contexts'}}(), name: str = 'noise_sensitivity', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = CONTINUOUS, mode: Literal['relevant', 'irrelevant'] = 'relevant', nli_statements_prompt: PydanticPrompt = NLIStatementPrompt(), statement_generator_prompt: PydanticPrompt = StatementGeneratorPrompt(), max_retries: int = 1)

AnswerAccuracy dataclass

AnswerAccuracy(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'response', 'reference'}}(), name: str = 'nv_accuracy', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None)

基类: MetricWithLLM, SingleTurnMetric

给定 user_input,衡量答案与地面真值的准确性。此指标平均两个不同的判断 prompt 进行评估。

Zero-shoot LLM-as-a-Judge 前 10 名排行榜:1)- mistralai/mixtral-8x22b-instruct-v0.1 2)- mistralai/mixtral-8x7b-instruct-v0.1 3)- meta/llama-3.1-70b-instruct 4)- meta/llama-3.3-70b-instruct 5)- meta/llama-3.1-405b-instruct 6)- mistralai/mistral-nemo-12b-instruct 7)- nvidia/llama-3.1-nemotron-70b-instruct 8)- meta/llama-3.1-8b-instruct 9)- google/gemma-2-2b-it 10)- nvidia/nemotron-mini-4b-instruct 排行榜前 1 名的模型与人工判断具有高度相关性 (~0.90)。

属性

名称 类型 描述
name string

指标的名称

answer_accuracy

AnswerAccuracy 对象

ContextRelevance dataclass

ContextRelevance(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'user_input', 'retrieved_contexts'}}(), name: str = 'nv_context_relevance', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None)

基类: MetricWithLLM, SingleTurnMetric

参数:根据用户输入对检索到的上下文的相关性进行评分。

输入:data:包含键 user_input, retrieved_contexts 的字典列表 输出:0.0:retrieved_contexts 与 user_input 不相关 0.5:retrieved_contexts 与 user_input 部分相关 1.0:retrieved_contexts 与 user_input 完全相关

ResponseGroundedness dataclass

ResponseGroundedness(_required_columns: Dict[MetricType, Set[str]] = lambda: {SINGLE_TURN: {'response', 'retrieved_contexts'}}(), name: str = 'nv_response_groundedness', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None)

基类: MetricWithLLM, SingleTurnMetric

参数:根据检索到的上下文对回复的有据度进行评分。

输入:data:包含键 response, retrieved contexts 的字典列表 输出:0.0:回复没有基于检索到的上下文 0.5:回复部分基于检索到的上下文 1.0:回复完全基于检索到的上下文

SimpleCriteriaScore

SimpleCriteriaScore(name: str, definition: str, llm: Optional[BaseRagasLLM] = None, required_columns: Optional[Dict[MetricType, Set[str]]] = None, output_type: Optional[MetricOutputType] = DISCRETE, single_turn_prompt: Optional[PydanticPrompt] = None, multi_turn_prompt: Optional[PydanticPrompt] = None, strictness: int = 1)

基类: MetricWithLLM, SingleTurnMetric, MultiTurnMetric

使用指标定义中指定的标准,对提交结果进行二元判断。

属性

名称 类型 描述
name str

指标的名称

definition str

对提交结果进行评分的标准

strictness int

进行自洽性检查的次数。最终判断使用多数投票决定。

源代码位于 src/ragas/metrics/_simple_criteria.py
def __init__(
    self,
    name: str,
    definition: str,
    llm: t.Optional[BaseRagasLLM] = None,
    required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None,
    output_type: t.Optional[MetricOutputType] = MetricOutputType.DISCRETE,
    single_turn_prompt: t.Optional[PydanticPrompt] = None,
    multi_turn_prompt: t.Optional[PydanticPrompt] = None,
    strictness: int = 1,
):
    if required_columns is None:
        required_columns = {
            MetricType.SINGLE_TURN: {
                "user_input:optional",
                "response:optional",
                "retrieved_contexts:optional",
                "reference:optional",
                "reference_contexts:optional",
            },
            MetricType.MULTI_TURN: {
                "user_input:optional",
                "reference:optional",
            },
        }
    super().__init__(
        name=name,
        llm=llm,
        _required_columns=required_columns,
        output_type=output_type,
    )

    self._definition = definition
    self.single_turn_prompt = single_turn_prompt or SingleTurnSimpleCriteriaPrompt()
    self.multi_turn_prompt = multi_turn_prompt or MultiTurnSimpleCriteriaPrompt()

    # update the instruction for the prompts with the definition
    instruction = f"Evaluate the input based on the criteria defined.\nCriteria Definition: {self._definition}"
    self.single_turn_prompt.instruction = instruction
    self.multi_turn_prompt.instruction = instruction

    # ensure odd number of checks to avoid tie in majority vote.
    self.strictness = strictness
    self.strictness = (
        self.strictness if self.strictness % 2 != 0 else self.strictness + 1
    )

Metric dataclass

Metric(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '')

基类: ABC

Ragas 中指标的抽象基类。

属性

名称 类型 描述
name str

指标的名称。

required_columns Dict[str, Set[str]]

一个字典,将指标类型名称映射到所需的列名集合。这是一个属性,如果列不在 VALID_COLUMNS 中,则会引发 ValueError

score

score(row: Dict, callbacks: Callbacks = None) -> float

计算单行数据的分数。

注意

此方法已弃用,将在 0.3 版本中移除。请改用 single_turn_ascoremulti_turn_ascore

源代码位于 src/ragas/metrics/base.py
@deprecated("0.2", removal="0.3", alternative="single_turn_ascore")
def score(self, row: t.Dict, callbacks: Callbacks = None) -> float:
    """
    Calculates the score for a single row of data.

    Note
    ----
    This method is deprecated and will be removed in 0.3. Please use `single_turn_ascore` or `multi_turn_ascore` instead.
    """
    callbacks = callbacks or []
    rm, group_cm = new_group(
        self.name,
        inputs=row,
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        if is_event_loop_running():
            try:
                import nest_asyncio

                nest_asyncio.apply()
            except ImportError:
                raise ImportError(
                    "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
                )
        loop = asyncio.get_event_loop()
        score = loop.run_until_complete(self._ascore(row=row, callbacks=group_cm))
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})
    return score

ascore async

ascore(row: Dict, callbacks: Callbacks = None, timeout: Optional[float] = None) -> float

异步计算单行数据的分数。

注意

此方法已弃用,将在 0.3 版本中移除。请改用 single_turn_ascore

源代码位于 src/ragas/metrics/base.py
@deprecated("0.2", removal="0.3", alternative="single_turn_ascore")
async def ascore(
    self,
    row: t.Dict,
    callbacks: Callbacks = None,
    timeout: t.Optional[float] = None,
) -> float:
    """
    Asynchronously calculates the score for a single row of data.

    Note
    ----
    This method is deprecated and will be removed in 0.3. Please use `single_turn_ascore` instead.
    """
    callbacks = callbacks or []
    rm, group_cm = new_group(
        self.name,
        inputs=row,
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        score = await asyncio.wait_for(
            self._ascore(row=row, callbacks=group_cm),
            timeout=timeout,
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})
    return score

MetricType

基类: Enum

Ragas 中的指标类型枚举。

属性

名称 类型 描述
SINGLE_TURN str

表示单轮指标类型。

MULTI_TURN str

表示多轮指标类型。

MetricWithLLM dataclass

MetricWithLLM(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '', llm: Optional[BaseRagasLLM] = None, output_type: Optional[MetricOutputType] = None)

基类: Metric, PromptMixin

使用语言模型进行评估的指标类。

属性

名称 类型 描述
llm Optional[BaseRagasLLM]

指标使用的语言模型。

train

train(path: Optional[str] = None, run_id: Optional[str] = None, demonstration_config: Optional[DemonstrationConfig] = None, instruction_config: Optional[InstructionConfig] = None, callbacks: Optional[Callbacks] = None, run_config: Optional[RunConfig] = None, batch_size: Optional[int] = None, with_debugging_logs=False, raise_exceptions: bool = True) -> None

使用本地 JSON 数据或 Ragas 平台的标注来训练指标

参数

名称 类型 描述 默认值
path str

本地 JSON 训练数据文件路径

None
run_id str

用于获取标注的直接运行 ID

None
demonstration_config DemonstrationConfig

演示优化配置

None
instruction_config InstructionConfig

指令优化配置

None
callbacks Callbacks

回调函数列表

None
run_config 运行配置

运行配置

None
batch_size int

训练的批次大小

None
with_debugging_logs bool

启用调试日志

False
raise_exceptions bool

训练期间是否引发异常

True

引发

类型 描述
ValueError

如果 path 和 run_id 的组合无效

源代码位于 src/ragas/metrics/base.py
def train(
    self,
    path: t.Optional[str] = None,
    run_id: t.Optional[str] = None,
    demonstration_config: t.Optional[DemonstrationConfig] = None,
    instruction_config: t.Optional[InstructionConfig] = None,
    callbacks: t.Optional[Callbacks] = None,
    run_config: t.Optional[RunConfig] = None,
    batch_size: t.Optional[int] = None,
    with_debugging_logs=False,
    raise_exceptions: bool = True,
) -> None:
    """
    Train the metric using local JSON data or annotations from Ragas platform

    Parameters
    ----------
    path : str, optional
        Path to local JSON training data file
    run_id : str, optional
        Direct run ID to fetch annotations
    demonstration_config : DemonstrationConfig, optional
        Configuration for demonstration optimization
    instruction_config : InstructionConfig, optional
        Configuration for instruction optimization
    callbacks : Callbacks, optional
        List of callback functions
    run_config : RunConfig, optional
        Run configuration
    batch_size : int, optional
        Batch size for training
    with_debugging_logs : bool, default=False
        Enable debugging logs
    raise_exceptions : bool, default=True
        Whether to raise exceptions during training

    Raises
    ------
    ValueError
        If invalid combination of path, and run_id is provided
    """
    # Validate input parameters
    provided_inputs = sum(x is not None for x in [path, run_id])
    if provided_inputs == 0:
        raise ValueError("One of path or run_id must be provided")
    if provided_inputs > 1:
        raise ValueError("Only one of path or run_id should be provided")

    run_config = run_config or RunConfig()
    callbacks = callbacks or []

    # Load the dataset based on input type
    if path is not None:
        if not path.endswith(".json"):
            raise ValueError("Train data must be in json format")
        dataset = MetricAnnotation.from_json(path, metric_name=self.name)
    elif run_id is not None:
        dataset = MetricAnnotation.from_app(
            run_id=run_id,
            metric_name=self.name,
        )
    else:
        raise ValueError("One of path or run_id must be provided")

    # only optimize the instruction if instruction_config is provided
    if instruction_config is not None:
        self._optimize_instruction(
            instruction_config=instruction_config,
            dataset=dataset,
            callbacks=callbacks,
            run_config=run_config,
            batch_size=batch_size,
            with_debugging_logs=with_debugging_logs,
            raise_exceptions=raise_exceptions,
        )

    # if demonstration_config is provided, optimize the demonstrations
    if demonstration_config is not None:
        self._optimize_demonstration(
            demonstration_config=demonstration_config,
            dataset=dataset,
        )

MultiTurnMetric dataclass

MultiTurnMetric(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '')

基类: Metric

用于评估多轮对话的指标类。

此类扩展了 Metric 基类,提供了评分多轮对话样本的功能。

multi_turn_score

multi_turn_score(sample: MultiTurnSample, callbacks: Callbacks = None) -> float

同步评分多轮对话样本。

如果在类似 Jupyter 的环境中未安装 nest_asyncio,可能会引发 ImportError。

源代码位于 src/ragas/metrics/base.py
def multi_turn_score(
    self,
    sample: MultiTurnSample,
    callbacks: Callbacks = None,
) -> float:
    """
    Score a multi-turn conversation sample synchronously.

    May raise ImportError if nest_asyncio is not installed in Jupyter-like environments.
    """
    callbacks = callbacks or []
    sample = self._only_required_columns_multi_turn(sample)
    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        if is_event_loop_running():
            try:
                import nest_asyncio

                nest_asyncio.apply()
            except ImportError:
                raise ImportError(
                    "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
                )
        loop = asyncio.get_event_loop()
        score = loop.run_until_complete(
            self._multi_turn_ascore(sample=sample, callbacks=group_cm)
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )
    return score

multi_turn_ascore async

multi_turn_ascore(sample: MultiTurnSample, callbacks: Callbacks = None, timeout: Optional[float] = None) -> float

异步评分多轮对话样本。

如果评分过程超出指定超时,可能会引发 asyncio.TimeoutError。

源代码位于 src/ragas/metrics/base.py
async def multi_turn_ascore(
    self,
    sample: MultiTurnSample,
    callbacks: Callbacks = None,
    timeout: t.Optional[float] = None,
) -> float:
    """
    Score a multi-turn conversation sample asynchronously.

    May raise asyncio.TimeoutError if the scoring process exceeds the specified timeout.
    """
    callbacks = callbacks or []
    sample = self._only_required_columns_multi_turn(sample)

    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        score = await asyncio.wait_for(
            self._multi_turn_ascore(sample=sample, callbacks=group_cm),
            timeout=timeout,
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )

    return score

SingleTurnMetric dataclass

SingleTurnMetric(_required_columns: Dict[MetricType, Set[str]] = dict(), name: str = '')

基类: Metric

用于评估单轮交互的指标类。

此类提供了同步和异步评分单轮样本的方法。

single_turn_score

single_turn_score(sample: SingleTurnSample, callbacks: Callbacks = None) -> float

同步评分单轮样本。

如果在类似 Jupyter 的环境中未安装 nest_asyncio,可能会引发 ImportError。

源代码位于 src/ragas/metrics/base.py
def single_turn_score(
    self,
    sample: SingleTurnSample,
    callbacks: Callbacks = None,
) -> float:
    """
    Synchronously score a single-turn sample.

    May raise ImportError if nest_asyncio is not installed in a Jupyter-like environment.
    """
    callbacks = callbacks or []
    # only get the required columns
    sample = self._only_required_columns_single_turn(sample)
    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        if is_event_loop_running():
            try:
                import nest_asyncio

                nest_asyncio.apply()
            except ImportError:
                raise ImportError(
                    "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
                )
        loop = asyncio.get_event_loop()
        score = loop.run_until_complete(
            self._single_turn_ascore(sample=sample, callbacks=group_cm)
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )
    return score

single_turn_ascore async

single_turn_ascore(sample: SingleTurnSample, callbacks: Callbacks = None, timeout: Optional[float] = None) -> float

异步评分单轮样本,可带超时设置。

如果评分过程超出指定超时,可能会引发 asyncio.TimeoutError。

源代码位于 src/ragas/metrics/base.py
async def single_turn_ascore(
    self,
    sample: SingleTurnSample,
    callbacks: Callbacks = None,
    timeout: t.Optional[float] = None,
) -> float:
    """
    Asynchronously score a single-turn sample with an optional timeout.

    May raise asyncio.TimeoutError if the scoring process exceeds the specified timeout.
    """
    callbacks = callbacks or []
    # only get the required columns
    sample = self._only_required_columns_single_turn(sample)
    rm, group_cm = new_group(
        self.name,
        inputs=sample.to_dict(),
        callbacks=callbacks,
        metadata={"type": ChainType.METRIC},
    )
    try:
        score = await asyncio.wait_for(
            self._single_turn_ascore(sample=sample, callbacks=group_cm),
            timeout=timeout,
        )
    except Exception as e:
        if not group_cm.ended:
            rm.on_chain_error(e)
        raise e
    else:
        if not group_cm.ended:
            rm.on_chain_end({"output": score})

    # track the evaluation event
    _analytics_batcher.add_evaluation(
        EvaluationEvent(
            metrics=[self.name],
            num_rows=1,
            evaluation_type=MetricType.SINGLE_TURN.name,
            language=get_metric_language(self),
        )
    )
    return score