跳转到内容

集成

ragas.integrations.langchain

EvaluatorChain

EvaluatorChain(metric: Metric, **kwargs: Any)

Bases: Chain, RunEvaluator

对 ragas Metrics 的封装,以便与 langsmith 一起使用。

源代码位于 src/ragas/integrations/langchain.py
def __init__(self, metric: Metric, **kwargs: t.Any):
    kwargs["metric"] = metric
    super().__init__(**kwargs)
    if "run_config" in kwargs:
        run_config = kwargs["run_config"]
    else:
        run_config = RunConfig()
    if isinstance(self.metric, MetricWithLLM):
        llm = get_or_init(kwargs, "llm", ChatOpenAI)
        t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
    if isinstance(self.metric, MetricWithEmbeddings):
        embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings)
        t.cast(
            MetricWithEmbeddings, self.metric
        ).embeddings = LangchainEmbeddingsWrapper(embeddings)
    self.metric.init(run_config)

    assert isinstance(self.metric, SingleTurnMetric), (
        "Metric must be SingleTurnMetric"
    )

evaluate_run

evaluate_run(run: Run, example: Optional[Example] = None) -> EvaluationResult

评估一个 langsmith 运行

源代码位于 src/ragas/integrations/langchain.py
@t.no_type_check
def evaluate_run(
    self, run: Run, example: t.Optional[Example] = None
) -> EvaluationResult:
    """
    Evaluate a langsmith run
    """
    # Moved away from this implementation in LangChain evaluations;
    # we can safely ignore type checking for this legacy function.
    self._validate_langsmith_eval(run, example)

    # this is just to suppress the type checker error
    # actual check and error message is in the _validate_langsmith_eval
    assert run.outputs is not None
    assert example is not None
    assert example.inputs is not None
    assert example.outputs is not None

    chain_eval = run.outputs
    chain_eval["question"] = example.inputs["question"]
    if "ground_truth" in get_required_columns_v1(self.metric):
        if example.outputs is None or "ground_truth" not in example.outputs:
            raise ValueError("expected `ground_truth` in example outputs.")
        chain_eval["ground_truth"] = example.outputs["ground_truth"]
    eval_output = self.invoke(chain_eval, include_run_info=True)

    evaluation_result = EvaluationResult(
        key=self.metric.name, score=eval_output[self.metric.name]
    )
    if RUN_KEY in eval_output:
        evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
    return evaluation_result

ragas.integrations.langsmith

upload_dataset

upload_dataset(dataset: Testset, dataset_name: str, dataset_desc: str = '') -> Dataset

将一个新的数据集上传到 LangSmith,上传前将其从 TestDataset 对象转换为 pandas DataFrame。如果指定名称的数据集已存在,则该函数会引发错误。

参数

名称 类型 描述 默认值
dataset TestDataset

待上传的数据集。

必需
dataset_name str

在 LangSmith 中新数据集的名称。

必需
dataset_desc str

新数据集的描述。默认为空字符串。

''

返回

类型 描述
数据集

上传后存储在 LangSmith 中的数据集对象。

抛出

类型 描述
ValueError

如果 LangSmith 中已存在指定名称的数据集。

备注

该函数会尝试按给定名称读取数据集以检查其是否存在。如果未找到,它会在将数据集转换为 pandas DataFrame 后继续上传。这涉及为正在上传的数据集指定输入和输出键。

源代码位于 src/ragas/integrations/langsmith.py
def upload_dataset(
    dataset: Testset, dataset_name: str, dataset_desc: str = ""
) -> LangsmithDataset:
    """
    Uploads a new dataset to LangSmith, converting it from a TestDataset object to a
    pandas DataFrame before upload. If a dataset with the specified name already
    exists, the function raises an error.

    Parameters
    ----------
    dataset : TestDataset
        The dataset to be uploaded.
    dataset_name : str
        The name for the new dataset in LangSmith.
    dataset_desc : str, optional
        A description for the new dataset. The default is an empty string.

    Returns
    -------
    LangsmithDataset
        The dataset object as stored in LangSmith after upload.

    Raises
    ------
    ValueError
        If a dataset with the specified name already exists in LangSmith.

    Notes
    -----
    The function attempts to read a dataset by the given name to check its existence.
    If not found, it proceeds to upload the dataset after converting it to a pandas
    DataFrame. This involves specifying input and output keys for the dataset being
    uploaded.
    """
    client = Client()
    try:
        # check if dataset exists
        langsmith_dataset: LangsmithDataset = client.read_dataset(
            dataset_name=dataset_name
        )
        raise ValueError(
            f"Dataset {dataset_name} already exists in langsmith. [{langsmith_dataset}]"
        )
    except LangSmithNotFoundError:
        # if not create a new one with the generated query examples
        langsmith_dataset: LangsmithDataset = client.upload_dataframe(
            df=dataset.to_pandas(),
            name=dataset_name,
            input_keys=["question"],
            output_keys=["ground_truth"],
            description=dataset_desc,
        )

        print(
            f"Created a new dataset '{langsmith_dataset.name}'. Dataset is accessible at {langsmith_dataset.url}"
        )
        return langsmith_dataset

evaluate

evaluate(dataset_name: str, llm_or_chain_factory: Any, experiment_name: Optional[str] = None, metrics: Optional[list] = None, verbose: bool = False) -> Dict[str, Any]

使用 LangSmith 在指定的数据集上评估语言模型或链工厂,并可自定义指标和详细程度。

参数

名称 类型 描述 默认值
dataset_name str

用于评估的数据集名称。该数据集必须存在于 LangSmith 中。

必需
llm_or_chain_factory Any

待评估的语言模型或链工厂。该参数非常灵活,可以接受各种对象,具体取决于实现。

必需
experiment_name Optional[str]

实验的名称。可用于在 LangSmith 中对评估运行进行分类或识别。默认为 None。

None
metrics Optional[list]

用于评估的自定义指标(函数或评估器)列表。如果为 None,则使用一组默认指标(答案相关性、上下文精确率、上下文召回率和忠实度)。默认为 None。

None
verbose bool

如果为 True,则在评估过程中会打印详细的进度和结果。默认为 False。

False

返回

类型 描述
Dict[str, Any]

包含评估结果的字典。

抛出

类型 描述
ValueError

如果指定的数据集在 LangSmith 中不存在。

另请参阅

Client.read_dataset : 读取现有数据集的方法。Client.run_on_dataset : 在指定数据集上运行评估的方法。

示例

>>> results = evaluate(
...     dataset_name="MyDataset",
...     llm_or_chain_factory=my_llm,
...     experiment_name="experiment_1_with_vanila_rag",
...     verbose=True
... )
>>> print(results)
{'evaluation_result': ...}
备注

该函数初始化一个用于与 LangSmith 交互的客户端,验证指定数据集的存在性,准备评估指标,并运行评估,最终返回结果。可以指定自定义评估指标,如果未提供,则将使用一组默认指标。

源代码位于 src/ragas/integrations/langsmith.py
def evaluate(
    dataset_name: str,
    llm_or_chain_factory: t.Any,
    experiment_name: t.Optional[str] = None,
    metrics: t.Optional[list] = None,
    verbose: bool = False,
) -> t.Dict[str, t.Any]:
    """
    Evaluates a language model or a chain factory on a specified dataset using
    LangSmith, with the option to customize metrics and verbosity.

    Parameters
    ----------
    dataset_name : str
        The name of the dataset to use for evaluation. This dataset must exist in
        LangSmith.
    llm_or_chain_factory : Any
        The language model or chain factory to be evaluated. This parameter is
        flexible and can accept a variety of objects depending on the implementation.
    experiment_name : Optional[str], optional
        The name of the experiment. This can be used to categorize or identify the
        evaluation run within LangSmith. The default is None.
    metrics : Optional[list], optional
        A list of custom metrics (functions or evaluators) to be used for the
        evaluation. If None, a default set of metrics (answer relevancy, context
        precision, context recall, and faithfulness) are used.
        The default is None.
    verbose : bool, optional
        If True, detailed progress and results will be printed during the evaluation
        process.
        The default is False.

    Returns
    -------
    Dict[str, Any]
        A dictionary containing the results of the evaluation.

    Raises
    ------
    ValueError
        If the specified dataset does not exist in LangSmith.

    See Also
    --------
    Client.read_dataset : Method to read an existing dataset.
    Client.run_on_dataset : Method to run the evaluation on the specified dataset.

    Examples
    --------
    >>> results = evaluate(
    ...     dataset_name="MyDataset",
    ...     llm_or_chain_factory=my_llm,
    ...     experiment_name="experiment_1_with_vanila_rag",
    ...     verbose=True
    ... )
    >>> print(results)
    {'evaluation_result': ...}

    Notes
    -----
    The function initializes a client to interact with LangSmith, validates the existence
    of the specified dataset, prepares evaluation metrics, and runs the evaluation,
    returning the results. Custom evaluation metrics can be specified, or a default set
    will be used if none are provided.
    """
    # init client and validate dataset
    client = Client()
    try:
        _ = client.read_dataset(dataset_name=dataset_name)
    except LangSmithNotFoundError:
        raise ValueError(
            f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith"
        )

    # make config
    if metrics is None:
        from ragas.metrics import (
            answer_relevancy,
            context_precision,
            context_recall,
            faithfulness,
        )

        metrics = [answer_relevancy, context_precision, faithfulness, context_recall]

    metrics = [EvaluatorChain(m) for m in metrics]
    eval_config = RunEvalConfig(
        custom_evaluators=metrics,
    )

    # run evaluation with langsmith
    run = client.run_on_dataset(
        dataset_name=dataset_name,
        llm_or_chain_factory=llm_or_chain_factory,
        evaluation=eval_config,
        verbose=verbose,
        # Any experiment metadata can be specified here
        project_name=experiment_name,
    )

    return run

ragas.integrations.llama_index

convert_to_ragas_messages

convert_to_ragas_messages(events: List[Event]) -> List[Message]

将 LlamIndex 代理事件序列转换为 Ragas 消息对象。

此函数处理一个 Event 对象列表(例如 AgentInputAgentOutputToolCallResult),并将其转换为一个 Message 对象列表(HumanMessageAIMessageToolMessage),这些对象可用于 Ragas 框架进行评估。

参数

名称 类型 描述 默认值
events List[Event]

一个代表对话轨迹的代理事件列表。这些事件可以包括用户输入(AgentInput)、模型输出(AgentOutput)和工具响应(ToolCallResult)。

必需

返回

类型 描述
List[Message]

与结构化对话相对应的 Ragas Message 对象列表。工具调用使用其工具 ID 进行去重,以避免重复条目。

源代码位于 src/ragas/integrations/llama_index.py
def convert_to_ragas_messages(events: t.List[Event]) -> t.List[Message]:
    """
    Convert a sequence of LlamIndex agent events into Ragas message objects.

    This function processes a list of `Event` objects (e.g., `AgentInput`, `AgentOutput`,
    and `ToolCallResult`) and converts them into a list of `Message` objects (`HumanMessage`,
    `AIMessage`, and `ToolMessage`) that can be used for evaluation with the Ragas framework.

    Parameters
    ----------
    events : List[Event]
        A list of agent events that represent a conversation trace. These can include
        user inputs (`AgentInput`), model outputs (`AgentOutput`), and tool responses
        (`ToolCallResult`).

    Returns
    -------
    List[Message]
        A list of Ragas `Message` objects corresponding to the structured conversation.
        Tool calls are de-duplicated using their tool ID to avoid repeated entries.
    """
    try:
        from llama_index.core.agent.workflow import (
            AgentInput,
            AgentOutput,
            ToolCallResult,
        )
        from llama_index.core.base.llms.types import MessageRole, TextBlock
    except ImportError:
        raise ImportError(
            "Please install the llama_index package to use this function."
        )
    ragas_messages = []
    tool_call_ids = set()

    for event in events:
        if isinstance(event, AgentInput):
            last_chat_message = event.input[-1]

            content = ""
            if last_chat_message.blocks:
                content = "\n".join(
                    str(block.text)
                    for block in last_chat_message.blocks
                    if isinstance(block, TextBlock)
                )

            if last_chat_message.role == MessageRole.USER:
                if ragas_messages and isinstance(ragas_messages[-1], ToolMessage):
                    continue
                ragas_messages.append(HumanMessage(content=content))

        elif isinstance(event, AgentOutput):
            content = "\n".join(
                str(block.text)
                for block in event.response.blocks
                if isinstance(block, TextBlock)
            )
            ragas_tool_calls = None

            if hasattr(event, "tool_calls"):
                raw_tool_calls = event.tool_calls
                ragas_tool_calls = []
                for tc in raw_tool_calls:
                    if tc.tool_id not in tool_call_ids:
                        tool_call_ids.add(tc.tool_id)
                        ragas_tool_calls.append(
                            ToolCall(
                                name=tc.tool_name,
                                args=tc.tool_kwargs,
                            )
                        )
            ragas_messages.append(
                AIMessage(
                    content=content,
                    tool_calls=ragas_tool_calls if ragas_tool_calls else None,
                )
            )
        elif isinstance(event, ToolCallResult):
            if event.return_direct:
                ragas_messages.append(AIMessage(content=event.tool_output.content))
            else:
                ragas_messages.append(ToolMessage(content=event.tool_output.content))

    return ragas_messages

ragas.integrations.opik

OpikTracer

Bases: OpikTracer

Opik 的回调,可用于将追踪和评估分数记录到 Opik 平台。

属性

名称 类型 描述
tags list[string]

为每个追踪设置的标签。

metadata dict

为每个追踪记录的附加元数据。

ragas.integrations.helicone

ragas.integrations.langgraph

convert_to_ragas_messages

convert_to_ragas_messages(messages: List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]], metadata: bool = False) -> List[Union[HumanMessage, AIMessage, ToolMessage]]

将 LangChain 消息转换为带有元数据的 Ragas 消息,以用于代理评估。

参数

名称 类型 描述 默认值
messages List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]]

待转换的 LangChain 消息对象列表。

必需
metadata (bool, optional(default=False))

是否在转换后的消息中包含元数据。

False

返回

类型 描述
List[Union[HumanMessage, AIMessage, ToolMessage]]

带有元数据的相应 Ragas 消息对象列表。

抛出

类型 描述
ValueError

如果遇到不支持的消息类型。

TypeError

如果消息内容不是字符串。

备注

在转换过程中会跳过 SystemMessages。

源代码位于 src/ragas/integrations/langgraph.py
def convert_to_ragas_messages(
    messages: List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]],
    metadata: bool = False,
) -> List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]:
    """
    Convert LangChain messages into Ragas messages with metadata for agent evaluation.

    Parameters
    ----------
    messages : List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]]
        List of LangChain message objects to be converted.
    metadata : bool, optional (default=False)
        Whether to include metadata in the converted messages.

    Returns
    -------
    List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]
        List of corresponding Ragas message objects with metadata.

    Raises
    ------
    ValueError
        If an unsupported message type is encountered.
    TypeError
        If message content is not a string.

    Notes
    -----
    SystemMessages are skipped in the conversion process.
    """

    def _validate_string_content(message, message_type: str) -> str:
        if not isinstance(message.content, str):
            raise TypeError(
                f"{message_type} content must be a string, got {type(message.content).__name__}. "
                f"Content: {message.content}"
            )
        return message.content

    def _extract_metadata(message) -> dict:
        return {k: v for k, v in message.__dict__.items() if k != "content"}

    if metadata:
        MESSAGE_TYPE_MAP = {
            HumanMessage: lambda m: r.HumanMessage(
                content=_validate_string_content(m, "HumanMessage"),
                metadata=_extract_metadata(m),
            ),
            ToolMessage: lambda m: r.ToolMessage(
                content=_validate_string_content(m, "ToolMessage"),
                metadata=_extract_metadata(m),
            ),
        }
    else:
        MESSAGE_TYPE_MAP = {
            HumanMessage: lambda m: r.HumanMessage(
                content=_validate_string_content(m, "HumanMessage")
            ),
            ToolMessage: lambda m: r.ToolMessage(
                content=_validate_string_content(m, "ToolMessage")
            ),
        }

    def _extract_tool_calls(message: AIMessage) -> List[r.ToolCall]:
        tool_calls = message.additional_kwargs.get("tool_calls", [])
        return [
            r.ToolCall(
                name=tool_call["function"]["name"],
                args=json.loads(tool_call["function"]["arguments"]),
            )
            for tool_call in tool_calls
        ]

    def _convert_ai_message(message: AIMessage, metadata: bool) -> r.AIMessage:
        tool_calls = _extract_tool_calls(message) if message.additional_kwargs else None
        if metadata:
            return r.AIMessage(
                content=_validate_string_content(message, "AIMessage"),
                tool_calls=tool_calls,
                metadata=_extract_metadata(message),
            )
        else:
            return r.AIMessage(
                content=_validate_string_content(message, "AIMessage"),
                tool_calls=tool_calls,
            )

    def _convert_message(message, metadata: bool = False):
        if isinstance(message, SystemMessage):
            return None  # Skip SystemMessages
        if isinstance(message, AIMessage):
            return _convert_ai_message(message, metadata)
        converter = MESSAGE_TYPE_MAP.get(type(message))
        if converter is None:
            raise ValueError(f"Unsupported message type: {type(message).__name__}")
        return converter(message)

    return [
        converted
        for message in messages
        if (converted := _convert_message(message)) is not None
    ]