跳到内容

集成

ragas.integrations.langchain

EvaluatorChain

EvaluatorChain(metric: Metric, **kwargs: Any)

基类: Chain, RunEvaluator

ragas 指标的包装器,用于与 langsmith 一起使用。

源代码位于 src/ragas/integrations/langchain.py
def __init__(self, metric: Metric, **kwargs: t.Any):
    kwargs["metric"] = metric
    super().__init__(**kwargs)
    if "run_config" in kwargs:
        run_config = kwargs["run_config"]
    else:
        run_config = RunConfig()
    if isinstance(self.metric, MetricWithLLM):
        llm = get_or_init(kwargs, "llm", ChatOpenAI)
        t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
    if isinstance(self.metric, MetricWithEmbeddings):
        embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings)
        t.cast(MetricWithEmbeddings, self.metric).embeddings = (
            LangchainEmbeddingsWrapper(embeddings)
        )
    self.metric.init(run_config)

    assert isinstance(
        self.metric, SingleTurnMetric
    ), "Metric must be SingleTurnMetric"

evaluate_run

evaluate_run(run: Run, example: Optional[Example] = None) -> EvaluationResult

评估 langsmith 运行

源代码位于 src/ragas/integrations/langchain.py
@t.no_type_check
def evaluate_run(
    self, run: Run, example: t.Optional[Example] = None
) -> EvaluationResult:
    """
    Evaluate a langsmith run
    """
    # Moved away from this implementation in LangChain evaluations;
    # we can safely ignore type checking for this legacy function.
    self._validate_langsmith_eval(run, example)

    # this is just to suppress the type checker error
    # actual check and error message is in the _validate_langsmith_eval
    assert run.outputs is not None
    assert example is not None
    assert example.inputs is not None
    assert example.outputs is not None

    chain_eval = run.outputs
    chain_eval["question"] = example.inputs["question"]
    if "ground_truth" in get_required_columns_v1(self.metric):
        if example.outputs is None or "ground_truth" not in example.outputs:
            raise ValueError("expected `ground_truth` in example outputs.")
        chain_eval["ground_truth"] = example.outputs["ground_truth"]
    eval_output = self.invoke(chain_eval, include_run_info=True)

    evaluation_result = EvaluationResult(
        key=self.metric.name, score=eval_output[self.metric.name]
    )
    if RUN_KEY in eval_output:
        evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
    return evaluation_result

ragas.integrations.langsmith

upload_dataset

upload_dataset(dataset: Testset, dataset_name: str, dataset_desc: str = '') -> Dataset

将新数据集上传到 LangSmith,在上传之前将其从 TestDataset 对象转换为 pandas DataFrame。如果指定名称的数据集已存在,函数将引发错误。

参数

名称 类型 描述 默认值
dataset TestDataset

要上传的数据集。

必需
dataset_name str

LangSmith 中新数据集的名称。

必需
dataset_desc str

新数据集的描述。默认值为空字符串。

''

返回值

类型 描述
Dataset

上传后存储在 LangSmith 中的数据集对象。

引发

类型 描述
ValueError

如果指定名称的数据集已存在于 LangSmith 中。

注意

函数尝试按给定名称读取数据集以检查其是否存在。如果未找到,它将数据集转换为 pandas DataFrame 后继续上传。这涉及指定要上传数据集的输入和输出键。

源代码位于 src/ragas/integrations/langsmith.py
def upload_dataset(
    dataset: Testset, dataset_name: str, dataset_desc: str = ""
) -> LangsmithDataset:
    """
    Uploads a new dataset to LangSmith, converting it from a TestDataset object to a
    pandas DataFrame before upload. If a dataset with the specified name already
    exists, the function raises an error.

    Parameters
    ----------
    dataset : TestDataset
        The dataset to be uploaded.
    dataset_name : str
        The name for the new dataset in LangSmith.
    dataset_desc : str, optional
        A description for the new dataset. The default is an empty string.

    Returns
    -------
    LangsmithDataset
        The dataset object as stored in LangSmith after upload.

    Raises
    ------
    ValueError
        If a dataset with the specified name already exists in LangSmith.

    Notes
    -----
    The function attempts to read a dataset by the given name to check its existence.
    If not found, it proceeds to upload the dataset after converting it to a pandas
    DataFrame. This involves specifying input and output keys for the dataset being
    uploaded.
    """
    client = Client()
    try:
        # check if dataset exists
        langsmith_dataset: LangsmithDataset = client.read_dataset(
            dataset_name=dataset_name
        )
        raise ValueError(
            f"Dataset {dataset_name} already exists in langsmith. [{langsmith_dataset}]"
        )
    except LangSmithNotFoundError:
        # if not create a new one with the generated query examples
        langsmith_dataset: LangsmithDataset = client.upload_dataframe(
            df=dataset.to_pandas(),
            name=dataset_name,
            input_keys=["question"],
            output_keys=["ground_truth"],
            description=dataset_desc,
        )

        print(
            f"Created a new dataset '{langsmith_dataset.name}'. Dataset is accessible at {langsmith_dataset.url}"
        )
        return langsmith_dataset

evaluate

evaluate(dataset_name: str, llm_or_chain_factory: Any, experiment_name: Optional[str] = None, metrics: Optional[list] = None, verbose: bool = False) -> Dict[str, Any]

使用 LangSmith 在指定数据集上评估语言模型或链工厂,可选择自定义指标和详细程度。

参数

名称 类型 描述 默认值
dataset_name str

用于评估的数据集名称。此数据集必须存在于 LangSmith 中。

必需
llm_or_chain_factory Any

要评估的语言模型或链工厂。此参数是灵活的,可以根据实现接受各种对象。

必需
experiment_name 可选[str]

实验的名称。这可用于在 LangSmith 中对评估运行进行分类或标识。默认值为 None。

None
metrics 可选[list]

用于评估的自定义指标(函数或评估器)列表。如果为 None,则使用默认指标集(答案相关性、上下文精确度、上下文召回率和忠实度)。默认值为 None。

None
verbose bool

如果为 True,评估过程中将打印详细进度和结果。默认值为 False。

False

返回值

类型 描述
字典[str, 任意类型]

包含评估结果的字典。

引发

类型 描述
ValueError

如果指定的数据集不存在于 LangSmith 中。

另请参见

Client.read_dataset : 读取现有数据集的方法。 Client.run_on_dataset : 在指定数据集上运行评估的方法。

示例

>>> results = evaluate(
...     dataset_name="MyDataset",
...     llm_or_chain_factory=my_llm,
...     experiment_name="experiment_1_with_vanila_rag",
...     verbose=True
... )
>>> print(results)
{'evaluation_result': ...}
注意

函数初始化一个客户端以与 LangSmith 交互,验证指定数据集的存在性,准备评估指标,然后运行评估并返回结果。可以指定自定义评估指标,如果未提供,则使用默认集。

源代码位于 src/ragas/integrations/langsmith.py
def evaluate(
    dataset_name: str,
    llm_or_chain_factory: t.Any,
    experiment_name: t.Optional[str] = None,
    metrics: t.Optional[list] = None,
    verbose: bool = False,
) -> t.Dict[str, t.Any]:
    """
    Evaluates a language model or a chain factory on a specified dataset using
    LangSmith, with the option to customize metrics and verbosity.

    Parameters
    ----------
    dataset_name : str
        The name of the dataset to use for evaluation. This dataset must exist in
        LangSmith.
    llm_or_chain_factory : Any
        The language model or chain factory to be evaluated. This parameter is
        flexible and can accept a variety of objects depending on the implementation.
    experiment_name : Optional[str], optional
        The name of the experiment. This can be used to categorize or identify the
        evaluation run within LangSmith. The default is None.
    metrics : Optional[list], optional
        A list of custom metrics (functions or evaluators) to be used for the
        evaluation. If None, a default set of metrics (answer relevancy, context
        precision, context recall, and faithfulness) are used.
        The default is None.
    verbose : bool, optional
        If True, detailed progress and results will be printed during the evaluation
        process.
        The default is False.

    Returns
    -------
    Dict[str, Any]
        A dictionary containing the results of the evaluation.

    Raises
    ------
    ValueError
        If the specified dataset does not exist in LangSmith.

    See Also
    --------
    Client.read_dataset : Method to read an existing dataset.
    Client.run_on_dataset : Method to run the evaluation on the specified dataset.

    Examples
    --------
    >>> results = evaluate(
    ...     dataset_name="MyDataset",
    ...     llm_or_chain_factory=my_llm,
    ...     experiment_name="experiment_1_with_vanila_rag",
    ...     verbose=True
    ... )
    >>> print(results)
    {'evaluation_result': ...}

    Notes
    -----
    The function initializes a client to interact with LangSmith, validates the existence
    of the specified dataset, prepares evaluation metrics, and runs the evaluation,
    returning the results. Custom evaluation metrics can be specified, or a default set
    will be used if none are provided.
    """
    # init client and validate dataset
    client = Client()
    try:
        _ = client.read_dataset(dataset_name=dataset_name)
    except LangSmithNotFoundError:
        raise ValueError(
            f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith"
        )

    # make config
    if metrics is None:
        from ragas.metrics import (
            answer_relevancy,
            context_precision,
            context_recall,
            faithfulness,
        )

        metrics = [answer_relevancy, context_precision, faithfulness, context_recall]

    metrics = [EvaluatorChain(m) for m in metrics]
    eval_config = RunEvalConfig(
        custom_evaluators=metrics,
    )

    # run evaluation with langsmith
    run = client.run_on_dataset(
        dataset_name=dataset_name,
        llm_or_chain_factory=llm_or_chain_factory,
        evaluation=eval_config,
        verbose=verbose,
        # Any experiment metadata can be specified here
        project_name=experiment_name,
    )

    return run

ragas.integrations.llama_index

ragas.integrations.opik

OpikTracer

基类: OpikTracer

Opik 的回调,可用于将跟踪和评估分数记录到 Opik 平台。

属性

名称 类型 描述
tags 列表[字符串]

为每个跟踪设置的标签。

metadata dict

为每个跟踪记录的附加元数据。

ragas.integrations.helicone

ragas.integrations.langgraph

convert_to_ragas_messages

convert_to_ragas_messages(messages: List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]], metadata: bool = False) -> List[Union[HumanMessage, AIMessage, ToolMessage]]

将 LangChain 消息转换为带有元数据的 Ragas 消息,用于 Agent 评估。

参数

名称 类型 描述 默认值
messages 列表[联合类型[HumanMessage, SystemMessage, AIMessage, ToolMessage]]

要转换的 LangChain 消息对象列表。

必需
metadata (布尔值, 可选(默认值=False))

是否在转换后的消息中包含元数据。

False

返回值

类型 描述
列表[联合类型[HumanMessage, AIMessage, ToolMessage]]

相应的带有元数据的 Ragas 消息对象列表。

引发

类型 描述
ValueError

如果遇到不支持的消息类型。

TypeError

如果消息内容不是字符串。

注意

在转换过程中会跳过 SystemMessages。

源代码位于 src/ragas/integrations/langgraph.py
def convert_to_ragas_messages(
    messages: List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]],
    metadata: bool = False,
) -> List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]:
    """
    Convert LangChain messages into Ragas messages with metadata for agent evaluation.

    Parameters
    ----------
    messages : List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]]
        List of LangChain message objects to be converted.
    metadata : bool, optional (default=False)
        Whether to include metadata in the converted messages.

    Returns
    -------
    List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]
        List of corresponding Ragas message objects with metadata.

    Raises
    ------
    ValueError
        If an unsupported message type is encountered.
    TypeError
        If message content is not a string.

    Notes
    -----
    SystemMessages are skipped in the conversion process.
    """

    def _validate_string_content(message, message_type: str) -> str:
        if not isinstance(message.content, str):
            raise TypeError(
                f"{message_type} content must be a string, got {type(message.content).__name__}. "
                f"Content: {message.content}"
            )
        return message.content

    def _extract_metadata(message) -> dict:

        return {k: v for k, v in message.__dict__.items() if k != "content"}

    if metadata:
        MESSAGE_TYPE_MAP = {
            HumanMessage: lambda m: r.HumanMessage(
                content=_validate_string_content(m, "HumanMessage"),
                metadata=_extract_metadata(m),
            ),
            ToolMessage: lambda m: r.ToolMessage(
                content=_validate_string_content(m, "ToolMessage"),
                metadata=_extract_metadata(m),
            ),
        }
    else:
        MESSAGE_TYPE_MAP = {
            HumanMessage: lambda m: r.HumanMessage(
                content=_validate_string_content(m, "HumanMessage")
            ),
            ToolMessage: lambda m: r.ToolMessage(
                content=_validate_string_content(m, "ToolMessage")
            ),
        }

    def _extract_tool_calls(message: AIMessage) -> List[r.ToolCall]:
        tool_calls = message.additional_kwargs.get("tool_calls", [])
        return [
            r.ToolCall(
                name=tool_call["function"]["name"],
                args=json.loads(tool_call["function"]["arguments"]),
            )
            for tool_call in tool_calls
        ]

    def _convert_ai_message(message: AIMessage, metadata: bool) -> r.AIMessage:
        tool_calls = _extract_tool_calls(message) if message.additional_kwargs else None
        if metadata:
            return r.AIMessage(
                content=_validate_string_content(message, "AIMessage"),
                tool_calls=tool_calls,
                metadata=_extract_metadata(message),
            )
        else:
            return r.AIMessage(
                content=_validate_string_content(message, "AIMessage"),
                tool_calls=tool_calls,
            )

    def _convert_message(message, metadata: bool = False):
        if isinstance(message, SystemMessage):
            return None  # Skip SystemMessages
        if isinstance(message, AIMessage):
            return _convert_ai_message(message, metadata)
        converter = MESSAGE_TYPE_MAP.get(type(message))
        if converter is None:
            raise ValueError(f"Unsupported message type: {type(message).__name__}")
        return converter(message)

    return [
        converted
        for message in messages
        if (converted := _convert_message(message)) is not None
    ]