从文档创建自定义单跳查询

加载示例文档

我正在使用 gitlab handbook 示例文档。你可以通过运行以下命令下载它。

! git clone https://hugging-face.cn/datasets/explodinggradients/Sample_Docs_Markdown

from langchain_community.document_loaders import DirectoryLoader


path = "Sample_Docs_Markdown/"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

创建 KG

使用文档创建基础知识图谱

from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType


kg = KnowledgeGraph()
for doc in docs:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={
                "page_content": doc.page_content,
                "document_metadata": doc.metadata,
            },
        )
    )

设置 LLM 和嵌入模型

你可以使用任意选择的模型，这里我使用的是来自 open-ai 的模型。

from ragas.llms.base import llm_factory
from ragas.embeddings.base import embedding_factory

llm = llm_factory()
embedding = embedding_factory()

设置转换（transforms）

这里我们使用了 2 个提取器和 2 个关系构建器。 - 标题提取器：从文档中提取标题 - 关键词提取器：从文档中提取关键词 - 标题分割器：根据标题将文档分割成节点

from ragas.testset.transforms import apply_transforms
from ragas.testset.transforms import (
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
)


headline_extractor = HeadlinesExtractor(llm=llm)
headline_splitter = HeadlineSplitter(min_tokens=300, max_tokens=1000)
keyphrase_extractor = KeyphrasesExtractor(
    llm=llm, property_name="keyphrases", max_num=10
)

transforms = [
    headline_extractor,
    headline_splitter,
    keyphrase_extractor,
]

apply_transforms(kg, transforms=transforms)

输出

Applying KeyphrasesExtractor:   6%| | 2/36 [00:01<00:20,  1Property 'keyphrases' already exists in node '514fdc'. Skipping!
Applying KeyphrasesExtractor:  11%| | 4/36 [00:01<00:10,  2Property 'keyphrases' already exists in node '84a0f6'. Skipping!
Applying KeyphrasesExtractor:  64%|▋| 23/36 [00:03<00:01,  Property 'keyphrases' already exists in node '93f19d'. Skipping!
Applying KeyphrasesExtractor:  72%|▋| 26/36 [00:04<00:00, 1Property 'keyphrases' already exists in node 'a126bf'. Skipping!
Applying KeyphrasesExtractor:  81%|▊| 29/36 [00:04<00:00,  Property 'keyphrases' already exists in node 'c230df'. Skipping!
Applying KeyphrasesExtractor:  89%|▉| 32/36 [00:04<00:00, 1Property 'keyphrases' already exists in node '4f2765'. Skipping!
Property 'keyphrases' already exists in node '4a4777'. Skipping!

配置角色/画像

你也可以使用自动角色/画像生成器自动完成此操作。

from ragas.testset.persona import Persona

person1 = Persona(
    name="gitlab employee",
    role_description="A junior gitlab employee curious on workings on gitlab",
)
persona2 = Persona(
    name="Hiring manager at gitlab",
    role_description="A hiring manager at gitlab trying to underestand hiring policies in gitlab",
)
persona_list = [person1, persona2]

单跳查询

继承 SingleHopQuerySynthesizer 并修改生成查询创建场景的函数。

步骤：- 找到用于查询创建的合格节点集。这里我选择所有已提取关键词的节点。- 对于每个合格集 - 将关键词与一个或多个角色/画像匹配。- 创建 (节点, 角色/画像, 查询风格, 查询长度) 的所有可能组合。- 从组合中抽取所需数量的查询。

from ragas.testset.synthesizers.single_hop import (
    SingleHopQuerySynthesizer,
    SingleHopScenario,
)
from dataclasses import dataclass
from ragas.testset.synthesizers.prompts import (
    ThemesPersonasInput,
    ThemesPersonasMatchingPrompt,
)


@dataclass
class MySingleHopScenario(SingleHopQuerySynthesizer):

    theme_persona_matching_prompt = ThemesPersonasMatchingPrompt()

    async def _generate_scenarios(self, n, knowledge_graph, persona_list, callbacks):

        property_name = "keyphrases"
        nodes = []
        for node in knowledge_graph.nodes:
            if node.type.name == "CHUNK" and node.get_property(property_name):
                nodes.append(node)

        number_of_samples_per_node = max(1, n // len(nodes))

        scenarios = []
        for node in nodes:
            if len(scenarios) >= n:
                break
            themes = node.properties.get(property_name, [""])
            prompt_input = ThemesPersonasInput(themes=themes, personas=persona_list)
            persona_concepts = await self.theme_persona_matching_prompt.generate(
                data=prompt_input, llm=self.llm, callbacks=callbacks
            )
            base_scenarios = self.prepare_combinations(
                node,
                themes,
                personas=persona_list,
                persona_concepts=persona_concepts.mapping,
            )
            scenarios.extend(
                self.sample_combinations(base_scenarios, number_of_samples_per_node)
            )

        return scenarios

query = MySingleHopScenario(llm=llm)

scenarios = await query.generate_scenarios(
    n=5, knowledge_graph=kg, persona_list=persona_list
)

scenarios[0]

输出

SingleHopScenario(
nodes=1
term=what is an ally
persona=name='Hiring manager at gitlab' role_description='A hiring manager at gitlab trying to underestand hiring policies in gitlab'
style=Web search like queries
length=long)

result = await query.generate_sample(scenario=scenarios[-1])

修改提示词以定制查询风格

这里我用一个只生成是/否问题的指令替换了默认提示词。这是一个可选步骤。

instruction = """Generate a Yes/No query and answer based on the specified conditions (persona, term, style, length) 
and the provided context. Ensure the answer is entirely faithful to the context, using only the information 
directly from the provided context.

### Instructions:
1. **Generate a Yes/No Query**: Based on the context, persona, term, style, and length, create a question 
that aligns with the persona's perspective, incorporates the term, and can be answered with 'Yes' or 'No'.
2. **Generate an Answer**: Using only the content from the provided context, provide a 'Yes' or 'No' answer 
to the query. Do not add any information not included in or inferable from the context."""

prompt = query.get_prompts()["generate_query_reference_prompt"]
prompt.instruction = instruction
query.set_prompts(**{"generate_query_reference_prompt": prompt})
result = await query.generate_sample(scenario=scenarios[-1])

result.user_input

输出

'Does the Diversity, Inclusion & Belonging (DIB) Team at GitLab have a structured approach to encourage collaborations among team members through various communication methods?'

result.reference

输出

'Yes'