跳转至

结构化输出简单指南

结构化输出的简单指南

在 Colab 中打开

这是使用 LLM 进行结构化输出的简单指南。从高层次上讲,我们可以将 Pydantic 类附加到任何 LLM 应用上,并使输出格式具有原生结构化形式,即使 LLM 用于上游模块。

我们从 LLM 的简单语法开始,然后介绍如何将其插入查询管道,以及如何将其插入查询引擎和代理等更高级别的模块。

结构化输出的许多底层行为均由我们的 Pydantic 程序模块提供支持。查看我们的深入结构化输出指南以了解更多详细信息。

Python
import nest_asyncio

nest_asyncio.apply()

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = llm
Settings.embed_model = embed_model

1. 简单结构化提取

您可以通过将输出类添加到任何 LLM 并将其转换为“结构化 LLM”,通过调用 as_structured_llm 接口实现。

这里我们传递一个 Album 类,包含歌曲列表。然后我们可以使用正常的 LLM 端点,如 chat/complete`。

注意:支持异步但流式传输即将推出。

Python
from typing import List
from pydantic import BaseModel, Field


class Song(BaseModel):
    """Data model for a song."""

    title: str
    length_seconds: int


class Album(BaseModel):
    """Data model for an album."""

    name: str
    artist: str
    songs: List[Song]
Python
from llama_index.core.llms import ChatMessage

sllm = llm.as_structured_llm(output_cls=Album)
input_msg = ChatMessage.from_str("Generate an example album from The Shining")

同步

Python
output = sllm.chat([input_msg])
# get actual object
output_obj = output.raw
Python
print(str(output))
print(output_obj)
Bash
assistant: {"name": "The Shining: Original Soundtrack", "artist": "Various Artists", "songs": [{"title": "Main Title", "length_seconds": 180}, {"title": "Rocky Mountains", "length_seconds": 210}, {"title": "Lontano", "length_seconds": 720}, {"title": "Music for Strings, Percussion and Celesta", "length_seconds": 540}, {"title": "Utrenja (Excerpt)", "length_seconds": 300}, {"title": "The Awakening of Jacob", "length_seconds": 480}, {"title": "De Natura Sonoris No. 2", "length_seconds": 540}, {"title": "Home", "length_seconds": 180}, {"title": "Midnight, the Stars and You", "length_seconds": 180}, {"title": "It's All Forgotten Now", "length_seconds": 150}, {"title": "Masquerade", "length_seconds": 180}]}
name='The Shining: Original Soundtrack' artist='Various Artists' songs=[Song(title='Main Title', length_seconds=180), Song(title='Rocky Mountains', length_seconds=210), Song(title='Lontano', length_seconds=720), Song(title='Music for Strings, Percussion and Celesta', length_seconds=540), Song(title='Utrenja (Excerpt)', length_seconds=300), Song(title='The Awakening of Jacob', length_seconds=480), Song(title='De Natura Sonoris No. 2', length_seconds=540), Song(title='Home', length_seconds=180), Song(title='Midnight, the Stars and You', length_seconds=180), Song(title="It's All Forgotten Now", length_seconds=150), Song(title='Masquerade', length_seconds=180)]

异步

Python
output = await sllm.achat([input_msg])
# get actual object
output_obj = output.raw
print(str(output))
Bash
assistant: {"name": "The Shining: Original Soundtrack", "artist": "Various Artists", "songs": [{"title": "Main Title (The Shining)", "length_seconds": 180}, {"title": "Rocky Mountains", "length_seconds": 210}, {"title": "Lontano", "length_seconds": 240}, {"title": "Music for Strings, Percussion and Celesta", "length_seconds": 300}, {"title": "Utrenja (Excerpt)", "length_seconds": 180}, {"title": "The Awakening of Jacob", "length_seconds": 150}, {"title": "De Natura Sonoris No. 2", "length_seconds": 270}, {"title": "Home", "length_seconds": 200}, {"title": "Heartbeats and Worry", "length_seconds": 160}, {"title": "The Overlook", "length_seconds": 220}]}

流式

Python
from IPython.display import clear_output
from pprint import pprint

stream_output = sllm.stream_chat([input_msg])
for partial_output in stream_output:
    clear_output(wait=True)
    pprint(partial_output.raw.dict())

output_obj = partial_output.raw
print(str(output))
Bash
{'artist': 'Various Artists',
 'name': 'The Shining: Original Soundtrack',
 'songs': [{'length_seconds': 180, 'title': 'Main Title'},
           {'length_seconds': 210, 'title': 'Rocky Mountains'},
           {'length_seconds': 240, 'title': 'Lontano'},
           {'length_seconds': 540,
            'title': 'Music for Strings, Percussion and Celesta'},
           {'length_seconds': 300, 'title': 'Utrenja (Excerpt)'},
           {'length_seconds': 360, 'title': 'The Awakening of Jacob'},
           {'length_seconds': 420, 'title': 'De Natura Sonoris No. 2'},
           {'length_seconds': 180, 'title': 'Home'},
           {'length_seconds': 180, 'title': 'Midnight, the Stars and You'},
           {'length_seconds': 150, 'title': "It's All Forgotten Now"},
           {'length_seconds': 120, 'title': 'Masquerade'}]}
assistant: {"name": "The Shining: Original Soundtrack", "artist": "Various Artists", "songs": [{"title": "Main Title (The Shining)", "length_seconds": 180}, {"title": "Rocky Mountains", "length_seconds": 210}, {"title": "Lontano", "length_seconds": 240}, {"title": "Music for Strings, Percussion and Celesta", "length_seconds": 300}, {"title": "Utrenja (Excerpt)", "length_seconds": 180}, {"title": "The Awakening of Jacob", "length_seconds": 150}, {"title": "De Natura Sonoris No. 2", "length_seconds": 270}, {"title": "Home", "length_seconds": 200}, {"title": "Heartbeats and Worry", "length_seconds": 160}, {"title": "The Overlook", "length_seconds": 220}]}

异步流式

Python
from IPython.display import clear_output
from pprint import pprint

stream_output = await sllm.astream_chat([input_msg])
async for partial_output in stream_output:
    clear_output(wait=True)
    pprint(partial_output.raw.dict())
Bash
{'artist': 'Various Artists',
 'name': 'The Shining: Original Soundtrack',
 'songs': [{'length_seconds': 180, 'title': 'Main Title'},
           {'length_seconds': 210, 'title': 'Rocky Mountains'},
           {'length_seconds': 720, 'title': 'Lontano'},
           {'length_seconds': 540,
            'title': 'Music for Strings, Percussion and Celesta'},
           {'length_seconds': 300, 'title': 'Utrenja (Excerpt)'},
           {'length_seconds': 480, 'title': 'The Awakening of Jacob'},
           {'length_seconds': 540, 'title': 'De Natura Sonoris No. 2'},
           {'length_seconds': 180, 'title': 'Home'},
           {'length_seconds': 180, 'title': 'Midnight, the Stars and You'},
           {'length_seconds': 180, 'title': "It's All Forgotten Now"},
           {'length_seconds': 180, 'title': 'Masquerade'}]}

1.b 使用查询管道的示例

您可以在查询管道中插入结构化 LLM - 输出将直接是结构化对象。

Python
# use query pipelines
from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.query_pipeline import QueryPipeline as QP
from llama_index.core.llms import ChatMessage

chat_prompt_tmpl = ChatPromptTemplate(
    message_templates=[
        ChatMessage.from_str(
            "Generate an example album from {movie_name}", role="user"
        )
    ]
)

qp = QP(chain=[chat_prompt_tmpl, sllm])
response = qp.run(movie_name="Inside Out")
response
Bash
Album(name='Inside Out Soundtrack', artist='Various Artists', songs=[Song(title='Bundle of Joy', length_seconds=150), Song(title='Team Building', length_seconds=120), Song(title='Nomanisone Island/National Movers', length_seconds=180), Song(title='Overcoming Sadness', length_seconds=210), Song(title='Free Skating', length_seconds=160), Song(title='First Day of School', length_seconds=140), Song(title='Riled Up', length_seconds=130), Song(title='Goofball No Longer', length_seconds=170), Song(title='Memory Lanes', length_seconds=200), Song(title='The Forgetters', length_seconds=110)])

1.c 使用structured_predict函数

无需明确执行llm.as_structured_llm(...),每个 LLM 类都有一个structured_predict函数,允许您更轻松地使用提示模板 + 模板变量调用 LLM,以在一行代码中返回结构化输出。

Python
# use query pipelines
from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.llms import ChatMessage
from llama_index.llms.openai import OpenAI

chat_prompt_tmpl = ChatPromptTemplate(
    message_templates=[
        ChatMessage.from_str(
            "Generate an example album from {movie_name}", role="user"
        )
    ]
)

llm = OpenAI(model="gpt-4o")
album = llm.structured_predict(
    Album, chat_prompt_tmpl, movie_name="Lord of the Rings"
)
album

2. 插入 RAG 管道

您还可以将其插入 RAG 管道。下面我们展示了 Apple 10K 报告中的结构化提取。

Bash
mkdir data
wget "https://s2.q4cdn.com/470004039/files/doc_financials/2021/q4/_10-K-2021-(As-Filed).pdf" -O data/apple_2021_10k.pdf

选项 1:使用 LlamaParse

您需要一个https://cloud.llamaindex.ai/帐户和一个 API 密钥才能使用 LlamaParse(我们的 10K 文件文档解析器)。

Python
from llama_parse import LlamaParse

# os.environ["LLAMA_CLOUD_API_KEY"] = "llx-..."
orig_docs = LlamaParse(result_type="text").load_data(
    "./data/apple_2021_10k.pdf"
)

开始解析job_id为cac11eca-7e00-452f-93f6-19c861b4c130下的文件

Python
from copy import deepcopy
from llama_index.core.schema import TextNode


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes


docs = get_page_nodes(orig_docs)
print(docs[0].get_content())
Python
UNITED STATES
                                    SECURITIES AND EXCHANGE COMMISSION
                                                          Washington, D.C. 20549

                                                               FORM 10-K
(Mark One)
         ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
                                                  For the fiscal year ended September 25, 2021
                                                                            or
      TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
                                                For the transition period from               to          .
                                                       Commission File Number: 001-36743

                                                               Apple Inc.
                                              (Exact name of Registrant as specified in its charter)

                            California                                                                          94-2404110
                     (State or other jurisdiction                                                    (I.R.S. Employer Identification No.)
                  of incorporation or organization)

                     One Apple Park Way
                     Cupertino, California                                                                         95014
              (Address of principal executive offices)                                                            (Zip Code)
                                                                    (408) 996-1010
                                                 (Registrants telephone number, including area code)

                                               Securities registered pursuant to Section 12(b) of the Act:

                                                                         Trading
                         Title of each class                            symbol(s)               Name of each exchange on which registered
      Common Stock, $0.00001 par value per share                         AAPL                       The Nasdaq Stock Market LLC
                    1.000% Notes due 2022                                                          The Nasdaq Stock Market LLC
                    1.375% Notes due 2024                                                          The Nasdaq Stock Market LLC
                    0.000% Notes due 2025                                                          The Nasdaq Stock Market LLC
                    0.875% Notes due 2025                                                          The Nasdaq Stock Market LLC
                    1.625% Notes due 2026                                                          The Nasdaq Stock Market LLC
                    2.000% Notes due 2027                                                          The Nasdaq Stock Market LLC
                    1.375% Notes due 2029                                                          The Nasdaq Stock Market LLC
                    3.050% Notes due 2029                                                          The Nasdaq Stock Market LLC
                    0.500% Notes due 2031                                                          The Nasdaq Stock Market LLC
                    3.600% Notes due 2042                                                          The Nasdaq Stock Market LLC

                                           Securities registered pursuant to Section 12(g) of the Act: None

Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.
                                                                    Yes       No 
Indicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.
                                                                   Yes       No 

选项 2:使用 SimpleDirectoryReader

您还可以选择使用我们的捆绑免费 PDF 解析器 SimpleDirectoryReader

Python
# # OPTION 2: Use SimpleDirectoryReader
# from llama_index.core import SimpleDirectoryReader

# reader = SimpleDirectoryReader(input_files=["apple_2021_10k.pdf"])
# docs = reader.load_data()

构建 RAG 管道,定义结构化输出模式

我们利用可靠的 VectorStoreIndexreranker 模块构建 RAG 管道。然后,我们将输出定义为 Pydantic 模型。这使我们能够创建带有输出类的结构化 LLM。

Python
from llama_index.core import VectorStoreIndex

# skip chunking since we're doing page-level chunking
index = VectorStoreIndex(docs)

from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
)

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

from pydantic import BaseModel, Field
from typing import List


class Output(BaseModel):
    """Output containing the response, page numbers, and confidence."""

    response: str = Field(..., description="The answer to the question.")
    page_numbers: List[int] = Field(
        ...,
        description="The page numbers of the sources used to answer this question. Do not include a page number if the context is irrelevant.",
    )
    confidence: float = Field(
        ...,
        description="Confidence value between 0-1 of the correctness of the result.",
    )
    confidence_explanation: str = Field(
        ..., description="Explanation for the confidence score"
    )


sllm = llm.as_structured_llm(output_cls=Output)

运行查询

Python
query_engine = index.as_query_engine(
    similarity_top_k=5,
    node_postprocessors=[reranker],
    llm=sllm,
    response_mode="tree_summarize",  # you can also select other modes like `compact`, `refine`
)

response = query_engine.query("Net sales for each product category in 2021")
print(str(response))

Bash
{"response": "In 2021, the net sales for each product category were as follows: iPhone: $191,973 million, Mac: $35,190 million, iPad: $31,862 million, Wearables, Home and Accessories: $38,367 million, and Services: $68,425 million.", "page_numbers": [21], "confidence": 1.0, "confidence_explanation": "The figures are directly taken from the provided data, ensuring high accuracy."}
Python
response.response.dict()

Bash
{'response': 'In 2021, the net sales for each product category were as follows: iPhone: $191,973 million, Mac: $35,190 million, iPad: $31,862 million, Wearables, Home and Accessories: $38,367 million, and Services: $68,425 million.',
 'page_numbers': [21],
 'confidence': 1.0,
 'confidence_explanation': 'The figures are directly taken from the provided data, ensuring high accuracy.'}