结构化输出简单指南
结构化输出的简单指南
在 Colab 中打开
这是使用 LLM 进行结构化输出的简单指南。从高层次上讲,我们可以将 Pydantic
类附加到任何 LLM 应用上,并使输出格式具有原生结构化形式,即使 LLM 用于上游模块。
我们从 LLM 的简单语法开始,然后介绍如何将其插入查询管道,以及如何将其插入查询引擎和代理等更高级别的模块。
结构化输出的许多底层行为均由我们的 Pydantic
程序模块提供支持。查看我们的深入结构化输出指南以了解更多详细信息。
import nest_asyncio
nest_asyncio.apply()
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = llm
Settings.embed_model = embed_model
1. 简单结构化提取
您可以通过将输出类添加到任何 LLM 并将其转换为“结构化 LLM”,通过调用 as_structured_llm
接口实现。
这里我们传递一个 Album
类,包含歌曲列表。然后我们可以使用正常的 LLM 端点,如 chat/complete`。
注意:支持异步但流式传输即将推出。
from typing import List
from pydantic import BaseModel, Field
class Song(BaseModel):
"""Data model for a song."""
title: str
length_seconds: int
class Album(BaseModel):
"""Data model for an album."""
name: str
artist: str
songs: List[Song]
from llama_index.core.llms import ChatMessage
sllm = llm.as_structured_llm(output_cls=Album)
input_msg = ChatMessage.from_str("Generate an example album from The Shining")
同步
assistant: {"name": "The Shining: Original Soundtrack", "artist": "Various Artists", "songs": [{"title": "Main Title", "length_seconds": 180}, {"title": "Rocky Mountains", "length_seconds": 210}, {"title": "Lontano", "length_seconds": 720}, {"title": "Music for Strings, Percussion and Celesta", "length_seconds": 540}, {"title": "Utrenja (Excerpt)", "length_seconds": 300}, {"title": "The Awakening of Jacob", "length_seconds": 480}, {"title": "De Natura Sonoris No. 2", "length_seconds": 540}, {"title": "Home", "length_seconds": 180}, {"title": "Midnight, the Stars and You", "length_seconds": 180}, {"title": "It's All Forgotten Now", "length_seconds": 150}, {"title": "Masquerade", "length_seconds": 180}]}
name='The Shining: Original Soundtrack' artist='Various Artists' songs=[Song(title='Main Title', length_seconds=180), Song(title='Rocky Mountains', length_seconds=210), Song(title='Lontano', length_seconds=720), Song(title='Music for Strings, Percussion and Celesta', length_seconds=540), Song(title='Utrenja (Excerpt)', length_seconds=300), Song(title='The Awakening of Jacob', length_seconds=480), Song(title='De Natura Sonoris No. 2', length_seconds=540), Song(title='Home', length_seconds=180), Song(title='Midnight, the Stars and You', length_seconds=180), Song(title="It's All Forgotten Now", length_seconds=150), Song(title='Masquerade', length_seconds=180)]
异步
output = await sllm.achat([input_msg])
# get actual object
output_obj = output.raw
print(str(output))
assistant: {"name": "The Shining: Original Soundtrack", "artist": "Various Artists", "songs": [{"title": "Main Title (The Shining)", "length_seconds": 180}, {"title": "Rocky Mountains", "length_seconds": 210}, {"title": "Lontano", "length_seconds": 240}, {"title": "Music for Strings, Percussion and Celesta", "length_seconds": 300}, {"title": "Utrenja (Excerpt)", "length_seconds": 180}, {"title": "The Awakening of Jacob", "length_seconds": 150}, {"title": "De Natura Sonoris No. 2", "length_seconds": 270}, {"title": "Home", "length_seconds": 200}, {"title": "Heartbeats and Worry", "length_seconds": 160}, {"title": "The Overlook", "length_seconds": 220}]}
流式
from IPython.display import clear_output
from pprint import pprint
stream_output = sllm.stream_chat([input_msg])
for partial_output in stream_output:
clear_output(wait=True)
pprint(partial_output.raw.dict())
output_obj = partial_output.raw
print(str(output))
{'artist': 'Various Artists',
'name': 'The Shining: Original Soundtrack',
'songs': [{'length_seconds': 180, 'title': 'Main Title'},
{'length_seconds': 210, 'title': 'Rocky Mountains'},
{'length_seconds': 240, 'title': 'Lontano'},
{'length_seconds': 540,
'title': 'Music for Strings, Percussion and Celesta'},
{'length_seconds': 300, 'title': 'Utrenja (Excerpt)'},
{'length_seconds': 360, 'title': 'The Awakening of Jacob'},
{'length_seconds': 420, 'title': 'De Natura Sonoris No. 2'},
{'length_seconds': 180, 'title': 'Home'},
{'length_seconds': 180, 'title': 'Midnight, the Stars and You'},
{'length_seconds': 150, 'title': "It's All Forgotten Now"},
{'length_seconds': 120, 'title': 'Masquerade'}]}
assistant: {"name": "The Shining: Original Soundtrack", "artist": "Various Artists", "songs": [{"title": "Main Title (The Shining)", "length_seconds": 180}, {"title": "Rocky Mountains", "length_seconds": 210}, {"title": "Lontano", "length_seconds": 240}, {"title": "Music for Strings, Percussion and Celesta", "length_seconds": 300}, {"title": "Utrenja (Excerpt)", "length_seconds": 180}, {"title": "The Awakening of Jacob", "length_seconds": 150}, {"title": "De Natura Sonoris No. 2", "length_seconds": 270}, {"title": "Home", "length_seconds": 200}, {"title": "Heartbeats and Worry", "length_seconds": 160}, {"title": "The Overlook", "length_seconds": 220}]}
异步流式
from IPython.display import clear_output
from pprint import pprint
stream_output = await sllm.astream_chat([input_msg])
async for partial_output in stream_output:
clear_output(wait=True)
pprint(partial_output.raw.dict())
{'artist': 'Various Artists',
'name': 'The Shining: Original Soundtrack',
'songs': [{'length_seconds': 180, 'title': 'Main Title'},
{'length_seconds': 210, 'title': 'Rocky Mountains'},
{'length_seconds': 720, 'title': 'Lontano'},
{'length_seconds': 540,
'title': 'Music for Strings, Percussion and Celesta'},
{'length_seconds': 300, 'title': 'Utrenja (Excerpt)'},
{'length_seconds': 480, 'title': 'The Awakening of Jacob'},
{'length_seconds': 540, 'title': 'De Natura Sonoris No. 2'},
{'length_seconds': 180, 'title': 'Home'},
{'length_seconds': 180, 'title': 'Midnight, the Stars and You'},
{'length_seconds': 180, 'title': "It's All Forgotten Now"},
{'length_seconds': 180, 'title': 'Masquerade'}]}
1.b 使用查询管道的示例
您可以在查询管道中插入结构化 LLM - 输出将直接是结构化对象。
# use query pipelines
from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.query_pipeline import QueryPipeline as QP
from llama_index.core.llms import ChatMessage
chat_prompt_tmpl = ChatPromptTemplate(
message_templates=[
ChatMessage.from_str(
"Generate an example album from {movie_name}", role="user"
)
]
)
qp = QP(chain=[chat_prompt_tmpl, sllm])
response = qp.run(movie_name="Inside Out")
response
Album(name='Inside Out Soundtrack', artist='Various Artists', songs=[Song(title='Bundle of Joy', length_seconds=150), Song(title='Team Building', length_seconds=120), Song(title='Nomanisone Island/National Movers', length_seconds=180), Song(title='Overcoming Sadness', length_seconds=210), Song(title='Free Skating', length_seconds=160), Song(title='First Day of School', length_seconds=140), Song(title='Riled Up', length_seconds=130), Song(title='Goofball No Longer', length_seconds=170), Song(title='Memory Lanes', length_seconds=200), Song(title='The Forgetters', length_seconds=110)])
1.c 使用structured_predict函数
无需明确执行llm.as_structured_llm(...),每个 LLM 类都有一个structured_predict函数,允许您更轻松地使用提示模板 + 模板变量调用 LLM,以在一行代码中返回结构化输出。
# use query pipelines
from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.llms import ChatMessage
from llama_index.llms.openai import OpenAI
chat_prompt_tmpl = ChatPromptTemplate(
message_templates=[
ChatMessage.from_str(
"Generate an example album from {movie_name}", role="user"
)
]
)
llm = OpenAI(model="gpt-4o")
album = llm.structured_predict(
Album, chat_prompt_tmpl, movie_name="Lord of the Rings"
)
album
2. 插入 RAG 管道
您还可以将其插入 RAG 管道。下面我们展示了 Apple 10K 报告中的结构化提取。
mkdir data
wget "https://s2.q4cdn.com/470004039/files/doc_financials/2021/q4/_10-K-2021-(As-Filed).pdf" -O data/apple_2021_10k.pdf
选项 1:使用 LlamaParse
您需要一个https://cloud.llamaindex.ai/帐户和一个 API 密钥才能使用 LlamaParse(我们的 10K 文件文档解析器)。
from llama_parse import LlamaParse
# os.environ["LLAMA_CLOUD_API_KEY"] = "llx-..."
orig_docs = LlamaParse(result_type="text").load_data(
"./data/apple_2021_10k.pdf"
)
开始解析job_id为cac11eca-7e00-452f-93f6-19c861b4c130下的文件
from copy import deepcopy
from llama_index.core.schema import TextNode
def get_page_nodes(docs, separator="\n---\n"):
"""Split each document into page node, by separator."""
nodes = []
for doc in docs:
doc_chunks = doc.text.split(separator)
for doc_chunk in doc_chunks:
node = TextNode(
text=doc_chunk,
metadata=deepcopy(doc.metadata),
)
nodes.append(node)
return nodes
docs = get_page_nodes(orig_docs)
print(docs[0].get_content())
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended September 25, 2021
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from to .
Commission File Number: 001-36743
Apple Inc.
(Exact name of Registrant as specified in its charter)
California 94-2404110
(State or other jurisdiction (I.R.S. Employer Identification No.)
of incorporation or organization)
One Apple Park Way
Cupertino, California 95014
(Address of principal executive offices) (Zip Code)
(408) 996-1010
(Registrant’s telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Trading
Title of each class symbol(s) Name of each exchange on which registered
Common Stock, $0.00001 par value per share AAPL The Nasdaq Stock Market LLC
1.000% Notes due 2022 — The Nasdaq Stock Market LLC
1.375% Notes due 2024 — The Nasdaq Stock Market LLC
0.000% Notes due 2025 — The Nasdaq Stock Market LLC
0.875% Notes due 2025 — The Nasdaq Stock Market LLC
1.625% Notes due 2026 — The Nasdaq Stock Market LLC
2.000% Notes due 2027 — The Nasdaq Stock Market LLC
1.375% Notes due 2029 — The Nasdaq Stock Market LLC
3.050% Notes due 2029 — The Nasdaq Stock Market LLC
0.500% Notes due 2031 — The Nasdaq Stock Market LLC
3.600% Notes due 2042 — The Nasdaq Stock Market LLC
Securities registered pursuant to Section 12(g) of the Act: None
Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.
Yes ☒ No ☐
Indicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.
Yes ☐ No ☒
选项 2:使用 SimpleDirectoryReader
您还可以选择使用我们的捆绑免费 PDF 解析器 SimpleDirectoryReader
。
# # OPTION 2: Use SimpleDirectoryReader
# from llama_index.core import SimpleDirectoryReader
# reader = SimpleDirectoryReader(input_files=["apple_2021_10k.pdf"])
# docs = reader.load_data()
构建 RAG 管道,定义结构化输出模式
我们利用可靠的 VectorStoreIndex
和 reranker
模块构建 RAG 管道。然后,我们将输出定义为 Pydantic 模型。这使我们能够创建带有输出类的结构化 LLM。
from llama_index.core import VectorStoreIndex
# skip chunking since we're doing page-level chunking
index = VectorStoreIndex(docs)
from llama_index.postprocessor.flag_embedding_reranker import (
FlagEmbeddingReranker,
)
reranker = FlagEmbeddingReranker(
top_n=5,
model="BAAI/bge-reranker-large",
)
from pydantic import BaseModel, Field
from typing import List
class Output(BaseModel):
"""Output containing the response, page numbers, and confidence."""
response: str = Field(..., description="The answer to the question.")
page_numbers: List[int] = Field(
...,
description="The page numbers of the sources used to answer this question. Do not include a page number if the context is irrelevant.",
)
confidence: float = Field(
...,
description="Confidence value between 0-1 of the correctness of the result.",
)
confidence_explanation: str = Field(
..., description="Explanation for the confidence score"
)
sllm = llm.as_structured_llm(output_cls=Output)
运行查询
query_engine = index.as_query_engine(
similarity_top_k=5,
node_postprocessors=[reranker],
llm=sllm,
response_mode="tree_summarize", # you can also select other modes like `compact`, `refine`
)
response = query_engine.query("Net sales for each product category in 2021")
print(str(response))
{"response": "In 2021, the net sales for each product category were as follows: iPhone: $191,973 million, Mac: $35,190 million, iPad: $31,862 million, Wearables, Home and Accessories: $38,367 million, and Services: $68,425 million.", "page_numbers": [21], "confidence": 1.0, "confidence_explanation": "The figures are directly taken from the provided data, ensuring high accuracy."}
{'response': 'In 2021, the net sales for each product category were as follows: iPhone: $191,973 million, Mac: $35,190 million, iPad: $31,862 million, Wearables, Home and Accessories: $38,367 million, and Services: $68,425 million.',
'page_numbers': [21],
'confidence': 1.0,
'confidence_explanation': 'The figures are directly taken from the provided data, ensuring high accuracy.'}