作者:CSDN博客
LangGraph快速搭建新一代信息检索增强工具DeepResearch
项目概览
本项目是一个基于LangGraph框架构建的自主研究智能体,能够根据用户查询自动生成深度研究报告。通过三阶段流水线架构,实现从问题理解到报告生成的完整闭环。
项目结构
- langgpraph_deepresearch/
- ├── .env # 环境变量配置
- ├── langgraph.json # LangGraph配置文件
- ├── requirements.txt # 项目依赖
- └── graph.py # 核心图结构实现
复制代码 环境配置
依赖清单(requirements.txt)
- langgraph
- langchain-core
- langchain-deepseek
- python-dotenv
- langsmith
- pydantic
- matplotlib
- seaborn
- pandas
- IPython
- langchain_mcdapters
- uv
复制代码 环境变量(.env)
- DEEPSEEK_API_KEY ='****'
- TAVILY_API_KEY ='tvly-dev-******'
- LANGSMITH_API_KEY ='lsv2_pt_**********'
- LANGSMITH_TRACING =trueLANGSMITH_PROJECT='langgpraph_deepresearch'
复制代码 LangGraph配置(langgraph.json)
- {"dependencies":["./"],"graphs":{"langgpraph_deepresearch":"./graph.py:graph"},"env":".env"}
复制代码 核心代码架构
1. 数据模型定义
- classWebSearchItem(BaseModel):
- query:str
- reason:strclassWebSearchPlan(BaseModel):
- searches: List[WebSearchItem]classReportData(BaseModel):
- short_summary:str
- markdown_report:str
- follow_up_questions: List[str]
复制代码 2. 规划器(Planner)
- PLANNER_INSTRUCTIONS =("You are a helpful research assistant. Given a query, come up with 5-7 web searches ""to perform to best answer the query.""Return **ONLY valid JSON** that follows this schema:"'{"searches": [ {"query": "example", "reason": "why"} ]}')
- planner_chain =(
- planner_prompt
- | model.with_structured_output(WebSearchPlan, method="json_mode"))
复制代码 3. 搜索代理(Search Agent)
- SEARCH_INSTRUCTIONS =("You are a research assistant. Given a search term, you search the web for that term and ""produce a concise summary of the results. The summary must 2-3 paragraphs and less than 300 ""words. Capture the main points. Write succinctly, no need to have complete sentences or good ""grammar. This will be consumed by someone synthesizing a report, so its vital you capture the ""essence and ignore any fluff. Do not include any additional commentary other than the summary ""itself.")
- search_tool = TavilySearch(max_results=5, topic="general")
- search_agent = create_react_agent(
- model=model,
- prompt=SEARCH_INSTRUCTIONS,
- tools=[search_tool],)
复制代码 4. 写作器(Writer)
- WRITER_PROMPT =("You are a senior researcher tasked with writing a cohesive report for a research query. ""You will be provided with the original query and some initial research.\n\n""① 先给出完整的大纲。\n""② 然后生成正式报告。\n""**写作要求**:\n""· 报告使用 Markdown 格式;\n""· 章节清晰,层次分明;\n""· markdown_report部分至少包含2000中文字符(注意需要用中文进行回复);\n""· 内容丰富,论据充分,可加入引用和数据,允许分段、添加引用、表格等;\n""· 最终仅返回 JSON:\n"'{"short_summary": "...", "markdown_report": "...", "follow_up_questions": ["..."]}')
- writer_chain = writer_prompt | model.with_structured_output(ReportData, method="json_mode")
复制代码 LangGraph节点实现
1. 规划节点(planner_node)
- defplanner_node(state: MessagesState)-> Command:
- user_query = state["messages"][-1].content
- raw = planner_chain.invoke({"query": user_query})print(raw)try:
- plan = WebSearchPlan.model_validate(raw)except ValidationError:ifisinstance(raw,dict)andisinstance(raw.get("searches"),list):
- plan = WebSearchPlan(searches=[WebSearchItem(query=q, reason="")for q in raw["searches"]])else:raisereturn Command(goto="search_node", update={"messages":[AIMessage(content=plan.model_dump_json())],"plan": plan})
复制代码 2. 搜索节点(search_node)
- defsearch_node(state: MessagesState)-> Command:
- plan_json = state["messages"][-1].content
- plan = WebSearchPlan.model_validate_json(plan_json)
- summaries =[]for item in plan.searches:# 串行处理
- run = search_agent.invoke({"messages":[HumanMessage(content=item.query)]})
- msgs = run["messages"]
- readable =next((m for m inreversed(msgs)ifisinstance(m,(ToolMessage, AIMessage))), msgs[-1])
- summaries.append(f"##{item.query}\n\n{readable.content}")
-
- combined ="\n\n".join(summaries)return Command(goto="writer_node", update={"messages":[AIMessage(content=combined)]})
复制代码 3. 写作节点(writer_node)
- defwriter_node(state: MessagesState)-> Command:
- original_query = state["messages"][0].content
- combined_summary = state["messages"][-1].content
-
- writer_input =(f"原始问题:{original_query}\n\n"f"搜索摘要:\n{combined_summary}")
-
- report = writer_chain.invoke({"content": writer_input})return Command(
- goto=END,
- update={"messages":[AIMessage(content=json.dumps(report.dict(), ensure_ascii=False, indent=4))]})
复制代码 图构建与执行
- builder = StateGraph(MessagesState)
- builder.add_node("planner_node", planner_node)
- builder.add_node("search_node", search_node)
- builder.add_node("writer_node", writer_node)
- builder.add_edge(START,"planner_node")
- builder.add_edge("planner_node","search_node")
- builder.add_edge("search_node","writer_node")
- builder.add_edge("writer_node", END)
- graph = builder.compile()
复制代码 运行流程
输入:用户查询通过messages参数传入规划:LLM生成5-7个搜索计划搜索:串行执行每个搜索任务综合:整合所有搜索结果写作:生成结构化中文报告输出:返回包含摘要、正文、后续问题的JSON
关键特性
中文支持:全程中文交互和报告生成结构化输出:使用Pydantic保证数据格式容错机制:ValidationError的优雅处理LangSmith集成:完整的调用链追踪可扩展性:模块化设计便于功能扩展
性能分析
串行搜索:当前为顺序执行,适合稳定性优先的场景Token优化:每次搜索限制在300词以内缓存支持:LangGraph自动状态管理调试友好:print语句和LangSmith双重调试
完整的项目代码graph.py
- # 封装更完整的图结构import json
- import os
- from typing import List
- from dotenv import load_dotenv
- from pydantic import BaseModel, ValidationError, parse_obj_as
- from langchain_deepseek import ChatDeepSeek
- from langchain.prompts import ChatPromptTemplate
- from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
- from langgraph.graph import StateGraph, MessagesState, START, END
- from langgraph.types import Command
- from langgraph.prebuilt import create_react_agent
- from langchain_tavily import TavilySearch
- from langchain_openai import ChatOpenAI
- load_dotenv()
- model = ChatDeepSeek(model="deepseek-chat", max_tokens=8000)# model = ChatOpenAI(model="gpt-4-1", max_tokens=32000)# -------- 1) Planner Chain --------
- PLANNER_INSTRUCTIONS =("You are a helpful research assistant. Given a query, come up with 5-7 web searches ""to perform to best answer the query.""Return **ONLY valid JSON** that follows this schema:"'{{"searches": [ {{"query": "example", "reason": "why"}} ]}}')classWebSearchItem(BaseModel):
- query:str
- reason:strclassWebSearchPlan(BaseModel):
- searches: List[WebSearchItem]
- planner_prompt = ChatPromptTemplate.from_messages([("system", PLANNER_INSTRUCTIONS),("human","{query}")])
- planner_chain =(
- planner_prompt
- | model.with_structured_output(WebSearchPlan, method="json_mode")# 强制 JSON)# -------- 2) search agent --------
- SEARCH_INSTRUCTIONS =("You are a research assistant. Given a search term, you search the web for that term and ""produce a concise summary of the results. The summary must 2-3 paragraphs and less than 300 ""words. Capture the main points. Write succinctly, no need to have complete sentences or good ""grammar. This will be consumed by someone synthesizing a report, so its vital you capture the ""essence and ignore any fluff. Do not include any additional commentary other than the summary ""itself.")
- search_tool = TavilySearch(max_results=5, topic="general")
- search_agent = create_react_agent(
- model=model,
- prompt=SEARCH_INSTRUCTIONS,
- tools=[search_tool],)# -------- 3) Writer Chain --------
- WRITER_PROMPT =("You are a senior researcher tasked with writing a cohesive report for a research query. ""You will be provided with the original query and some initial research.\n\n""① 先给出完整的大纲。\n""② 然后生成正式报告。\n""**写作要求**:\n""· 报告使用 Markdown 格式;\n""· 章节清晰,层次分明;\n""· markdown_report部分至少包含2000中文字符(注意需要用中文进行回复);\n""· 内容丰富,论据充分,可加入引用和数据,允许分段、添加引用、表格等;\n""· 最终仅返回 JSON:\n"'{{"short_summary": "...", "markdown_report": "...", "follow_up_questions": ["..."]}}')classReportData(BaseModel):
- short_summary:str
- markdown_report:str
- follow_up_questions: List[str]
- writer_prompt = ChatPromptTemplate.from_messages([("system", WRITER_PROMPT),("human","{content}")])
- writer_chain = writer_prompt | model.with_structured_output(ReportData, method="json_mode")# -------------LangGraph 节点----------------defplanner_node(state: MessagesState)-> Command:
- user_query = state["messages"][-1].content
- raw = planner_chain.invoke({"query": user_query})print(raw)try:# plan = parse_obj_as(WebSearchPlan, raw)
- plan = WebSearchPlan.model_validate(raw)except ValidationError:ifisinstance(raw,dict)andisinstance(raw.get("searches"),list):
- plan = WebSearchPlan(searches=[WebSearchItem(query=q, reason="")for q in raw["searches"]])else:raisereturn Command(goto="search_node", update={"messages":[AIMessage(content=plan.model_dump_json())],"plan": plan})# -------------search_node----------------defsearch_node(state: MessagesState)-> Command:
- plan_json = state["messages"][-1].content
- plan = WebSearchPlan.model_validate_json(plan_json)
- summaries =[]for item in plan.searches:
- run = search_agent.invoke({"messages":[HumanMessage(content=item.query)]})
- msgs = run["messages"]
- readable =next((m for m inreversed(msgs)ifisinstance(m,(ToolMessage, AIMessage))), msgs[-1])
- summaries.append(f"##{item.query}\n\n{readable.content}")
- combined ="\n\n".join(summaries)return Command(goto="writer_node", update={"messages":[AIMessage(content=combined)]})# -------------write_node----------------defwriter_node(state: MessagesState)-> Command:
- original_query = state["messages"][0].content
- combined_summary = state["messages"][-1].content
-
- writer_input =(f"原始问题:{original_query}\n\n"f"搜索摘要:\n{combined_summary}")
-
- report = writer_chain.invoke({"content": writer_input})return Command(
- goto=END,
- update={"messages":[AIMessage(content=json.dumps(report.dict(), ensure_ascii=False, indent=4))]})# 构建并运行Graph
- builder = StateGraph(MessagesState)
- builder.add_node("planner_node", planner_node)
- builder.add_node("search_node", search_node)
- builder.add_node("writer_node", writer_node)# 定义节点间的连接关系
- builder.add_edge(START,"planner_node")
- builder.add_edge("planner_node","search_node")
- builder.add_edge("search_node","writer_node")
- builder.add_edge("writer_node", END)# 编译Graph
- graph = builder.compile()
复制代码 使用示例
- # 运行研究
- result = graph.invoke({"messages":[HumanMessage(content="分析2024年人工智能发展趋势")]})# 解析结果import json
- report_data = json.loads(result["messages"][-1].content)print("摘要:", report_data["short_summary"])print("报告:", report_data["markdown_report"])
复制代码 部署
项目部署大家参考这里:langgraph-cli快速完成项目部署
扩展方向
并行化搜索:使用asyncio.gather优化性能缓存机制:避免重复搜索相同主题多语言支持:扩展至英文等多语言报告自定义模板:支持不同领域的报告模板
总结
DeepResearch项目展示了如何基于LangGraph构建一个完整的AI研究智能体,通过三阶段流水线实现了从问题理解到深度报告生成的完整闭环。项目代码结构清晰,配置完整,具备良好的可扩展性和实用性。
参考:赋范空间大模型技术社区–DeepResearch应用开发实战
原文地址:https://blog.csdn.net/Galen_xia/article/details/149817138 |