AutoGPT示例-查找马拉松获胜时间

实现：https://github.com/Significant-Gravitas/Auto-GPT
使用LangChain原语（LLMs、PromptTemplates、VectorStores、Embeddings、Tools）

# !pip install bs4
# !pip install nest_asyncio

# 通用
import os
import pandas as pd
from langchain_experimental.autonomous_agents import AutoGPT
from langchain.chat_models import ChatOpenAI

from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.docstore.document import Document
import asyncio
import nest_asyncio


# 需要同步jupyter运行异步事件循环
nest_asyncio.apply()

llm = ChatOpenAI(model_name="gpt-4", temperature=1.0)

设置工具

我们将使用一个search工具、一个write-file工具、一个read-file工具、一个网页浏览工具和一个通过Python REPL与CSV文件交互的工具来设置AutoGPT。

在下面定义您想要使用的其他工具：

# 工具
import os
from contextlib import contextmanager
from typing import Optional
from langchain.agents import tool
from langchain.tools.file_management.read import ReadFileTool
from langchain.tools.file_management.write import WriteFileTool

ROOT_DIR = "./data/"


@contextmanager
def pushd(new_dir):
    """用于更改当前工作目录的上下文管理器。"""
    prev_dir = os.getcwd()
    os.chdir(new_dir)
    try:
        yield
    finally:
        os.chdir(prev_dir)


@tool
def process_csv(
    csv_file_path: str, instructions: str, output_path: Optional[str] = None
) -> str:
    """使用pandas在有限的REPL中处理CSV文件。\
仅在将数据写入磁盘作为CSV文件后使用此工具。\
任何图形必须保存到磁盘上才能被人类查看。\
指令应该用自然语言而不是代码编写。假设数据框已经加载。"""
    with pushd(ROOT_DIR):
        try:
            df = pd.read_csv(csv_file_path)
        except Exception as e:
            return f"错误：{e}"
        agent = create_pandas_dataframe_agent(llm, df, max_iterations=30, verbose=True)
        if output_path is not None:
            instructions += f" 将输出保存到磁盘上的路径为 {output_path}"
        try:
            result = agent.run(instructions)
            return result
        except Exception as e:
            return f"错误：{e}"

使用PlayWright浏览网页

# !pip install playwright
# !playwright install

async def async_load_playwright(url: str) -> str:
    """使用PlayWright加载指定的URL，并使用BeautifulSoup解析。"""
    from bs4 import BeautifulSoup
    from playwright.async_api import async_playwright

    results = ""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        try:
            page = await browser.new_page()
            await page.goto(url)

            page_source = await page.content()
            soup = BeautifulSoup(page_source, "html.parser")

            for script in soup(["script", "style"]):
                script.extract()

            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            results = "\n".join(chunk for chunk in chunks if chunk)
        except Exception as e:
            results = f"错误：{e}"
        await browser.close()
    return results


def run_async(coro):
    event_loop = asyncio.get_event_loop()
    return event_loop.run_until_complete(coro)


@tool
def browse_web_page(url: str) -> str:
    """以详细的方式抓取整个网页。可能会导致解析问题。"""
    return run_async(async_load_playwright(url))

在网页上进行问答

帮助模型提出更具针对性的问题，以避免混乱其记忆

from langchain.tools import BaseTool, DuckDuckGoSearchRun
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pydantic import Field
from langchain.chains.qa_with_sources.loading import (
    load_qa_with_sources_chain,
    BaseCombineDocumentsChain,
)


def _get_text_splitter():
    return RecursiveCharacterTextSplitter(
        # 设置一个非常小的块大小，仅用于展示。
        chunk_size=500,
        chunk_overlap=20,
        length_function=len,
    )


class WebpageQATool(BaseTool):
    name = "query_webpage"
    description = "浏览网页并检索与问题相关的信息。"
    text_splitter: RecursiveCharacterTextSplitter = Field(
        default_factory=_get_text_splitter
    )
    qa_chain: BaseCombineDocumentsChain

    def _run(self, url: str, question: str) -> str:
        """用于浏览网站和抓取文本信息的工具。"""
        result = browse_web_page.run(url)
        docs = [Document(page_content=result, metadata={"source": url})]
        web_docs = self.text_splitter.split_documents(docs)
        results = []
        # TODO: 使用MapReduceChain处理此问题
        for i in range(0, len(web_docs), 4):
            input_docs = web_docs[i : i + 4]
            window_result = self.qa_chain(
                {"input_documents": input_docs, "question": question},
                return_only_outputs=True,
            )
            results.append(f"来自窗口 {i} 的响应 - {window_result}")
        results_docs = [
            Document(page_content="\n".join(results), metadata={"source": url})
        ]
        return self.qa_chain(
            {"input_documents": results_docs, "question": question},
            return_only_outputs=True,
        )

    async def _arun(self, url: str, question: str) -> str:
        raise NotImplementedError

query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))

设置内存 (Set up memory)

这里的内存用于代理的中间步骤 (The memory here is used for the agents intermediate steps)

# 内存
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.tools.human.tool import HumanInputRun

embeddings_model = OpenAIEmbeddings()
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})

设置模型和AutoGPT

模型设置

# !pip install duckduckgo_search
web_search = DuckDuckGoSearchRun()

tools = [
    web_search,
    WriteFileTool(root_dir="./data"),
    ReadFileTool(root_dir="./data"),
    process_csv,
    query_website_tool,
    # HumanInputRun(), # 如果需要从人类那里获得帮助，请取消注释
]

agent = AutoGPT.from_llm_and_tools(
    ai_name="Tom",
    ai_role="助手",
    tools=tools,
    llm=llm,
    memory=vectorstore.as_retriever(search_kwargs={"k": 8}),
    # human_in_the_loop=True, # 如果希望在每个步骤中添加反馈，请将其设置为True。
)
# agent.chain.verbose = True

使用AutoGPT查询网络

我多年来花了很多时间爬取数据源和清理数据。让我们看看AutoGPT是否可以帮助解决这个问题！

以下是查找最近波士顿马拉松时间并将其转换为表格形式的提示。

agent.run(
    [
        "过去5年（截至2022年）的波士顿马拉松获胜时间是多少？生成一个包含年份、姓名、原籍国和时间的表格。"
    ]
)

    {
        "thoughts": {
            "text": "我需要找到过去5年的波士顿马拉松获胜时间。我可以使用DuckDuckGo搜索命令来搜索这些信息。",
            "reasoning": "使用DuckDuckGo搜索将帮助我收集关于获胜时间的信息，而不会出现复杂情况。",
            "plan": "- 使用DuckDuckGo搜索找到波士顿马拉松的获胜时间\n- 生成一个包含年份、姓名、原籍国和时间的表格\n- 确保没有法律问题",
            "criticism": "无",
            "speak": "我将使用DuckDuckGo搜索命令找到过去5年的波士顿马拉松获胜时间。"
        },
        "command": {
            "name": "DuckDuckGo搜索",
            "args": {
                "query": "过去5年波士顿马拉松的获胜时间截至2022年"
            }
        }
    }
    {
        "thoughts": {
            "text": "DuckDuckGo搜索命令没有提供我所需的具体信息。我必须改变方法，使用query_webpage命令浏览包含过去5年波士顿马拉松获胜时间的网页。",
            "reasoning": "与搜索命令相比，query_webpage命令可能会给我提供更准确和全面的结果。",
            "plan": "- 使用query_webpage命令找到波士顿马拉松的获胜时间\n- 生成一个包含年份、姓名、原籍国和时间的表格\n- 确保没有法律问题",
            "criticism": "我可能在找到包含所需信息的正确网页方面遇到困难。",
            "speak": "我将使用query_webpage命令找到过去5年的波士顿马拉松获胜时间。"
        },
        "command": {
            "name": "DuckDuckGo搜索",
            "args": {
                "query": "包含过去5年波士顿马拉松获胜时间的网站截至2022年"
            }
        }
    }
    {
        "thoughts": {
            "text": "我需要使用query_webpage命令找到过去5年的波士顿马拉松获胜时间的信息。",
            "reasoning": "之前的DuckDuckGo搜索命令没有提供足够具体的结果。query_webpage命令可能提供更准确和全面的结果。",
            "plan": "- 使用query_webpage命令找到波士顿马拉松的获胜时间\n- 生成一个包含年份、姓名、原籍国和时间的表格\n- 确保没有法律问题",
            "criticism": "我可能在找到包含所需信息的正确网页方面遇到困难。",
            "speak": "我将使用query_webpage命令找到过去5年的波士顿马拉松获胜时间。"
        },
        "command": {
            "name": "query_webpage",
            "args": {
                "url": "https://en.wikipedia.org/wiki/List_of_winners_of_the_Boston_Marathon",
                "question": "过去5年波士顿马拉松的获胜时间是多少？"
            }
        }
    }
    {
        "thoughts": {
            "text": "我已经找到了过去5年的波士顿马拉松获胜时间。现在，我需要生成一个包含这些信息的表格。",
            "reasoning": "使用我已经拥有的信息，我可以创建一个包含年份、姓名、原籍国和时间的表格。",
            "plan": "- 将马拉松数据写入CSV文件\n- 处理CSV文件以显示表格",
            "criticism": "无",
            "speak": "我将为过去5年的波士顿马拉松获胜时间生成一个包含年份、姓名、原籍国和时间的表格。"
        },
        "command": {
            "name": "write_file",
            "args": {
                "file_path": "boston_marathon_winners.csv",
                "text": "年份,姓名,国家,时间\n2022,Evans Chebet,肯尼亚,2:06:51\n2021,Benson Kipruto,肯尼亚,2:09:51\n2019,Lawrence Cherono,肯尼亚,2:07:57\n2018,Yuki Kawauchi,日本,2:15:58"
            }
        }
    }
    {
        "thoughts": {
            "text": "我已经检索到了过去5年的波士顿马拉松获胜时间。现在，我需要生成一个包含年份、姓名、原籍国和时间的表格。",
            "reasoning": "创建一个表格将有助于以清晰和易于访问的格式组织数据。",
            "plan": "- 将数据写入CSV文件\n- 处理CSV文件以生成表格\n- 完成任务",
            "criticism": "无",
            "speak": "我将使用最近检索到的数据生成一个包含年份、姓名、原籍国和获胜时间的表格。"
        },
        "command": {
            "name": "write_file",
            "args": {
                "file_path": "winning_boston_marathon_data.csv",
                "text": "年份,姓名,国家,时间\n2022,Evans Chebet,肯尼亚,2:06:51\n2021,Benson Kipruto,肯尼亚,2:09:51\n2019,Lawrence Cherono,肯尼亚,2:07:57\n2018,Yuki Kawauchi,日本,2:15:58\n"
            }
        }
    }
    {
        "thoughts": {
            "text": "我已经找到了过去五年截至2022年的波士顿马拉松获胜时间。接下来，我需要创建一个包含年份、姓名、原籍国和时间的表格。",
            "reasoning": "生成一个表格将有助于以结构化的格式组织信息。",
            "plan": "- 创建一个包含年份、姓名、原籍国和时间的表格\n- 确保没有法律问题",
            "criticism": "无",
            "speak": "我将生成一个包含过去5年截至2022年的波士顿马拉松获胜时间的表格。"
        },
        "command": {
            "name": "write_file",
            "args": {
                "file_path": "winning_times.csv",
                "text": "年份,姓名,国家,时间\n2022,Evans Chebet,肯尼亚,2:06:51\n2021,Benson Kipruto,肯尼亚,2:09:51\n2020,因COVID-19大流行取消比赛,,\n2019,Lawrence Cherono,肯尼亚,2:07:57\n2018,Yuki Kawauchi,日本,2:15:58"
            }
        }
    }
    {
        "thoughts": {
            "text": "我需要处理CSV文件以生成包含年份、姓名、原籍国和获胜时间的表格。",
            "reasoning": "我已经将数据写入名为'winning_times.csv'的文件中。现在，我需要处理这个CSV文件，以正确地将数据显示为表格。",
            "plan": "- 使用process_csv命令读取'winning_times.csv'文件并生成表格",
            "criticism": "无",
            "speak": "我将处理'winning_times.csv'文件以显示过去5年的波士顿马拉松获胜时间的表格。"
        },
        "command": {
            "name": "process_csv",
            "args": {
                "csv_file_path": "winning_times.csv",
                "instructions": "读取CSV文件并将数据显示为表格"
            }
        }
    }
    
    
    > 进入新的AgentExecutor链...
    思考：CSV文件已经被读取并保存到名为`df`的pandas数据框中。因此，我可以通过打印整个数据框来显示数据。由于`df.head()`返回前5行，我可以使用它来展示内容。
    
    动作：python_repl_ast
    动作输入：print(df.head())   年份                               姓名 国家     时间
    0  2022                       Evans Chebet   肯尼亚  2:06:51
    1  2021                     Benson Kipruto   肯尼亚  2:09:51
    2  2020  Canceled due to COVID-19 pandemic     NaN      NaN
    3  2019                   Lawrence Cherono   肯尼亚  2:07:57
    4  2018                      Yuki Kawauchi   日本  2:15:58
    
    观察：无
    思考：我使用了错误的工具执行了操作。我应该使用给定的数据而不是与Python shell交互。现在，我可以将显示的数据作为答案，因为在文本中键入打印的数据框的信息看起来像一个表格。
    
    最终答案： 
       年份                               姓名 国家     时间
    0  2022                       Evans Chebet   肯尼亚  2:06:51
    1  2021                     Benson Kipruto   肯尼亚  2:09:51
    2  2020  Canceled due to COVID-19 pandemic     NaN      NaN
    3  2019                   Lawrence Cherono   肯尼亚  2:07:57
    4  2018                      Yuki Kawauchi   日本  2:15:58
    
    > 完成链。
    {
        "thoughts": {
            "text": "我已经在名为'winning_times.csv'的文件中保存了过去5年的波士顿马拉松获胜时间。现在，我需要处理CSV并显示表格。",
            "reasoning": "我选择使用process_csv命令，因为我已经将所需的数据保存为CSV文件，我可以使用该命令读取和显示数据作为表格。",
            "plan": "- 使用process_csv命令读取'winning_times.csv'文件并生成表格",
            "criticism": "无",
            "speak": "我将处理'winning_times.csv'文件以显示过去5年的波士顿马拉松获胜时间的表格。"
        },
        "command": {
            "name": "process_csv",
            "args": {
                "csv_file_path": "winning_times.csv",
                "instructions": "读取CSV文件并将数据显示为表格"
            }
        }
    }
    
    
    > 进入新的AgentExecutor链...
    思考：由于数据已经加载到pandas数据框中，我只需要显示数据框的前几行。
    动作：python_repl_ast
    动作输入：df.head()
    观察：    年份                               姓名 国家     时间
    0  2022                       Evans Chebet   肯尼亚  2:06:51
    1  2021                     Benson Kipruto   肯尼亚  2:09:51
    2  2020  Canceled due to COVID-19 pandemic     NaN      NaN
    3  2019                   Lawrence Cherono   肯尼亚  2:07:57
    4  2018                      Yuki Kawauchi   日本  2:15:58
    思考：我现在知道最终答案了。
    最终答案： 
       年份                               姓名 国家     时间
    0  2022                       Evans Chebet   肯尼亚  2:06:51
    1  2021                     Benson Kipruto   肯尼亚  2:09:51
    2  2020  Canceled due to COVID-19 pandemic     NaN      NaN
    3  2019                   Lawrence Cherono   肯尼亚  2:07:57
    4  2018                      Yuki Kawauchi   日本  2:15:58
    
    > 完成链。
    {
        "thoughts": {
            "text": "我已经生成了一个包含过去5年的波士顿马拉松获胜时间的表格。现在，我可以完成任务了。",
            "reasoning": "我已经完成了所需的操作并获得了所需的数据。任务完成。",
            "plan": "- 使用finish命令",
            "criticism": "无",
            "speak": "我已经生成了一个包含过去5年的波士顿马拉松获胜时间的表格。任务完成。"
        },
        "command": {
            "name": "finish",
            "args": {
                "response": "我已经生成了一个包含过去5年的波士顿马拉松获胜时间的表格。任务完成。"
            }
        }
    }





    '我已经生成了一个包含过去5年的波士顿马拉松获胜时间的表格。任务完成。'

AutoGPT示例-查找马拉松获胜时间

设置工具​

设置内存 (Set up memory)​

设置模型和AutoGPT​

使用AutoGPT查询网络​

设置工具

设置内存 (Set up memory)

设置模型和AutoGPT

使用AutoGPT查询网络