Cassandra
Apache Cassandra® 是一个NoSQL、面向行的、高度可扩展和高可用的数据库。
最新的Cassandra版本原生地support向量相似性搜索。
要运行这个笔记本,您需要一个配备了向量搜索功能的运行中的Cassandra集群(在撰写本文时处于预发布状态),或者在云中运行的DataStax Astra DB实例(您可以在datastax.com免费获取一个)。请查看cassio.org获取更多信息。
pip install "cassio>=0.0.7"
请提供数据库连接参数和密钥:
import os
import getpass
database_mode = (input("\n(C)assandra or (A)stra DB? ")).upper()
keyspace_name = input("\nKeyspace name? ")
if database_mode == "A":
ASTRA_DB_APPLICATION_TOKEN = getpass.getpass('\nAstra DB Token ("AstraCS:...") ')
#
ASTRA_DB_SECURE_BUNDLE_PATH = input("Full path to your Secure Connect Bundle? ")
elif database_mode == "C":
CASSANDRA_CONTACT_POINTS = input(
"Contact points? (comma-separated, empty for localhost) "
).strip()
根据本地或基于云的Astra DB,创建相应的数据库连接"Session"对象
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
if database_mode == "C":
if CASSANDRA_CONTACT_POINTS:
cluster = Cluster(
[cp.strip() for cp in CASSANDRA_CONTACT_POINTS.split(",") if cp.strip()]
)
else:
cluster = Cluster()
session = cluster.connect()
elif database_mode == "A":
ASTRA_DB_CLIENT_ID = "token"
cluster = Cluster(
cloud={
"secure_connect_bundle": ASTRA_DB_SECURE_BUNDLE_PATH,
},
auth_provider=PlainTextAuthProvider(
ASTRA_DB_CLIENT_ID,
ASTRA_DB_APPLICATION_TOKEN,
),
)
session = cluster.connect()
else:
raise NotImplementedError
请提供OpenAI访问密钥
我们想要使用OpenAIEmbeddings
,所以我们需要获取OpenAI API密钥。
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
创建和使用向量存储
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Cassandra
from langchain.document_loaders import TextLoader
from langchain.document_loaders import TextLoader
loader = TextLoader("../../../state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embedding_function = OpenAIEmbeddings()
table_name = "my_vector_db_table"
docsearch = Cassandra.from_documents(
documents=docs,
embedding=embedding_function,
session=session,
keyspace=keyspace_name,
table_name=table_name,
)
query = "What did the president say about Ketanji Brown Jackson"
docs = docsearch.similarity_search(query)
## 如果您已经有一个索引,您可以加载它并像这样使用它:
# docsearch_preexisting = Cassandra(
# embedding=embedding_function,
# session=session,
# keyspace=keyspace_name,
# table_name=table_name,
# )
# docsearch_preexisting.similarity_search(query, k=2)
print(docs[0].page_content)
最大边际相关搜索
除了在检索器对象中使用相似性搜索外,您还可以使用mmr
作为检索器。
retriever = docsearch.as_retriever(search_type="mmr")
matched_docs = retriever.get_relevant_documents(query)
for i, d in enumerate(matched_docs):
print(f"\n## Document {i}\n")
print(d.page_content)
或者直接使用max_marginal_relevance_search
:
found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)
for i, doc in enumerate(found_docs):
print(f"{i + 1}.", doc.page_content, "\n")