Skip to main content

Cassandra

Apache Cassandra® 是一个NoSQL、面向行的、高度可扩展和高可用的数据库。

最新的Cassandra版本原生地support向量相似性搜索。

要运行这个笔记本,您需要一个配备了向量搜索功能的运行中的Cassandra集群(在撰写本文时处于预发布状态),或者在云中运行的DataStax Astra DB实例(您可以在datastax.com免费获取一个)。请查看cassio.org获取更多信息。

pip install "cassio>=0.0.7"

请提供数据库连接参数和密钥:

import os
import getpass

database_mode = (input("\n(C)assandra or (A)stra DB? ")).upper()

keyspace_name = input("\nKeyspace name? ")

if database_mode == "A":
ASTRA_DB_APPLICATION_TOKEN = getpass.getpass('\nAstra DB Token ("AstraCS:...") ')
#
ASTRA_DB_SECURE_BUNDLE_PATH = input("Full path to your Secure Connect Bundle? ")
elif database_mode == "C":
CASSANDRA_CONTACT_POINTS = input(
"Contact points? (comma-separated, empty for localhost) "
).strip()

根据本地或基于云的Astra DB,创建相应的数据库连接"Session"对象

from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

if database_mode == "C":
if CASSANDRA_CONTACT_POINTS:
cluster = Cluster(
[cp.strip() for cp in CASSANDRA_CONTACT_POINTS.split(",") if cp.strip()]
)
else:
cluster = Cluster()
session = cluster.connect()
elif database_mode == "A":
ASTRA_DB_CLIENT_ID = "token"
cluster = Cluster(
cloud={
"secure_connect_bundle": ASTRA_DB_SECURE_BUNDLE_PATH,
},
auth_provider=PlainTextAuthProvider(
ASTRA_DB_CLIENT_ID,
ASTRA_DB_APPLICATION_TOKEN,
),
)
session = cluster.connect()
else:
raise NotImplementedError

请提供OpenAI访问密钥

我们想要使用OpenAIEmbeddings,所以我们需要获取OpenAI API密钥。

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

创建和使用向量存储

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Cassandra
from langchain.document_loaders import TextLoader
from langchain.document_loaders import TextLoader

loader = TextLoader("../../../state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embedding_function = OpenAIEmbeddings()
table_name = "my_vector_db_table"

docsearch = Cassandra.from_documents(
documents=docs,
embedding=embedding_function,
session=session,
keyspace=keyspace_name,
table_name=table_name,
)

query = "What did the president say about Ketanji Brown Jackson"
docs = docsearch.similarity_search(query)
## 如果您已经有一个索引,您可以加载它并像这样使用它:

# docsearch_preexisting = Cassandra(
# embedding=embedding_function,
# session=session,
# keyspace=keyspace_name,
# table_name=table_name,
# )

# docsearch_preexisting.similarity_search(query, k=2)
print(docs[0].page_content)

最大边际相关搜索

除了在检索器对象中使用相似性搜索外,您还可以使用mmr作为检索器。

retriever = docsearch.as_retriever(search_type="mmr")
matched_docs = retriever.get_relevant_documents(query)
for i, d in enumerate(matched_docs):
print(f"\n## Document {i}\n")
print(d.page_content)

或者直接使用max_marginal_relevance_search

found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)
for i, doc in enumerate(found_docs):
print(f"{i + 1}.", doc.page_content, "\n")