Skip to main content

匹配引擎 (MatchingEngine)

这个笔记本展示了如何使用与GCP Vertex AI MatchingEngine向量数据库相关的功能。

Vertex AI Matching Engine 提供了业界领先的高规模低延迟向量数据库。这些向量数据库通常被称为向量相似匹配或近似最近邻(ANN)服务。

注意:此模块期望已经创建了一个端点和部署的索引,因为创建时间接近一小时。要了解如何创建索引,请参阅创建索引并将其部署到端点部分。

从文本创建向量存储

from langchain.vectorstores import MatchingEngine
texts = [
"The cat sat on",
"the mat.",
"I like to",
"eat pizza for",
"dinner.",
"The sun sets",
"in the west.",
]


vector_store = MatchingEngine.from_components(
texts=texts,
project_id="<my_project_id>",
region="<my_region>",
gcs_bucket_uri="<my_gcs_bucket>",
index_id="<my_matching_engine_index_id>",
endpoint_id="<my_matching_engine_endpoint_id>",
)

vector_store.add_texts(texts=texts)

vector_store.similarity_search("lunch", k=2)

创建索引并将其部署到端点

导入、常量和配置

# 安装依赖项。
pip install tensorflow \
google-cloud-aiplatform \
tensorflow-hub \
tensorflow-text
import os
import json

from google.cloud import aiplatform
import tensorflow_hub as hub
import tensorflow_text
PROJECT_ID = "<my_project_id>"
REGION = "<my_region>"
VPC_NETWORK = "<my_vpc_network_name>"
PEERING_RANGE_NAME = "ann-langchain-me-range" # 用于创建VPC peering的名称。
BUCKET_URI = "gs://<bucket_uri>"
# tensorflow通用句子编码器的维度数。
# 如果使用其他嵌入器,维度可能需要更改。
DIMENSIONS = 512
DISPLAY_NAME = "index-test-name"
EMBEDDING_DIR = f"{BUCKET_URI}/banana"
DEPLOYED_INDEX_ID = "endpoint-test-name"

PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]
VPC_NETWORK_FULL = f"projects/{PROJECT_NUMBER}/global/networks/{VPC_NETWORK}"

# 如果需要创建VPC,请更改此设置。
CREATE_VPC = False
# 设置项目ID
gcloud config set project {PROJECT_ID}
# 删除if条件以运行封装的代码
if CREATE_VPC:
# 创建VPC网络
gcloud compute networks create {VPC_NETWORK} --bgp-routing-mode=regional --subnet-mode=auto --project={PROJECT_ID}

# 添加必要的防火墙规则
gcloud compute firewall-rules create {VPC_NETWORK}-allow-icmp --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow icmp
gcloud compute firewall-rules create {VPC_NETWORK}-allow-internal --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow all --source-ranges 10.128.0.0/9
gcloud compute firewall-rules create {VPC_NETWORK}-allow-rdp --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow tcp:3389
gcloud compute firewall-rules create {VPC_NETWORK}-allow-ssh --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow tcp:22

# 保留IP范围
gcloud compute addresses create {PEERING_RANGE_NAME} --global --prefix-length=16 --network={VPC_NETWORK} --purpose=VPC_PEERING --project={PROJECT_ID} --description="peering range"

# 设置与服务网络的对等连接
# 您的帐号必须具有“Compute Network Admin”角色才能运行以下命令。
gcloud services vpc-peerings connect --service=servicenetworking.googleapis.com --network={VPC_NETWORK} --ranges={PEERING_RANGE_NAME} --project={PROJECT_ID}
# 创建存储桶。
gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

使用Tensorflow通用句子编码器作为嵌入器

# 加载通用句子编码器模块
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
model = hub.load(module_url)
# 为每个单词生成嵌入
embeddings = model(["banana"])

插入一个测试嵌入

initial_config = {
"id": "banana_id",
"embedding": [float(x) for x in list(embeddings.numpy()[0])],
}

with open("data.json", "w") as f:
json.dump(initial_config, f)
gsutil cp data.json {EMBEDDING_DIR}/file.json
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

创建索引

my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
display_name=DISPLAY_NAME,
contents_delta_uri=EMBEDDING_DIR,
dimensions=DIMENSIONS,
approximate_neighbors_count=150,
distance_measure_type="DOT_PRODUCT_DISTANCE",
)

创建端点

my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
display_name=f"{DISPLAY_NAME}-endpoint",
network=VPC_NETWORK_FULL,
)

部署索引

my_index_endpoint = my_index_endpoint.deploy_index(
index=my_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes