from transformers import BertTokenizer, BertModel import torch import torch.nn.functional as F # 初始化BERT模型和分词器 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') def sentence_to_embedding(sentence): # 分词并转换为张量 inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True) with torch.no_grad(): outputs = model(**inputs) # 获取CLS token的输出向量 sentence_embedding = outputs.last_hidden_state[:, 0, :] return sentence_embedding def cosine_similarity(embedding1, embedding2): # 计算余弦相似度 similarity = F.cosine_similarity(embedding1, embedding2) return similarity.item() # 输入句子 sentence1 = "我是神里绫华的狗" sentence2 = "好像被神里小姐踩在脚下啊" # 将句子转换为BERT向量表示 embedding1 = sentence_to_embedding(sentence1) embedding2 = sentence_to_embedding(sentence2) # 计算余弦相似度 similarity = cosine_similarity(embedding1, embedding2) print(f"'{sentence1}' 和 '{sentence2}' 的相似度为: {similarity:.4f}")