# 注意:只是文本匹配而非语义匹配 | |
# 参考地址:https://zhuanlan.zhihu.com/p/43396514 | |
# https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/ | |
# | |
# https://baike.baidu.com/item/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6/17509249 | |
# https://github.com/nltk/nltk | |
# http://www.nltk.org/ | |
# https://github.com/fxsjy/jieba | |
# similarity = consine = (A.B) / (||A||.||B||) where A and B are vectors.(欧几里得点积公式) | |
# pip install nltk | |
# import nltk | |
# nltk.download ('all') ['punkt'|'stopwords'|...] --all 好像要下 300M + 数据 | |
# nltk.tokenize: It is used for tokenization. Tokenization is the process by which big quantity of text is | |
# divided into smaller parts called tokens. word_tokenize(X) split the given sentence X into | |
# words and return list. | |
# nltk.corpus: In this program, it is used to get a list of stopwords. | |
# A stop word is a commonly used word (such as “the”, “a”, “an”, “in”). | |
import jieba | |
import math | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
def check_zh_similarity(s1, s2): | |
# 使用 jieba 分词并去掉 stopwords,合并 words,每个 word 都是一个维度 | |
sw = ['。', ','] | |
s1_cut = [i for i in jieba.cut(s1, cut_all=True) if i not in sw] | |
s2_cut = [i for i in jieba.cut(s2, cut_all=True) if i not in sw] | |
word_set = set(s1_cut).union(set(s2_cut)) | |
print('cut s1: %s ' % s1_cut) | |
print('cut s2: %s ' % s2_cut) | |
print('word set: %s \n' % word_set) | |
# 列出所有词 及 每个词在 set 中出现的位置(即每个单词所在的维度和该维度的编码) | |
word_dict = dict() | |
i = 0 | |
for word in word_set: | |
word_dict[word] = i | |
i += 1 | |
print('word dict: %s \n' % word_dict) | |
# 统计 s1 各维度编码并计算每个维度的词频 | |
s1_cut_word_code = [word_dict[word] for word in s1_cut] | |
print('s1 appear code: %s ' % s1_cut_word_code) | |
s1_cut_word_fq = [0] * len(word_dict) | |
for word in s1_cut: | |
s1_cut_word_fq[word_dict[word]] += 1 | |
print('s1 word appear on each code: %s ' % s1_cut_word_fq) | |
# 统计 s2 各维度编码并计算每个维度的词频 | |
s2_cut_word_code = [word_dict[word] for word in s2_cut] | |
print('s2 appear code: %s ' % s2_cut_word_code) | |
s2_cut_word_fq = [0] * len(word_dict) | |
for word in s2_cut: | |
s2_cut_word_fq[word_dict[word]] += 1 | |
print('s2 word appear on each code: %s \n' % s2_cut_word_fq) | |
# 计算余弦相似度 | |
sigma_sum, sq1, sq2 = 0, 0, 0 | |
for i in range(len(word_dict)): | |
sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i] | |
sq1 += pow(s1_cut_word_fq[i], 2) | |
sq2 += pow(s2_cut_word_fq[i], 2) | |
try: | |
cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6) | |
except ZeroDivisionError: | |
cosine = 0.0 | |
print('cosine similarity: %s \n\n' % cosine) | |
def check_en_similarity1(s1, s2): | |
# 使用 nltk 分词并去掉 stopwords,合并 words,每个 word 都是一个维度 | |
sw = stopwords.words('english') | |
others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's'] | |
sw.extend(others) | |
s1_cut = word_tokenize(s1) | |
s2_cut = word_tokenize(s2) | |
s1_cut = {w for w in s1_cut if not w.lower() in sw} | |
s2_cut = {w for w in s2_cut if not w.lower() in sw} | |
word_set = set(s1_cut).union(set(s2_cut)) | |
print('cut s1: %s ' % s1_cut) | |
print('cut s2: %s ' % s2_cut) | |
print('word set: %s \n' % word_set) | |
# 列出所有词 及 每个词在 set 中出现的位置(即每个单词所在的维度和该维度的编码) | |
word_dict = dict() | |
i = 0 | |
for word in word_set: | |
word_dict[word] = i | |
i += 1 | |
print('word dict: %s \n' % word_dict) | |
# 统计 s1 各维度编码并计算 s1 在每个维度的词频 | |
s1_cut_word_code = [word_dict[word] for word in s1_cut] | |
print('s1 appear code: %s ' % s1_cut_word_code) | |
s1_cut_word_fq = [0] * len(word_dict) | |
for word in s1_cut: | |
s1_cut_word_fq[word_dict[word]] += 1 | |
print('s1 word appear on each code: %s ' % s1_cut_word_fq) | |
# 统计 s2 各维度编码并计算 s2 在每个维度的词频 | |
s2_cut_word_code = [word_dict[word] for word in s2_cut] | |
print('s2 appear code: %s ' % s2_cut_word_code) | |
s2_cut_word_fq = [0] * len(word_dict) | |
for word in s2_cut: | |
s2_cut_word_fq[word_dict[word]] += 1 | |
print('s2 word appear on each code: %s \n' % s2_cut_word_fq) | |
# 计算余弦相似度 | |
sigma_sum, sq1, sq2 = 0, 0, 0 | |
for i in range(len(word_dict)): | |
sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i] | |
sq1 += pow(s1_cut_word_fq[i], 2) | |
sq2 += pow(s2_cut_word_fq[i], 2) | |
try: | |
cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6) | |
except ZeroDivisionError: | |
cosine = 0.0 | |
print('cosine similarity: %s \n\n' % cosine) | |
def check_en_similarity2(s1, s2): | |
# tokenization | |
x_list = word_tokenize(s1) | |
y_list = word_tokenize(s2) | |
# sw contains the list of stopwords | |
sw = stopwords.words('english') | |
others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's'] | |
sw.extend(others) | |
l1 = [] | |
l2 = [] | |
# remove stop words from the string | |
x_set = {w for w in x_list if not w.lower() in sw} | |
y_set = {w for w in y_list if not w.lower() in sw} | |
# form a set containing keywords of both strings | |
rvector = x_set.union(y_set) | |
for w in rvector: | |
if w in x_set: | |
l1.append(1) # create a vector | |
else: | |
l1.append(0) | |
if w in y_set: | |
l2.append(1) | |
else: | |
l2.append(0) | |
# cosine formula | |
c = 0 | |
for i in range(len(rvector)): | |
c += l1[i] * l2[i] | |
cosine = round(c / float((sum(l1) * sum(l2)) ** 0.5), 6) | |
print("similarity: ", cosine) | |
return cosine | |
if __name__ == '__main__': | |
# 中文句子测试 | |
# sentence1 = ' 这只皮靴号码大了。那只号码合适' | |
# sentence2 = ' 这只皮靴号码不小,那只更合适' | |
# check_zh_similarity(sentence1, sentence2) | |
# 英文句子测试 | |
# sentence3 = 'How to create ticket in ticket system?' | |
# sentence4 = 'How can I create ticket?' | |
sentence3 = 'Do you know I love you?' | |
sentence4 = 'I love you so much?' | |
check_en_similarity1(sentence3, sentence4) | |
check_en_similarity2(sentence3, sentence4) | |
print('end!') |
如果需要语义匹配可以尝试使用 spacy 或者一些其他的工具,这些工具会让你下载一些关方的预先训练好的 model,这些 model 里基本都实现了 tf-idf,基本能够达到语义的识别精度,当然我们也可以自己搭建神经网络(tensorflow+kelas 等)去自己训练(个人感觉)