# 注意:只是文本匹配而非语义匹配
 
# 参考地址:https://zhuanlan.zhihu.com/p/43396514
#         https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/
#
#         https://baike.baidu.com/item/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6/17509249
#         https://github.com/nltk/nltk
#         http://www.nltk.org/
#         https://github.com/fxsjy/jieba
# similarity = consine = (A.B) / (||A||.||B||) where A and B are vectors.(欧几里得点积公式)
 
# pip install nltk
# import nltk
# nltk.download ('all') ['punkt'|'stopwords'|...]  --all 好像要下 300M + 数据
# nltk.tokenize: It is used for tokenization. Tokenization is the process by which big quantity of text is
#                divided into smaller parts called tokens. word_tokenize(X) split the given sentence X into
#                words and return list.
 
# nltk.corpus: In this program, it is used to get a list of stopwords.
#              A stop word is a commonly used word (such as “the”, “a”, “an”, “in”).
 
import jieba
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
 
def check_zh_similarity(s1, s2):
    # 使用 jieba 分词并去掉 stopwords,合并 words,每个 word 都是一个维度
    sw = ['。', ',']
    s1_cut = [i for i in jieba.cut(s1, cut_all=True) if i not in sw]
    s2_cut = [i for i in jieba.cut(s2, cut_all=True) if i not in sw]
    word_set = set(s1_cut).union(set(s2_cut))
    print('cut s1: %s ' % s1_cut)
    print('cut s2: %s ' % s2_cut)
    print('word set: %s \n' % word_set)
 
    # 列出所有词 及 每个词在 set 中出现的位置(即每个单词所在的维度和该维度的编码)
    word_dict = dict()
    i = 0
    for word in word_set:
        word_dict[word] = i
        i += 1
    print('word dict: %s \n' % word_dict)
 
    # 统计 s1 各维度编码并计算每个维度的词频
    s1_cut_word_code = [word_dict[word] for word in s1_cut]
    print('s1 appear code: %s ' % s1_cut_word_code)
    s1_cut_word_fq = [0] * len(word_dict)
    for word in s1_cut:
        s1_cut_word_fq[word_dict[word]] += 1
    print('s1 word appear on each code: %s ' % s1_cut_word_fq)
 
    # 统计 s2 各维度编码并计算每个维度的词频
    s2_cut_word_code = [word_dict[word] for word in s2_cut]
    print('s2 appear code: %s ' % s2_cut_word_code)
    s2_cut_word_fq = [0] * len(word_dict)
    for word in s2_cut:
        s2_cut_word_fq[word_dict[word]] += 1
    print('s2 word appear on each code: %s \n' % s2_cut_word_fq)
 
    # 计算余弦相似度
    sigma_sum, sq1, sq2 = 0, 0, 0
    for i in range(len(word_dict)):
        sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i]
        sq1 += pow(s1_cut_word_fq[i], 2)
        sq2 += pow(s2_cut_word_fq[i], 2)
    try:
        cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6)
    except ZeroDivisionError:
        cosine = 0.0
    print('cosine similarity: %s \n\n' % cosine)
 
 
def check_en_similarity1(s1, s2):
    # 使用 nltk 分词并去掉 stopwords,合并 words,每个 word 都是一个维度
    sw = stopwords.words('english')
    others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's']
    sw.extend(others)
    s1_cut = word_tokenize(s1)
    s2_cut = word_tokenize(s2)
    s1_cut = {w for w in s1_cut if not w.lower() in sw}
    s2_cut = {w for w in s2_cut if not w.lower() in sw}
    word_set = set(s1_cut).union(set(s2_cut))
    print('cut s1: %s ' % s1_cut)
    print('cut s2: %s ' % s2_cut)
    print('word set: %s \n' % word_set)
 
    # 列出所有词 及 每个词在 set 中出现的位置(即每个单词所在的维度和该维度的编码)
    word_dict = dict()
    i = 0
    for word in word_set:
        word_dict[word] = i
        i += 1
    print('word dict: %s \n' % word_dict)
 
    # 统计 s1 各维度编码并计算 s1 在每个维度的词频
    s1_cut_word_code = [word_dict[word] for word in s1_cut]
    print('s1 appear code: %s ' % s1_cut_word_code)
    s1_cut_word_fq = [0] * len(word_dict)
    for word in s1_cut:
        s1_cut_word_fq[word_dict[word]] += 1
    print('s1 word appear on each code: %s ' % s1_cut_word_fq)
 
    # 统计 s2 各维度编码并计算 s2 在每个维度的词频
    s2_cut_word_code = [word_dict[word] for word in s2_cut]
    print('s2 appear code: %s ' % s2_cut_word_code)
    s2_cut_word_fq = [0] * len(word_dict)
    for word in s2_cut:
        s2_cut_word_fq[word_dict[word]] += 1
    print('s2 word appear on each code: %s \n' % s2_cut_word_fq)
 
    # 计算余弦相似度
    sigma_sum, sq1, sq2 = 0, 0, 0
    for i in range(len(word_dict)):
        sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i]
        sq1 += pow(s1_cut_word_fq[i], 2)
        sq2 += pow(s2_cut_word_fq[i], 2)
    try:
        cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6)
    except ZeroDivisionError:
        cosine = 0.0
    print('cosine similarity: %s \n\n' % cosine)
 
 
def check_en_similarity2(s1, s2):
    # tokenization
    x_list = word_tokenize(s1)
    y_list = word_tokenize(s2)
 
    # sw contains the list of stopwords
    sw = stopwords.words('english')
    others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's']
    sw.extend(others)
    l1 = []
    l2 = []
 
    # remove stop words from the string
    x_set = {w for w in x_list if not w.lower() in sw}
    y_set = {w for w in y_list if not w.lower() in sw}
 
    # form a set containing keywords of both strings
    rvector = x_set.union(y_set)
    for w in rvector:
        if w in x_set:
            l1.append(1)  # create a vector
        else:
            l1.append(0)
        if w in y_set:
            l2.append(1)
        else:
            l2.append(0)
 
    # cosine formula
    c = 0
    for i in range(len(rvector)):
        c += l1[i] * l2[i]
    cosine = round(c / float((sum(l1) * sum(l2)) ** 0.5), 6)
    print("similarity: ", cosine)
    return cosine
 
 
if __name__ == '__main__':
    # 中文句子测试
    # sentence1 = ' 这只皮靴号码大了。那只号码合适'
    # sentence2 = ' 这只皮靴号码不小,那只更合适'
    # check_zh_similarity(sentence1, sentence2)
 
    # 英文句子测试
    # sentence3 = 'How to create ticket in ticket system?'
    # sentence4 = 'How can I create ticket?'
    sentence3 = 'Do you know I love you?'
    sentence4 = 'I love you so much?'
    check_en_similarity1(sentence3, sentence4)
    check_en_similarity2(sentence3, sentence4)
    print('end!')

如果需要语义匹配可以尝试使用 spacy 或者一些其他的工具,这些工具会让你下载一些关方的预先训练好的 model,这些 model 里基本都实现了 tf-idf,基本能够达到语义的识别精度,当然我们也可以自己搭建神经网络(tensorflow+kelas 等)去自己训练(个人感觉)