# 这个比我们自己通过余弦相似度计算更精确一些,可以基本达到语义相似匹配
# This looks like it's the formula for computing cosine similarity and the vectors seem to be created with SpaCy's .vector which the documentation says is trained from GloVe's w2v model.
 
# github: https://github.com/explosion/spaCy
# doc: https://spacy.io/usage/vectors-similarity
# install: pip install spacy
# install english model: python -m spacy download en_core_web_sm[md|lg], ref: https://spacy.io/models/en
# install chinese model: python -m spacy download zh_core_web_sm[md|lg], ref: https://spacy.io/models/zh
 
import spacy
 
 
def check_word_property():
    nlp = spacy.load("en_core_web_sm")
    input_sentence = input('Please input a sentence: ')
    while input_sentence:
        doc = nlp(input_sentence)
        word_property = [{'word': w.text, 'property': w.pos_} for w in doc]
        noun_list = [wp['word'] for wp in word_property if wp['property'] == 'NOUN']
        verb_list = [wp['word'] for wp in word_property if wp['property'] == 'VERB']
        print('Input sentence: %s' % input_sentence)
        print('Noun list: %s' % ', '.join(noun_list))
        print('Verb list: %s\n' % ', '.join(verb_list))
        input_sentence = input('Please input a sentence: ')
 
 
def check_en_word_similarity():
    # 为了使比较算法简洁和快速,spaCy 的小模型 (所有以 sm 结尾的包) 都不使用单词向量,
    # 而且这些 sm 包只包含上下文相关的向量,我们虽然可以使用 similarity () 方法,但结果不会太理想
    # 所以为了使用真正的词向量,最好使用 en_core_web_lg 模型
    nlp = spacy.load('en_core_web_lg')
    tokens = nlp('dog cat banana ssafsf')  # ssafsf 会被警告改单词没有向量
 
    # 内置单词向量的模型使它们成为可用的标记,文本向量将默认为它们的 token 向量的平均值,
    # 通过 has_vector 我们可以检查一个 token 是否有分配的向量,并得到 L2 规范,它可以用来使向量标准化
    print('Token属性:\n')
    for token in tokens:
        print(token.text, token.has_vector, token.vector_norm, token.is_oov)
 
    print('\nToken两两之间关系:\n')
    for token1 in tokens:
        for token2 in tokens:
            print('%s and %s: %s' % (token1, token2, token1.similarity(token2)))
 
 
def check_en_doc_similarity():
    nlp = spacy.load('en_core_web_lg')
    doc1 = nlp('How to create ticket?')
    doc2 = nlp('How can I create ticket?')
    doc3 = nlp('I want to create ticket?')
    doc4 = nlp('Do you know how to create the ticket?')
    print(doc1.similarity(doc2))  # 0.9540794124760449
    print(doc2.similarity(doc3))  # 0.9751696134778753
    print(doc1.similarity(doc3))  # 0.9549075548119442
    print(doc1.similarity(doc4))  # 0.9547450107860063
 
    sentence3 = nlp('Do you know I love you?')
    sentence4 = nlp('I love you so much?')
    print(sentence3.similarity(sentence4))  # 0.9629862471369796
 
 
def check_zh_doc_similarity():
    nlp = spacy.load('zh_core_web_lg')
    doc1 = nlp('你好吗?')
    doc2 = nlp('你还好吗?')
    doc3 = nlp('今天你还好吗?')
    doc4 = nlp('你的身体今天还好吗?')
    print(doc1.similarity(doc2))  # 0.7544851165307768
    print(doc2.similarity(doc3))  # 0.9664107589955437
    print(doc1.similarity(doc3))  # 0.730822854943996
    print(doc1.similarity(doc4))  # 0.6528684500574182
 
 
if __name__ == '__main__':
    check_word_property()
 
    check_en_word_similarity()
 
    check_zh_doc_similarity()
    check_en_doc_similarity()
 
    print('end!')
更新于 阅读次数

请我喝[茶]~( ̄▽ ̄)~*

Jalen Chu 微信支付

微信支付

Jalen Chu 支付宝

支付宝

Jalen Chu 公众号

公众号