使用 spacy 检测文本相似度 - NLP - Python 语言学习 - 学习 | Jalen's Blog = Follow your heart = A happy life

	# 这个比我们自己通过余弦相似度计算更精确一些，可以基本达到语义相似匹配
	# This looks like it's the formula for computing cosine similarity and the vectors seem to be created with SpaCy's .vector which the documentation says is trained from GloVe's w2v model.

	# github: https://github.com/explosion/spaCy
	# doc： https://spacy.io/usage/vectors-similarity
	# install: pip install spacy
	# install english model: python -m spacy download en_core_web_sm[md\|lg], ref: https://spacy.io/models/en
	# install chinese model: python -m spacy download zh_core_web_sm[md\|lg], ref: https://spacy.io/models/zh

	import spacy


	def check_word_property():
	nlp = spacy.load("en_core_web_sm")
	input_sentence = input('Please input a sentence: ')
	while input_sentence:
	doc = nlp(input_sentence)
	word_property = [{'word': w.text, 'property': w.pos_} for w in doc]
	noun_list = [wp['word'] for wp in word_property if wp['property'] == 'NOUN']
	verb_list = [wp['word'] for wp in word_property if wp['property'] == 'VERB']
	print('Input sentence: %s' % input_sentence)
	print('Noun list: %s' % ', '.join(noun_list))
	print('Verb list: %s\n' % ', '.join(verb_list))
	input_sentence = input('Please input a sentence: ')


	def check_en_word_similarity():
	# 为了使比较算法简洁和快速，spaCy 的小模型 (所有以 sm 结尾的包) 都不使用单词向量，
	# 而且这些 sm 包只包含上下文相关的向量，我们虽然可以使用 similarity () 方法，但结果不会太理想
	# 所以为了使用真正的词向量，最好使用 en_core_web_lg 模型
	nlp = spacy.load('en_core_web_lg')
	tokens = nlp('dog cat banana ssafsf') # ssafsf 会被警告改单词没有向量

	# 内置单词向量的模型使它们成为可用的标记，文本向量将默认为它们的 token 向量的平均值，
	# 通过 has_vector 我们可以检查一个 token 是否有分配的向量，并得到 L2 规范，它可以用来使向量标准化
	print('Token属性：\n')
	for token in tokens:
	print(token.text, token.has_vector, token.vector_norm, token.is_oov)

	print('\nToken两两之间关系：\n')
	for token1 in tokens:
	for token2 in tokens:
	print('%s and %s: %s' % (token1, token2, token1.similarity(token2)))


	def check_en_doc_similarity():
	nlp = spacy.load('en_core_web_lg')
	doc1 = nlp('How to create ticket?')
	doc2 = nlp('How can I create ticket?')
	doc3 = nlp('I want to create ticket?')
	doc4 = nlp('Do you know how to create the ticket?')
	print(doc1.similarity(doc2)) # 0.9540794124760449
	print(doc2.similarity(doc3)) # 0.9751696134778753
	print(doc1.similarity(doc3)) # 0.9549075548119442
	print(doc1.similarity(doc4)) # 0.9547450107860063

	sentence3 = nlp('Do you know I love you?')
	sentence4 = nlp('I love you so much?')
	print(sentence3.similarity(sentence4)) # 0.9629862471369796


	def check_zh_doc_similarity():
	nlp = spacy.load('zh_core_web_lg')
	doc1 = nlp('你好吗?')
	doc2 = nlp('你还好吗?')
	doc3 = nlp('今天你还好吗?')
	doc4 = nlp('你的身体今天还好吗?')
	print(doc1.similarity(doc2)) # 0.7544851165307768
	print(doc2.similarity(doc3)) # 0.9664107589955437
	print(doc1.similarity(doc3)) # 0.730822854943996
	print(doc1.similarity(doc4)) # 0.6528684500574182


	if __name__ == '__main__':
	check_word_property()

	check_en_word_similarity()

	check_zh_doc_similarity()
	check_en_doc_similarity()

	print('end!')

使用余弦相似度实现文本相似度检测

使用Universal Sentence Encoder检测文本相似度