医学词向量训练---词向量训练和可视化
词向量训练

python代码
import jieba
import os
import json
import codecs
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import LineSentence
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
class medicalWord2vec:
"""
医学词向量
"""
def __init__(self):
self.stopwords_path = '../../词向量/word/stop.txt' # 停用词路径
self.new_cut_words_path = "../../词向量/word/jieba.txt" # 原先分词词库路径
self.origin_cut_words_path = "../../词向量/word/my_words.txt" # 新分词词库路径
self.data_path = '../../词向量/data/content/' # 语料json路径
self.corpus_path = "../../词向量/word/content.txt" # 保存的代训练语料路径
self.word_vector_path = "../../词向量/word/word2vec.vector" # 词向量路径
self.word_vector_dimension = 400 # 词向量维度
self.test_words = ['新冠', '全身疼痛', '感冒', '病毒感染', '肚子疼', '咳嗽', "头疼", "头痛"] # 测试词向量词语
def seg_sentence(self, sentence):
"""
对语料分词,结巴去除听用词分词
:param sentence: 句子
:return: 分词后list
"""
sentence_seg = jieba.cut(sentence.strip())
stopwords = [line.strip() for line in open(self.stopwords_path, 'r', encoding='utf-8').readlines()]
return [word for word in sentence_seg if word not in stopwords and word != "\t"]
def get_cut_word(self):
"""
基于39net问答丰富分词词库
"""
print("------开始更新分词词库------")
f_data = [line.replace("\n", "") for line in open(self.origin_cut_words_path, encoding="utf8")]
f = open(self.new_cut_words_path, "w", encoding="utf8")
result = []
for file_one in os.listdir(self.data_path):
for file_two in os.listdir(self.data_path + file_one + "/"):
with open(self.data_path + file_one + "/" + file_two, encoding='utf8') as data_json:
data_json = json.load(data_json)
result.extend([element['label'] for element in data_json['key_word'] if len(element['label']) > 1])
print(file_one + "-----------已完成")
result = list(set(result + f_data))
for element in result:
f.write(element + "\n")
f.close()
print("------分词词库更新完成------")
def get_corpus(self):
"""
对语料进行分词
"""
print("------开始语料分词------")
target = codecs.open(self.corpus_path, 'w', encoding="utf8")
jieba.load_userdict(self.new_cut_words_path)
for file_one in os.listdir(self.data_path):
for file_two in os.listdir(self.data_path + file_one + "/"):
with open(self.data_path + file_one + "/" + file_two, encoding='utf8') as data_json:
data_json = json.load(data_json)['content']
line_seg = " ".join(self.seg_sentence(data_json))
target.writelines(line_seg)
print(file_one + "-----------已完成")
target.close()
print("------语料分词结束------")
def train_word2vec(self):
"""
基于word2vec训练词向量
"""
print("------开始词向量训练------")
model = Word2Vec(LineSentence(self.corpus_path), size=self.word_vector_dimension, window=5, min_count=5,
workers=multiprocessing.cpu_count())
model.wv.save_word2vec_format(self.word_vector_path, binary=True)
print("------词向量训练结束------")
def test_word2vec(self):
"""
测试词向量
"""
word2vec_model = KeyedVectors.load_word2vec_format(self.word_vector_path, binary=True)
test_words = self.test_words
for i in range(len(test_words)):
res = word2vec_model.most_similar(test_words[i])
print("测试词语:-----", test_words[i])
print("类似词:-----", res)
def visualization(self):
"""
可视化词向量
"""
word2vec_model = KeyedVectors.load_word2vec_format(self.word_vector_path, binary=True)
words = list(word2vec_model.wv.vocab)
random.shuffle(words)
print("词向量总数:----------", len(words))
vector = word2vec_model[words]
random_word2vec = TSNE(n_components=2, init='pca', verbose=1).fit_transform(vector)
plt.figure(figsize=(14, 10))
plt.scatter(random_word2vec[:300, 0], random_word2vec[:300, 1])
for i in range(300):
x = random_word2vec[i][0]
y = random_word2vec[i][1]
plt.text(x, y, words[i])
plt.title('医学词向量可视化', size=16)
plt.savefig('../../词向量/word/TSNE.jpg', dpi=200)
plt.show()
结果
训练结果

词向量可视化
