Word2Vec其实就是经过学习文原本用词向量的方式表征词的语义信息,即经过一个嵌入空间使得语义上类似的单词在该空间内距离很近。html
Embedding其实就是一个映射,将单词从原先所属的空间映射到新的多维空间中,也就是把原先词所在空间嵌入到一个新的空间中去。网络
Word2Vec模型实际上分为了两个部分,第一部分为创建模型,第二部分是经过模型获取嵌入词向量。Word2Vec的整个建模过程实际上与自编码器(auto-encoder)的思想很类似,即先基于训练数据构建一个神经网络,当这个模型训练好之后,咱们并不会用这个训练好的模型处理新的任务,咱们真正须要的是这个模型经过训练数据所学得的参数,例如隐层的权重矩阵——后面咱们将会看到这些权重在Word2Vec中实际上就是咱们试图去学习的“word vectors”。基于训练数据建模的过程,咱们给它一个名字叫“Fake Task”,意味着建模并非咱们最终的目的。app
上面提到的这种方法实际上会在无监督特征学习(unsupervised feature learning)中见到,最多见的就是自编码器(auto-encoder):经过在隐层将输入进行编码压缩,继而在输出层将数据解码恢复初始状态,训练完成后,咱们会将输出层“砍掉”,仅保留隐层。dom
https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.htmliphone
基于Python版本的实现:
import math
import sys
import numpy as np函数
class Ngram:
def init(self, tokens):
self.tokens = tokens
self.count = 0
self.score = 0.0oop
def set_score(self, score): self.score = score def get_string(self): return '_'.join(self.tokens)
class Corpus: #语料库
def init(self, filename, word_phrase_passes, word_phrase_delta, word_phrase_threshold, word_phrase_filename):
i = 0
file_pointer = open(filename, 'r')学习
all_tokens = [] for line in file_pointer: line_tokens = line.split() for token in line_tokens: token = token.lower() #大写转小写 if len(token) > 1 and token.isalnum(): # isalnum() 方法检测字符串是否由字母和数字组成 all_tokens.append(token) i += 1 if i % 10000 == 0: sys.stdout.flush() #刷新输出 sys.stdout.write("\rReading corpus: %d" % i) sys.stdout.flush() print( "\rCorpus read: %d" % i) file_pointer.close() self.tokens = all_tokens for x in range(1, word_phrase_passes + 1): self.build_ngrams(x, word_phrase_delta, word_phrase_threshold, word_phrase_filename) self.save_to_file(filename) def build_ngrams(self, x, word_phrase_delta, word_phrase_threshold, word_phrase_filename): ngrams = [] ngram_map = {} token_count_map = {} for token in self.tokens: if token not in token_count_map: token_count_map[token] = 1 else: token_count_map[token] += 1 i = 0 ngram_l = [] for token in self.tokens: if len(ngram_l) == 2: ngram_l.pop(0) ngram_l.append(token) ngram_t = tuple(ngram_l) if ngram_t not in ngram_map: ngram_map[ngram_t] = len(ngrams) ngrams.append(Ngram(ngram_t)) ngrams[ngram_map[ngram_t]].count += 1 i += 1 if i % 10000 == 0: sys.stdout.flush() sys.stdout.write("\rBuilding n-grams (%d pass): %d" % (x, i)) sys.stdout.flush() print( "\rn-grams (%d pass) built: %d" % (x, i)) filtered_ngrams_map = {} file_pointer = open(word_phrase_filename + ('-%d' % x), 'w') # http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf i = 0 for ngram in ngrams: product = 1 for word_string in ngram.tokens: product *= token_count_map[word_string] ngram.set_score((float(ngram.count) - word_phrase_delta) / float(product)) if ngram.score > word_phrase_threshold: filtered_ngrams_map[ngram.get_string()] = ngram file_pointer.write('%s %d\n' % (ngram.get_string(), ngram.count)) i += 1 if i % 10000 == 0: sys.stdout.flush() sys.stdout.write("\rScoring n-grams: %d" % i) sys.stdout.flush() print( "\rScored n-grams: %d, filtered n-grams: %d" % (i, len(filtered_ngrams_map))) file_pointer.close() # Combining the tokens all_tokens = [] i = 0 while i < len(self.tokens): if i + 1 < len(self.tokens): ngram_l = [] ngram_l.append(self.tokens[i]) ngram_l.append(self.tokens[i+1]) ngram_string = '_'.join(ngram_l) if len(ngram_l) == 2 and (ngram_string in filtered_ngrams_map): ngram = filtered_ngrams_map[ngram_string] all_tokens.append(ngram.get_string()) i += 2 else: all_tokens.append(self.tokens[i]) i += 1 else: all_tokens.append(self.tokens[i]) i += 1 print("Tokens combined") self.tokens = all_tokens def save_to_file(self, filename): i = 1 filepointer = open('preprocessed-' + filename, 'w') line = '' for token in self.tokens: if i % 20 == 0: line += token filepointer.write('%s\n' % line) line = '' else: line += token + ' ' i += 1 if i % 10000 == 0: sys.stdout.flush() sys.stdout.write("\rWriting to preprocessed input file") sys.stdout.flush() print ("\rPreprocessed input file written") filepointer.close() def __getitem__(self, i): return self.tokens[i] def __len__(self): return len(self.tokens) def __iter__(self): return iter(self.tokens)
class Word:
def init(self, word):
self.word = word
self.count = 0ui
class Vocabulary:
def init(self, corpus, min_count):
self.words = []
self.word_map = {}
self.build_words(corpus, min_count)编码
self.filter_for_rare_and_common() def build_words(self, corpus, min_count): words = [] word_map = {} i = 0 for token in corpus: if token not in word_map: word_map[token] = len(words) words.append(Word(token)) words[word_map[token]].count += 1 i += 1 if i % 10000 == 0: sys.stdout.flush() sys.stdout.write("\rBuilding vocabulary: %d" % len(words)) sys.stdout.flush() print("\rVocabulary built: %d" % len(words)) self.words = words self.word_map = word_map # Mapping from each token to its index in vocab def __getitem__(self, i): return self.words[i] def __len__(self): return len(self.words) def __iter__(self): return iter(self.words) def __contains__(self, key): return key in self.word_map def indices(self, tokens): return [self.word_map[token] if token in self else self.word_map['{rare}'] for token in tokens] def filter_for_rare_and_common(self): # Remove rare words and sort tmp = [] tmp.append(Word('{rare}')) unk_hash = 0 count_unk = 0 for token in self.words: if token.count < min_count: count_unk += 1 tmp[unk_hash].count += token.count else: tmp.append(token) tmp.sort(key=lambda token : token.count, reverse=True) # Update word_map word_map = {} for i, token in enumerate(tmp): word_map[token.word] = i self.words = tmp self.word_map = word_map pass
class TableForNegativeSamples:
def init(self, vocab):
power = 0.75
norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constants
table_size = int(1e6) table = np.zeros(table_size, dtype=np.uint32) p = 0 # Cumulative probability i = 0 for j, word in enumerate(vocab): p += float(math.pow(word.count, power))/norm while i < table_size and float(i) / table_size < p: table[i] = j i += 1 self.table = table def sample(self, count): indices = np.random.randint(low=0, high=len(self.table), size=count) return [self.table[i] for i in indices]
def sigmoid(z):
if z > 6:
return 1.0
elif z < -6:
return 0.0
else:
return 1 / (1 + math.exp(-z))
def save(vocab, nn0, filename):
file_pointer = open(filename, 'w')
for token, vector in zip(vocab, nn0):
word = token.word.replace(' ', '_')
vector_str = ' '.join([str(s) for s in vector])
file_pointer.write('%s %s\n' % (word, vector_str))
file_pointer.close()
if name == 'main':
for input_filename in ['in.txt']: #for input_filename in ['news-2012-phrases-10000.txt']: # Number of negative examples k_negative_sampling = 5 # Min count for words to be used in the model, else {rare} min_count = 3 # Number of word phrase passes word_phrase_passes = 3 # 3 # min count for word phrase formula word_phrase_delta = 3 # 5 # Threshold for word phrase creation word_phrase_threshold = 1e-4 # Read the corpus 读取语料库 corpus = Corpus(input_filename, word_phrase_passes, word_phrase_delta, word_phrase_threshold, 'phrases-%s' % input_filename) # Read train file to init vocab读取训练文件初始化vocab vocab = Vocabulary(corpus, min_count) table = TableForNegativeSamples(vocab) # Max window length for window in [5]: # 5 for large set # Dimensionality of word embeddings for dim in [100]: # 100 print( "Training: %s-%d-%d-%d" % (input_filename, window, dim, word_phrase_passes)) # Initialize network nn0 = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(len(vocab), dim)) nn1 = np.zeros(shape=(len(vocab), dim)) # Initial learning rate initial_alpha = 0.01 # 0.01 # Modified in loop global_word_count = 0 alpha = initial_alpha word_count = 0 last_word_count = 0 tokens = vocab.indices(corpus) for token_idx, token in enumerate(tokens): if word_count % 10000 == 0: global_word_count += (word_count - last_word_count) last_word_count = word_count # Recalculate alpha # alpha = initial_alpha * (1 - float(global_word_count) / len(corpus)) # if alpha < initial_alpha * 0.0001: # alpha = initial_alpha * 0.0001 sys.stdout.flush() sys.stdout.write("\rTraining: %d of %d" % (global_word_count, len(corpus))) # Randomize window size, where win is the max window size current_window = np.random.randint(low=1, high=window+1) context_start = max(token_idx - current_window, 0) context_end = min(token_idx + current_window + 1, len(tokens)) context = tokens[context_start:token_idx] + tokens[token_idx+1:context_end] # Turn into an iterator? for context_word in context: # Init neu1e with zeros neu1e = np.zeros(dim) classifiers = [(token, 1)] + [(target, 0) for target in table.sample(k_negative_sampling)] for target, label in classifiers: z = np.dot(nn0[context_word], nn1[target]) p = sigmoid(z) g = alpha * (label - p) neu1e += g * nn1[target] # Error to backpropagate to nn0 nn1[target] += g * nn0[context_word] # Update nn1 # Update nn0 nn0[context_word] += neu1e word_count += 1 global_word_count += (word_count - last_word_count) sys.stdout.flush() print("\rTraining finished: %d" % global_word_count) # Save model to file save(vocab, nn0, 'output-%s-%d-%d-%d' % (input_filename, window, dim, word_phrase_passes))
import time
import numpy as np
import tensorflow as tf
import random
from collections import Counter
with open('text8') as f:
text = f.read()
def preprocess(text, freq=5):
'''
对文本进行预处理
参数 --- text: 文本数据 freq: 词频阈值 ''' # 对文本中的符号进行替换 text = text.lower() text = text.replace('.', ' <PERIOD> ') text = text.replace(',', ' <COMMA> ') text = text.replace('"', ' <QUOTATION_MARK> ') text = text.replace(';', ' <SEMICOLON> ') text = text.replace('!', ' <EXCLAMATION_MARK> ') text = text.replace('?', ' <QUESTION_MARK> ') text = text.replace('(', ' <LEFT_PAREN> ') text = text.replace(')', ' <RIGHT_PAREN> ') text = text.replace('--', ' <HYPHENS> ') text = text.replace('?', ' <QUESTION_MARK> ') # text = text.replace('\n', ' <NEW_LINE> ') text = text.replace(':', ' <COLON> ') words = text.split() # 删除低频词,减小噪音影响 word_counts = Counter(words) trimmed_words = [word for word in words if word_counts[word] > freq] return trimmed_words
words = preprocess(text)
print(words[:20])
vocab = set(words)
vocab_to_int = {w: c for c, w in enumerate(vocab)}
int_to_vocab = {c: w for c, w in enumerate(vocab)}
print("total words: {}".format(len(words)))
print("unique words: {}".format(len(set(words))))
int_words = [vocab_to_int[w] for w in words]
t = 1e-5 # t值
threshold = 0.8 # 剔除几率阈值
int_word_counts = Counter(int_words)
total_count = len(int_words)
word_freqs = {w: c/total_count for w, c in int_word_counts.items()}
prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
train_words = [w for w in int_words if prob_drop[w] < threshold]
print(len(train_words))
def get_targets(words, idx, window_size=5):
'''
得到input word的上下文单词列表
参数 --- words: 单词列表 idx: input word的索引号 window_size: 窗口大小 ''' target_window = np.random.randint(1, window_size + 1) # 这里要考虑input word前面单词不够的状况 start_point = idx - target_window if (idx - target_window) > 0 else 0 end_point = idx + target_window # output words(即窗口中的上下文单词) targets = set(words[start_point: idx] + words[idx + 1: end_point + 1]) return list(targets)
def get_batches(words, batch_size, window_size=5):
'''
构造一个获取batch的生成器
'''
n_batches = len(words) // batch_size
# 仅取full batches words = words[:n_batches * batch_size] for idx in range(0, len(words), batch_size): x, y = [], [] batch = words[idx: idx + batch_size] for i in range(len(batch)): batch_x = batch[i] batch_y = get_targets(batch, i, window_size) # 因为一个input word会对应多个output word,所以须要长度统一 x.extend([batch_x] * len(batch_y)) y.extend(batch_y) yield x, y
train_graph = tf.Graph()
with train_graph.as_default():
inputs = tf.placeholder(tf.int32, shape=[None], name='inputs')
labels = tf.placeholder(tf.int32, shape=[None, None], name='labels')
# 嵌入 # 嵌入矩阵的矩阵形状为 vocab_size*hidden_units_size vocab_size = len(int_to_vocab) embedding_size = 200 # 嵌入维度
with train_graph.as_default():
# 嵌入层权重矩阵
embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))#tf.random_uniform 从均匀分布中输出随机值
# 实现lookup
embed = tf.nn.embedding_lookup(embedding, inputs)
#tf.nn.embedding_lookup函数的用法主要是:选取一个张量里面索引对应的元素。
# tf.nn.embedding_lookup(tensor, id):tensor就是输入张量,id就是张量对应的索引,
#负采样:负采样主要是为了解决梯度降低计算速度慢的问题
# ensorFlow中的tf.nn.sampled_softmax_loss会在softmax层上进行采样计算损失,计算出的loss要比full softmax loss低。
n_sampled = 100
with train_graph.as_default():
softmax_w = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=0.1))
softmax_b = tf.Variable(tf.zeros(vocab_size))
# 计算negative sampling下的损失 loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, vocab_size) cost = tf.reduce_mean(loss) optimizer = tf.train.AdamOptimizer().minimize(cost)
with train_graph.as_default():
# 随机挑选一些单词
valid_size = 16
valid_window = 100
# 从不一样位置各选8个单词
valid_examples = np.array(random.sample(range(valid_window), valid_size // 2))
valid_examples = np.append(valid_examples,
random.sample(range(1000, 1000 + valid_window), valid_size // 2))
valid_size = len(valid_examples) # 验证单词集 valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # 计算每一个词向量的模并进行单位化 norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True)) normalized_embedding = embedding / norm # 查找验证单词的词向量 valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset) # 计算余弦类似度 similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding)) epochs = 10 # 迭代轮数 batch_size = 1000 # batch大小 window_size = 10 # 窗口大小
with train_graph.as_default():
saver = tf.train.Saver() # 文件存储
with tf.Session(graph=train_graph) as sess:
iteration = 1
loss = 0
sess.run(tf.global_variables_initializer())
for e in range(1, epochs + 1): batches = get_batches(train_words, batch_size, window_size) start = time.time() # for x, y in batches: feed = {inputs: x, labels: np.array(y)[:, None]} train_loss, _ = sess.run([cost, optimizer], feed_dict=feed) loss += train_loss if iteration % 100 == 0: end = time.time() print("Epoch {}/{}".format(e, epochs), "Iteration: {}".format(iteration), "Avg. Training loss: {:.4f}".format(loss / 100), "{:.4f} sec/batch".format((end - start) / 100)) loss = 0 start = time.time() # 计算类似的词 if iteration % 1000 == 0: # 计算similarity sim = similarity.eval() for i in range(valid_size): valid_word = int_to_vocab[valid_examples[i]] top_k = 8 # 取最类似单词的前8个 nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = 'Nearest to [%s]:' % valid_word for k in range(top_k): close_word = int_to_vocab[nearest[k]] log = '%s %s,' % (log, close_word) print(log) iteration += 1
save_path = saver.save(sess, "checkpoints/text8.ckpt")
embed_mat = sess.run(normalized_embedding)
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
viz_words = 500
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])
fig, ax = plt.subplots(figsize=(14, 14)) for idx in range(viz_words): plt.scatter(*embed_tsne[idx, :], color='steelblue') plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)