本文主要参考:https://github.com/zhedongzheng/finch 完成。与原代码的区别在于没有使用 tf.estimator,以及数据预处理方面作了部分修改(使用于dataset)python
# -*- coding:utf-8 -*- from collections import Counter import tensorflow as tf import numpy as np import re PARAMS = { 'min_freq': 5, 'window_size': 3, 'n_sampled': 100, 'embed_dim': 200, } def preprocess_text(text): # 1. 将数据中的换行符替换为空格 text = text.replace('\n', ' ') # 2. 将数据中的空白字符替换为空格,并转化为全小写 text = re.sub('\s+', ' ', text).strip().lower() # 3. 以空格为分隔符,相似于简单的分词 words = text.split() # 4. 统计词频 word2freq = Counter(words) # 5. 去掉低频词 words = [word for word in words if word2freq[word] > PARAMS['min_freq']] print("Total words:", len(words)) # 6. 去重 _words = set(words) PARAMS['word2idx'] = {c: i for i, c in enumerate(_words)} PARAMS['idx2word'] = {i: c for i, c in enumerate(_words)} PARAMS['vocab_size'] = len(PARAMS['idx2word']) print('Vocabulary size:', PARAMS['vocab_size']) indexed = [PARAMS['word2idx'][w] for w in words] # 7. 将高频词去掉 indexed = filter_high_freq(indexed) print("Word preprocessing completed ...") return indexed def filter_high_freq(int_words, t=1e-5, threshold=0.8): int_word_counts = Counter(int_words) total_count = len(int_words) # 1. 计算词的几率,c/all word_freqs = {w: c / total_count for w, c in int_word_counts.items()} # 2. 计算词的丢弃几率,词频越高,丢弃几率越高。例如: 'the' 出现词频很高,但携带的信息少,须要去除 prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts} # 3. 高于阈值的丢弃掉 train_words = [w for w in int_words if prob_drop[w] < threshold] return train_words def make_data(int_words): x, y = [], [] for i in range(PARAMS['window_size'], len(int_words) - PARAMS['window_size']): # 1. 生成一个词的上下文 inputs = get_x(int_words, i) # 2. 将一个词的全部上下文做为一个总体,添加到x中 # x = [['a','b','d',e'],['b','c','e','f'] x.append(inputs) # 3. 将每一个label做为一个子list,添加到y中 #y = [['c'],['d']] # 4. 即每条数据为context:word y.append([int_words[i]]) return np.array(x), np.array(y) def get_x(words, idx): left = idx - PARAMS['window_size'] right = idx + PARAMS['window_size'] return words[left: idx] + words[idx + 1: right + 1] # 1. 预处理数据 with open(r'E:\nlp_data\ptb_train.txt') as f: x_train, y_train = make_data(preprocess_text(f.read())) # 2. 将数据封装为dataset # 这里一条数据是多少呢? # 一条数据,x = 6个词,y=1个词,x_train[i],y_train[i] # 缘由在于 make_data 中的 x.append(inputs) 和 y.append([int_words[i]]) dataset = tf.data.Dataset.from_tensor_slices(tensors=(x_train,y_train)) dataset = dataset.batch(batch_size=100).repeat(5) iter = dataset.make_one_shot_iterator() next_data = iter.get_next() # 3. CBOW模型搭建 # 这里填写 shape = (None,6) 和 shape=(None,1) # 缘由在于 make_data 中的 x.append(inputs) 和 y.append([int_words[i]]) # window_size = 3,则context大小为6 # None 为100,缘由在于 dataset.batch(batch_size=100) x = tf.placeholder(shape=(None,6),dtype=tf.int32) y_= tf.placeholder(shape=(None,1),dtype=tf.int32) E = tf.get_variable(name="E",shape=(PARAMS['vocab_size'],PARAMS['embed_dim'])) embedding = tf.nn.embedding_lookup(params=E,ids=x) embedding = tf.reduce_mean(embedding,axis=[1]) W = tf.get_variable(name="w",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']),dtype=tf.float32) b = tf.get_variable(name="b",shape=(PARAMS['vocab_size']),dtype=tf.float32) loss_op = tf.reduce_mean(tf.nn.sampled_softmax_loss( weights=W, biases=b, labels=y_, inputs=embedding, num_sampled=PARAMS['n_sampled'], num_classes=PARAMS['vocab_size'])) opt = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss=loss_op) init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) try: while True: inputs,labels = session.run(next_data) session.run(fetches=opt,feed_dict={x:inputs,y_:labels}) except tf.errors.OutOfRangeError: print("train complete")