在前面咱们大体介绍了什么是意图识别,把这个问题抽象出来实际上是一个分类问题。在结构上面,咱们使用LSTM来提取特征,Softmax来进行最后的多分类。因为语料的限制,咱们目前仅考虑电台,音乐,问答类等三类的意图识别。更多种类的意图识别, 其实也是把更多种类的语料加入进来,修改softmax的分类数。最后的目标是在这三类的分类准备率可以达到90%。算法
咱们将考虑使用 keras(严格意义上只能说是一个接口)来实现这个意图识别的工做。网络
图一 意图分类训练流程架构
咱们总体的流程如图所示,首先是利用对语料语料进行预处理,包括去除语料的标点符号,去除停用词等等。将语料初始化之后即是利用word2vec生成词向量, 生成词向量之后即是利用LSTM来进行特征提取,最后即是利用softmax来完成咱们的意图分类工做。总体流程很是的清晰。app
咱们的数据有三个文件,一个是question.txt, 一个是music.txt, 一个是station.txt。咱们展现一下数据的格式,你们按照以下结构组织训练便可,至于更多分类是同样的。dom
music.txtide
我想听千千阙歌
汪峰的歌曲
question.txt函数
天为甚么这么蓝
中国有多大
station.txt学习
我要听郭德纲的相声
交通广播电台
在语料预处理这块,咱们的工做目前作的很粗糙,仅仅是将语料按照1:1:1的比例提取出来进行训练,这里有个问题你们能够思考一下,为何咱们在训练的时候要尽可能使不一样类别的数据按照1:1:1的比例来进行训练.lua
生成词向量idea
生成词向量的过程,是将语料从文字转化为数值,方便程序后续处理的过程。咱们直接使用word2vec来进行训练的,至于word2Vec的原理,咱们不在这里展开。在训练的时候,咱们把全部一万五千条数据所有加入进行训练。
# -*- coding: UTF-8 -*-
import os import numpy as np from gensim.models.word2vec import Word2Vec from gensim.corpora.dictionary import Dictionary class Embedding(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): yield line.split() if __name__ == '__main__': // 训练word2vec模型 sentences = Embedding('../data/') # a memory-friendly iterator
图二:多层LSTM提取特征,外接softmax 三分类
1 # -*- coding: utf-8 -*-
2
3 import yaml 4 import sys 5 reload(sys) 6 sys.setdefaultencoding("utf-8") 7 from sklearn.cross_validation import train_test_split 8 import multiprocessing 9 import numpy as np 10 from keras.utils import np_utils 11 from gensim.models.word2vec import Word2Vec 12 from gensim.corpora.dictionary import Dictionary 13
14 from keras.preprocessing import sequence 15 from keras.models import Sequential 16 from keras.layers.embeddings import Embedding 17 from keras.layers.recurrent import LSTM 18 from keras.layers.core import Dense, Dropout,Activation 19 from keras.models import model_from_yaml 20 from sklearn.preprocessing import LabelEncoder 21 np.random.seed(1337) # For Reproducibility
22 import jieba 23 import pandas as pd 24 sys.setrecursionlimit(1000000) 25 # set parameters:
26 vocab_dim = 100
27 maxlen = 100
28 n_iterations = 1 # ideally more..
29 n_exposures = 10
30 window_size = 7
31 batch_size = 32
32 n_epoch = 15
33 input_length = 100
34 cpu_count = multiprocessing.cpu_count() 35 #加载训练文件
36
37 def loadfile(): 38 fopen = open('data/question_query.txt', 'r') 39 questtion = [] 40 for line in fopen: 41 question.append(line) 42
43 fopen = open('data/music_query.txt', 'r') 44 music = [] 45 for line in fopen: 46 music.append(line) 47
48 fopen = open('data/station_query.txt', 'r') 49 station = [] 50 for line in fopen: 51 station.append(line) 52
53 combined = np.concatenate((station, music, qabot)) 54 question_array = np.array([-1]*len(question),dtype=int) 55 station_array = np.array([0]*len(station),dtype=int) 56 music_array = np.array([1]*len(music),dtype=int) 57 #y = np.concatenate((np.ones(len(station), dtype=int), np.zeros(len(music), dtype=int)),qabot_array[0])
58 y = np.hstack((qabot_array, station_array,music_array)) 59 print "y is:"
60 print y.size 61 print "combines is:"
62 print combined.size 63 return combined, y 64
66 #对句子分词,并去掉换行符
67 def tokenizer(document): 68 ''' Simple Parser converting each document to lower-case, then 69 removing the breaks for new lines and finally splitting on the 70 whitespace 71 '''
72 #text = [jieba.lcut(document.replace('\n', '')) for str(document) in text_list]
73 result_list = [] 74 for text in document: 75 result_list.append(' '.join(jieba.cut(text)).encode('utf-8').strip()) 76 return result_list 77
80 #建立词语字典,并返回每一个词语的索引,词向量,以及每一个句子所对应的词语索引
81 def create_dictionaries(model=None, 82 combined=None): 83 ''' Function does are number of Jobs: 84 1- Creates a word to index mapping 85 2- Creates a word to vector mapping 86 3- Transforms the Training and Testing Dictionaries 87 4- 返回全部词语的向量的拼接结果
88 '''
89 if (combined is not None) and (model is not None): 90 gensim_dict = Dictionary() 91 gensim_dict.doc2bow(model.wv.vocab.keys(), 92 allow_update=True) 93 w2indx = {v: k+1 for k, v in gensim_dict.items()}#全部频数超过10的词语的索引
94 w2vec = {word: model[word] for word in w2indx.keys()}#全部频数超过10的词语的词向量
95
96 def parse_dataset(combined): 97 ''' Words become integers 98 '''
99 data=[] 100 for sentence in combined: 101 new_txt = [] 102 sentences = sentence.split(' ') 103 for word in sentences: 104 try: 105 word = unicode(word, errors='ignore') 106 new_txt.append(w2indx[word]) 107 except: 108 new_txt.append(0) 109 data.append(new_txt) 110 return data 111 combined=parse_dataset(combined) 112 combined= sequence.pad_sequences(combined, maxlen=maxlen)#每一个句子所含词语对应的索引,因此句子中含有频数小于10的词语,索引为0
113 return w2indx, w2vec,combined 114 else: 115 print 'No data provided...'
116
118 #建立词语字典,并返回每一个词语的索引,词向量,以及每一个句子所对应的词语索引
119 def word2vec_train(combined): 120 # 加载word2vec 模型
121 model = Word2Vec.load('lstm_data/model/Word2vec_model.pkl') 122 index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined) 123 return index_dict, word_vectors,combined 124
125 def get_data(index_dict,word_vectors,combined,y): 126 # 获取句子的向量
127 n_symbols = len(index_dict) + 1 # 全部单词的索引数,频数小于10的词语索引为0,因此加1
128 embedding_weights = np.zeros((n_symbols, vocab_dim)) #索引为0的词语,词向量全为0
129 for word, index in index_dict.items(): #从索引为1的词语开始,对每一个词语对应其词向量
130 embedding_weights[index, :] = word_vectors[word] 131 x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2) 132 # encode class values as integers
133 encoder = LabelEncoder() 134 encoded_y_train = encoder.fit_transform(y_train) 135 encoded_y_test = encoder.fit_transform(y_test) 136 # convert integers to dummy variables (one hot encoding)
137 y_train = np_utils.to_categorical(encoded_y_train) 138 y_test = np_utils.to_categorical(encoded_y_test) 139 print x_train.shape,y_train.shape 140 return n_symbols,embedding_weights,x_train,y_train,x_test,y_test 141
142 ##定义网络结构
143 def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test): 144 nb_classes = 3
145 print 'Defining a Simple Keras Model...'
146 ## 定义基本的网络结构
147 model = Sequential() # or Graph or whatever
148 ## 对于LSTM 变长的文本使用Embedding 将其变成指定长度的向量
149 model.add(Embedding(output_dim=vocab_dim, 150 input_dim=n_symbols, 151 mask_zero=True, 152 weights=[embedding_weights], 153 input_length=input_length)) # Adding Input Length
154 ## 使用单层LSTM 输出的向量维度是50,输入的向量维度是vocab_dim,激活函数relu
155 model.add(LSTM(output_dim=50, activation='relu', inner_activation='hard_sigmoid')) 156 model.add(Dropout(0.5)) 157 ## 在这里外接softmax,进行最后的3分类
158 model.add(Dense(output_dim=nb_classes, input_dim=50, activation='softmax')) 159 print 'Compiling the Model...'
160 ## 激活函数使用的是adam
161 model.compile(loss='categorical_crossentropy', 162 optimizer='adam',metrics=['accuracy']) 163
164 print "Train..."
165 print y_train 166 model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test)) 167 print "Evaluate..."
168 score = model.evaluate(x_test, y_test, 169 batch_size=batch_size) 170 yaml_string = model.to_yaml() 171 with open('lstm_data/lstm_koubei.yml', 'w') as outfile: 172 outfile.write( yaml.dump(yaml_string, default_flow_style=True) ) 173 model.save_weights('lstm_data/lstm_koubei.h5') 174 print 'Test score:', score 175
176 #训练模型,并保存
177 def train(): 178 print 'Loading Data...'
179 combined,y=loadfile() 180 print len(combined),len(y) 181 print 'Tokenising...'
182 combined = tokenizer(combined) 183 print 'Training a Word2vec model...'
184 index_dict, word_vectors,combined=word2vec_train(combined) 185 print 'Setting up Arrays for Keras Embedding Layer...'
186 n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) 187 print x_train.shape,y_train.shape 188 train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)
190
191 #训练模型,并保存
192 def self_train(): 193 print 'Loading Data...'
194 combined,y=loadfile() 195 print len(combined),len(y) 196 print 'Tokenising...'
197 combined = tokenizer(combined) 198 print 'Training a Word2vec model...'
199 index_dict, word_vectors,combined=word2vec_train(combined) 200 print 'Setting up Arrays for Keras Embedding Layer...'
201 n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) 202 print x_train.shape,y_train.shape 203 train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test) 204
205 def input_transform(string): 206 words=' '.join(jieba.cut(string)).encode('utf-8').strip() 207 tmp_list = [] 208 tmp_list.append(words) 209 #words=np.array(tmp_list).reshape(1,-1)
210 model=Word2Vec.load('lstm_data/model/Word2vec_model.pkl') 211 _,_,combined=create_dictionaries(model,tmp_list) 212 return combined248
249 if __name__=='__main__': 250 self_train()
咱们使用LSTM单层网络结构,在迭代15 次之后训练准确率已经能够达到96%以上。进一步思考一下,叠加LSTM网络,是否能够达到更高的训练准确率,其余的部分不变,咱们仅仅修改咱们的网络定义部分
1 ##定义网络结构
2 def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test): 3 nb_classes = 3
4 print 'Defining a Simple Keras Model...'
5 model = Sequential() # or Graph or whatever
6 model.add(Embedding(output_dim=vocab_dim, 7 input_dim=n_symbols, 8 mask_zero=True, 9 weights=[embedding_weights], 10 input_length=input_length)) # Adding Input Length
11 print vocab_dim 12 print n_symbols 13 #model.add(LSTM(output_dim=50, activation='relu',inner_activation='hard_sigmoid'))
14 #model.add(LSTM(output_dim=25, activation='relu', return_sequences=True))
15 model.add(LSTM(64, input_dim=vocab_dim, activation='relu', return_sequences=True)) 16 model.add(LSTM(32, return_sequences=True)) 17 model.add(Dropout(0.5)) 18 #model.add(Dense(nb_classes))
19 #model.add(Activation('softmax'))
20 print model.summary() 21 model.add(NonMasking()) 22 model.add(Flatten()) 23 model.add(Dense(output_dim=nb_classes, activation='softmax')) 24 print 'Compiling the Model...'
25 model.compile(loss='categorical_crossentropy', 26 optimizer='adam',metrics=['accuracy']) 27
28 print "Train..."
29 print y_train 30 model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test)) 31 print "Evaluate..."
32 score = model.evaluate(x_test, y_test, 33 batch_size=batch_size) 34
35 yaml_string = model.to_yaml() 36 with open('lstm_data/lstm_koubei.yml', 'w') as outfile: 37 outfile.write( yaml.dump(yaml_string, default_flow_style=True) ) 38 model.save_weights('lstm_data/lstm_koubei.h5') 39 print 'Test score:', score
咱们发现一样迭代15次,训练准确率能够达到97%左右。说明叠加LSTM网络结构确实是有效的,可以更好的抓取训练语料的特征。
训练反思与总结
目前,咱们仅仅能够说作了一个意图识别的demo,已经能够达到比较高的训练准确率,可是咱们还有不少方面改进。第一也是最直观的是咱们目前的训练语料还不多,而且训练的类别也比较少,咱们但愿在保持训练准确率的前提下,训练的语料能够更多,训练的类别更多。第二对语料的预处理作的很是的粗糙,没有去除停用词,没有去除标点符号等等,咱们这里没有作的缘由是咱们的训练语料是比较干净因此就没有进行处理了。第三个是咱们目前分词的算法是很是的粗糙,使用的结巴分词默认的词库进行分词。分词的词库没有匹配咱们领域知识。第四咱们还但愿使用CNN来对比一下抽取的效果。
可是你们能够看到深度学习在天然语言处理当中巨大的威力,咱们不用辛辛苦苦的去提取unigram,bigram等等特征,使用embeding的方法来描述文本,节省了大量人工,而且训练的准确率远超过咱们的预期。