该章的源代码已经调通,以下, 先记录下来,再慢慢理解python
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import pickle import keras from keras.models import Sequential, Model from keras.layers import Input, Dense, Activation, Dropout, Embedding, Reshape, Dot, Concatenate, Multiply from keras.layers import LSTM from keras.optimizers import RMSprop from keras.utils.data_utils import get_file from keras.preprocessing.sequence import pad_sequences from keras.models import model_from_json import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler plt.rcParams['figure.figsize']=(20, 10) # 读入数据 # In[2]:文件下载地址:http://dataset.cs.mcgill.ca/ubuntu-corpus-1.0/ubuntu_blobs.tgz with open("dataset.pkl", "rb") as f: data = pickle.load(f) # In[3]: print("size ======= %s" % len(data)) # In[4]: import gc gc.collect() # 看看数据里都是什么 # In[5]: for j in range(len(data)): print("======= %s" % j) for i, k in enumerate(data[j]): print(k) # 这里分析最长的句子的长度 # In[6]: # 这里分析最长的句子的长度 length=map(len, data[0]['c']) res=list(length) context_length=np.max(res[:]) print(context_length) length=map(len, data[0]['r']) res=list(length) response_length=np.max(res[:]) print(response_length) # 这里分析整个词典的大小 # In[7]: context_size = np.max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['c']))) print(context_size) response_size = max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['r']))) print(response_size) # In[8]: max(data[0]['r'][1]) # In[9]: embedding_dim=64 lstm_dim=64 context_length=np.max(list(map(len, data[0]['c']))) #print(context_length) response_length=np.max(list( map(len, data[0]['r']))) #print(response_length) Y = data[0]['r'] print('Begin Modeling...') context_size = np.max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['c']))) response_size = max(list(map(lambda x: max(x) if len(x)>0 else 0, data[0]['r']))) volcabulary_size=max(context_size, response_size) context_length=120 # 对上下文部分进行嵌入和建模 context=Input(shape=((context_length,)), name='context_input') context_embedded=Embedding(input_length=context_length, output_dim=embedding_dim, input_dim=volcabulary_size)(context) context_lstm=LSTM(lstm_dim)(context_embedded) # 对回应部分进行嵌入和建模 response_length=120 response=Input(shape=((response_length,)), name='response_input') response_embedded=Embedding(input_length=response_length, output_dim=embedding_dim, input_dim=volcabulary_size)(response) response_lstm=LSTM(lstm_dim)(response_embedded) #print(response_lstm.outputs) x = Dot([1, 1])([context_lstm, response_lstm]) #x = Multiply()([context_lstm, response_lstm]) yhat = Dense(2, activation='softmax')(x) model = Model(inputs=[context, response], outputs=yhat) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) print('Finish compiling...') model.summary() # In[10]: # 针对该数据定制的generator。通常应该将三个部分分离之后再编制generator def data_gen(data, batch_size=100): contextRaw = data['c'] responseRaw = data['r'] yRaw = data['y'] number_of_batches = len(contextRaw) // batch_size counter=0 context_length=np.max(list(map(len, contextRaw)))//3 response_length=np.max(list( map(len, responseRaw)))//3 context_length=120 response_length=120 while 1: lowerBound = batch_size*counter upperBound = batch_size*(counter+1) Ctemp = contextRaw[lowerBound : upperBound] C_batch = pad_sequences(Ctemp, maxlen=context_length, padding='post') C_res = np.zeros((batch_size, context_length), dtype=np.int) Rtemp = responseRaw[lowerBound : upperBound] R_batch = pad_sequences(Rtemp, maxlen=response_length, padding='post') R_res = np.zeros((batch_size, response_length), dtype=np.int) for k in np.arange(batch_size): C_res[k, :] = C_batch[k, :] R_res[k, :] = R_batch[k, :] y_res= keras.utils.to_categorical(yRaw[lowerBound : upperBound]) counter += 1 yield([C_res.astype('float32'), R_res.astype('float32')], y_res.astype('float32')) if (counter < number_of_batches): counter=0 # 下面训练这个模型。在6GB显存的GTX 1060上,小批量的大小不能超过200。读者有时间能够试试屡次迭代,看看效果。 # In[11]: #Y = keras.utils.to_categorical(data[0]['y'], num_classes=2) batch_size=168 model.fit_generator(data_gen(data[0], batch_size=batch_size), steps_per_epoch=len(data[0]['c'])//batch_size, validation_data = data_gen(data[1]), validation_steps = len(data[1]['c'])//batch_size, epochs=1) # 下面咱们将模型存入磁盘。咱们也能够在拟合过程当中使用checkponit选项将每一步的结果都分别存入一个磁盘文件中。 # In[12]: # 将模型结构存为JSON格式 model_json = model.to_json() with open("dual_lstm_model.json", "w") as json_file: json_file.write(model_json) # 将模型拟合获得的权重存入HDF5文件中 model.save_weights("dual_lstm_model.h5") print("模型已经写入磁盘") # In[13]: # 若是要调用已有模型,能够经过以下方法 # 从磁盘载入模型结构 json_file = open('dual_lstm_model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # 从磁盘读入模型权重 model.load_weights("dual_lstm_model.h5") print("载入模型完毕") model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) print('模型编译完毕...') # 下面进行预测。输入数据的组织形式应该遵循data generator里面的数据处理和输出组织形式,可是咱们能够经过predict_generator方法直接引用现有的data generator,只是用在测试集,而不是训练集上。 # In[14]: batch_size=256 ypred = model.predict_generator( data_gen(data[2], batch_size=batch_size), steps=(len(data[2]['c'])//batch_size), verbose=1) # In[15]: yTest = data[1]['y'] ypred2=(2-(ypred[:,0]>ypred[:,1]))-1 z = [str(ypred2[i])==yTest[i] for i in range(len(ypred2))] np.mean(z)