pip intall nltk==3.4.5
nltk_data 存放了不少语料数据, 包括大量的数据集,本文中就是用到了其中的 positive_tweets 和 negative_tweets 两个数据集来训练模型python
安装方式有两种, 离线和在线, 推荐【使用离线】, 由于数据量很大, 在线下载一般会失败git
python交互式命令行中输入github
import nltk nltk.download()
执行后会弹出下载窗口, 若是不须要全量下载, 选择对应分类下, 进行点击下载便可, json
下载成功后会相应提示 installed 或者指定包进行下载, 一样仍是python交互式命令行输入数据结构
import nltk
nltk.download('punkt')
主要用到的是packages 文件夹下的内容app
这就是所有的nltk_data 的内容less
下载后须要进行简单配置dom
一、 将下载的packages 文件夹重命名为nltk_data测试
二、将重命名后的nltk_data文件夹放置到nltk能够找到的路径下, 查看方法为 : this
>>>from nltk import data >>>data.find('.') FileSystemPathPointer('C:\\Users\\用户\\AppData\\Roaming\\nltk_data') # 会输出本地加载路径, 直接放置在Roaming下便可
或者出现如下输出, 将nltk_data文件夹放在任意目录下也能够
到此, 环境就已经准备好啦~~~
import re import string import random from nltk.corpus import twitter_samples from nltk.tag import pos_tag, pos_tag_sents from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk import FreqDist
使用的是twitter_samples下的 negative_tweets.json:【5000条带有负面情感的推文】 和 positive_tweets.json:【5000条带有正面情感的推文 用于训练模型】,能够讲压缩包解压出来, 看下里面的json 文件的内容
使用代码获取推文内容
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json') # 能够打印看下分析的推文内容
# 分词 po_fenci_res = fenci(po_file_name)[:2] be_fenci_res = fenci(ne_file_name)[:2] # 数据量比较大, 因此仅取前2条 print('积极分词结果: {}'.format(po_fenci_res)) print('消极分词结果: {}'.format(be_fenci_res)) # 积极分词结果: [['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'], ['@Lamb2ja', 'Hey', 'James', '!', 'How', 'odd', ':/', 'Please', 'call', 'our', 'Contact', 'Centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'Many', 'thanks', '!']] # 消极分词结果: [['hopeless', 'for', 'tmr', ':('], ['Everything', 'in', 'the', 'kids', 'section', 'of', 'IKEA', 'is', 'so', 'cute', '.', 'Shame', "I'm", 'nearly', '19', 'in', '2', 'months', ':(']]
数据规范化包括如下步骤
def cleaned_list_func(evert_tweet): """ 数据预处理 :param evert_tweet: 每条推文 / 每条待分析的英文句子 :return: 处理后的单词, 一维列表 """ new_text = [] cixing_list = pos_tag(evert_tweet) # [('', 'NN'), ('', 'NNS'), ()] print('每条推文的词性标注结果:{}'.format(cixing_list)) for word, cixing in cixing_list: word = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+', '', word) # 去掉网址的正则规则 word = re.sub('(@[A-Za-z0-9_]+)', '', word) # 去掉人民的规则, 带有@的部分 if cixing.startswith('NN'): # 将标注的词性进行判断, 替换为英文的标准词性表示 pos = 'n' elif cixing.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() # 使用WordNetLemmatizer类下的lemmatize方法进行词性还原
new_word = lemmatizer.lemmatize(word, pos)
if len(new_word) > 0 and new_word not in string.punctuation and \ new_word.lower() not in stopwords.words('english'):
new_text.append(new_word.lower())
return new_text
# 数据规范化
positive_cleaned_list = []
negative_cleaned_list = []
for i in po_fenci_res:
positive_cleaned = cleaned_list_func(i)
positive_cleaned_list.append(positive_cleaned)
print('处理后的积极推文结果: {}'.format(positive_cleaned_list))
print('原积极数据对比: {}'.format(positive_tweets[:2]))
通过词性标注的结果为
# 每条推文的词性标注结果:[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]
数据处理后的推文及原数据的对比
# 处理后的积极推文结果: [['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)'], ['hey', 'james', 'odd', ':/', 'please', 'call', 'contact', 'centre', '02392441234', 'able', 'assist', ':)', 'many', 'thanks']] # 原积极数据对比: ['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!']
def get_tweets_for_model(clean_tokens_list, tag): """ 准备模型数据 :param clean_tokens_list: 处理后的推文 二维列表 :param tag: 标签类别 :return: 一维列表, 元素是二元元组 """ li = [] for every_tweet in clean_tokens_list: data_dict = dict([token, True] for token in every_tweet) # {'':true,'':true} li.append((data_dict, tag)) return li
# 准备模型数据
po_for_model = get_tweets_for_model(positive_cleaned_list, 'Positive')
ne_for_model = get_tweets_for_model(negative_cleaned_list, 'Negative')
print('为模型准备的积极数据: {}'.format(po_for_model))
print('为模型准备的消极数据: {}'.format(ne_for_model))
此时数据结构为
# 为模型准备的消极数据: [({'hopeless': True, 'tmr': True, ':(': True}, 'Negative'), ({'everything': True, 'kid': True, 'section': True, 'ikea': True, 'cute': True, 'shame': True, "i'm": True, 'nearly': True, '19': True, '2': True, 'month': True, ':(': True}, 'Negative')]
model_data = po_for_model + ne_for_model random.shuffle(model_data) # 打乱 train_data = model_data[:7000] # 前7000做为训练集 test_data = model_data[7000:] # 其他做为测试机, 测试训练出来的模型准确度
def train_model(train_data, test_data): """ 训练及测试模型 :param train_data: 训练集 :param test_data: 测试集 :return: 训练后的模型 """ from nltk import classify from nltk import NaiveBayesClassifier model = NaiveBayesClassifier.train(train_data) print('模型准确率为: {}'.format(classify.accuracy(model, test_data))) print(model.show_most_informative_features(10)) return model
# 训练及测试模型
model = train_model(train_data, test_data)
def test(model, test_text): """ 使用训练好的模型预测数据 :param model: :param test_text: 待分析的句子 :return: """ from nltk.tokenize import word_tokenize custom_tokens = cleaned_list_func(word_tokenize(test_text)) result = dict([token, True] for token in custom_tokens) yuce_res = model.classify(result) print('内容: {} 预测结果: {}'.format(test_text, yuce_res))
test_list = [
"I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know.",
"My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood",
"I will always be there for you.",
'I fuck you fuck your mother fuck your father fuck your family',
"Don't worry when you are not recognized, but strive to be worthy of recognition.",
"The power of imagination makes us infinite.",
"The glow of one warm thought is to me worth more than money."
]
for i in test_list:
test(model, i)
预测结果为
模型准确率为: 0.9943333333333333 Most Informative Features sad = True Negati : Positi = 35.1 : 1.0 follower = True Positi : Negati = 20.6 : 1.0 bam = True Positi : Negati = 20.1 : 1.0 arrive = True Positi : Negati = 18.6 : 1.0 x15 = True Negati : Positi = 17.3 : 1.0 blog = True Positi : Negati = 16.7 : 1.0 followed = True Negati : Positi = 15.5 : 1.0 damn = True Negati : Positi = 15.4 : 1.0 top = True Positi : Negati = 15.3 : 1.0 appreciate = True Positi : Negati = 13.9 : 1.0 None 内容: I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know. 预测结果: Negative 内容: My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood 预测结果: Negative 内容: I will always be there for you. 预测结果: Negative 内容: I fuck you fuck your mother fuck your father fuck your family 预测结果: Negative 内容: Don't worry when you are not recognized, but strive to be worthy of recognition. 预测结果: Positive 内容: The power of imagination makes us infinite. 预测结果: Negative 内容: The glow of one warm thought is to me worth more than money. 预测结果: Positive
因为训练集数据量有限, 因此预测结果也不必定彻底准确
import random import re import string from nltk.corpus import twitter_samples from nltk.tag import pos_tag, pos_tag_sents from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk import FreqDist def fenci(file): return twitter_samples.tokenized(file) def cleaned_list_func(evert_tweet): new_text = [] cixing_list = pos_tag(evert_tweet) for word, cixing in cixing_list: word = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+', '', word) word = re.sub('(@[A-Za-z0-9_]+)', '', word) if cixing.startswith('NN'): pos = 'n' elif cixing.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() new_word = lemmatizer.lemmatize(word, pos) if len(new_word) > 0 and new_word not in string.punctuation and new_word.lower() not in stopwords.words('english'): new_text.append(new_word.lower()) return new_text def get_all_words(clean_tokens_list): for tokens in clean_tokens_list: for token in tokens: yield token def get_tweets_for_model(clean_tokens_list, tag): li = [] for every_tweet in clean_tokens_list: data_dict = dict([token, True] for token in every_tweet) li.append((data_dict, tag)) return li def train_model(train_data, test_data): from nltk import classify from nltk import NaiveBayesClassifier model = NaiveBayesClassifier.train(train_data)return model def test(model, test_text): from nltk.tokenize import word_tokenize custom_tokens = cleaned_list_func(word_tokenize(test_text)) result = dict([token, True] for token in custom_tokens) if __name__ == '__main__': po_file_path = 'positive_tweets.json' ne_file_path = 'negative_tweets.json' positive_tweets = twitter_samples.strings(po_file_path) negative_tweets = twitter_samples.strings(ne_file_path) po_fenci_res = fenci(po_file_path) be_fenci_res = fenci(ne_file_path) positive_cleaned_list = [] negative_cleaned_list = [] for i in po_fenci_res: positive_cleaned = cleaned_list_func(i) positive_cleaned_list.append(positive_cleaned) for j in be_fenci_res: negative_cleaned = cleaned_list_func(j) negative_cleaned_list.append(negative_cleaned) po_for_model = get_tweets_for_model(positive_cleaned_list, 'Positive') ne_for_model = get_tweets_for_model(negative_cleaned_list, 'Negative') model_data = po_for_model + ne_for_model random.shuffle(model_data) train_data = model_data[:7000] test_data = model_data[7000:] model = train_model(train_data, test_data) test_list = [ "I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know.", "My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood", "I will always be there for you.", 'I fuck you fuck your mother fuck your father fuck your family', "Don't worry when you are not recognized, but strive to be worthy of recognition.", "The power of imagination makes us infinite.", "The glow of one warm thought is to me worth more than money." ] for i in test_list: test(model, i)
以上内容为简单分析, 初学,有任何问题欢迎留言讨论~~