中文文本分类

        将文本进行分类是自然语言处理当中最主要的工作之一,本文处理很重要的一项工作就是对文本进行向量化,本文不做详细的介绍,只是采用TF-IDF的方法对文本进行向量化,然后分别采用SVM,  Bayes,  RandomForest,BP四种方法对文本进行分类。

训练语料是在excel中存储的,格式见下图:

data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None)
data.columns = ['class_label', 'text']
data.dropna(inplace=True)

# 加载自定义词典
jieba.load_userdict('../dict_out.csv')
# 加载停用词表
stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()]
stopkey.append(" ")
list1 = []
list2 = []
for i in data["text"]:
    try:
        jiebas = jieba.cut(i)
        jiebas = [w for w in jiebas if w not in stopkey]
        fenci_key = ",".join(jiebas)
    except AttributeError:
        continue
    finally:
        list2.append(jiebas)
        list1.append(fenci_key.strip())
# 将分分词结果写入data
data["tokens"] = list1
data.to_excel("1data.xls", header=None, index=False)

该文本训练库共有10000条数据,分为:'体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'这10个类别。

本文的分类主要工程如下:

  1. 对文本内容进行分词处理,删除停用词,只留下有意义的词语。
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None)
data.columns = ['class_label', 'text']
data.dropna(inplace=True)

# 加载自定义词典
jieba.load_userdict('../dict_out.csv')
# 加载停用词表
stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()]
stopkey.append(" ")
list1 = []
list2 = []
for i in data["text"]:
    try:
        jiebas = jieba.cut(i)
        jiebas = [w for w in jiebas if w not in stopkey]
        fenci_key = ",".join(jiebas)
    except AttributeError:
        continue
    finally:
        list2.append(jiebas)
        list1.append(fenci_key.strip())
# 将分分词结果写入data
data["tokens"] = list1
data.to_excel("1data.xls", header=None, index=False)

2.将语料库分为训练集和测试集

data = pd.read_excel('1data.xls', encoding='utf-8', header=None)
data.columns=[ 'class_label','text', 'tokens']
label = data['class_label']
categories = []
for i in label:
    if i in categories:
        pass
    else:
        categories.append(i)
print(categories)

le = preprocessing.LabelEncoder().fit_transform(data['class_label'])
data["class_label"] = le
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data["tokens"], 
                                                    data["class_label"], 
                                                    test_size=0.2,
                                                    random_state=1)

3.对词组进行TF-IDF处理,将各个词组转换成词向量。具体理论可查看其他相关资料,这里不再做详细的阐述

# 声明文本特征提取方法
# 文本特征提取
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

4.分别采用上面提到的分类方法进行训练和测试,并查看测试结果

def get_metrics(y_test, y_predicted):
    """
    y_test:真实值
    y_predicted:预测值
    """
    # 精确度=真阳性/(真阳性+假阳性)
    precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted')
    # 召回率=真阳性/(真阳性+假阴性)
    recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')
    # F1
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    # 准确率
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1
def BayesClassify():
    clf_tfidf = MultinomialNB(alpha=0.01)
    clf_tfidf.fit(X_train_tfidf, y_train)
    joblib.dump(clf_tfidf, "BayesModel.m")

def BayesTest():
    clf_tfidf = joblib.load("BayesModel.m")
    y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 评估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)
def SVMClassify():
    clf_tfidf = SVC(gamma=1, kernel='rbf', probability=True)
    clf_tfidf.fit(X_train_tfidf, y_train)
    joblib.dump(clf_tfidf, "SVMModel.m")

def SVMTest():
    clf_tfidf = joblib.load("SVMModel.m")
    y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 评估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)
def RandomForestClassify():
    clf_tfidf = clf_tfidf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
    clf_tfidf.fit(X_train_tfidf, y_train)
    joblib.dump(clf_tfidf, "RandomForestModel.m")

def RandomForestTest():
    clf_tfidf = joblib.load("RandomForestModel.m")
    y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 评估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)
def BPClassify(inputPoint):
    net = Sequential()
    net.add(Dense(128, input_shape=(inputPoint,)))
    net.add(Activation('relu'))
    
    net.add(Dense(len(categories), input_shape=(128,)))
    net.add(Activation('sigmoid'))
    
    net.compile(optimizer='adam', loss='binary_crossentropy')
    net.fit(X_train_tfidf, y_train_onehot, batch_size=128, epochs=2)


    y_predicted_tfidf = net.predict(X_test_tfidf)
    print(y_predicted_tfidf)
    res = np.zeros((y_test.shape[0], 1))
    for i, j in enumerate(y_predicted_tfidf):
        j = list(j)
        maxIndex = j.index(max(j))
        res[i] = maxIndex
    y_predicted_tfidf = res
    
    accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
    print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
        accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
    # 评估
    print("Precision, Recall, F1-Score and support")
    print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
    print(cm)

最终的分类效果较为理想,准确率和召回率都在90%以上。其中SVM耗时稍长。

文本分类    svm  贝叶斯  随机森林  神经网络