spark MLPC 文本分类例子

时间 2019-12-17

标签 spark mlpc 文本分类例子栏目 Spark 繁體版

原文原文链接

一、公司名归类，简称cc码java

二、算法:多层感知分类node

三、总思路：文本分词-->Word2Vec--->矩阵---->MultilayerPerceptronClassifier算法

①中文分词使用是 IK Analyzerapache

例如:浙江工人日报社印刷厂---->分词后--->浙江|工人日报|社|印刷厂|less

代码dom

import java.io.StringReader
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.spark.{SparkConf, SparkContext}
import org.wltea.analyzer.lucene.IKAnalyzer

/**
  * Created by dongdong on 17/4/24.
  */
object Participles {

  def main(args: Array[String]): Unit = {
  
    val inpath = "/Users/dongdong/Desktop/cc/small_data/mlj_total_cc.txt"
    val outpath = "/Users/dongdong/Desktop/cc/participles_small"
    val conf = new SparkConf().setMaster("local[2]").setAppName("Participles")
    val sc = new SparkContext(conf)

   //read data
    val originalData = sc
      .textFile(inpath)
      .map(line => {
        val arr = line.split("\t")
        arr
      }).filter(t => {
      t.length == 3
    })

    //splits data
    val participles_data = originalData.map(t => {
      var words = ""
      val company_name = t(0).trim
      val label = t(1).trim
      val cNumber = t(2).trim
      //  val address = t(3).trim
      val anal = new IKAnalyzer(true)
      val reader = new StringReader(company_name)
      val ts = anal.tokenStream("", reader)
      ts.reset()
      val term: CharTermAttribute = ts.getAttribute(classOf[CharTermAttribute])
      while (ts.incrementToken()) {
        words += term.toString + "|"
      }
      val words_repalce = words.replaceAll(",", "")

      words_repalce + "," + label + "," + cNumber

    })

    //save data
    participles_data.repartition(1).saveAsTextFile(outpath)

    sc.stop()
  }

}

②MultilayerPerceptronClassifierspa

核心代码scala

//The label into vector
    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(originalData)
    val labelIndexer_data: DataFrame = labelIndexer.transform(originalData)
   
    //Break up the word
    val tokenizer = new RegexTokenizer()
      .setInputCol("text")
      .setOutputCol("words")
      .setPattern("\\|")
    val tokenizer_ts_data: DataFrame = tokenizer.transform(labelIndexer_data)
    
    //Filter the useless words
    val arr = Array("有限公司", "有限责任公司", "", "公司", "分公司", "责任公司", "有限", "责任")
    val remover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("filtered")
      .setStopWords(arr)
    val fitered_data: DataFrame = remover.transform(tokenizer_ts_data)

    //The words into vector
    val word2Vec = new Word2Vec()
      .setInputCol("filtered")
      .setOutputCol("features")
      //Set features number
      .setVectorSize(VECTOR_SIZE)
      .setMinCount(1)
      .setMaxIter(100)
    // .setNumPartitions(3)

   
    // The hidden layer nodes=2n+1，n input nodes
    //the 43 is number of we want to classification
    val layers = Array[Int](VECTOR_SIZE, 101, 100, 43)
    val mlpc = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(512)
      .setSeed(1234L)
      .setMaxIter(128)
      .setFeaturesCol("features")
      .setLabelCol("indexedLabel")
      .setPredictionCol("prediction")

    //To convert vector label to the label of type String
    val labelConverter = new IndexToString()
      .setInputCol("prediction")
      .setOutputCol("predictedLabel")
      .setLabels(labelIndexer.labels)

    val Array(trainingData, testData) = originalData.randomSplit(Array(0.8, 0.2))

    val pipeline = new Pipeline().setStages(Array(tokenizer, remover, labelIndexer, word2Vec, mlpc, labelConverter))

③试过用TF-IDF 和LogisticRegression(逻辑回归)组合code

TF-IDF 和 NaiveBayes(朴素贝叶斯)组合orm

效果都不太好，其中LogisticRegression只支持二分类

④ 因为先验数据集分布不均匀

最终的正确率：0.606549930730621

total_rate		659490		527397		132093		80121		0.606549930730621