一、公司名归类,简称cc码java
二、算法:多层感知分类node
三、总思路:文本分词-->Word2Vec--->矩阵---->MultilayerPerceptronClassifier算法
①中文分词使用是 IK Analyzerapache
例如:浙江工人日报社印刷厂---->分词后--->浙江|工人日报|社|印刷厂|less
代码dom
import java.io.StringReader import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.spark.{SparkConf, SparkContext} import org.wltea.analyzer.lucene.IKAnalyzer /** * Created by dongdong on 17/4/24. */ object Participles { def main(args: Array[String]): Unit = { val inpath = "/Users/dongdong/Desktop/cc/small_data/mlj_total_cc.txt" val outpath = "/Users/dongdong/Desktop/cc/participles_small" val conf = new SparkConf().setMaster("local[2]").setAppName("Participles") val sc = new SparkContext(conf) //read data val originalData = sc .textFile(inpath) .map(line => { val arr = line.split("\t") arr }).filter(t => { t.length == 3 }) //splits data val participles_data = originalData.map(t => { var words = "" val company_name = t(0).trim val label = t(1).trim val cNumber = t(2).trim // val address = t(3).trim val anal = new IKAnalyzer(true) val reader = new StringReader(company_name) val ts = anal.tokenStream("", reader) ts.reset() val term: CharTermAttribute = ts.getAttribute(classOf[CharTermAttribute]) while (ts.incrementToken()) { words += term.toString + "|" } val words_repalce = words.replaceAll(",", "") words_repalce + "," + label + "," + cNumber }) //save data participles_data.repartition(1).saveAsTextFile(outpath) sc.stop() } }
②MultilayerPerceptronClassifierspa
核心代码scala
//The label into vector val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(originalData) val labelIndexer_data: DataFrame = labelIndexer.transform(originalData) //Break up the word val tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\|") val tokenizer_ts_data: DataFrame = tokenizer.transform(labelIndexer_data) //Filter the useless words val arr = Array("有限公司", "有限责任公司", "", "公司", "分公司", "责任公司", "有限", "责任") val remover = new StopWordsRemover() .setInputCol("words") .setOutputCol("filtered") .setStopWords(arr) val fitered_data: DataFrame = remover.transform(tokenizer_ts_data) //The words into vector val word2Vec = new Word2Vec() .setInputCol("filtered") .setOutputCol("features") //Set features number .setVectorSize(VECTOR_SIZE) .setMinCount(1) .setMaxIter(100) // .setNumPartitions(3) // The hidden layer nodes=2n+1,n input nodes //the 43 is number of we want to classification val layers = Array[Int](VECTOR_SIZE, 101, 100, 43) val mlpc = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(512) .setSeed(1234L) .setMaxIter(128) .setFeaturesCol("features") .setLabelCol("indexedLabel") .setPredictionCol("prediction") //To convert vector label to the label of type String val labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) val Array(trainingData, testData) = originalData.randomSplit(Array(0.8, 0.2)) val pipeline = new Pipeline().setStages(Array(tokenizer, remover, labelIndexer, word2Vec, mlpc, labelConverter))
③试过用TF-IDF 和LogisticRegression(逻辑回归)组合code
TF-IDF 和 NaiveBayes(朴素贝叶斯)组合orm
效果都不太好,其中LogisticRegression只支持二分类
④ 因为先验数据集分布不均匀
最终的正确率:0.606549930730621
total_rate 659490 527397 132093 80121 0.606549930730621