我使用的是Weka自带的示例数据集java
基本的读取数据方式为mysql
Instances data1=DataSource.read("data\\iris.arff");
若是文件的拓展名未知,咱们能够指定加载器进行加载,例如咱们能够把以前的iris.arff文件改为iris.data,而后经过指定加载器加载本地数据sql
package weka.loaddata; import java.io.File; import weka.core.Instances; import weka.core.converters.ArffLoader; public class Test { public static void main(String[] args) { try { ArffLoader loader=new ArffLoader(); loader.setSource(new File("data\\iris.data")); Instances data1=loader.getDataSet(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("done"); } }
arff和csv须要人为指定做为类别的字段数据库
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) { try { Instances data1=DataSource.read("data\\iris.arff"); System.out.println(data1.classIndex()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("done"); } }
返回-1表明此时并无指定类别属性api
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) { try { Instances data1=DataSource.read("data\\iris.arff"); data1.setClassIndex(data1.numAttributes()-1); System.out.println(data1.classIndex()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("done"); } }
因而咱们经过上述程序将最后一个属性做为类别属性dom
一种是InstanceQuery,容许检索系数数据,一种是DatabaseLoader,容许增量检索测试
package weka.loaddata; import weka.core.Instances; import weka.experiment.InstanceQuery; public class Test { public static void main(String[] args) throws Exception { InstanceQuery query = new InstanceQuery(); query.setDatabaseURL("jdbc:mysql://localhost:3306/new_schema"); query.setUsername("root"); query.setPassword("*******"); query.setQuery("select * from iris"); Instances data = query.retrieveInstances(); System.out.println("done"); } }
我首先将iris数据加载进mysql数据库了
若是你用过jdbc的话,会发现这几个东西就是用的jdbcui
package weka.loaddata; import weka.core.Instances; import weka.core.converters.DatabaseLoader; public class Test { public static void main(String[] args) throws Exception { DatabaseLoader loader = new DatabaseLoader(); loader.setSource("jdbc:mysql://localhost:3306/new_schema", "root", "*******"); loader.setQuery("select * from iris"); Instances data = loader.getDataSet(); } }
批量检索lua
package weka.loaddata; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.DatabaseLoader; public class Test { public static void main(String[] args) throws Exception { DatabaseLoader loader = new DatabaseLoader(); loader.setSource("jdbc:mysql://localhost:3306/new_schema", "root", "zxy123456"); loader.setQuery("select * from iris"); Instances structure = loader.getStructure(); Instances data = new Instances(structure); Instance inst; while ((inst = loader.getNextInstance(structure)) != null) data.add(inst); System.out.println("done"); } }
增量检索spa
package weka.loaddata; import java.io.File; import java.io.FileOutputStream; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSink; import weka.core.converters.ConverterUtils.DataSource; import weka.core.converters.XRFFSaver; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); DataSink.write("data/write_iris.csv", data); FileOutputStream arff = new FileOutputStream("data/write_iris.arff"); DataSink.write(arff, data); arff.close(); XRFFSaver saver = new XRFFSaver(); saver.setInstances(data); saver.setFile(new File("data/write_iris.xrff")); saver.writeBatch(); System.out.println("done"); } }
能够直接写,也能够指定加载器
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.core.converters.DatabaseSaver; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); DatabaseSaver saver = new DatabaseSaver(); saver.setDestination("jdbc:mysql://localhost:3306/new_schema", "root", "zxy123456"); saver.setTableName("write_iris"); saver.setRelationForTableName(false); saver.setInstances(data); saver.writeBatch(); System.out.println("done"); } }
saver.setRelationForTableName(false);
若是是true的话,只能将数据的relation名做为表名,固然也能够改关系名啦
data.setRelationName(newName);
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.core.converters.DatabaseSaver; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); DatabaseSaver saver = new DatabaseSaver(); saver.setDestination("jdbc:mysql://localhost:3306/new_schema", "root", "zxy123456"); saver.setTableName("write_iris"); saver.setRelationForTableName(false); saver.setRetrieval(DatabaseSaver.INCREMENTAL); saver.setInstances(data); for (int i = 0; i < data.numInstances(); i++) { saver.writeIncremental(data.instance(i)); } saver.writeIncremental(null); System.out.println("done"); } }
增量保存,看起来就是一条一条存
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); System.out.println(data); System.out.println("----------------"); String[] options = new String[2]; options[0] = "-R"; options[1] = "1"; Remove rm = new Remove(); rm.setOptions(options); rm.setInputFormat(data); Instances inst1 = Filter.useFilter(data, rm); System.out.println(inst1); } }
意思是去除一个属性,其余的东西该如何应用,仍是看看api吧,
Instances inst1 = Filter.useFilter(data, rm);
这个应该是
的使用方法,猜的,应该是。
能够经过内存提取数据,总共分两步
首先设置属性定义数据格式
其次一行一行地添加实际数据
1.1 定义数据格式
Attribute numeric=new Attribute("attribute_name");
Attribute date=new Attribute("attribute_name","yyyy-MM-dd");
具体日期格式参照SimpleDateFormat中对日期的规定
ArrayList<String> labels=new ArrayList<String>(); labels.add("label_a"); labels.add("label_b"); Attribute nominal=new Attribute("attribute_name",labels);
Attribute string = new Attribute("attribute_name",(ArrayList<String>)null);
提供一个ArrayList的你null对象
ArrayList<Attribute> atts = new ArrayList<Attribute>(); atts.add(new Attribute("rel.numeric")); ArrayList<String> values = new ArrayList<String>(); values.add("val_A"); values.add("val_B"); values.add("val_C"); atts.add(new Attribute("rel.nominal")); Instances rel_struct = new Instances("rel", atts, 0); Attribute relational = new Attribute("attribute_name", rel_struct);
atts里有一个numeric属性和一个nominal属性,而后建立了一个大小为0的instances对象。而后利用这个instances建立了这个relation数据属性。
Attribute num1 = new Attribute("num1"); Attribute num2 = new Attribute("num2"); ArrayList<String> labels = new ArrayList<String>(); labels.add("no"); labels.add("yes"); Attribute cls = new Attribute("class", labels); ArrayList<Attribute> attributes = new ArrayList<>(); attributes.add(num1); attributes.add(num2); attributes.add(cls); Instances dataset = new Instances("relation_name", attributes, 0);
咱们建立了num1,num2,cls三个属性,而后建立了这个数据集的instances对象,
1.2 添加数据
package weka.api; import java.util.ArrayList; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; public class Test { public static void main(String[] args) throws Exception { Attribute numeric = new Attribute("numeric"); Attribute date = new Attribute("date", "yyyy-MM-dd"); ArrayList<String> label = new ArrayList<String>(); label.add("label_a"); label.add("label_b"); label.add("label_c"); Attribute nominal = new Attribute("nominal", label); Attribute string = new Attribute("string", (ArrayList) null); // ArrayList<Attribute> rel_attributes = new ArrayList<>(); // rel_attributes.add(numeric); // rel_attributes.add(nominal); // Instances rel_struct = new Instances("rel_struct", rel_attributes, // 1); // Attribute relation = new Attribute("relation", rel_struct); ArrayList<Attribute> attributes = new ArrayList<>(); attributes.add(numeric); attributes.add(date); attributes.add(nominal); attributes.add(string); // attributes.add(relation); Instances data = new Instances("data", attributes, 1); double[] values = new double[data.numAttributes()]; values[0] = 1.23; values[1] = data.attribute(1).parseDate("2017-8-19"); values[2] = data.attribute(2).indexOfValue("label_c"); System.out.println(values[2]); values[3] = data.attribute(3).addStringValue("A string"); // Instances dataRel=new Instances(data.attribute(4).relation(), 0); // double[] valuesRel=new double[dataRel.numAttributes()]; // valuesRel[0]=2.34; // valuesRel[1]=dataRel.attribute(1).indexOfValue("label_c"); // dataRel.add(new DenseInstance(1.0,valuesRel)); // values[4]=data.attribute(4).addRelation(dataRel); Instance inst = new DenseInstance(1, values); data.add(inst); System.out.println(data); } }
relation这个东西我还不太会用。。。因此注释掉了
须要注意的是在使用nominal属性的时候,若是添加的值不在以前的声明之中,他会返回-1,却不会报错,而在使用的时候才会报错,并且还找不到哪里错误,从这点来看他们这个API写的实在有点= =粗糙。。。。
package weka.api; import java.util.ArrayList; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; public class Test { public static void main(String[] args) throws Exception { ArrayList<Attribute> atts; ArrayList<Attribute> attsRel; ArrayList<String> attVals; ArrayList<String> attValsRel; Instances data; Instances dataRel; double[] vals; double[] valsRel; int i = 0; atts = new ArrayList<Attribute>(); atts.add(new Attribute("att1")); attVals = new ArrayList<String>(); for (i = 0; i < 5; i++) { attVals.add("val" + (i + 1)); } atts.add(new Attribute("att2", attVals)); atts.add(new Attribute("att3", (ArrayList<String>) null)); atts.add(new Attribute("att4", "yyyy-MM-dd")); attsRel = new ArrayList<Attribute>(); attsRel.add(new Attribute("att5.1")); attValsRel = new ArrayList<String>(); for (i = 0; i < 5; i++) { attValsRel.add("val5." + (i + 1)); } attsRel.add(new Attribute("att5.2", attValsRel)); dataRel = new Instances("att5", attsRel, 0); atts.add(new Attribute("att5", dataRel, 0)); data=new Instances("MyRelation",atts,0); vals=new double[data.numAttributes()]; vals[0]=Math.PI; vals[1]=attVals.indexOf("val3"); vals[2]=data.attribute(2).addStringValue("a string"); vals[3]=data.attribute(3).parseDate("2017-8-19"); dataRel=new Instances(data.attribute(4).relation(),0); valsRel=new double[2]; valsRel[0]=Math.PI+1; valsRel[1]=attValsRel.indexOf("val5.3"); dataRel.add(new DenseInstance(1,valsRel)); vals[4]=data.attribute(4).addRelation(dataRel); data.add(new DenseInstance(1,vals)); System.out.println(data); } }
这个例子比以前个人好,不过关系型属性是真的麻烦,不过理解起来就好像是,一组数据被当作一个特征。
package weka.api; import java.util.Random; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\iris.arff"); System.out.println(data); long seed = 123456; Instances data3 = new Instances(data); data3.randomize(new Random(seed)); System.out.println(data3); } }
这是其中一种方法,在这种方法中,推荐使用种子,另外还有可使用filter的方法进行随机排序,后文继续介绍
如今要增长一个数值属性和一个标称属性,并添加随机值
package weka.api; import java.util.Random; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Add; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\weather.numeric.arff"); Instances result = null; Add filter; result = new Instances(data); filter = new Add(); filter.setAttributeIndex("last"); filter.setAttributeName("NumericAttribute"); filter.setInputFormat(result); result = Filter.useFilter(result, filter); filter = new Add(); filter.setAttributeIndex("last"); filter.setNominalLabels("A,B,C"); filter.setAttributeName("NominalAttribute"); filter.setInputFormat(result); result = Filter.useFilter(result, filter); Random rand = new Random(1234); for (int i = 0; i < result.numInstances(); i++) { result.instance(i).setValue(result.numAttributes() - 2, rand.nextDouble()); result.instance(i).setValue(result.numAttributes() - 1, rand.nextInt(3)); } System.out.println("过滤后的数据集:"); System.out.println(result); } }
运用了Standardize,将数据集中全部数字属性标准化,零均值与单位方差
package weka.api; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Standardize; public class Test { public static void main(String[] args) throws Exception { Instances train = DataSource.read("data\\segment-challenge.arff"); Instances test = DataSource.read("data\\segment-test.arff"); Standardize filter = new Standardize(); filter.setInputFormat(train); Instances newTrain = Filter.useFilter(train, filter); Instances newTest = Filter.useFilter(test, filter); System.out.println("new trainer"); System.out.println(newTrain); System.out.println("new test"); System.out.println(newTest); } }
package weka.api; import weka.classifiers.meta.FilteredClassifier; import weka.classifiers.trees.J48; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.unsupervised.attribute.Remove; public class Test { public static void main(String[] args) throws Exception { Instances train = DataSource.read("data\\segment-challenge.arff"); Instances test = DataSource.read("data\\segment-test.arff"); train.setClassIndex(train.numAttributes() - 1); test.setClassIndex(test.numAttributes() - 1); if (!train.equalHeaders(test)) { throw new Exception("训练集与测试机不兼容:\n" + train.equalHeadersMsg(test)); } Remove rm = new Remove(); rm.setAttributeIndices("1"); J48 j48 = new J48(); j48.setUnpruned(true); FilteredClassifier fc = new FilteredClassifier(); fc.setFilter(rm); fc.setClassifier(j48); fc.buildClassifier(train); for (int i = 0; i < test.numInstances(); i++) { double pred = fc.classifyInstance(test.instance(i)); System.out.print("index: " + (i + 1)); System.out.print(", class: " + test.classAttribute() .value((int) test.instance(i).classValue())); System.out.println(", predict class: " + test.classAttribute().value((int) pred)); } } }
解释一下
分类器分为批量分类器和增量分类器
构建批量分类器分为两步
示例
增量分类器都实现了UpdateableClassifier接口
增量分类器用于处理规模较大的数据,不会将数据一次加载进内存,arff文件能够增量读取,一样也分两步
示例
为啥不带数据,由于以前loader进行的是加载结构的方法
构建分类器的评价标准有两种方式,交叉验证和专用测试集验证
评价由Evaluation类实现
示例
3.1 批量分类器构建
package weka.api; import java.io.File; import weka.classifiers.trees.J48; import weka.core.Instances; import weka.core.converters.ArffLoader; public class Test { public static void main(String[] args) throws Exception { ArffLoader loader=new ArffLoader(); loader.setFile(new File("data\\weather.nominal.arff")); Instances data=loader.getDataSet(); data.setClassIndex(data.numAttributes()-1); String[] options=new String[1]; options[0]="-U"; J48 tree=new J48(); tree.setOptions(options); tree.buildClassifier(data); System.out.println(tree); } }
4.2 增量分类器构建
package weka.api; import java.io.File; import weka.classifiers.bayes.NaiveBayesUpdateable; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.ArffLoader; public class Test { public static void main(String[] args) throws Exception { ArffLoader loader = new ArffLoader(); loader.setFile(new File("data\\weather.nominal.arff")); Instances structure = loader.getStructure(); structure.setClassIndex(structure.numAttributes() - 1); NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(structure); Instance instance; while ((instance = loader.getNextInstance(structure)) != null) nb.updateClassifier(instance); System.out.println(nb); } }
4.3 输出类别分布
package weka.api; import weka.classifiers.trees.J48; import weka.core.Instances; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) throws Exception { Instances train = DataSource.read("data\\segment-challenge.arff"); Instances test = DataSource.read("data\\segment-test.arff"); train.setClassIndex(train.numAttributes() - 1); test.setClassIndex(test.numAttributes() - 1); if (!train.equalHeaders(test)) { throw new Exception("不相容"); } J48 classifier = new J48(); classifier.buildClassifier(train); for (int i = 0; i < test.numInstances(); i++) { double pred = classifier.classifyInstance(test.instance(i)); double[] dist = classifier .distributionForInstance(test.instance(i)); System.out.print((i + 1) + " - " + test.instance(i).toString(test.classIndex()) + " - " + test.classAttribute().value((int) pred) + " - "); if (pred != test.instance(i).classValue()) { System.out.print("wrong"); } else { System.out.print("correct"); } System.out.println(" - " + Utils.arrayToString(dist)); } } }
训练了一个分类器,而后一条一跳测试集过,
double pred = classifier.classifyInstance(test.instance(i));是预测结果
double[] dist = classifier.distributionForInstance(test.instance(i));获得的是这条数据的预测各个类的几率
4.5 交叉验证并预测
package weka.api; import java.util.Random; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.classifiers.Evaluation; import weka.core.Instances; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\ionosphere.arff"); data.setClassIndex(data.numAttributes() - 1); String[] options = new String[2]; String classname = "weka.classifiers.trees.J48"; options[0] = "-C"; options[1] = "0.25"; Classifier classifier = (Classifier) Utils.forName(Classifier.class, classname, options); int seed = 1234; int folds = 10; Random rand = new Random(seed); Instances newData = new Instances(data); newData.randomize(rand); if (newData.classAttribute().isNominal()) { newData.stratify(folds); } Evaluation eval = new Evaluation(newData); for (int i = 0; i < folds; i++) { Instances train = newData.trainCV(folds, i); Instances test = newData.testCV(folds, i); Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train); eval.evaluateModel(clsCopy, test); } System.out.println("===分类器设置==="); System.out.println("分类器:" + Utils.toCommandLine(classifier)); System.out.println("数据集:" + data.relationName()); System.out.println("折数:" + folds); System.out.println("随机种子:" + seed); System.out.println(); System.out.println( eval.toSummaryString("=== " + folds + "折交叉认证===", false)); } }
其实不难理解,不过有几个地方须要说
newData.randomize(rand);这个是将数据随机打乱
newData.stratify(folds);这个的api是这么写的
Stratifies a set of instances according to its class values if the class attribute is nominal (so that afterwards a stratified cross-validation can be performed).
意思应该是,若是这个类信息是标称的,那么咱们以后若是用的是n折的,好比99个个体共3类,每类都33个,那假如分3折,那前33个里应该每类大约11个左右这样。
4.5 交叉验证并预测
package weka.api; import java.util.Random; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.classifiers.Evaluation; import weka.core.Instances; import weka.core.OptionHandler; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.supervised.attribute.AddClassification; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\ionosphere.arff"); data.setClassIndex(data.numAttributes() - 1); String[] options = new String[2]; String classname = "weka.classifiers.trees.J48"; options[0] = "-C"; options[1] = "0.25"; Classifier classifier = (Classifier) Utils.forName(Classifier.class, classname, options); int seed = 1234; int folds = 10; Random rand = new Random(seed); Instances newData = new Instances(data); newData.randomize(rand); if (newData.classAttribute().isNominal()) { newData.stratify(folds); } Instances predictedData = null; Evaluation eval = new Evaluation(newData); for (int i = 0; i < folds; i++) { Instances train = newData.trainCV(folds, i); Instances test = newData.testCV(folds, i); Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train); eval.evaluateModel(clsCopy, test); AddClassification filter = new AddClassification(); filter.setClassifier(classifier); filter.setOutputClassification(true); filter.setOutputDistribution(true); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); Instances pred = Filter.useFilter(test, filter); if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } System.out.println("===分类器设置==="); if (classifier instanceof OptionHandler) System.out.println("分类器: " + classifier.getClass().getName() + " " + Utils.joinOptions( ((OptionHandler) classifier).getOptions())); else System.out.println("分类器:" + Utils.toCommandLine(classifier)); System.out.println("数据集:" + data.relationName()); System.out.println("折数:" + folds); System.out.println("随机种子:" + seed); System.out.println(); System.out.println( eval.toSummaryString("=== " + folds + "折交叉认证===", false)); } }
这个得好好掰扯掰扯
Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train);Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train);
建立了一空的原始的啥都不知道的分类器,而后再训练集进行了训练
eval.evaluateModel(clsCopy, test);
这是将这个训练好的分类器,运用到测试集上进行测试,这是个累加的过程,能够看到好比第一折测试的时候,测试集有35个,那么这个eval记录了这35个的测试结果,第二折测试集有31个,那么这个eval记录了35+31总共的分类结果。
AddClassification filter = new AddClassification(); filter.setClassifier(classifier); filter.setOutputClassification(true); filter.setOutputDistribution(true); filter.setOutputErrorFlag(true);
doc上写
用于将分类,类分布和错误标记添加到具备分类器的数据集的过滤器。 分类器是对数据自己进行培训或做为序列化模型提供。
其实应该相似于把这个空的Classifier包装了起来,包装成一个过滤器
filter.setInputFormat(train); Filter.useFilter(train, filter); Instances pred = Filter.useFilter(test, filter);
先设置数据,Filter.useFilter(train, filter);是训练,后一个是预测,运用这个过滤器,在预测的同时还会给数据后面加上三条属性。
可是这两条命令明明相同啊
以后就是把预测结果丢进去就能够了。end