Weka 3.7.12源码学习、阅读、分析(3)

时间 2019-12-06

标签 weka 3.7.12 源码学习阅读分析繁體版

原文原文链接

首先须要指出的是，笔者认为，阅读Weka相关算法实现的源码以前，应该对所阅读的算法有原理上的认识与理解，这样才会在阅读时有所心得和收获。也正是由于这个缘由，再也不对算法的原理作出交代，敬请见谅！算法

一样，NaiveBayes在继承了AbstractClassifier的基础上，也实现了几个接口。OptionHandler:返回操做的枚举ide

WeightInstancesHandler:若是有对象用到实例权重提供的信息，就会用到这个接口测试

TechnicalInformationHandler:返回一些对分类算法的技术信息和相关出版物的一些资料ui

Aggregateable<NaiveBayes>:返回NaiveBayes的汇集结果（暂时还不太理解）this

具体实现：orm

public class NaiveBayes extends AbstractClassifier implements OptionHandler,
WeightedInstancesHandler, TechnicalInformationHandler,
Aggregateable<NaiveBayes> {对象

static final long serialVersionUID = 5995231201785697655L;//序列化继承

protected Estimator[][] m_Distributions;//用于属性评估接口

protected Estimator m_ClassDistribution;//用于分类评估ci

protected boolean m_UseKernelEstimator = false;//设置用核心密度分布或者是普通分布

protected boolean m_UseDiscretization = false;//设置用离散分布或者普通分布

protected int m_NumClasses;//分类数

protected Instances m_Instances;

protected static final double DEFAULT_NUM_PRECISION = 0.01;//数值属性的估计精度

//下面是对算法的全局描述

public String globalInfo() {
    return "Class for a Naive Bayes classifier using estimator classes. Numeric"
      + " estimator precision values are chosen based on analysis of the "
      + " training data. For this reason, the classifier is not an"
      + " UpdateableClassifier (which in typical usage are initialized with zero"
      + " training instances) -- if you need the UpdateableClassifier functionality,"
      + " use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable"
      + " classifier will use a default precision of 0.1 for numeric attributes"
      + " when buildClassifier is called with zero training instances.\n\n"
      + "For more information on Naive Bayes classifiers, see\n\n"
      + getTechnicalInformation().toString();
}

//设置技术信息的内容，技术信息TechnicalIngormation是经过在Field下枚举信息的各个条目实现的。

@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "George H. John and Pat Langley");
    result.setValue(Field.TITLE,
      "Estimating Continuous Distributions in Bayesian Classifiers");
    result.setValue(Field.BOOKTITLE,
      "Eleventh Conference on Uncertainty in Artificial Intelligence");
    result.setValue(Field.YEAR, "1995");
    result.setValue(Field.PAGES, "338-345");
    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
    result.setValue(Field.ADDRESS, "San Mateo");

return result;
}

//做用：复写buildClassifier（）方法，生成分类器

@Override
public void buildClassifier(Instances instances) throws Exception {

// 检查分类器是否能负载测试集
getCapabilities().testWithFail(instances);

    // 移除缺失分类的实例
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

//获得分类数

m_NumClasses = instances.numClasses();

// 复制测试集，而不是直接对其操做。
m_Instances = new Instances(instances);

    // 若是测试集须要离散化，将其离散化，毕竟有些分类算法没法处理连续值或处理结果精度较低。
    if (m_UseDiscretization) {
      m_Disc = new weka.filters.supervised.attribute.Discretize();
      m_Disc.setInputFormat(m_Instances);
      m_Instances = weka.filters.Filter.useFilter(m_Instances, m_Disc);
    } else {
      m_Disc = null;
    }

    // 为离散化预备空间
    m_Distributions = new Estimator[m_Instances.numAttributes() - 1][m_Instances
      .numClasses()];
    m_ClassDistribution = new DiscreteEstimator(m_Instances.numClasses(), true);
    int attIndex = 0;
    Enumeration<Attribute> enu = m_Instances.enumerateAttributes();
    while (enu.hasMoreElements()) {
      Attribute attribute = enu.nextElement();

      // 若是属性是数值型，就评价两个相邻值的数值精度
      double numPrecision = DEFAULT_NUM_PRECISION;
      if (attribute.type() == Attribute.NUMERIC) {
        m_Instances.sort(attribute);
        if ((m_Instances.numInstances() > 0)
          && !m_Instances.instance(0).isMissing(attribute)) {
          double lastVal = m_Instances.instance(0).value(attribute);
          double currentVal, deltaSum = 0;
          int distinct = 0;
          for (int i = 1; i < m_Instances.numInstances(); i++) {
            Instance currentInst = m_Instances.instance(i);
            if (currentInst.isMissing(attribute)) {
              break;
            }
            currentVal = currentInst.value(attribute);
            if (currentVal != lastVal) {
              deltaSum += currentVal - lastVal;
              lastVal = currentVal;
              distinct++;
            }
          }
          if (distinct > 0) {
            numPrecision = deltaSum / distinct;
          }
        }
      }

      for (int j = 0; j < m_Instances.numClasses(); j++) {
        switch (attribute.type()) {
        case Attribute.NUMERIC:
          if (m_UseKernelEstimator) {
            m_Distributions[attIndex][j] = new KernelEstimator(numPrecision);
          } else {
            m_Distributions[attIndex][j] = new NormalEstimator(numPrecision);
          }
          break;
        case Attribute.NOMINAL:
          m_Distributions[attIndex][j] = new DiscreteEstimator(
            attribute.numValues(), true);
          break;
        default:
          throw new Exception("Attribute type unknown to NaiveBayes");
        }
      }
      attIndex++;
    }

    // 计算次数
    Enumeration<Instance> enumInsts = m_Instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      Instance instance = enumInsts.nextElement();
      updateClassifier(instance);
    }

// 存储空间 m_Instances = new Instances(m_Instances, 0); }