开源中文分词工具探析（七）：LTP

时间 2019-11-06

标签开源中文分词工具 ltp 繁體版

原文原文链接

LTP是哈工大开源的一套中文语言处理系统，涵盖了基本功能：分词、词性标注、命名实体识别、依存句法分析、语义角色标注、语义依存分析等。html

【开源中文分词工具探析】系列：java

1. 前言

同THULAC同样，LTP也是基于结构化感知器（Structured Perceptron, SP），以最大熵准则建模标注序列\(Y\)在输入序列\(X\)的状况下的score函数：git

\[ S(Y,X) = \sum_s \alpha_s \Phi_s(Y,X) \]github

其中，\(\Phi_s(Y,X)\)为本地特征函数。中文分词问题等价于给定\(X\)序列，求解score函数最大值对应的\(Y\)序列：数组

\[ \mathop{\arg \max}_Y S(Y,X) \]app

2. 分解

如下源码分析基于版本3.4.0。函数

分词流程

分词流程与其余分词器别无二致，先提取字符特征，计算特征权重值，而后Viterbi解码。代码详见__ltp_dll_segmentor_wrapper::segment()：工具

int segment(const char *str, std::vector<std::string> &words) {
    ltp::framework::ViterbiFeatureContext ctx;
    ltp::framework::ViterbiScoreMatrix scm;
    ltp::framework::ViterbiDecoder decoder;
    ltp::segmentor::Instance inst;

    int ret = preprocessor.preprocess(str, inst.raw_forms, inst.forms, 
inst.chartypes);

    if (-1 == ret || 0 == ret) {
        words.clear();
        return 0;
    }

    ltp::segmentor::SegmentationConstrain con;
    con.regist(&(inst.chartypes));
    build_lexicon_match_state(lexicons, &inst);
    extract_features(inst, model, &ctx, false);
    calculate_scores(inst, (*model), ctx, true, &scm);

    // allocate a new decoder so that the segmentor support multithreaded
    // decoding. this modification was committed by niuox
    decoder.decode(scm, con, inst.predict_tagsidx);
    build_words(inst.raw_forms, inst.predict_tagsidx, words);

    return words.size();
}

训练模型

模型文件cws.model包含了类别、特征、权重、内部词典（internal lexicon）等。我用Java 重写了模型解析，代码以下：源码分析

DataInputStream is = new DataInputStream(new FileInputStream(path));
char[] octws = readCharArray(is, 128);

// 1. read label
SmartMap label = readSmartMap(is);
int[] entries = readIntArray(is, label.numEntries);

// 2. read feature Space
char[] space = readCharArray(is, 16);
int offset = readInt(is);
int sz = readInt(is);
SmartMap[] dicts = new SmartMap[sz];
for (int i = 0; i < sz; i++) {
    dicts[i] = readSmartMap(is);
}

// 3. read param
char[] param = readCharArray(is, 16);
int dim = readInt(is);
double[] w = readDoubleArray(is, dim);
double[] wSum = readDoubleArray(is, dim);
int lastTimestamp = readInt(is);

// 4. read internal lexicon
SmartMap internalLexicon = readSmartMap(is);

// read char array
private static char[] readCharArray(DataInputStream is, int length) throws IOException {
    char[] chars = new char[length];
    for (int i = 0; i < length; i++) {
        chars[i] = (char) is.read();
    }
    return chars;
}

// read int array
private static int[] readIntArray(DataInputStream is, int length) throws IOException {
    byte[] bytes = new byte[4 * length];
    is.read(bytes);
    IntBuffer intBuffer = ByteBuffer.wrap(bytes)
            .order(ByteOrder.LITTLE_ENDIAN)
            .asIntBuffer();
    int[] array = new int[length];
    intBuffer.get(array);
    return array;
}

LTP共用到了15类特征，故sz为15；特征是采用Map表示，LTP称之为SmartMap，看代码本质上是一个HashMap。分词工具测评结果代表，LTP分词速度较THULAC要慢。究其缘由，THULAC采用双数组Trie来表示模型，特征检索速度要优于LTP。ui

特征

LTP所用到的特征大体可分为如下几类：

unigram字符特征 ch[-2], ch[-1], ch[0], ch[1], ch[2]
bigram字符特征 ch[-2]ch[-1], ch[-1]ch[0],ch[0]ch[1],ch[1]ch[2]
字符类型特征 ct[-1], ct[0], ct[1]
词典属性特征 ch[0]是否为词典开始字符、中间字符、结束字符

源码见extractor.cpp：

Extractor::Extractor() {
    // delimit feature templates
    templates.push_back(new Template("1={c-2}"));
    templates.push_back(new Template("2={c-1}"));
    templates.push_back(new Template("3={c-0}"));
    templates.push_back(new Template("4={c+1}"));
    templates.push_back(new Template("5={c+2}"));
    templates.push_back(new Template("6={c-2}-{c-1}"));
    templates.push_back(new Template("7={c-1}-{c-0}"));
    templates.push_back(new Template("8={c-0}-{c+1}"));
    templates.push_back(new Template("9={c+1}-{c+2}"));
    templates.push_back(new Template("14={ct-1}"));
    templates.push_back(new Template("15={ct-0}"));
    templates.push_back(new Template("16={ct+1}"));
    templates.push_back(new Template("17={lex1}"));
    templates.push_back(new Template("18={lex2}"));
    templates.push_back(new Template("19={lex3}"));
}

#define TYPE(x) (strutils::to_str(inst.chartypes[(x)]&0x07))
            data.set("c-2", (idx - 2 < 0 ? BOS : inst.forms[idx - 2]));
            data.set("c-1", (idx - 1 < 0 ? BOS : inst.forms[idx - 1]));
            data.set("c-0", inst.forms[idx]);
            data.set("c+1", (idx + 1 >= len ? EOS : inst.forms[idx + 1]));
            data.set("c+2", (idx + 2 >= len ? EOS : inst.forms[idx + 2]));
            data.set("ct-1", (idx - 1 < 0 ? BOT : TYPE(idx - 1)));
            data.set("ct-0", TYPE(idx));
            data.set("ct+1", (idx + 1 >= len ? EOT : TYPE(idx + 1)));
            data.set("lex1", strutils::to_str(inst.lexicon_match_state[idx] & 0x0f));
            data.set("lex2", strutils::to_str((inst.lexicon_match_state[idx] >> 4) & 0x0f));
            data.set("lex3", strutils::to_str((inst.lexicon_match_state[idx] >> 8) & 0x0f));
#undef TYPE