词项频率:java
df:term frequency。 term在文档中出现的频率.tf越大,词项越重要.工具
文档频率:学习
tf:document frequecy。有多少文档包含此term,df越大词项越不重要.spa
词项权重计算公式:.net
tf-idf=tf(t,d)*log(N/df(t))
package com.javacore.algorithm; import java.util.Arrays; import java.util.List; /** * Created by bee on 17/3/13. * @version 1.0 * @author blog.csdn.net/napoay */ public class TfIdfCal { /** *calculate the word frequency * @param doc word vector of a doc * @param term a word * @return the word frequency of a doc */ public double tf(List<String> doc, String term) { double termFrequency = 0; for (String str : doc) { if (str.equalsIgnoreCase(term)) { termFrequency++; } } return termFrequency / doc.size(); } /** *calculate the document frequency * @param docs the set of all docs * @param term a word * @return the number of docs which contain the word */ public int df(List<List<String>> docs, String term) { int n = 0; if (term != null && term != "") { for (List<String> doc : docs) { for (String word : doc) { if (term.equalsIgnoreCase(word)) { n++; break; } } } } else { System.out.println("term不能为null或者空串"); } return n; } /** *calculate the inverse document frequency * @param docs the set of all docs * @param term a word * @return idf */ public double idf(List<List<String>> docs, String term) { System.out.println("N:"+docs.size()); System.out.println("DF:"+df(docs,term)); return Math.log(docs.size()/(double)df(docs,term)); } /** * calculate tf-idf * @param doc a doc * @param docs document set * @param term a word * @return inverse document frequency */ public double tfIdf(List<String> doc, List<List<String>> docs, String term) { return tf(doc, term) * idf(docs, term); } public static void main(String[] args) { List<String> doc1 = Arrays.asList("人工", "智能", "成为", "互联网", "大会", "焦点"); List<String> doc2 = Arrays.asList("谷歌", "推出", "开源", "人工", "智能", "系统", "工具"); List<String> doc3 = Arrays.asList("互联网", "的", "将来", "在", "人工", "智能"); List<String> doc4 = Arrays.asList("谷歌", "开源", "机器", "学习", "工具"); List<List<String>> documents = Arrays.asList(doc1, doc2, doc3,doc4); TfIdfCal calculator = new TfIdfCal(); System.out.println(calculator.tf(doc2, "开源")); System.out.println(calculator.df(documents, "开源")); double tfidf = calculator.tfIdf(doc2, documents, "谷歌"); System.out.println("TF-IDF (谷歌) = " + tfidf); System.out.println(Math.log(4/2)*1.0/7); } }
运行结果:code
0.14285714285714285 2 N:4 DF:2 TF-IDF (谷歌) = 0.09902102579427789