import jieba f = open('article.txt','r') text = f.read() f.close() str = '''一!“”,。?;’"',.、:\n''' for s in str: text = text.replace(s,' ') jieba.add_word('钱先生') wordlist = list(jieba.cut(text)) exclude = {'说','有','得','没','的','他','了','她','是','在','—','你','走','对','他们','着','把','不','也','我','人','而', '与','就','但是','那','要','又','想','和','一个',' ','呢','很','一点','都','去', '没有','个','上','给','来','还','到','这','\u3000','点','小','看'} set = set(wordlist) - exclude dict = {} for key in set: dict[key]=wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) for i in range(20): print(dictlist[i])
运行结果python
('日本', 665)
('本身', 647)
('什么', 608)
('老人', 536)
('瑞宣', 422)
('好', 415)
('知道', 394)
('北平', 350)
('钱', 338)
('起来', 295)
('钱先生', 291)
('里', 291)
('先生', 290)
('并', 286)
('象', 284)
('能', 282)
('似的', 280)
('那么', 279)
('不能', 279)
('会', 267)blog