import re import math import copy ''' 需求:统计单词出现的次数 1.从dream.txt文件中读取数据 2.处理每一行的特殊符号 3.将每一行经过空格进行分隔,存入words列表中 4.统计单词的出现次数,使用dict数据结构 # 列表 集合 RE模块 字典 ''' ''' \xa0 是不间断空白符 咱们一般所用的空格是 \x20 ,是在标准ASCII可见字符 0x20~0x7e 范围内。 而 \xa0 属于 latin1 (ISO/IEC_8859-1)中的扩展字符集字符,表明空白符nbsp(non-breaking space)。 \u3000:全角的空白符 ''' r = re.compile('[\xa0\u3000\n-\.!@#\$%\\\^&\*\)\(\+=\{\}\[\]\/",\'<>~\·`\?:;|]') words = [] with open("dream.txt", "r", encoding = "utf-8") as f: lines = f.readlines() for line in lines: line = r.sub(' ', line) for elem in line.split(' '): if elem: words.append(elem.strip(r'\u3000')) def word_count(wrods, set_words): words_count = {} for w in set_words: words_count[w] = 0 for w in words: words_count[w] += 1 return copy.deepcopy(words_count) set_words = set(words) words_count = word_count(words, set_words) print("总共单词个数(含重复):", len(words)) print("总共单词个数(不含重复):", len(set_words)) print("单词和出现的次数:", words_count)