哈夫曼编码是一种变长编码,根据字符频率肯定编码的长度。在学习数据结构时,咱们知道,经过贪心的策略自底向上构造二叉树,最后获得哈夫曼树。从根节点遍历,即可以获得编码。算法
本文给出了经典教材《数据结构》一书上算法6.12的具体实现细节。数组
构造二叉树的过程为:初始为所有字符的 \(n\) 个叶子节点,每次选择权值最小的两个根节点合并,造成新的节点,其权值为合并的两节点权值之和。引入 parent
做为是否为根节点判断的标志。数据结构
\(n\) 个节点完成 \(n-1\) 次合并操做,造成共包含 \(2n-1\) 个节点的二叉树,树的根节点编号为 \(2n-1\) 。学习
// 哈夫曼树节点类型 typedef struct { char data; // 节点字符 double weight; // 节点权值 int parent, lchild, rchild; // 父节点、左右孩子节点 }HfmTNode, *HuffmanTree; // 哈夫曼编码类型 记录{字符 -> 编码} typedef struct { char letter; // 节点字符 char *code; // 节点编码 }HfmCNode, *HuffmanCode; // 哈夫曼类型 typedef struct { HuffmanTree tree; HuffmanCode code; int n; // 字符集长度 char *letters; // 字符集 int *frequency; // 字符频率 int rt; // 哈夫曼树根节点编号,根节点即 `tree[2n-1]` }Huffman;
参考 《数据结构(C语言版)》测试
P147 算法 6.12优化
要获得哈夫曼编码,依次调用ui
// 初始化哈夫曼 void initHuffman(Huffman *hfm, const char *letters, const int frequency[], int n) { if (n<1) return; int m = 2*n-1; hfm->n = n; hfm->letters = (char*)malloc((n+1)*sizeof(char)); hfm->frequency = (int*)malloc((n+1)*sizeof(int)); hfm->tree = (HuffmanTree)malloc((m+1)* sizeof(HfmTNode)); hfm->rt = m; for (int i=1;i<=n;i++) { hfm->letters[i] = letters[i-1]; hfm->frequency[i] = frequency[i-1]; } for (int i=1;i<=n;i++) hfm->tree[i] = (HfmTNode){letters[i-1], frequency[i-1], 0, 0, 0}; for (int i=n+1;i<2*n;i++) hfm->tree[i] = (HfmTNode){0, 0, 0, 0, 0}; for(int i=n+1;i<=m;i++) { hfm->tree[i].weight = 0; hfm->tree[i].lchild = hfm->tree[i].rchild = hfm->tree[i].parent = 0; } } // 创建哈夫曼树 void buildHuffmanTree(Huffman *hfm) { // 创建哈夫曼树 int n = hfm->n; int m = 2*n-1; for(int i=n+1;i<=m;i++) { int p1 = 1, p2 = 1; // p1记录最小结点位置, p2记录第二小 while(p1<=i-1 && hfm->tree[p1].parent) p1++; p2 = p1+1; while(p2<=i-1 && hfm->tree[p2].parent) p2++; for(int j=p1+1;j<=i-1;j++) { if (hfm->tree[j].parent) continue; // 非根节点 if(hfm->tree[j].weight<=hfm->tree[p1].weight) { p2 = p1, p1 = j; } else if(hfm->tree[j].weight<hfm->tree[p2].weight) { p2 = j; } } hfm->tree[i].weight = hfm->tree[p1].weight + hfm->tree[p2].weight; hfm->tree[i].lchild = p1; hfm->tree[i].rchild = p2; hfm->tree[p1].parent = i; hfm->tree[p2].parent = i; } } // 获取哈夫曼编码 void getHuffmanCode(Huffman *hfm) { // 求赫夫曼编码 int n = hfm->n; hfm->code = (HuffmanCode)malloc((n+1)*sizeof(HfmCNode)); for (int i=1;i<=n;i++) hfm->code[i] = (HfmCNode){hfm->letters[i], ""}; char *code = (char *)malloc(n*sizeof(char)); code[n-1] = '\0'; for(int i=1;i<=n;i++) { int start = n-1; int c = i, f = hfm->tree[i].parent; while(f) { if(c==hfm->tree[f].lchild) code[--start] = '0'; else code[--start] = '1'; c = f; f = hfm->tree[c].parent; } hfm->code[i].code = (char*)malloc((n-start)*sizeof(char)); strcpy(hfm->code[i].code, &code[start]); } free(code); } // 凹入表示法输出 void showHuffmanTree(Huffman *hfm, int rt=-1, int level=0) { if (rt==0) return ; if (rt==-1) { printf("HuffmanCode:\n"); for (int i=1;i<=hfm->n;i++) { // printf("%c\n", hfm->letters[i]); // printf("%c\n", hfm->tree[i].data); printf("%c:%s\n", hfm->code[i].letter, hfm->code[i].code); } rt = hfm->rt; printf("HuffmanTree:\n"); } int i; for(i=0;i<level;i++) printf(" "); if (hfm->tree[rt].data==0) printf("**\n"); else printf("%c:%s\n", hfm->tree[rt].data, hfm->code[rt].code); showHuffmanTree(hfm, hfm->tree[rt].lchild, level+1); showHuffmanTree(hfm, hfm->tree[rt].rchild, level+1); }
图方便,直接使用了C++ string
类型,而不是基于C类型字符串(本质上是 char*
字符数组)编码
// 编码 string Encode(Huffman *hfm, const char *input) { int cnt = 0; string output = ""; for (int i=0;input[i];i++) { char c = input[i]; for (int i=1;i<=hfm->n;i++) { if (hfm->code[i].letter==c) { output += hfm->code[i].code; break; } } if (++cnt<=10) cout<<output<<endl; } return output; } // 译码 string Decode(Huffman *hfm, const char *input) { int p = hfm->rt; string output = ""; for (int i=0;input[i];i++) { char c = input[i]; if(c=='0') p = hfm->tree[p].lchild; else p = hfm->tree[p].rchild; if(p<=hfm->n) // 翻译到叶子节点 { output += hfm->tree[p].data; p = hfm->rt; } } return output; }
// 统计文章字符频率 创建哈夫曼树 void readTxt2Huffman(const char *filename, Huffman *hfm) { FILE *fp = fopen(filename, "r"); if (fp==NULL) return; char *letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ,.;\'\""; int frequency[58] = {0}; // 2*26个字母 空格 逗号 句号 分号 单引号 双引号 while(1) { char c = fgetc(fp); if (feof(fp)) break; // if (c>='a' && c<='z') c += 'A' - 'a'; if (c>='a' && c<='z') frequency[c-'a']++; else if (c>='A' && c<='Z') frequency[c-'A'+26]++; else if (c==' ') frequency[52]++; else if(c==',') frequency[53]++; else if(c=='.') frequency[54]++; else if(c==';') frequency[55]++; else if(c=='\'') frequency[56]++; else if(c=='\"') frequency[57]++; // else printf("%c\n", c); } initHuffman(hfm, letters, frequency, 58); buildHuffmanTree(hfm); getHuffmanCode(hfm); } // 读文件,返回char*字符串 char* readText(const char* filename) { char* text; FILE *pf = fopen(filename, "r"); if (pf==NULL) { printf("文件%s不存在\n", filename); return ""; } fseek(pf, 0, SEEK_END); long lSize = ftell(pf); text = (char*)malloc(lSize+1); rewind(pf); fread(text, sizeof(char), lSize, pf); text[lSize] = '\0'; return text; } int main() { /* Huffman hfm; int w[6] = {1, 2, 3, 4, 6, 8}; initHuffman(&hfm, "abcdef", w, 6); buildHuffmanTree(&hfm); getHuffmanCode(&hfm); for (int i=1;i<=6;i++) { printf("%c\n", hfm.letters[i]); printf("%c\n", hfm.tree[i].data); printf("%s\n", hfm.code[i].code); } showHuffmanTree(&hfm); cout<<Encode(&hfm, "bacbefd")<<endl; cout<<Decode(&hfm, "100110001011001011100")<<endl; */ // 测试读文件,完成编码,译码 const char *filename = "article.txt"; Huffman hfm; readTxt2Huffman(filename, &hfm); showHuffmanTree(&hfm); char text[5000]; strcpy(text, readText(filename)); // printf("加密前:\n"); // printf("%s\n", text); // printf("加密后:\n"); string text_encode = Encode(&hfm, text); cout<<text_encode<<endl; cout<<Decode(&hfm, text_encode.c_str())<<endl; return 0; }
任务一须要从控制台读入 须要按Ctrl Z终止输入 用 2==scanf()跳出循环加密
分配内存使用malloc,单块内存大小为 sizeof(xxx) 写错了类型,致使程序无输出也没有报错,花费很长时间才定位到错误spa
hfm->code = (HuffmanCode)malloc((n+1)*sizeof(HfmCNode))
读取文章能正常创建哈夫曼树并编码 ,译码过程出错。经过输出译码过程,检查到字符集(包含小写)与译码规则不一致,须要对大小写特判。完善字符集,包含大小写和各类符号的字符集做为输入,即可直接译码获得原始输入。
本人学习《数据结构》这门课是在大一C语言刚结束以后,彼时对C语言的核心——指针还没彻底琢磨透彻。学习数据结构也仅仅循序渐进完成了书上的课程实验,如今回头看过去写的代码,不只代码风格凌乱,也存在内存泄漏的隐患。本次帮学弟写做业的同时,顺便重构了过去的代码。最近须要用C/C++进行k-means的算法优化,也借此好好熟悉一番传统的C/C++。
(完)