基于字符串的分离连接hash算法

时间 2020-06-09

标签基于字符串分离连接 hash 算法繁體版

原文原文链接

Hashes
问题：你有一个很大的字符串数组。须要知道另一个字符串是否在这个字符串数组中。你可能会将这个字符串与数组中的字符串依次做比较。可是实际中，你会发现这种方法太慢。必须找其它的方法。可是除了依次比较字符串外，还有没有其它方法来知道某个字符串是否存在呢？
解决方案： Hashes。 Hashes是用小的数据类型（如，数字）来表示其它大的数据类型（一般是字符串）。在这种情形下，你可能将字符串存储在hash数组中。而后你能够计算要查找字符串的hash值，用这个hash值与数组中的hash值进行比较。若是在hash数组中有一个hash值与这个新的要查询的hash值相等，则证明这个字符串存在。这个方法，称为索引(indexing)。
本文采用分离连接hash算法来实现基于字符串的hash算法，而且能够统计某个字符串出现的次数node

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/*定义hash节点*/
struct hash_node {
    char *value;   /*字符串数据，动态分配内存*/
    int count;      /*此字符串出现的次数*/
    struct hash_node * next; /*解决冲突的分离连接法的next节点*/
};
/*定义hash表结构
 *  *两种方式:
 *   * 1. 用数组定义
 *    * 2. 用链表*/
/*用数组*/
#define MAX_HASH_TABLE 10000

/*用链表表示*/
struct hash_table {
    int num;   /*记录hash表的大小*/
    struct hash_node **hashlist; /*用指针动态分配hash数组内存大小*/
};

typedef struct hash_node * hash_list;
typedef struct hash_table* Hash_Table;

/*根据hash表大小，初始化hash表*/
Hash_Table init_hash_table(int hash_size)
{
    Hash_Table hashtable;
    int i;

    hashtable = (Hash_Table)malloc(sizeof(struct hash_table));

    if(hashtable == NULL) {
        printf("malloc hashtable error\n");
        return NULL;
    }
    hashtable->num = hash_size;/*hash数组大小*/

    /*为hash数组动态分配内存*/
    hashtable->hashlist = (struct hash_node **)malloc(sizeof(struct hash_node*) * hash_size);
    if(hashtable->hashlist == NULL) {
        printf("malloc hashlist error\n");
        free(hashtable);
        hashtable = NULL;
        return NULL;
    }
    /*根据hash数组的大小，为每个成员分配内存,而且初始化内存*/
    for(i = 0; i < hash_size; i++) {
        hashtable->hashlist[i] = (struct hash_node*)malloc(sizeof(struct hash_node));
        if(hashtable->hashlist[i] == NULL) {
            printf("malloc hashtable->hashlist error\n");
            exit(1);
        }else {
            hashtable->hashlist[i]->value = NULL;
            hashtable->hashlist[i]->count= 0;
            hashtable->hashlist[i]->next = NULL;
        }
    }
    return hashtable;
}
/*获取hash key值的hash算法函数*/
unsigned long get_hash_index(const char *key,int hash_size)
{
    unsigned long ulHash = 0;

    while(*key) {
        ulHash += (ulHash << 5) + *key++;
    }
    return (ulHash % hash_size);
}

/*在hash表中插入一个字符串*/
int hash_insert(char *string, Hash_Table hash_table)
{
    unsigned long index;
    hash_list hash;

    index = get_hash_index(string,hash_table->num);

    hash = hash_table->hashlist[index];


    if(hash == NULL) {
        hash = (hash_list)malloc(sizeof(struct hash_node));
        if(hash == NULL) {
            printf("error: malloc hashlist failed\n");
            return -1;
        }else {
            memset(hash,0,sizeof(struct hash_node));
            hash->value = (char*)malloc(strlen(string)+1);
            hash->count++;
            strncpy(hash->value,string,strlen(string)+1);
        }
    }else {
           while(hash) {
                if(hash->value != NULL) {
                    if(strcmp(hash->value,string) == 0) {
                        hash->count++;
                        return 0;
                    }
                    hash=hash->next;
                }else {
                    hash->value = (char*)malloc(strlen(string)+1);
                    hash->count++;
                    strncpy(hash->value,string,strlen(string)+1);
                    return 0;
                }
           }
 }
    return 0;
}



hash_list hash_find(const char *string, Hash_Table hash_table)
{
    unsigned long index;
    hash_list hash;

    index = get_hash_index(string,hash_table->num);
    hash = hash_table->hashlist[index];

    while(hash) {
        if((hash->value != NULL) && (strcmp(hash->value,string) == 0)) {
            printf("find %s in hash table.....\n",string);
            return hash;
        }
        hash = hash->next;
    }
    return NULL;

}


int main(int argc, char *argv[])
{
    Hash_Table hash_table;
    int rc = 0;
    hash_list hash;

    hash_table = init_hash_table(MAX_HASH_TABLE);

    //rc = hash_insert("wgw",hash_table);
    rc = hash_insert("cdef",hash_table);
    rc = hash_insert("abcd",hash_table);

    rc = hash_insert("cdef",hash_table);

    hash = hash_find("cdef",hash_table);
    if(hash) {
        printf("hit num of cdef is %d\n",hash->count);
    }

    hash = hash_find("wgw",hash_table);
    printf("%s\n",hash?"find wgw":"can't find wgw");
    if(hash) printf("num=%d\n",hash->count);
}

运行结果：
面试

海量数据面试题：
搜索引擎会经过日志文件把用户每次检索使用的全部检索串都记录下来，每一个查询串的长度为1-255字节。
假设目前有一千万个记录（这些查询串的重复度比较高，虽然总数是1千万，但若是除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门。），请你统计最热门的10个查询串，要求使用的内存不能超过1G。算法

分析：
不超过3百万个，假设都是最大的255个字节，加上next指针和技术count总共255+4+4=263bytes
3000000*263=789000000~~~~789Mbytes小于1G内存。并且这个是考虑到极限状况，通常不会因此都是255字节。
能够考虑用上面的hash算法，来统计次数，而后用排序算法获取最大的10个查询串。数组