Redis源码阅读笔记-动态字符串(SDS)结构

时间 2019-11-13

原文原文链接

Redis中采用自定义的结构来保存字符串，在sds.h中：数组

/* Note: sdshdr5 is never used, we just access the flags byte directly.
 * However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

SDS由4部分组成:安全

len: SDS字符串已经使用的空间（不包含C中字符串的结束符的长度1）。
alloc: 申请的空间大小，减去len就是未使用的空间，初始时和len一直。
flags: 使用低三位表示类型，细分SDS的分类。方便根据字符串的长度不一样选择不用的SDS结构体，节省一部分空间。
buf: 用了C的不定长字符串。

PS: __attribute__ ((__packed__))关键字的解释，在C/C++中，创建一个结构体时，会进行字节对齐操做，使得结构体的大小比其变量占用的字节要多一些，当结构体上声明中加上__attribute__ ((__packed__))，则表示取消字节对齐，按照紧凑排列的方式。app

// sds.h

typedef char *sds;

......
// 用法是 SDS_HDR(8, s)，T传入的是 sdshdr 的数字，s是buf字符串的地址
// SDS_HDR_VAR(T,s) 的做用是从 传入的buf（参数s）的字符串地址 得到一个指向结构体的指针 sh
#define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = (void*)((s)-(sizeof(struct sdshdr##T)));
// SDS_HDR(T, s)，的做用是经过T，和传入的buf（参数s）的字符串地址，找到结构体的首地址
#define SDS_HDR(T,s) ((struct sdshdr##T *)((s)-(sizeof(struct sdshdr##T))))

// 函数的做用是经过传入的sds结构体的buf字符串地址，得到字符串的长度
static inline size_t sdslen(const sds s) {
    // 传入的s是结构体 sdshdr 中 buf 的字符串地址，因此s[-1] 指向 flags
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            return SDS_TYPE_5_LEN(flags);
        case SDS_TYPE_8:
            return SDS_HDR(8,s)->len;
        case SDS_TYPE_16:
            return SDS_HDR(16,s)->len;
        case SDS_TYPE_32:
            return SDS_HDR(32,s)->len;
        case SDS_TYPE_64:
            return SDS_HDR(64,s)->len;
    }
    return 0;
}

部分函数代码解析

sds sdsnewlen(const void *init, size_t initlen) 建立一个包含给定C字符串的SDS:curl

// sds.c

	static inline char sdsReqType(size_t string_size) {
	    if (string_size < 1<<5)
	        // 字符串长度少于32 
	        return SDS_TYPE_5;
	    if (string_size < 1<<8)
	        // 字符串长度少于256
	        return SDS_TYPE_8;
	    if (string_size < 1<<16)
	        // 字符串长度少于65536
	        return SDS_TYPE_16;
	#if (LONG_MAX == LLONG_MAX)
	    if (string_size < 1ll<<32)
	        // 字符串长度少于2^32
	        return SDS_TYPE_32;
	#endif
	    return SDS_TYPE_64;
	}

	/* Create a new sds string with the content specified by the 'init' pointer
	 * and 'initlen'.
	 * If NULL is used for 'init' the string is initialized with zero bytes.
	 *
	 * The string is always null-termined (all the sds strings are, always) so
	 * even if you create an sds string with:
	 *
	 * mystring = sdsnewlen("abc",3);
	 *
	 * You can print the string with printf() as there is an implicit \0 at the
	 * end of the string. However the string is binary safe and can contain
	 * \0 characters in the middle, as the length is stored in the sds header. 	*/
	sds sdsnewlen(const void *init, size_t initlen) {
	    // *init 是字符串的首地址，initlen是字符串的长度（不包含字符串结束符）

	    // 指向sds结构体的指针
	    void *sh;
	    sds s;

	    // 根据字符串的长度，来指定sds的结构体
	    // type 是表示sds的类型，具体数值查看sds.h
	    char type = sdsReqType(initlen);

	    // SDS_TYPE_5 再也不使用，最小单位是SDS_TYPE_8
	    /* Empty strings are usually created in order to append. Use type 8
	     * since type 5 is not good at this. */
	    if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;

	    // 获取结构体的大小
	    int hdrlen = sdsHdrSize(type);
	    unsigned char *fp; /* flags pointer. */

	    // 申请结构体所需的内存，hdrlen是结构体的大小，initlen是字符串的长度，1是字符串结束符的长度
	    sh = s_malloc(hdrlen+initlen+1);
	    if (!init)
	        memset(sh, 0, hdrlen+initlen+1);
	    if (sh == NULL) return NULL;

	    // s 指向告终构体中buf的首地址，为下面将字符串内容复制到buf中作准备
	    s = (char*)sh+hdrlen;
	    // fp 指向告终构体中flags的地址
	    fp = ((unsigned char*)s)-1;
	    switch(type) {

	        // 下面是给sds结构体的len, alloc, fp赋值

	        case SDS_TYPE_5: {
	            *fp = type | (initlen << SDS_TYPE_BITS);
	            break;
	        }
	        case SDS_TYPE_8: {
	            SDS_HDR_VAR(8,s);
	            sh->len = initlen;
	            sh->alloc = initlen;
	            *fp = type;
	            break;
	        }
	        case SDS_TYPE_16: {
	            SDS_HDR_VAR(16,s);
	            sh->len = initlen;
	            sh->alloc = initlen;
	            *fp = type;
	            break;
	        }
	        case SDS_TYPE_32: {
	            SDS_HDR_VAR(32,s);
	            sh->len = initlen;
	            sh->alloc = initlen;
	            *fp = type;
	            break;
	        }
	        case SDS_TYPE_64: {
	            SDS_HDR_VAR(64,s);
	            sh->len = initlen;
	            sh->alloc = initlen;
	            *fp = type;
	            break;
	        }
	    }

	    // 将 init 的字符串拷贝到结构体的 buf 中
	    if (initlen && init)
	        memcpy(s, init, initlen);

	    // 给buf的末尾加上结束符
	    s[initlen] = '\0';
	    return s;
	}

void sdsfree(sds s)释放给定的sds内存：函数

// sds.c

	/* Free an sds string. No operation is performed if 's' is NULL. */
	void sdsfree(sds s) {
		 // 传入的s，是指向sds结构体中buf字符串的首地址的指针

	    if (s == NULL) return;
	    // s_free是zmalloc.c中的zfree()函数，释放内存
	    // s[-1] 是得到指向sds结构体中flags的地址
	    // sdsHdrSize(s[-1]) 则是得到对应的sds结构体的size
	    // 因此(char*)s-sdsHdrSize(s[-1])则是得到指向sds结构体的地址
	    s_free((char*)s-sdsHdrSize(s[-1]));
	}

sds sdscat(sds s, const char *t)为sds字符串后追加字符串，能够看出，若是这个函数会分配一个额外的内存空间来做为预留使用：性能

// sds.c

	/* Append the specified null termianted C string to the sds string 's'.
	 *
	 * After the call, the passed sds string is no longer valid and all the
	 * references must be substituted with the new pointer returned by the call. */
	sds sdscat(sds s, const char *t) {
    //经过sdscatlen拼接sds字符串, 将C字符串t拼接到sds字符串s上
	    return sdscatlen(s, t, strlen(t));
	}

	/* Append the specified binary-safe string pointed by 't' of 'len' bytes to the
	 * end of the specified sds string 's'.
	 *
	 * After the call, the passed sds string is no longer valid and all the
	 * references must be substituted with the new pointer returned by the call. */
	sds sdscatlen(sds s, const void *t, size_t len) {

   // 经过sdslen()获取sds字符串的长度curlen
	    size_t curlen = sdslen(s);

	    s = sdsMakeRoomFor(s,len);
	    if (s == NULL) return NULL;
	    // s为sds结构体中buf的地址指针，
	    // s+curlen为当前sds中字符串的末尾地址
	    // memcpy(s+curlen, t, len) 则是将字符串写入buf后
	    memcpy(s+curlen, t, len);
	    // 将sds的长度(len)设置为curlen+len
	    sdssetlen(s, curlen+len);
	    // 在字符串末尾添加结束字符
	    s[curlen+len] = '\0';
	    return s;
	}

	/* Enlarge the free space at the end of the sds string so that the caller
	 * is sure that after calling this function can overwrite up to addlen
	 * bytes after the end of the string, plus one more byte for nul term.
	 *
	 * Note: this does not change the *length* of the sds string as returned
	 * by sdslen(), but only the free buffer space we have. */
	// 给sds的字符空间增大addlen的大小
	sds sdsMakeRoomFor(sds s, size_t addlen) {
	    void *sh, *newsh;

	    // sdsavail()是计算sds有多少空闲空间，因此avail是该sds的空闲空间
	    size_t avail = sdsavail(s);
	    size_t len, newlen;
	    // s[-1]是指向该sds的flags，s[-1] & SDS_TYPE_MASK 则得到sds的结构体类型
	    char type, oldtype = s[-1] & SDS_TYPE_MASK;
	    int hdrlen;

	    // 若是avail空闲空间比所需空间大，则直接返回
	    /* Return ASAP if there is enough space left. */
	    if (avail >= addlen) return s;

	    // 得到sds的长度len
	    len = sdslen(s);
	    // 得到指向sds结构体的指针sh
	    sh = (char*)s-sdsHdrSize(oldtype);
	    newlen = (len+addlen);

	    // 若是新的长度少于SDS_MAX_PREALLOC(1024*1024)，则申请多一倍的空间做为预留
	    if (newlen < SDS_MAX_PREALLOC)
	        newlen *= 2;
	    else
	        // 若是新的长度大于等于SDS_MAX_PREALLOC(1024*1024)，则申请多一个若是新的长度大于等于SDS_MAX_PREALLOC的空间做为预留
	        newlen += SDS_MAX_PREALLOC;

	    // 根据新的空间长度，获取sds的结构体类型
	    type = sdsReqType(newlen);

	    /* Don't use type 5: the user is appending to the string and type 5 is
	     * not able to remember empty space, so sdsMakeRoomFor() must be called
	     * at every appending operation. */
	    if (type == SDS_TYPE_5) type = SDS_TYPE_8;

	    // hdrlen为新sds结构体的大小
	    hdrlen = sdsHdrSize(type);
	    if (oldtype==type) {
	        // 当sds结构体类型不须要改变时
	        // 经过realloc()在本来结构体上扩大内存
	        newsh = s_realloc(sh, hdrlen+newlen+1);
	        if (newsh == NULL) return NULL;
	        // s指向结构体的buf
	        s = (char*)newsh+hdrlen;
	    } else {
	        /* Since the header size changes, need to move the string forward,
	         * and can't use realloc */

	        // 由于sds结构体类型变了，因此须要从新分配内存

	        newsh = s_malloc(hdrlen+newlen+1);
	        if (newsh == NULL) return NULL;
	        // 拷贝本来字符串到新的sds结构体的buf中
	        memcpy((char*)newsh+hdrlen, s, len+1);
	        // 释放旧的sds
	        s_free(sh);
	        // 各个属性从新赋值
	        s = (char*)newsh+hdrlen;
	        s[-1] = type;
	        sdssetlen(s, len);
	    }
	    // 赋值sds新的alloc值
	    sdssetalloc(s, newlen);
	    return s;
	}

SDS与C字符串的区别

总结之《Redis设计与实现》ui

C字符串	SDS
获取字符串长度的复杂度为O(N)	经过SDS中的len属性，获取字符串的复杂度为O(1)
API是不安全的，可能会形成缓冲区溢出	API是安全的，不会形成缓冲区溢出，由于封装的函数都会去检查是否够剩余的内存地址
修改字符串长度N次必然须要执行N此内存从新分配	修改字符串长度N此最多须要执行N次内存分配，由于在字符串拼接等操做中，封装的函数会给SDS分配预留的内存空间，因此下次操做并不必定会引发内存从新分配。
只能保存文本数据	能够保存文本或二进制数据，由于SDS是经过len属性来判断字符串是否结束，而不是经过`'\0'`
可使用全部`<string.h>`库中的函数	可使用部分`<string.h>`库中的函数，由于SDS始终会将字符串结束符`'\0'`追加到字符串的末尾

SDS API

参考之《Redis设计与实现》this

函数	做用	时间复杂度
`sds sdsnewlen(const void *init, size_t initlen)`	经过C字符串`init`和字符串长度`initlen`建立一个SDS字符串	O(N), N为长度`initlen`
`sds sdsnew(const char *init)`	经过C字符串`init`建立一个SDS字符串，其实是调用`sdsnewlen()`	O(N), N为字符串`init`长度
`sds sdsempty(void)`	建立一个空的SDS字符串，实际上调用`sdsnewlen("", 0)`	O(1)
`sds sdsdup(const sds s)`	建立给定的SDS字符串`s`的副本(复制)，实际上调用`sdsnewlen(s, sdslen(s))`	O(N)
`void sdsfree(sds s)`	释放给定的SDS	O(N)
`sds sdsgrowzero(sds s, size_t len)`	将sds增加到指定的长度，若是指定的长度小于sds当前长度，则不进行操做	O(N)
`sds sdscatlen(sds s, const void *t, size_t len)`	将指定长度`len`的二进制安全字符`t`追加到sds`s`上	O(N)
`sds sdscat(sds s, const char *t)`	将指定字符串`t`追加到sds`s`上，实际上调用的是`sdscatlen(s, t, strlen(t))`	O(N)
`sds sdscatsds(sds s, const sds t)`	将指定sds字符串`t`追加到sds`s`上，实际上调用的是`sdscatlen(s, t, sdslen(t))`	O(N)
`sds sdscpylen(sds s, const char *t, size_t len)`	破坏性的修改sds`s`，将指定长度`len`的二进制安全字符串`t`赋值给`s`	O(N)
`sds sdscpy(sds s, const char *t)`	将指定的C字符串`t`赋值给sds`s`	O(N)
`sds sdscatvprintf(sds s, const char *fmt, va_list ap)`	将不定参`ap`按`fmt`中格式化为字符串，而后拼接到sds`s`中
`sds sdscatprintf(sds s, const char *fmt, ...)`	将按`fmt`格式化后的字符串附加到sds`s`中
`sds sdscatfmt(sds s, char const *fmt, ...)`	相似于`sdscatprintf()`函数，但速度更快，不依赖于libc实现的`sprintf()`; `%s - C String`,`%S - SDS string`,`%i - signed int`,`%I - 64 bit signed integer (long long, int64_t)`,`%u - unsigned int`,`%U - 64 bit unsigned integer (unsigned long long, uint64_t)`,`%% - Verbatim "%" character`
`sds sdstrim(sds s, const char *cset)`	从sds`s`左边和右边删除`cset`中含有的字符，知道遇到非匹配时中止。	O(N)
`void sdsrange(sds s, ssize_t start, ssize_t end)`	保留sds`s`给定区间内的数据，不在区间内的数据将会被覆盖或清除	O(N)
`void sdsupdatelen(sds s)`	将sds`s`的长度设置为`strlen(s)`的长度，即遇到第一个`'\0'`字符的长度	O(N)
`void sdsclear(sds s)`	清空sds`s`中的数据，实际操做是将sds中的`len`设为0，并将`s[0]`设为`'\0'`	O(1)
`int sdscmp(const sds s1, const sds s2)`	对比两个sds字符串`s1`，`s2`是否相同，实现上先对比长度，再对比内容	长度相同时O(N)，长度不一样时O(1)
`sds sdssplitlen(const char s, ssize_t len, const char sep, int seplen, int count)`	使用字符串`sep`分割sds`s`，`seplen`是`sep`的长度，`count`是分割后返回的sds字符串个数
`void sdsfreesplitres(sds *tokens, int count)`	释放`sdssplitlen()`返回的结果的内存，`tokens`为返回的结果，`count`为返回的sds字符串个数
`void sdstolower(sds s)`	将sds`s`中的全部字符转为小写	O(N)
`void sdstoupper(sds s)`	将sds`s`中的全部字符转为大写	O(N)
`sds sdsfromlonglong(long long value)`	经过一个`long long`类型的`value`来建立sds字符串，性能比`sdscatprintf(sdsempty(),"%lld\n", value)`要好
`sds sdscatrepr(sds s, const char *p, size_t len)`	将指定长度`len`的字符串`p`附加到sds`s`中，但要检查p中的字符，若是是非打印字符，则要转成`\n\r\a....`或`"\x<hex-number>"`的形式	O(N)
`sds sdssplitargs(const char line, int *argc)`	将命令行参数解析成sds数组，`argc`表示数组大小；返回的sds数组要使用`sdsfreesplitres()`函数释放内存
`sds sdsmapchars(sds s, const char from, const char to, size_t setlen)`	将`s`中`from`字符集的字符映射成`to`中的对应字符集，`setlen`表示`from`和`to`中字符集的个数，两者必须严格一一对应
`sds sdsjoin(char *argv, int argc, char sep)`	将C的字符串数组合并为sds字符串，`argv`为字符串数组的首地址，`argc`为数组的长度，`sep`为分隔符
`sds sdsjoinsds(sds argv, int argc, const char sep, size_t seplen)`	将sds字符串数组合并为sds字符串，`argv`为字符串数组的首地址，`argc`为数组的长度，`sep`为分隔符