NAT惟一五元组选取

时间 2021-02-15

标签前端算法 shell 数组网络 app dom 函数 this 栏目 Unix 繁體版

原文原文链接

使用iptable进行nat设置时，可使用以下扩展选项：前端

# SNAT 源地址转换，用在 POSTROUTING、INPUT 链
--to-source [<ipaddr>[-<ipaddr>]][:port[-port]]
--random        # 映射到随机端口号,
--random-fully  # 映射到随机端口号（PRNG 彻底随机化）
--persistent    # 映射到固定地址

# DNAT 目的地址转换，用在 PREROUTING、OUTPUT 链
--to-destination [<ipaddr>[-<ipaddr>]][:port[-port]]
--random        # 映射到随机端口号
--persistent    # 映射到固定地址

在内核中有以下几个标志与上面的选项对应：算法

/* 指定了IP范围 */
#define NF_NAT_RANGE_MAP_IPS            (1 << 0)
/* 指定了端口具体范围 */
#define NF_NAT_RANGE_PROTO_SPECIFIED        (1 << 1)
/* 范围随机，使用secure_port函数进行源端口计算，对应于--random */
#define NF_NAT_RANGE_PROTO_RANDOM        (1 << 2)
/* 映射到固定地址，同一个客户端使用相同的源地址，对应于--persistent */
#define NF_NAT_RANGE_PERSISTENT            (1 << 3)
/* 彻底随机，对应于--random-fully */
#define NF_NAT_RANGE_PROTO_RANDOM_FULLY        (1 << 4)

//上面几个标志有些能够组合使用

//随机标志
#define NF_NAT_RANGE_PROTO_RANDOM_ALL        \
    (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
//范围标志
#define NF_NAT_RANGE_MASK                    \
    (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |    \
     NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |    \
     NF_NAT_RANGE_PROTO_RANDOM_FULLY)

构建nat信息

netfilter在两个地方会构建nat信息。一个是在命中nat规则后构建nat信息，另一个是relate链接会构建nat信息，在expect函数中。构建nat信息都是使用函数nf_nat_setup_info进行构建，二者的差别在于range参数。后者由iptable规则设置，前者由help函数肯定。nat会修改链接跟踪，仅仅修改应答方向。shell

/* 根据提供的nat类型以及范围进行nat五元组修改 */
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range *range,
          enum nf_nat_manip_type maniptype)
{
    struct net *net = nf_ct_net(ct);/* 获取该链接跟踪所在的网络命名空间 */
    struct nf_conntrack_tuple curr_tuple, new_tuple;

    /* Can't setup nat info for confirmed ct. */
    /* 链接已经确认的不在进行构建 */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;

    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);

    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;

    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
     * 获取请求方向的五元组
     */
    nf_ct_invert_tuplepr(&curr_tuple,
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
    /* 根据请求方向的五元组获取nat后的请求方向的五元组 */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
    /* 获取的惟一的五元组进行翻转后将会做为链接跟踪的应答方向的五元组。 */
    /* 新的请求方向的五元组与原来的五元组不同，则须要改变应答方向的五元组 */
    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* 根据新的五元组获得应答方向的新的五元组 */
        nf_ct_invert_tuplepr(&reply, &new_tuple);
        /* 替换应答方向的五元组 */
        nf_conntrack_alter_reply(ct, &reply);

        /* Non-atomic: we own this at the moment. */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;
        /* 判断该链接是否存在help，若是存在则必须添加seq-adj扩展功能 */
        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }
    /* 若是是源nat操做，则将该五元组添加到nf_nat_bysource hash表中 */
    /* 该表将会被用来选取snat的源IP，即相同的client会使用相同的源IP */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;

        srchash = hash_by_src(net,
                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
        spin_lock_bh(lock);
        hlist_add_head_rcu(&ct->nat_bysource,
                   &nf_nat_bysource[srchash]);
        spin_unlock_bh(lock);
    }

    /* It's done. nat处理完毕 */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
    else
        ct->status |= IPS_SRC_NAT_DONE;

    return NF_ACCEPT;
}

重点分析get_unique_tuple函数

nf_ct_invert_tuplepr(&curr_tuple,数组

&ct->tuplehash[IP_CT_DIR_REPLY].tuple);语句求出了curr_tuple，对于首包或者链接没有通过nat来讲其值就是请求方向的五元组，没啥不一样，对于通过了nat的包，则不一样。

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __ip_conntrack_confirm and drop the packet. 
 * 参数tuple为求出来的惟一的五元组。
 * 参数orig_tuple为请求方向的五元组。
 * 参数range为规则设置的参数。
 * 参数maniptype为nat类型，由hook点决定。
 */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range *range,
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)
{
    const struct nf_conntrack_zone *zone;
    const struct nf_nat_l3proto *l3proto;
    const struct nf_nat_l4proto *l4proto;
    struct net *net = nf_ct_net(ct);

    zone = nf_ct_zone(ct);

    rcu_read_lock();
    l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
    l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
                    orig_tuple->dst.protonum);

    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     *
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
     */
    if (maniptype == NF_NAT_MANIP_SRC && //第一种状况，若是是源nat，而且没有设置随机标志
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* try the original tuple first */
        /* 首先判断原始的方向的五元组是否知足snat的范围要求，若是知足，而且该五元组没有被使用，则直接使用该五元组 
        ** 这种状况下不须要进行nat。很是少见。 */
        if (in_range(l3proto, l4proto, orig_tuple, range)) {
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
                goto out;
            }/* 已经使用，则须要进一步计算 */
            
        /* 原始五元组不在范围内，进行源IP选取，选择最近使用的相同的源IP的nat后的IP */    
        } else if (find_appropriate_src(net, zone, l3proto, l4proto,
                        orig_tuple, tuple, range)) {
            pr_debug("get_unique_tuple: Found current src map\n");
            /* 查看咱们选取的源IP是否知足惟一，知足则直接退出 */
            if (!nf_nat_used_tuple(tuple, ct))
                goto out;
        }
    }

    /* 2) Select the least-used IP/proto combination in the given range */
    /* 2) 前面的snat没有选出合适的源IP或者dnat在这里进一步选择ip */
    *tuple = *orig_tuple;
    find_best_ips_proto(zone, tuple, range, ct, maniptype);

    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.
     */

    /* Only bother mapping if it's not already in range and unique */
    /* 没有设置随机标志 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {//指定了具体端口范围
            if (l4proto->in_range(tuple, maniptype,//查看当前端口是否在指定的范围，而且只指定了一个端口，且五元组没有被使用过，则再也不进行端口的选取。
                          &range->min_proto,
                          &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                goto out;
        } else if (!nf_nat_used_tuple(tuple, ct)) {//没有指定具体的端口范围，而且五元组没有被使用，则直接使用。
            goto out;
        }
    }

    /* Last change: get protocol to try to obtain unique tuple. */
    /* 最后使用协议去获取一个端口保证五元组惟一 */
    l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
    rcu_read_unlock();
}

find_appropriate_src

/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
             const struct nf_conntrack_zone *zone,
             const struct nf_nat_l3proto *l3proto,
             const struct nf_nat_l4proto *l4proto,
             const struct nf_conntrack_tuple *tuple,
             struct nf_conntrack_tuple *result,
             const struct nf_nat_range *range)
{
    unsigned int h = hash_by_src(net, tuple);
    const struct nf_conn *ct;
    //遍历全部进行snat的请求方向的五元组，查看是否源IP相同，相同则使用对应nat后的源IP。
    hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
        if (same_src(ct, tuple) &&//源IP相同
            net_eq(net, nf_ct_net(ct)) &&//相同命名空间
            nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {//相同的zone
            /* Copy source part from reply tuple. */
            /* 获取应答方向的五元组，反转，获得咱们须要nat后的源IP */
            nf_ct_invert_tuplepr(result,
                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);//获取应答方向的反转五元组
            //还原目的IP
            result->dst = tuple->dst;
            //是否符合指定的range，符合则返回1，不然继续下一个元素。
            if (in_range(l3proto, l4proto, result, range))
                return 1;
        }
    }
    return 0;
}

find_best_ips_proto

/* For [FUTURE] fragmentation handling, we want the least-used
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 * 选择一个最少使用的IP/PRO协议组合。这里直接采用hash算法计算一个值。
 */
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
            struct nf_conntrack_tuple *tuple,
            const struct nf_nat_range *range,
            const struct nf_conn *ct,
            enum nf_nat_manip_type maniptype)
{
    union nf_inet_addr *var_ipp;
    unsigned int i, max;
    /* Host order */
    u32 minip, maxip, j, dist;
    bool full_range;

    /* No IP mapping?  Do nothing. 没有设置IP转换标志，退出*/
    if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
        return;

    if (maniptype == NF_NAT_MANIP_SRC)/* 根据nat类型，指向须要修改的ip内存地址 */
        var_ipp = &tuple->src.u3;
    else
        var_ipp = &tuple->dst.u3;

    /* Fast path: only one choice. 若是只有一个IP地址，则就使用该IP地址 */
    if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
        *var_ipp = range->min_addr;
        return;
    }
    //计算IP地址最后四字节在ip数组中的偏移。
    if (nf_ct_l3num(ct) == NFPROTO_IPV4)
        max = sizeof(var_ipp->ip) / sizeof(u32) - 1;//为0
    else
        max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;//为3

    /* Hashing source and destination IPs gives a fairly even
     * spread in practice (if there are a small number of IPs
     * involved, there usually aren't that many connections
     * anyway).  The consistency means that servers see the same
     * client coming from the same IP (some Internet Banking sites
     * like this), even across reboots.
     * 若是设置了NF_NAT_RANGE_PERSISTENT标志的话，则保证同一个客户端
     * 使用相同的hash值，即hash的时候仅仅使用源IP，不使用目的IP。
     */
    j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
           range->flags & NF_NAT_RANGE_PERSISTENT ?
            0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
    //对ip地址的每个四字节进行hash取值，保证在指定的范围内。
    full_range = false;
    for (i = 0; i <= max; i++) {
        /* If first bytes of the address are at the maximum, use the
         * distance. Otherwise use the full range.
         */
        if (!full_range) {
            minip = ntohl((__force __be32)range->min_addr.all[i]);
            maxip = ntohl((__force __be32)range->max_addr.all[i]);
            dist  = maxip - minip + 1;
        } else {
            minip = 0;
            dist  = ~0;
        }

        var_ipp->all[i] = (__force __u32)
            htonl(minip + reciprocal_scale(j, dist));
        if (var_ipp->all[i] != range->max_addr.all[i])
            full_range = true;

        if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
            j ^= (__force u32)tuple->dst.u3.all[i];
    }
}

l4proto->unique_tuple

l4proto->unique_tuple的实现为nf_nat_l4proto_unique_tuple。网络

/*
若是没有指定范围，DNAT时目的端口不能改变，SNAT时源端口能够改变
端口的变化范围有几个限制，端口是512之内的映射范围是1-512，端口
是512-1024的映射范围是600-1024,1024以上的映射范围就是1024以上
若是指定了端口的变化范围，那就按照指定的来
若是是NF_NAT_RANGE_PROTO_RANDOM模式的话，调用L3的secure_port，
根据源目的IP和须要修改的端口计算一个hash值。
若是是NF_NAT_RANGE_PROTO_RANDOM_FULLY模式的话，直接计算随机数
根据获得的值根据范围取余，再加上最小值就获得的端口，而后断定是否已用，
用了的话加1再断定。
*/
void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
                 struct nf_conntrack_tuple *tuple,
                 const struct nf_nat_range *range,
                 enum nf_nat_manip_type maniptype,
                 const struct nf_conn *ct,
                 u16 *rover)
{
    unsigned int range_size, min, max, i;
    __be16 *portptr;
    u_int16_t off;

    if (maniptype == NF_NAT_MANIP_SRC)
        portptr = &tuple->src.u.all;
    else
        portptr = &tuple->dst.u.all;

    /* If no range specified... 判断是否指定了具体的端口范围 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {/* 没有指定具体端口范围的话 */
        /* If it's dst rewrite, can't change port 目的nat不改变端口 */
        if (maniptype == NF_NAT_MANIP_DST)
            return;
        /* 源端口为保留端口，则须要保证nat后的源端口也为保留端口 */
        if (ntohs(*portptr) < 1024) {
            /* Loose convention: >> 512 is credential passing */
            /* 源端口小于512，那么在1-511之间进行选择 */
            if (ntohs(*portptr) < 512) {
                min = 1;
                range_size = 511 - min + 1;
            } else {
                /* 大于512，则在600到1024之间进行选择 */
                min = 600;
                range_size = 1023 - min + 1;
            }
        } else {//非保留端口则在1024到65536之间进行选择
            min = 1024;
            range_size = 65535 - 1024 + 1;
        }
    } else {//指定了具体端口范围
        min = ntohs(range->min_proto.all);
        max = ntohs(range->max_proto.all);
        if (unlikely(max < min))
            swap(max, min);
        range_size = max - min + 1;
    }

    if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
        off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
                          ? tuple->dst.u.all
                          : tuple->src.u.all);
    } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
        off = prandom_u32();
    } else {
        off = *rover;
    }

    for (i = 0; ; ++off) {
        *portptr = htons(min + off % range_size);
        /* 端口已经被使用，则加1进行尝试，直到知足要求或者全部状况都应遍历完 
        ** 若是是因为++i == range_size跳出的循环的话，表示没有选出一个惟一的tuple，会话会被删除，报文将会在__nf_conntrack_confirm被丢弃*/
        if (++i != range_size && nf_nat_used_tuple(tuple, ct))
            continue;
        /* 若是没有设置随机的话，设置当前选用的端口号 */
        if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
            *rover = off;
        return;
    }
}