icmp协议相比于tcp,udp有其读特性,它介于网络层和传输层之间,它没有传输层的源目的端口。因此在建立链接跟踪时须要进行特殊处理。还有ICMP属于差错报文,并非全部icmp报文是成对出现的,这些不一样形成了icmp的处理与tcp,udp处理的不一样。网络
icmp报文由以下种类:tcp
最多有18种icmp报文,每一种icmp报文可能会有一些子类。只有下面四种icmp报文是成对出现的。ide
static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; //其成对的关系为 /* Add 1; spaces filled with 0. 这里都给其对应的类型加了1,主要是由于ICMP_ECHO值为0,内核想把0这个值表示没有成对消息,因此在这里进行了加1,最后在构建CT的时候会减掉1。详细能够查看函数icmp_invert_tuple。 */ static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 };
由于只有这四对消息时成对的,因此链接跟踪只会为这四对消息进行链接跟踪。函数
icmp报文没有源目的端口,采用什么来填充tuple呢?post
从下面代码能够看出,链接跟踪使用一个__be16 id来替代tuple中的源port,这里的id是icmp报文中的标识符,这四类消息都有,其中ping消息通常在其中填充ping程序的pid,因此同一台设备启动两个不一样的ping程序ping同一个ip会生成两个会话:ui
/* The protocol-specific manipulable parts of the tuple: always in * network order */ union nf_conntrack_man_proto { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { __be16 id; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; /* GRE key is 32bit, PPtP only uses 16bit */ } gre; };
从下面代码能够看出,链接跟踪使用一个u_int8_t type, code;来替代tuple中的目的port:this
/* This contains the information to distinguish a connection. */ struct nf_conntrack_tuple { struct nf_conntrack_man src; /* These are the parts of the tuple which are fixed. */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; /* The protocol. */ u_int8_t protonum; /* The direction (for tuplehash) */ u_int8_t dir; } dst; };
下面咱们看一下,icmp如何求一个tuple的反转tuple,是否项tcp,udp将源目的端口调换同样呢?spa
/* 反转五元组 */ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判断类型是否超过了最大值,非法 !invmap[orig->dst.u.icmp.type])//判断该类型的icmp消息是不是成对的,使用0表示不成对,不成对则不处理。 return false; //id依然填写到id的位置,没有被调换到type,code位置 tuple->src.u.icmp.id = orig->src.u.icmp.id; //只是替换了type到其对应的type。 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//这里减了1,由于invmap中都加了1 //code不会变。由于这四对消息的code只有一个值0。详细请看前面的图片。 tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; }
从上面能够看出,icmp求反转tuple时,只会将type替换成对应的type(咱们这里不涉及IP地址)debug
icmp更多的是差错报文,它是用来通知源主机的一些错误信息的。它的产生每每是某台设备发送的报文在传输过程当中出现了差错,在传输路径中设备或者目标主机设备检测到了差错,从而生成一个ICMP差错报文通知源主机。差错报文会在icmp报文头后添加一段致使该icmp报文的原始报文头信息。因此说icmp差错报文是一个链接的附属,链接跟踪将差错报文视为一个子链接(不会真实建立CT,而是依附于主链接,设置该报文的状态为IP_CT_RELATED或者IP_CT_RELATED_REPLY)。3d
链接跟踪处理以下几种差错报文:
ICMP_DEST_UNREACH //目的不可达 ICMP_SOURCE_QUENCH //源抑制,向源主机发送源抑制报文通知源主机减慢发送速度 ICMP_TIME_EXCEEDED //TTL超时, ICMP_PARAMETERPROB //参数问题, ICMP_REDIRECT //重定向,收到该差错的主机须要更新路由的下一跳,或者邻居(直连主机)
链接跟踪对于这几种差错报文,须要正确交给目标主机。处理的主要缘由是NAT,后续详细说明。
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, .get_timeouts = icmp_get_timeouts, .new = icmp_new, .error = icmp_error, .destroy = NULL, .me = NULL, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, .nla_policy = icmp_nla_policy, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) .ctnl_timeout = { .nlattr_to_obj = icmp_timeout_nlattr_to_obj, .obj_to_nlattr = icmp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ .init_net = icmp_init_net, .get_net_proto = icmp_get_net_proto, };
/* Small and modified version of icmp_rcv */ /* 用于在链接跟踪中处理报文错误,tmpl通常为NULL */ static int icmp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u8 pf, unsigned int hooknum) { const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? icmp头是否完整 */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ /* 检验校验码 */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&//本机发送的不检查 nf_ip_checksum(skb, hooknum, dataoff, 0)) { icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. * 类型是否非法。 */ if (icmph->type > NR_ICMP_TYPES) { icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } /* Need to track icmp error message? */ /* 非差错报文直接检查经过 */ if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_SOURCE_QUENCH && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB && icmph->type != ICMP_REDIRECT) return NF_ACCEPT; //处理icmp差错报文 return icmp_error_message(net, tmpl, skb, hooknum); } /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ /* icmp差错报文处理,主要是根据内层携带的原始报文头找到对应的主链接。 ** 而后设置该报文依附于主连接,是一个RELATE报文 */ static int icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int hooknum) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ /* 根据内层报文信息获取对应的五元组到origtuple中 */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } /* rcu_read_lock()ed by nf_hook_thresh */ /* 获取内层报文的传输层控制块 */ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. ** 获取内层报文的反向五元组 */ if (!nf_ct_invert_tuple(&innertuple, &origtuple, &nf_conntrack_l3proto_ipv4, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } //设置报文的状态为子链接,这是报文的状态。 ctinfo = IP_CT_RELATED; //根据反向五元组获取对应的主CT。为何是反向呢? //由于icmp报文是对源报文的一个响应,因此应该根据源报文的信息去获取其所属链接。 h = nf_conntrack_find_get(net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } //若是是应答方向,则设置其状态为 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ /* 将该报文关联到 主CT ,其状态为IP_CT_RELATED or IP_CT_RELATED_REPLY*/ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); /* 修改内部的 */ return NF_ACCEPT; } //差错报文返回NF_ACCEPT后,由于设置了报文的CT,报文的链接跟踪处理就结束了。详细状况nf_conntrack_in函数。
/* 提取icmp的五元组,只有成对报文才会 */ static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->type;/* 类型 */ tuple->src.u.icmp.id = hp->un.echo.id;/* id号,ping报文为进程id */ tuple->dst.u.icmp.code = hp->code;/* 代码,通常为0 */ return true; }
/* 反转五元组 */ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判断类型是否超过了最大值,非法 !invmap[orig->dst.u.icmp.type])//判断该类型的icmp消息是不是成对的,使用0表示不成对,不成对则不处理。 return false; //id依然填写到id的位置,没有被调换到type,code位置 tuple->src.u.icmp.id = orig->src.u.icmp.id; //只是替换了type到其对应的type。 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//这里减了1,由于invmap中都加了1 //code不会变。由于这四对消息的code只有一个值0。详细请看前面的图片。 tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; }
非差错报文的请求方向报文会被该函数处理,主要是进行合法性校验,icmp_error函数已经处理过了,这里多余。
/* Called when a new connection for this protocol found. */ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { static const u_int8_t valid_new[] = {/* 共18个元素,其中只有下面四个icmp请求会进行链接跟踪 */ [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); return false; } return true; }
非差错报文的应答方向报文会被该函数处理,主要是进行超时更新和报文统计。
/* Returns verdict for packet, or -1 for invalid. */ /* icmp协议本身的链接跟踪事务处理,对于icmp仅仅是进行报文统计 */ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, unsigned int *timeout) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; }
icmp 链接跟踪超时时间获取,通常是30秒。
/* 获取该命名空间的icmp链接跟踪的超时时间 */ static unsigned int *icmp_get_timeouts(struct net *net) { return &icmp_pernet(net)->timeout; } static int icmp_init_net(struct net *net, u_int16_t proto) { struct nf_icmp_net *in = icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; return icmp_kmemdup_sysctl_table(pn, in); } /* icmp会话30秒超时 */ static const unsigned int nf_ct_icmp_timeout = 30*HZ;
icmp报文对nat的支持实际更多的是网络层的支持,对于icmp报文自己来讲只有一个标识符能够改变,不过不多有场景要改变标识符的。下面就代码简单的分析一下.
const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .l4proto = IPPROTO_ICMP, .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif };
判断icmp的标识符是否在指定的范围中。
static bool icmp_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); }
分配一个标志符,使得五元组惟一。
static void icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { static u_int16_t id; unsigned int range_size; unsigned int i; range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; /* If no range specified... 没有指定范围,则设置方位为0xffff */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) range_size = 0xFFFF; for (i = 0; ; ++id) { tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + (id % range_size)); if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) return; } return; }
将选择的标识符替换掉原来的标识符,更新校验码。
static bool icmp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l3proto *l3proto, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; }
差错报文的内层报文信息来自于产生差错的报文。当一个主机发送一个报文通过NAT后,其报文头发生了改变。也就是说,检测到该报文有差错的设备看到的报文是通过NAT后的报文,因此NAT须要将内层报文还原回原来的报文再转发给源主机。
unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED://对于icmp差错报文,会为这两个状态 case IP_CT_RELATED_REPLY: //icmp报文特殊处理,这种状态的报文是一个icmp差错报文。 //根据其所属的原始报文决定其所属的ct。对icmp携带的原始报文部分进行 //相应操做。 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { //既会对内层报文进行nat,也会对外层报文进行nat,这里处理完毕后就返回了。 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } ... } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); const struct nf_nat_l4proto *l4proto; struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) return 0; //获取icmp报文头起始地址 inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) {//重定向差错报恩。 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ /* 应答方向进行求反 */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; //若是主链接没有该nat操做,退出。 if (!(ct->status & statusbit)) return 1; //获取内层报文的传输层操做控制块 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); //进行内层报文nat处理。包括传输层和网络层 if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), l4proto, &ct->tuplehash[!dir].tuple, !manip)) return 0; //更新icmp校验码 if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ //进行外层报文的nat处理 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) return 0; return 1; }