2.6.1*Linux内核中NAT处理的改变
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn
1. 前言在2.6.10内核后,netfilter的NAT部分作了一些较大的修改,而连接跟踪部分修改了一些但不大,本文大致描述一下NAT修改部分的情况。 以下内核代码版本为2.6.17.11。2. 修改情况 2.1 多连接协议的跟踪和NAT 对于多连接协议,如FTP等,netfilter的处理进行了较大的修改,表现为:1) 多连接跟踪改为一个nf_hook_ops点处理2) 数据内容部分的NAT直接在跟踪函数中调用处理,不再由NAT部分处理连接help操作在INPUT和POSTROUTING点上进行,优先级仅高于SEQ_ADJUST和confirm,数据包到这时地址端口部分已经经过NAT了。/* net/ipv4/netfilter/ip_conntrack_standalne.c */...... { .hook = ip_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ip_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, },.......static unsigned int ip_conntrack_help(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)){ struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; /* This is where we call the helper: as the packet goes out. */ ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && ct->helper) { unsigned int ret; ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) return ret; } return NF_ACCEPT;} 多连接协议的跟踪辅助结构struct ip_conntrack_helper内容变化不大,但原来的NAT辅助结构struct ip_nat_helper已经取消了,数据内容的修改直接在跟踪函数中调用。以FTP为例: /* net/ipv4/netfilter/ip_conntrack_ftp.c */ static int help(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo){ unsigned int dataoff, datalen; struct tcphdr _tcph, *th; char *fb_ptr; int ret; u32 seq, array[6] = { 0 }; int dir = CTINFO2DIR(ctinfo); unsigned int matchlen, matchoff; struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info; struct ip_conntrack_expect *exp; unsigned int i; int found = 0, ends_in_nl; /* Until there's been traffic both ways, don't look in packets. */ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); return NF_ACCEPT; }// TCP头,会拷贝到缓冲区 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, sizeof(_tcph), &_tcph); if (th == NULL) return NF_ACCEPT; dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; /* No data? */ if (dataoff >= (*pskb)->len) { DEBUGP("ftp: pskblen = %u\n", (*pskb)->len); return NF_ACCEPT; } datalen = (*pskb)->len - dataoff; spin_lock_bh(&ip_ftp_lock);// 获取FTP内容数据部分,可能会拷贝到一个缓冲区进行,不直接在原数据区// 上进行操作,这也许和2.6支持抢占式处理有关,如果被其他内核进程抢占// 由对该skb进行操作就麻烦了 fb_ptr = skb_header_pointer(*pskb, dataoff, (*pskb)->len - dataoff, ftp_buffer); BUG_ON(fb_ptr == NULL); ends_in_nl = (fb_ptr[datalen - 1] == '\n'); seq = ntohl(th->seq) + datalen; /* Look up to see if we're just after a \n. */// 检查序列号是否是在回车符后,防止phrack63中描述的序列号问题 if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { /* Now if this ends in \n, update ftp info. */ DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", ct_ftp_info->seq_aft_nl[0][dir] old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); ret = NF_ACCEPT; goto out_update_nl; } /* Initialize IP array to expected address (it's not mentioned in EPSV responses) */// 期待的地址,如果进行了NAT,那得到的已经是NAT后的地址 array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF; array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF; array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;// 模式查找 for (i = 0; i < ARRAY_SIZE(search); i++) { if (search[i].dir != dir) continue; found = find_pattern(fb_ptr, (*pskb)->len - dataoff, search[i].pattern, search[i].plen, search[i].skip, search[i].term, &matchoff, &matchlen, array, search[i].getnum); if (found) break; } if (found == -1) { /* We don't usually drop packets. After all, this is connection tracking, not packet filtering. However, it is necessary for accurate tracking in this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", search[i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; } else if (found == 0) { /* No match */ ret = NF_ACCEPT; goto out_update_nl; } DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n", fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); /* Allocate expectation which will be inserted */// 使用动态分配的期待连接空间,以前在此是静态,提交后才是动态分配 exp = ip_conntrack_expect_alloc(ct); if (exp == NULL) { ret = NF_DROP; goto out; } /* We refer to the reverse direction ("!dir") tuples here, * because we're expecting something in the other direction. * Doesn't matter unless NAT is happening. */ exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) != ct->tuplehash[dir].tuple.src.ip) { /* Enrico Scholz's passive FTP to partially RNAT'd ftp server: it really wants us to connect to a different IP address. Simply don't record it for NAT. */ DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", array[0], array[1], array[2], array[3], NIPQUAD(ct->tuplehash[dir].tuple.src.ip)); /* Thanks to Cristiano Lincoln Mattos <lincoln@cesar.org.br> for reporting this potential problem (DMZ machines opening holes to internal networks, or the packet filter itself). */ if (!loose) { ret = NF_ACCEPT; goto out_put_expect; } exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); } exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]); exp->tuple.src.u.tcp.port = 0; /* Don't care. */ exp->tuple.dst.protonum = IPPROTO_TCP; exp->mask = ((struct ip_conntrack_tuple) { { 0xFFFFFFFF, { 0 } }, { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); exp->expectfn = NULL; exp->flags = 0; /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (ip_nat_ftp_hook)// 如果注册了NAT修改函数,在此直接调用,该hook函数在ip_nat_ftp.c中定义 ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, matchoff, matchlen, exp, &seq);// 现在的问题是如果内核包括了ip_nat_ftp,那相应ip_nat_ftp_hook就定义了;// 如果系统仅运行在纯路由模式下,不进行NAT,也要调用该函数处理?// 现在看不出如何区分是否需要进行NAT操作,似乎有点问题? else {// 否则直接注册期待连接信息// 现在2.6.1*中ip_conntrack_expect_related()就只需要调用一次(忽略端口已用情况),// 而以前版本在跟踪时调用一次,NAT后由必须调用一次 /* Can't expect this? Best to drop packet now. */ if (ip_conntrack_expect_related(exp) != 0) ret = NF_DROP; else ret = NF_ACCEPT; }out_put_expect: ip_conntrack_expect_put(exp);out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) update_nl_seq(seq, ct_ftp_info,dir, *pskb); out: spin_unlock_bh(&ip_ftp_lock); return ret;} FTP的NAT处理文件就只是定义ip_nat_ftp_hook函数,不再向以前那样在NAT操作中修改,还要修改期待的子连接,现在就只是纯粹的内容数据修改,对期待连接的只检查新的端口是否被使用的问题,因此实际上和以前NAT操作相比简单了很多: /* net/ipv4/netfilter/ip_nat_ftp.c */ static int __init ip_nat_ftp_init(void){ BUG_ON(ip_nat_ftp_hook); ip_nat_ftp_hook = ip_nat_ftp; return 0;} static unsigned int ip_nat_ftp(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, enum ip_ct_ftp_type type, unsigned int matchoff, unsigned int matchlen, struct ip_conntrack_expect *exp, u32 *seq){ u_int32_t newip; u_int16_t port; int dir = CTINFO2DIR(ctinfo); struct ip_conntrack *ct = exp->master; DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); /* Connection will come from wherever this packet goes, hence !dir */ newip = ct->tuplehash[!dir].tuple.dst.ip; exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; exp->dir = !dir; /* When you see the packet, we need to NAT it the same as the * this one. */// 该函数在初始化连接init_conntrack()函数中调用,用于为子连接建立NAT信息 exp->expectfn = ip_nat_follow_master; /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { exp->tuple.dst.u.tcp.port = htons(port);// 检查是否可以用此端口替代原来的端口 if (ip_conntrack_expect_related(exp) == 0) break; }// 无空闲端口,丢包 if (port == 0) return NF_DROP; if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, seq)) {// 修改失败丢包 ip_conntrack_unexpect_related(exp); return NF_DROP; } return NF_ACCEPT;} 2.2 由sk_buff包获取相应连接的信息的修改 在struct sk_buff中直接增加了nfctinfo参数,因此可以直接得到sk_buff包是哪种类型,不象以前是通过在ip_conntrack结构中的一个数组位置来表示是哪种类型,相应nfct参数就直接指向连接本身。这样减少了struct ip_conntrack结构的大小,而sk_buff中nfctinfo字段是利用原来其他字段的空闲区记录的,没有增加sk_buff的大小。/* include/linux/skbuff.h */struct sk_buff{...... __u8 local_df:1, cloned:1, ip_summed:2, nohdr:1, nfctinfo:3;...... struct nf_conntrack *nfct;...... 2.3 隐含的HOOK点PREROUTING: DEFRAG, CONNTRACKOUTPUT: DEFRAG, CONNTRACKPOSTROUTING: conn_help, SEQ_ADJUST,confirmINPUT: conn_help, SEQ_ADJUST,confirm 权限值定义:/* include/linux/netfilter_ipv4.h */enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK_DEFRAG = -400, NF_IP_PRI_RAW = -300, NF_IP_PRI_SELINUX_FIRST = -225, NF_IP_PRI_CONNTRACK = -200, NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD = -175, NF_IP_PRI_MANGLE = -150, NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT = -50, NF_IP_PRI_FILTER = 0, NF_IP_PRI_NAT_SRC = 100, NF_IP_PRI_SELINUX_LAST = 225, NF_IP_PRI_CONNTRACK_HELPER = INT_MAX - 2, NF_IP_PRI_NAT_SEQ_ADJUST = INT_MAX - 1, NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX, NF_IP_PRI_LAST = INT_MAX,};桥下的netfilter控制已经正式集成到内核中,不需要象以前那样需要打补丁了。协议数据内容NAT时序列号调整也使用单独的nf_hook_ops,分别作用在INPUT和POSTROUTING挂接点上,以前也是在NAT模块中一起进行: /* net/ipv4/netfilter/ip_nat_standalone.c */...... { .hook = ip_nat_adjust, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_NAT_SEQ_ADJUST, },...... { .hook = ip_nat_adjust, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_IN, .priority = NF_IP_PRI_NAT_SEQ_ADJUST, },...... static unsigned intip_nat_adjust(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)){ struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { DEBUGP("ip_nat_standalone: adjusting sequence number\n"); if (!ip_nat_seq_adjust(pskb, ct, ctinfo)) return NF_DROP; } return NF_ACCEPT;} 其中标志位:IPS_SEQ_ADJUST_BIT是在ip_nat_mangle_tcp_packet()函数中发现修改后的数据长度和原数据长度不同时设置的。 2.4 普通数据的NAT现在的NAT只处理TCP/IP头部分,包括地址和传输层的端口,不再处理应用部分的数据的修改。而且对于已经建立好NAT对应关系的已建立(ESTABLISHED)的包,相关处理函数由以前的do_binding()改为ip_nat_packet():/* net/ipv4/netfilter/ip_nat_core.c */ /* Do packet manipulations according to ip_nat_setup_info. */unsigned int ip_nat_packet(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff **pskb){ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned long statusbit; enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);// 判断数据修改类型 if (mtype == IP_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */// ct->status中NAT类型是在建立NAT信息的ip_nat_setup_info()函数中// 设置的 if (ct->status & statusbit) { struct ip_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);// 修改数据 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype)) return NF_DROP; } return NF_ACCEPT;}可看到ip_nat_packet()函数比以前的do_binding()函数简洁了很多。2.5 连接状态值 /* include/linux/netfilter/nf_conntrack_common.h */ /* Bitset representing status of connection. */enum ip_conntrack_status { /* It's an expected connection: bit 0 set. This bit never changed */ IPS_EXPECTED_BIT = 0, IPS_EXPECTED = (1 << IPS_EXPECTED_BIT), /* We've seen packets both ways: bit 1 set. Can be set, not unset. */ IPS_SEEN_REPLY_BIT = 1, IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT), /* Conntrack should never be early-expired. */ IPS_ASSURED_BIT = 2, IPS_ASSURED = (1 << IPS_ASSURED_BIT), /* Connection is confirmed: originating packet has left box */ IPS_CONFIRMED_BIT = 3, IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT), /* Connection needs src nat in orig dir. This bit never changed. */ IPS_SRC_NAT_BIT = 4, IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT), /* Connection needs dst nat in orig dir. This bit never changed. */ IPS_DST_NAT_BIT = 5, IPS_DST_NAT = (1 << IPS_DST_NAT_BIT), /* Both together. */ IPS_NAT_MASK = (IPS_DST_NAT | IPS_SRC_NAT), /* Connection needs TCP sequence adjusted. */ IPS_SEQ_ADJUST_BIT = 6, IPS_SEQ_ADJUST = (1 << IPS_SEQ_ADJUST_BIT), /* NAT initialization bits. */ IPS_SRC_NAT_DONE_BIT = 7, IPS_SRC_NAT_DONE = (1 << IPS_SRC_NAT_DONE_BIT), IPS_DST_NAT_DONE_BIT = 8, IPS_DST_NAT_DONE = (1 << IPS_DST_NAT_DONE_BIT), /* Both together */ IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE), /* Connection is dying (removed from lists), can not be unset. */ IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT),};在连接结构struct ip_conntrack中的status参数用来描述该连接的一些状态,用上面的枚举类型描述,和2.4相比,增加了关于NAT操作的一些新的位。 3. 结论2.6.1*以后的Linux内核中的NAT部分进行了较大的变化,修改后的NAT部分只专注于IP层和传输层参数的修改,剔除了内容级别的修改,内容级别的修改变成连接跟踪的单独模块进行,使系统更简化,同时由skbuff数据包获取连接信息的操作也更加简单,理论上系统效率会高一些。发表于: 2006-08-28,修改于: 2006-08-28 08:30,已浏览3556次,有评论2条 推荐 投诉网友: rha030 时间:2007-11-23 09:55:14 IP地址:211.103.237.★我最近在学习netfilter,在网上找到你的文章,对我帮助很大,所以首先要谢谢你! 我现在有一个问题想不清楚,就是nat是怎么使用conntrack->tuplehash的,ORIG和REPLY tuple分别指的是什么呢?前者只的是NAT转换前的?还有就是如果NAT设置了SNAT或者DNAT,另外一个方向就不需要再进行相应的设置了,就是靠这个tuplehash保存相应的信息来做的吧,可能搞清了第一个问题,这个问题也就明白了,下面是你博客上文章的一部分,没有看 懂。。。请指教! 谢谢!我的邮箱是wudx05@gmail.comorig_tp = conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */// 通过连接反方向的tuple获取连接正方向的tuple值存到curr_tuple中,// 也就是NAT转换前的tuple invert_tuplepr(&curr_tuple, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);// 获取地址转换后的新的tuple值到new_tuple中,已经包括了传输层上的转换 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);// 检查转换前后的tuple值是否相同,new_tuple是NAT后的新的原始方向的tuple if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {// 不同,进行NAT转换 struct ip_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */// 获取转换后的连接响应方向的tuple值到reply中 invert_tuplepr(&reply, &new_tuple);// 修改连接中的响应方向的tuple值// 即conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = reply ip_conntrack_alter_reply(conntrack, &reply);网友: yfydz 时间:2007-11-23 11:02:02 IP地址:218.247.216.★ORIG就是从发起方看起来的网络连接,REPLY是从响应方看起来的网络连接比如说内部机器10.1.1.1:tcp:1024->1.1.1.1:tcp:80,这是ORIG, SNAT后是2.2.2.2,SNAT后端口是1025,这时REPLY的连接就是:1.1.1.1:tcp:80->2.2.2.2:tcp:1025,ORI的源就是REP的目的,ORI的目的就是REP的源,如果两个不同的话就进行NAT操作