ip_queue的兑现分析

ip_queue的实现分析本文档的Copyleft归yfydz所有，使用GPL发布，可以自由拷贝，转载，转载时请保持文档的完整
ip_queue的实现分析
本文档的Copyleft归yfydz所有，使用GPL发布，可以自由拷贝，转载，转载时请保持文档的完整性，严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源：http://yfydz.cublog.cn
1. 前言 ip_queue是netfilter提供的将网络数据包从内核传递到用户空间的方法，内核中要提供ip_queue支持，在用户层空间打开一个netlink的socket后就可以接受内核通过ip_queue所传递来的网络数据包，具体数据包类型可由iptables命令来确定，只要将规则动作设置为“-j QUEUE”即可。 之所以要命名为ip_queue，是因为这是一个队列处理过程，iptables规则把指定的包发给QUEUE是一个数据进入队列的过程，而用户空间程序通过netlink socket获取数据包进行裁定，结果返回内核，进行出队列的操作。 在iptables代码中，提供了libipq库，封装了对ipq的一些操作，用户层程序可以直接使用libipq库函数处理数据。 2. 用户层接口：libipq libipq主要是在iptables-<version>/libipq/libipq.c中实现，提供了以下函数： //建立ipq的handle:struct ipq_handle *ipq_create_handle(u_int32_t flags, u_int32_t protocol); // 释放ipq handleint ipq_destroy_handle(struct ipq_handle *h); // 读取数据到buf中ssize_t ipq_read(const struct ipq_handle *h,                unsigned char *buf, size_t len, int timeout); // 设置ipq拷贝模式int ipq_set_mode(const struct ipq_handle *h, u_int8_t mode, size_t len); // 从buf中解析数据包结构ipq_packet_msg_t *ipq_get_packet(const unsigned char *buf); // 返回包的类型int ipq_message_type(const unsigned char *buf); // 设置对数据包的裁决int ipq_set_verdict(const struct ipq_handle *h,                    ipq_id_t id,                    unsigned int verdict,                    size_t data_len,                    unsigned char *buf); 有了libipq，用户层程序就很简单了，libipq.3中提供了一个实例，比较简单，只列出，不再赘述。/* * This code is GPL. */#include <linux/netfilter.h>#include <libipq.h>#include <stdio.h>#define BUFSIZE 2048static void die(struct ipq_handle *h){ ipq_perror("passer"); ipq_destroy_handle(h); exit(1);}int main(int argc, char **argv){ int status; unsigned char buf[BUFSIZE]; struct ipq_handle *h;  h = ipq_create_handle(0, PF_INET); if (!h)  die(h);   status = ipq_set_mode(h, IPQ_COPY_PACKET, BUFSIZE); if (status < 0)  die(h);   do{  status = ipq_read(h, buf, BUFSIZE, 0);  if (status < 0)   die(h);     switch (ipq_message_type(buf)) {   case NLMSG_ERROR:    fprintf(stderr, "Received error message %d\\n",            ipq_get_msgerr(buf));    break;       case IPQM_PACKET: {    ipq_packet_msg_t *m = ipq_get_packet(buf);        status = ipq_set_verdict(h, m->packet_id,                             NF_ACCEPT, 0, NULL);    if (status < 0)     die(h);    break;   }      default:    fprintf(stderr, "Unknown message type!\\n");    break;  } } while (1);  ipq_destroy_handle(h); return 0;}3. 内核：数据包进入队列 以下内核代码版本为2.4.26。 在net/core/netfilter.c中的对于要进行动作NF_QUEUE的数据处理流程为： nf_hook_slow()->nf_queue->queue_handler[pf].outfn 如果ip_queue模块有效，这个queue_handler[pf].outfn函数实际上是对应ipq_enqueue_packet()函数(net/ipv4/netfilter/ip_queue.c)，这是通过下面的函数进行登记的：/* net/ipv4/netfilter/ip_queue.c */... status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);... ipq_enqueue_packet()函数： static intipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data){ int status = -EINVAL; struct sk_buff *nskb; struct ipq_queue_entry *entry;// copy_mode是一个全局变量，IPQ_COPY_NONE表示还没初始化，数据包会被丢弃// 通常要初始化为IPQ_COPY_META(只拷贝META信息到用户层)或// IPQ_COPY_PACKET(拷贝全部信息到用户层) if (copy_mode == IPQ_COPY_NONE)  return -EAGAIN; // 记录数据包的相关信息，包括其路由信息 entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) {  printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");  return -ENOMEM; } entry->info = info; entry->skb = skb;  if (entry->info->hook == NF_IP_LOCAL_OUT) {// 在OUTPUT点进行QUEUE时记录相关路由信息：TOS，源、目的IP  struct iphdr *iph = skb->nh.iph;  entry->rt_info.tos = iph->tos;  entry->rt_info.daddr = iph->daddr;  entry->rt_info.saddr = iph->saddr; } // 生成一个新的skb包，该包中保存关于entry的信息，其数据部分是准备传递给用户// 空间的数据结构也就是libipq所读取的数据内容，如果拷贝模式是IPQ_COPY_META，// 只包含ipq数据头信息；如果是IPQ_COPY_PACKET，在ipq数据头后拷贝整个skb包// IP数据信息 nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL)  goto err_out_free;   write_lock_bh(&queue_lock);  if (!peer_pid)  goto err_out_free_nskb;  /* netlink_unicast will either free the nskb or attach it to a socket */// 将该skb附加到用户层打开的netlink socket上，放到其等待队列中，如果不成功这丢弃该包// ipqnl是ip_queue对应的netlink sock// peer_pid用户空间程序的pid status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); if (status < 0)  goto err_out_unlock; // 将entry信息入QUEUE队列，等待用户层的处理结果，如果队列满则丢弃该包 status = __ipq_enqueue_entry(entry); if (status < 0)  goto err_out_unlock; write_unlock_bh(&queue_lock); return status; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: write_unlock_bh(&queue_lock);err_out_free: kfree(entry); return status;} 所附加的META数据是这样一个结构: /* include/linux/netlink.h */struct nlmsghdr{ __u32  nlmsg_len; /* Length of message including header */ __u16  nlmsg_type; /* Message content */ __u16  nlmsg_flags; /* Additional flags */ __u32  nlmsg_seq; /* Sequence number */ __u32  nlmsg_pid; /* Sending process PID */}; 一旦数据进入了netlink sock的输入队列中，用户层对数据的读取就由netlink sock来处理了，ip_queue就不再管了，ip_queue只需要处理从用户层发来的数据，从用户层看是对netlink socket的写，从内核的ip_queue看是用户层数据的数据读取过程。4. 内核：读取用户层数据 ip_queue要读取netlink socket中返回的处理数据结果，函数流程为： ipq_rcv_sk()   |   Vipq_rcv_skb()   |   Vipq_receive_peer()   |   |------------------------------+   V                              Vipq_set_verdict()             ipq_set_mode()   |                              |   V                              Vipq_find_dequeue_entry()      __ipq_set_mode()ipq_issue_verdict()     |   Vnf_reinject()在模块初始化时建立netlink sock：  ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); 其接收数据函数为ipq_rcv_sk()： static voidipq_rcv_sk(struct sock *sk, int len){ do {  struct sk_buff *skb;  if (down_trylock(&ipqnl_sem))   return;// 从sock的等待队列中取出skb  while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {// 接收skb内容，skb中的数据格式和发送skb到ipq的格式是一样的，前面是ipq的// 控制头，即META部分, 后面才是真正的skb中的数据   ipq_rcv_skb(skb);// 丢弃skb包，这个skb本来就不是正常的网络skb，而是ipq通信的skb   kfree_skb(skb);  }    up(&ipqnl_sem); } while (ipqnl && ipqnl->receive_queue.qlen);} ipq_rcv_skb()函数本身都是再为ipq_receive_peer()函数作准备，忽略；ipq_receive_peer函数： static intipq_receive_peer(struct ipq_peer_msg *pmsg,                 unsigned char type, unsigned int len){ int status = 0; if (len < sizeof(*pmsg))  return -EINVAL; switch (type) {// 设置IPQ的拷贝模式：IPQ_COPY_META or IPQ_COPY_PACKET case IPQM_MODE:  status = ipq_set_mode(pmsg->msg.mode.value,                        pmsg->msg.mode.range);  break;// 处理数据包的裁决   case IPQM_VERDICT:  if (pmsg->msg.verdict.value > NF_MAX_VERDICT)   status = -EINVAL;  else   status = ipq_set_verdict(&pmsg->msg.verdict,                            len - sizeof(*pmsg));   break; default:  status = -EINVAL; } return status;} ipq_set_verdict()函数： static intipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len){ struct ipq_queue_entry *entry; if (vmsg->value > NF_MAX_VERDICT)  return -EINVAL;// 根据包的ID找出以前放入QUEUE队列中的ipq_queue_entry结构，该结构保存// 最初的skb包的地址 entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); if (entry == NULL)  return -ENOENT; else {  int verdict = vmsg->value;    if (vmsg->data_len && vmsg->data_len == len)// 如果数据被用户层修改，将修改后的信息替换skb中原来的信息   if (ipq_mangle_ipv4(vmsg, entry) < 0)    verdict = NF_DROP;// 最终进行裁定  ipq_issue_verdict(entry, verdict);  return 0; }} ipq_issue_verdict()函数： static voidipq_issue_verdict(struct ipq_queue_entry *entry, int verdict){// 所有QUEUE的包都要由该函数返回netfilter，在net/core/netfilter.c中定义 nf_reinject(entry->skb, entry->info, verdict); kfree(entry);} 5. 结论 ip_queue工具的提供使得很多在内核里不太容易实现的功能可以放到用户层空间内实现，处理安全性高，毕竟内核中的错误会导致系统崩溃，而用户层程序的出错不影响系统的整体运行，当然这是以性能降低为代价的。 ip_queue队列实现是使用queue_handler的，queue_handler对于每个协议族只支持一个队列，所以如果有两个需要使用queue功能的应用就会发生冲突，如实现QoS的IMQ也使用这个队列，因此两者在内核中是不能共存的。