Linux内核中netlink协议族的实现(下)
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严
禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn
5.3 连接连接通常是针对客户端连接服务器static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags){ int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr; if (addr->sa_family == AF_UNSPEC) {// 目的地址协议族为AF_UNSPEC(未指定), 简单返回成功 sk->sk_state = NETLINK_UNCONNECTED; nlk->dst_pid = 0; nlk->dst_group = 0; return 0; }// 限制目的地址协议族类型为AF_NETLINK if (addr->sa_family != AF_NETLINK) return -EINVAL; /* Only superuser is allowed to send multicasts */// 只有ROOT权限才能多播 if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) return -EPERM;// 没指定pid的话自动绑定一个pid if (!nlk->pid) err = netlink_autobind(sock); if (err == 0) {// 已经指定了pid或者自动绑定成功时设置sock的对方参数, 状态为连接成功 sk->sk_state = NETLINK_CONNECTED; nlk->dst_pid = nladdr->nl_pid; nlk->dst_group = ffs(nladdr->nl_groups); } return err;} 5.4 获取sock名称// 填充sockaddr_nl结构中的数据 static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, intpeer){ struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr;// 协议族 nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; *addr_len = sizeof(*nladdr); if (peer) {// 对方sock的pid和groups nladdr->nl_pid = nlk->dst_pid; nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else {// 自己sock的pid和groups nladdr->nl_pid = nlk->pid; nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; } return 0;}5.5 pollpoll是用poll(2)或select(2)系统调用选择套接口数据是否准备好时的处理函数,netlink用的是通用的数据报的poll处理函数dategram_poll(), 说明略。 5.6 setsockopt设置netlink sock的各种控制参数:static int netlink_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen){ struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); int val = 0, err;// sock层次要为SOL_NETLINK if (level != SOL_NETLINK) return -ENOPROTOOPT;// 读取用户空间的设置信息 if (optlen >= sizeof(int) && get_user(val, (int __user *)optval)) return -EFAULT; switch (optname) { case NETLINK_PKTINFO:// 处理NETLINK_RECV_PKTINFO标志, 非0设置, 0为清除 if (val) nlk->flags |= NETLINK_RECV_PKTINFO; else nlk->flags &= ~NETLINK_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: {// 加入或退出多播组 unsigned int subscriptions; int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;// 检查权限 if (!netlink_capable(sock, NL_NONROOT_RECV)) return -EPERM;// 如果当前sock的多播组为空是分配空间 if (nlk->groups == NULL) { err = netlink_alloc_groups(sk); if (err) return err; }// 检查数据范围 if (!val || val - 1 >= nlk->ngroups) return -EINVAL; netlink_table_grab();// 原来的状态标志 old = test_bit(val - 1, nlk->groups);// 如果old=1, new=0, subscriptions-1// 如果old=0, new=1, subscriptions+1 subscriptions = nlk->subscriptions - old + new;// 设置或清除相应状态标志 if (new) __set_bit(val - 1, nlk->groups); else __clear_bit(val - 1, nlk->groups);// 更新sock参数 netlink_update_subscriptions(sk, subscriptions); netlink_update_listeners(sk); netlink_table_ungrab(); err = 0; break; } default: err = -ENOPROTOOPT; } return err;}// 分配netlink sock的多播组空间static int netlink_alloc_groups(struct sock *sk){ struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; int err = 0; netlink_lock_table();// 组的数量是内核初始化时固定的, 最小值32, 尽量是8的倍数 groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) err = -ENOENT; netlink_unlock_table(); if (err) return err;// NLGRPSZ(groups)进行8字节对齐 nlk->groups = kzalloc(NLGRPSZ(groups), GFP_KERNEL); if (nlk->groups == NULL) return -ENOMEM; nlk->ngroups = groups; return 0;}5.7 getsockopt获取netlink sock的各种控制参数:static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen){ struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); int len, val, err;// sock层次要为SOL_NETLINK if (level != SOL_NETLINK) return -ENOPROTOOPT;// 读取用户空间的查询信息 if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO:// 只提供一种选项信息PKTINFO if (len < sizeof(int)) return -EINVAL; len = sizeof(int);// 看sock标志是否有NETLINK_RECV_PKTINFO返回1或0 val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; default: err = -ENOPROTOOPT; } return err;}5.8 发送消息从用户层发送数据到内核, 内核的sock是接收方static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len){// sock的IO控制块 struct sock_iocb *siocb = kiocb_to_siocb(kiocb);// socket -> sock struct sock *sk = sock->sk;// sock -> netlink sock struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *addr=msg->msg_name; u32 dst_pid; u32 dst_group; struct sk_buff *skb; int err;// scm: Socket level control messages processing struct scm_cookie scm;// 设置了OOB(out of band)标志, 在TCP中支持,netlink不支持 if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; if (NULL == siocb->scm) siocb->scm = &scm;// scm这些处理是干什么的以后再看 err = scm_send(sock, msg, siocb->scm); if (err < 0) return err;// 确定目的pid和组 if (msg->msg_namelen) { if (addr->nl_family != AF_NETLINK) return -EINVAL; dst_pid = addr->nl_pid; dst_group = ffs(addr->nl_groups); if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) return -EPERM; } else { dst_pid = nlk->dst_pid; dst_group = nlk->dst_group; }// 如果sock的pid为0, 自动绑定一个pid if (!nlk->pid) { err = netlink_autobind(sock); if (err) goto out; } err = -EMSGSIZE;// 消息长度太大 if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS;// 新生成一个skb数据包 skb = nlmsg_new(len, GFP_KERNEL); if (skb==NULL) goto out;// 设置该skb的netlink控制块参数 NETLINK_CB(skb).pid = nlk->pid; NETLINK_CB(skb).dst_pid = dst_pid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); selinux_get_task_sid(current, &(NETLINK_CB(skb).sid)); memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); /* What can I do? Netlink is asynchronous, so that we will have to save current capabilities to check them, when this message will be delivered to corresponding kernel module. --ANK (980802) */ err = -EFAULT;// 将发送的信息拷贝到skb的存储区 if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { kfree_skb(skb); goto out; }/* @netlink_send: * Save security information for a netlink message so that permission * checking can be performed when the message is processed. The security * information can be saved using the eff_cap field of the * netlink_skb_parms structure. Also may be used to provide fine * grained control over message transmission. * @sk associated sock of task sending the message., * @skb contains the sk_buff structure for the netlink message. * Return 0 if the information was successfully saved and message * is allowed to be transmitted. */ err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; }// 如果是多播的,先进行广播发送 if (dst_group) {// 增加使用者计数, 使skb不会真正释放 atomic_inc(&skb->users); netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); }// 单播发送 err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);out: return err;}// netlink广播, 发送到组内的全部sockint netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, u32 group, gfp_t allocation){// netlink广播数据结构信息 struct netlink_broadcast_data info; struct hlist_node *node; struct sock *sk;// 调整skb空间 skb = netlink_trim(skb, allocation);// 填充info结构基本参数 info.exclude_sk = ssk; info.pid = pid; info.group = group; info.failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table();// 遍历多播链表, 分别对每个sock进行单播 sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info);// 释放skb, 其实没有立即释放, 要先减少使用者数 kfree_skb(skb); netlink_unlock_table();// 如果分配了skb2,释放之 if (info.skb2) kfree_skb(info.skb2); if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT)) yield(); return 0; } if (info.failure) return -ENOBUFS; return -ESRCH;}// 单一广播static inline int do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p){ struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) goto out;// 检查pid和组是否合法 if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->failure) { netlink_overrun(sk); goto out; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) {// 克隆skb p->skb2 = skb_clone(p->skb, p->allocation); } else {// 此时skb2不会为NULL的 p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) {// 如果还是为NULL必然是克隆失败 netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1;// 否则发送skb2 } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); } else {// 数据正常发送 p->congested |= val; p->delivered = 1; p->skb2 = NULL; } sock_put(sk);out: return 0;}static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb){ struct netlink_sock *nlk = nlk_sk(sk);// 发送缓冲中要有足够空间 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { skb_set_owner_r(skb, sk);// 添加到接收队列尾, 由于是本机内部通信, 可以自己找到要发送的目的方,// 所以直接将数据扔给目的方, 所以是接收队列 skb_queue_tail(&sk->sk_receive_queue, skb);// 调用netlink sock的sk_data_ready函数处理, 由此进入内核中netlink各协议// 的回调处理 sk->sk_data_ready(sk, skb->len); return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; } return -1;}// netlink单播int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock){ struct sock *sk; int err; long timeo;// 调整skb大小 skb = netlink_trim(skb, gfp_any());// 获取超时时间 timeo = sock_sndtimeo(ssk, nonblock);retry:// ssk是服务器端的sock, 然后根据pid找到客户端的sock sk = netlink_getsockbypid(ssk, pid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); }// 将数据包附着在客户端sock上 err = netlink_attachskb(sk, skb, nonblock, timeo, ssk); if (err == 1) goto retry; if (err) return err;// 发送netlink数据包 return netlink_sendskb(sk, skb, ssk->sk_protocol);}/* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */// 注意这个是内核全局函数, 非staticint netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo, struct sock *ssk){ struct netlink_sock *nlk; nlk = nlk_sk(sk);// 检查接收缓存大小是否足够, 不够的话阻塞等待直到出错或条件满足 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) {// 声明当前进程的等待队列 DECLARE_WAITQUEUE(wait, current); if (!timeo) { if (!ssk || nlk_sk(ssk)->pid == 0) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; }// 设置当前进程状态为可中断的 __set_current_state(TASK_INTERRUPTIBLE);// 将sock挂接到等待队列 add_wait_queue(&nlk->wait, &wait);// 空间不够的话阻塞, timeo为阻塞超时 if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) timeo = schedule_timeout(timeo);// 进程状态运行 __set_current_state(TASK_RUNNING);// 删除等待队列 remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) {// 阻塞是通过超时解开的,而不是空间条件符合解开, 属于错误状态 kfree_skb(skb); return sock_intr_errno(timeo); }// 返回1, 重新选sock return 1; }// 条件满足, 直接将skb的所有者设为该netlink sock skb_set_owner_r(skb, sk); return 0;}// 注意这个是内核全局函数, 非staticint netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol){ int len = skb->len;// 将skb添加到接收队列末尾 skb_queue_tail(&sk->sk_receive_queue, skb);// 调用netlink sock的sk_data_ready函数处理 sk->sk_data_ready(sk, len); sock_put(sk); return len;} 5.9 接收消息数据是内核传向用户空间的static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len, int flags){// sock的IO控制块 struct sock_iocb *siocb = kiocb_to_siocb(kiocb);// scm struct scm_cookie scm;// socket -> sock struct sock *sk = sock->sk;// sock -> netlink sock struct netlink_sock *nlk = nlk_sk(sk);// 是否是非阻塞的 int noblock = flags&MSG_DONTWAIT; size_t copied; struct sk_buff *skb; int err;// 不能带OOB标志 if (flags&MSG_OOB) return -EOPNOTSUPP; copied = 0;// 接收一个数据包 skb = skb_recv_datagram(sk,flags,noblock,&err); if (skb==NULL) goto out; msg->msg_namelen = 0;// 收到的实际数据长度 copied = skb->len;// 接收缓冲小于数据长度, 设置数据裁剪标志 if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb->h.raw = skb->data;// 将skb的数据拷贝到接收缓冲区 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (msg->msg_name) {// sock有效, 填写nl sock的数据 struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name; addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).pid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); }// 接收数据包信息标志, 将消息头拷贝到用户空间 if (nlk->flags & NETLINK_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); if (NULL == siocb->scm) { memset(&scm, 0, sizeof(scm)); siocb->scm = &scm; } siocb->scm->creds = *NETLINK_CREDS(skb); skb_free_datagram(sk, skb); if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) netlink_dump(sk); scm_recv(sock, msg, siocb->scm, flags);out:// 接收唤醒 netlink_rcv_wake(sk); return err ? : copied;} 6. 结论netlink处理代码不是很好懂, 毕竟和其他协议不同之处是内核中同时存在服务器和客户端的sock, 因此接收发送数据要注意数据的流向。不过在实际使用中感觉不是很稳定, 流量大时会发生各种奇异的死机现象。