首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 软件管理 > VSTS >

ip_vs兑现分析(8)

2012-09-13 
ip_vs实现分析(8)本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,

ip_vs实现分析(8)
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn

 10. IPVS的同步IPVS支持对连接的同步,两台IPVS设备可分别以MASTER或BACKUP运行,MASTER进程可将连接信息备份到BACKUP设备上,这样主设备死机时从设备可以无缝切换。可以在IPVS设备上同时启动MASTER和BACKUP进程,使设备之间互为备份,实现IPVS设备的均衡。IPVS同步实现在net/ipv4/ipvs/ip_vs_sync.c中10.0 数据结构同步信息块的格式如下,开始是4字节的信息头,后面是多个IPVS连接同步信息,每个块大小不固定,连接同步信息个数从0到多个:       0                   1                   2                   3       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+      |  Count Conns  |    SyncID     |            Size               |      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+      |                                                               |      |                    IPVS Sync Connection (1)                   |      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+      |                            .                                  |      |                            .                                  |      |                            .                                  |      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+      |                                                               |      |                    IPVS Sync Connection (n)                   |      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+信息头结构:#define SYNC_MESG_HEADER_LEN 4struct ip_vs_sync_mesg {// 连接数 __u8                    nr_conns;// 同步ID __u8                    syncid;// 数据总长 __u16                   size; /* ip_vs_sync_conn entries start here */};IPVS连接同步信息结构:struct ip_vs_sync_conn { __u8   reserved;// 连接基本信息 /* Protocol, addresses and port numbers */ __u8   protocol;       /* Which protocol (TCP/UDP) */ __u16   cport; __u16                   vport; __u16                   dport; __u32                   caddr;          /* client address */ __u32                   vaddr;          /* virtual address */ __u32                   daddr;          /* destination address */// 连接的状态和标志 /* Flags and state transition */ __u16                   flags;          /* status flags */ __u16                   state;          /* state info */// 后续可能有连接选项参数,就是TCP的序列号和确认号信息 /* The sequence options start here */};IPVS连接同步选项结构,,就是进入和发出发现TCP的序列号信息struct ip_vs_sync_conn_options { struct ip_vs_seq        in_seq;         /* incoming seq. struct */ struct ip_vs_seq        out_seq;        /* outgoing seq. struct */};连接数据控制块结构struct ip_vs_sync_buff {// 形成队列 struct list_head        list; unsigned long           firstuse; /* pointers for the message data */// 实际的同步信息 struct ip_vs_sync_mesg  *mesg;// 数据空闲区头指针 unsigned char           *head;// 数据尾指针 unsigned char           *end;}; 10.1 进程启动IPVS同步进程是一个内核进程,是由IPVSADM通过命令启动的.int start_sync_thread(int state, char *mcast_ifn, __u8 syncid){ DECLARE_COMPLETION(startup); pid_t pid;// 检查进程是否已经存在 if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||     (state == IP_VS_STATE_BACKUP && sync_backup_pid))  return -EEXIST; IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",    sizeof(struct ip_vs_sync_conn)); ip_vs_sync_state |= state; if (state == IP_VS_STATE_MASTER) {// MASTER进程// 通信的网卡  strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));// 同步ID  ip_vs_master_syncid = syncid; } else {// SLAVE进程// 通信的网卡  strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));// 同步ID  ip_vs_backup_syncid = syncid; }  repeat:// 启动内核进程 if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {  IP_VS_ERR("could not create fork_sync_thread due to %d... "     "retrying.\n", pid);  ssleep(1);  goto repeat; } wait_for_completion(&startup); return 0;}fork_sync_thread()函数也继续fork出一个进程,形成守护进程static int fork_sync_thread(void *startup){ pid_t pid; /* fork the sync thread here, then the parent process of the    sync thread is the init process after this thread exits. */  repeat: if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {  IP_VS_ERR("could not create sync_thread due to %d... "     "retrying.\n", pid);  ssleep(1);  goto repeat; } return 0;}static int sync_thread(void *startup){ DECLARE_WAITQUEUE(wait, current); mm_segment_t oldmm; int state; const char *name; /* increase the module use count */// 增加IPVS模块引用计数 ip_vs_use_count_inc();// 设置进程状态的名称 if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {  state = IP_VS_STATE_MASTER;  name = "ipvs_syncmaster"; } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {  state = IP_VS_STATE_BACKUP;  name = "ipvs_syncbackup"; } else {  IP_VS_BUG();  ip_vs_use_count_dec();  return -EINVAL; }// daemon化进程 daemonize(name); oldmm = get_fs(); set_fs(KERNEL_DS); /* Block all signals */// 本进程不接收所有信号 spin_lock_irq(&current->sighand->siglock); siginitsetinv(&current->blocked, 0); recalc_sigpending(); spin_unlock_irq(&current->sighand->siglock); /* set the maximum length of sync message */// 设置最大同步信息长度 set_sync_mesg_maxlen(state); /* set up multicast address */// 设置UDP多播sock的参数 mcast_addr.sin_family = AF_INET;// 端口为8848 mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);// 多播地址为224.0.0.81 mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);// 增加等待队列,由于接收发送数据必须在top half中,在bottom half中// 只是发个WQ信号告诉top half可以进行接收或发送数据了 add_wait_queue(&sync_wait, &wait);// 设置当前进程的pid set_sync_pid(state, current->pid); complete((struct completion *)startup); /* processing master/backup loop here */// 分别进行MASTER或SLAVE循环 if (state == IP_VS_STATE_MASTER)  sync_master_loop(); else if (state == IP_VS_STATE_BACKUP)  sync_backup_loop(); else IP_VS_BUG();// 循环退出,删除等待队列 remove_wait_queue(&sync_wait, &wait); /* thread exits */ set_sync_pid(state, 0); IP_VS_INFO("sync thread stopped!\n"); set_fs(oldmm); /* decrease the module use count */// 减少IPVS模块引用 ip_vs_use_count_dec();// 设置同步状态标志为0 set_stop_sync(state, 0); wake_up(&stop_sync_wait); return 0;} 10.2 进程停止同步进程也是由ipvsadm命令发出的.int stop_sync_thread(int state){ DECLARE_WAITQUEUE(wait, current);// 检查进程是否在运行 if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||     (state == IP_VS_STATE_BACKUP && !sync_backup_pid))  return -ESRCH; IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); IP_VS_INFO("stopping sync thread %d ...\n",     (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); __set_current_state(TASK_UNINTERRUPTIBLE);// 增加停止等待队列 add_wait_queue(&stop_sync_wait, &wait);// 设置同步停止标志为1,让所有同步进程停止 set_stop_sync(state, 1); ip_vs_sync_state -= state; wake_up(&sync_wait);// 重新调度,等待同步进程结束 schedule();// 重新调度会来执行本进程, 同步进程应该已经结束 __set_current_state(TASK_RUNNING);// 删除等待队列 remove_wait_queue(&stop_sync_wait, &wait); /* Note: no need to reap the sync thread, because its parent    process is the init process */// 检查一下进程状态和标志是否支持 if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||     (state == IP_VS_STATE_BACKUP && stop_backup_sync))  IP_VS_BUG(); return 0;} 10.3 MASTER循环static void sync_master_loop(void){ struct socket *sock; struct ip_vs_sync_buff *sb;// 建立多播SOCK,同步信息由此SOCK发出 /* create the sending multicast socket */ sock = make_send_sock(); if (!sock)  return; IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "     "syncid = %d\n",     ip_vs_master_mcast_ifn, ip_vs_master_syncid);// 进入死循环 for (;;) {// 从队列中取发送数据块  while ((sb=sb_dequeue())) {// 发出同步数据块   ip_vs_send_sync_msg(sock, sb->mesg);// 释放数据块缓冲   ip_vs_sync_buff_release(sb);  }  /* check if entries stay in curr_sb for 2 seconds */// 如果2秒内数据块没准备好,直接将未完成的数据块发出去// 最差情况下数据块里没有IPVS连接信息,只有一个数据头,// 相当于同步信号,表明MASTER还没死  if ((sb = get_curr_sync_buff(2*HZ))) {   ip_vs_send_sync_msg(sock, sb->mesg);   ip_vs_sync_buff_release(sb);  }// 发现停止MASTER进程标志,中断循环  if (stop_master_sync)   break;// 休眠1秒  ssleep(1); }// 循环退出,将当前发送队列中的数据块都发送完 /* clean up the sync_buff queue */ while ((sb=sb_dequeue())) {  ip_vs_sync_buff_release(sb); } /* clean up the current sync_buff */// 立即返回当前块,当前块是构造中还没放到发送队列中的数据块 if ((sb = get_curr_sync_buff(0))) {  ip_vs_sync_buff_release(sb); }// 释放UDP多播SOCK /* release the sending multicast socket */ sock_release(sock);}同步数据块出队列static inline struct ip_vs_sync_buff * sb_dequeue(void){ struct ip_vs_sync_buff *sb; spin_lock_bh(&ip_vs_sync_lock);// 检查队列是否为空 if (list_empty(&ip_vs_sync_queue)) {  sb = NULL; } else {// 获取队列第一个节点中的数据  sb = list_entry(ip_vs_sync_queue.next,    struct ip_vs_sync_buff,    list);// 将节点从链表中拆除  list_del(&sb->list); } spin_unlock_bh(&ip_vs_sync_lock); return sb;}发现同步信息块static voidip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg){ int msize; msize = msg->size; /* Put size in network byte order */// 网络传输的长度必须是网络序 msg->size = htons(msg->size);// 将数据块作为普通缓冲数据发送 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)  IP_VS_ERR("ip_vs_send_async error\n");}发送缓冲数据,就是调用kernel_sendmsg()函数发送static intip_vs_send_async(struct socket *sock, const char *buffer, const size_t length){ struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; struct kvec iov; int  len; EnterFunction(7); iov.iov_base     = (void *)buffer; iov.iov_len      = length; len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); LeaveFunction(7); return len;}获取当前同步数据块static inline struct ip_vs_sync_buff *get_curr_sync_buff(unsigned long time){ struct ip_vs_sync_buff *sb; spin_lock_bh(&curr_sb_lock); if (curr_sb && (time == 0 ||// 当前同步数据块存在,而且立即返回(time == 0)或// 数据块已经存在超过time个jiffies时返回   time_before(jiffies - curr_sb->firstuse, time))) {  sb = curr_sb;  curr_sb = NULL; } else  sb = NULL; spin_unlock_bh(&curr_sb_lock); return sb;}10.4 BACKUP循环static void sync_backup_loop(void){ struct socket *sock; char *buf; int len;// 分配数据接收空间 if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {  IP_VS_ERR("sync_backup_loop: kmalloc error\n");  return; }// 创建UDP接收SOCK /* create the receiving multicast socket */ sock = make_receive_sock(); if (!sock)  goto out; IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "     "syncid = %d\n",     ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);// 进入接收循环 for (;;) {  /* do you have data now? */// 接收队列非空  while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {// 接收数据到缓冲区   if ((len =        ip_vs_receive(sock, buf,        sync_recv_mesg_maxlen)) <= 0) {    IP_VS_ERR("receiving message error\n");    break;   }   /* disable bottom half, because it accessed the data      shared by softirq while getting/creating conns */// 处理数据时不能再进入bottom half   local_bh_disable();// 处理接收数据   ip_vs_process_message(buf, len);   local_bh_enable();  }// 检查是否设置进程停止标志  if (stop_backup_sync)   break;// 睡眠1秒  ssleep(1); }// 释放UDP SOCK /* release the sending multicast socket */ sock_release(sock);  out:// 释放接收缓冲 kfree(buf);}接收数据函数,比较简单,直接调用内核的kernel_recvmsg函数static intip_vs_receive(struct socket *sock, char *buffer, const size_t buflen){ struct msghdr  msg = {NULL,}; struct kvec  iov; int   len; EnterFunction(7); /* Receive a packet */ iov.iov_base     = buffer; iov.iov_len      = (size_t)buflen; len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); if (len < 0)  return -1; LeaveFunction(7); return len;}处理接收数据函数/* *      Process received multicast message and create the corresponding *      ip_vs_conn entries. */static void ip_vs_process_message(const char *buffer, const size_t buflen){ struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; struct ip_vs_sync_conn *s; struct ip_vs_sync_conn_options *opt; struct ip_vs_conn *cp; char *p; int i; /* Convert size back to host byte order */ m->size = ntohs(m->size);// 检查接收的数据长度是否正确 if (buflen != m->size) {  IP_VS_ERR("bogus message\n");  return; }// 检查同步ID是否匹配 /* SyncID sanity check */ if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {  IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",     m->syncid);  return; }// 同步信息块头后面是真正的IPVS连接同步信息// p现在是第一个同步连接结构指针 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; i<m->nr_conns; i++) {// 循环读取缓冲区中的同步连接信息  unsigned flags;  s = (struct ip_vs_sync_conn *)p;  flags = ntohs(s->flags);// 根据同步连接信息查找连接  if (!(flags & IP_VS_CONN_F_TEMPLATE))   cp = ip_vs_conn_in_get(s->protocol,            s->caddr, s->cport,            s->vaddr, s->vport);  else   cp = ip_vs_ct_in_get(s->protocol,            s->caddr, s->cport,            s->vaddr, s->vport);  if (!cp) {// 找不到连接,说明是MASTER新建的连接同步过来了// 新建连接,连接的dest参数为NULL,表明是同步产生的连接,而不是BACKUP自己生成的连接   cp = ip_vs_conn_new(s->protocol,         s->caddr, s->cport,         s->vaddr, s->vport,         s->daddr, s->dport,         flags, NULL);   if (!cp) {    IP_VS_ERR("ip_vs_conn_new failed\n");    return;   }// 设置连接状态   cp->state = ntohs(s->state);  } else if (!cp->dest) {// 找到了连接但没有dest指针,说明该连接是同步产生的连接,而不是BACKUP主动产生的连接   /* it is an entry created by the synchronization */   cp->state = ntohs(s->state);   cp->flags = flags | IP_VS_CONN_F_HASHED;  } /* Note that we don't touch its state and flags      if it is a normal entry. */  if (flags & IP_VS_CONN_F_SEQ_MASK) {// 拷贝连接选项   opt = (struct ip_vs_sync_conn_options *)&s[1];   memcpy(&cp->in_seq, opt, sizeof(*opt));// 缓冲数据后移连接加选项长度   p += FULL_CONN_SIZE;  } else// 缓冲数据后移连接长度   p += SIMPLE_CONN_SIZE;// 设置连接计数,是个固定值  atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);// 超时  cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;// 减少连接引用计数  ip_vs_conn_put(cp);// 检查当前缓冲区指针是否越界了  if (p > buffer+buflen) {   IP_VS_ERR("bogus message\n");   return;  } }}10.5 连接同步连接同步函数ip_vs_sync_conn()是由ip_vs_in()函数调用的:......// MASTER状态 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&// 非TCP协议或是TCP的连接建立好状态才同步     (cp->protocol != IPPROTO_TCP ||      cp->state == IP_VS_TCP_S_ESTABLISHED) &&// 积攒了一定包数后才同步,不是每个包都同步     (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]      == sysctl_ip_vs_sync_threshold[0]))  ip_vs_sync_conn(cp);....../* *      Add an ip_vs_conn information into the current sync_buff. *      Called by ip_vs_in. */void ip_vs_sync_conn(struct ip_vs_conn *cp){ struct ip_vs_sync_mesg *m; struct ip_vs_sync_conn *s; int len; spin_lock(&curr_sb_lock); if (!curr_sb) {// 当前连接数据块为空,分配新块  if (!(curr_sb=ip_vs_sync_buff_create())) {   spin_unlock(&curr_sb_lock);   IP_VS_ERR("ip_vs_sync_buff_create failed.\n");   return;  } }// 检查是否包括选项长度 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :  SIMPLE_CONN_SIZE; m = curr_sb->mesg;// 空闲缓冲区头,作为一个连接同步单元头 s = (struct ip_vs_sync_conn *)curr_sb->head;// 基本同步信息 /* copy members */ s->protocol = cp->protocol; s->cport = cp->cport; s->vport = cp->vport; s->dport = cp->dport; s->caddr = cp->caddr; s->vaddr = cp->vaddr; s->daddr = cp->daddr;// 连接标志和状态 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); s->state = htons(cp->state); if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {// 增加选项信息,即TCP序列号  struct ip_vs_sync_conn_options *opt =   (struct ip_vs_sync_conn_options *)&s[1];  memcpy(opt, &cp->in_seq, sizeof(*opt)); }// 同步块中连接数增加 m->nr_conns++;// 有效数据长度增加 m->size += len;// 空闲指针后移 curr_sb->head += len;// 检查剩下的空间是否还能容纳一个同步连接结构 /* check if there is a space for next one */ if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {// 空间不够的话将当前同步数据块添加到发送链表中// 饺子包好了  sb_queue_tail(curr_sb);// 当前同步块指针置空// 要重新给张饺皮了  curr_sb = NULL; } spin_unlock(&curr_sb_lock); /* synchronize its controller if it has */// 如果有主连接,递归调用本函数同步主连接信息 if (cp->control)  ip_vs_sync_conn(cp->control);}分配新连接同步数据块static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void){ struct ip_vs_sync_buff *sb;// 分配控制信息块头 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))  return NULL;// 分配具体的信息缓冲区 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {  kfree(sb);  return NULL; }// 当前同步连接数为0 sb->mesg->nr_conns = 0;// 同步ID sb->mesg->syncid = ip_vs_master_syncid;// 目前有效数据包长就是同步信息头长 sb->mesg->size = 4;// 空闲数据头指针 sb->head = (unsigned char *)sb->mesg + 4;// 数据缓冲结尾 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;// 同步数据块创建时间 sb->firstuse = jiffies; return sb;}......待续......

热点排行