Merge master.kernel.org:/pub/scm/linux/kernel/git/dtor/input
[linux-2.6-microblaze.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34
35 #include <linux/netfilter.h>
36 #include <linux/netfilter_ipv4.h>
37
38 #include <net/ip.h>
39 #include <net/route.h>
40 #include <net/sock.h>
41
42 #include <asm/uaccess.h>
43
44 #include <net/ip_vs.h>
45
46 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
47 static DECLARE_MUTEX(__ip_vs_mutex);
48
49 /* lock for service table */
50 static DEFINE_RWLOCK(__ip_vs_svc_lock);
51
52 /* lock for table with the real services */
53 static DEFINE_RWLOCK(__ip_vs_rs_lock);
54
55 /* lock for state and timeout tables */
56 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
57
58 /* lock for drop entry handling */
59 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
60
61 /* lock for drop packet handling */
62 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
63
64 /* 1/rate drop and drop-entry variables */
65 int ip_vs_drop_rate = 0;
66 int ip_vs_drop_counter = 0;
67 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
68
69 /* number of virtual services */
70 static int ip_vs_num_services = 0;
71
72 /* sysctl variables */
73 static int sysctl_ip_vs_drop_entry = 0;
74 static int sysctl_ip_vs_drop_packet = 0;
75 static int sysctl_ip_vs_secure_tcp = 0;
76 static int sysctl_ip_vs_amemthresh = 1024;
77 static int sysctl_ip_vs_am_droprate = 10;
78 int sysctl_ip_vs_cache_bypass = 0;
79 int sysctl_ip_vs_expire_nodest_conn = 0;
80 int sysctl_ip_vs_expire_quiescent_template = 0;
81 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
82 int sysctl_ip_vs_nat_icmp_send = 0;
83
84
85 #ifdef CONFIG_IP_VS_DEBUG
86 static int sysctl_ip_vs_debug_level = 0;
87
88 int ip_vs_get_debug_level(void)
89 {
90         return sysctl_ip_vs_debug_level;
91 }
92 #endif
93
94 /*
95  *      update_defense_level is called from keventd and from sysctl,
96  *      so it needs to protect itself from softirqs
97  */
98 static void update_defense_level(void)
99 {
100         struct sysinfo i;
101         static int old_secure_tcp = 0;
102         int availmem;
103         int nomem;
104         int to_change = -1;
105
106         /* we only count free and buffered memory (in pages) */
107         si_meminfo(&i);
108         availmem = i.freeram + i.bufferram;
109         /* however in linux 2.5 the i.bufferram is total page cache size,
110            we need adjust it */
111         /* si_swapinfo(&i); */
112         /* availmem = availmem - (i.totalswap - i.freeswap); */
113
114         nomem = (availmem < sysctl_ip_vs_amemthresh);
115
116         local_bh_disable();
117
118         /* drop_entry */
119         spin_lock(&__ip_vs_dropentry_lock);
120         switch (sysctl_ip_vs_drop_entry) {
121         case 0:
122                 atomic_set(&ip_vs_dropentry, 0);
123                 break;
124         case 1:
125                 if (nomem) {
126                         atomic_set(&ip_vs_dropentry, 1);
127                         sysctl_ip_vs_drop_entry = 2;
128                 } else {
129                         atomic_set(&ip_vs_dropentry, 0);
130                 }
131                 break;
132         case 2:
133                 if (nomem) {
134                         atomic_set(&ip_vs_dropentry, 1);
135                 } else {
136                         atomic_set(&ip_vs_dropentry, 0);
137                         sysctl_ip_vs_drop_entry = 1;
138                 };
139                 break;
140         case 3:
141                 atomic_set(&ip_vs_dropentry, 1);
142                 break;
143         }
144         spin_unlock(&__ip_vs_dropentry_lock);
145
146         /* drop_packet */
147         spin_lock(&__ip_vs_droppacket_lock);
148         switch (sysctl_ip_vs_drop_packet) {
149         case 0:
150                 ip_vs_drop_rate = 0;
151                 break;
152         case 1:
153                 if (nomem) {
154                         ip_vs_drop_rate = ip_vs_drop_counter
155                                 = sysctl_ip_vs_amemthresh /
156                                 (sysctl_ip_vs_amemthresh-availmem);
157                         sysctl_ip_vs_drop_packet = 2;
158                 } else {
159                         ip_vs_drop_rate = 0;
160                 }
161                 break;
162         case 2:
163                 if (nomem) {
164                         ip_vs_drop_rate = ip_vs_drop_counter
165                                 = sysctl_ip_vs_amemthresh /
166                                 (sysctl_ip_vs_amemthresh-availmem);
167                 } else {
168                         ip_vs_drop_rate = 0;
169                         sysctl_ip_vs_drop_packet = 1;
170                 }
171                 break;
172         case 3:
173                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
174                 break;
175         }
176         spin_unlock(&__ip_vs_droppacket_lock);
177
178         /* secure_tcp */
179         write_lock(&__ip_vs_securetcp_lock);
180         switch (sysctl_ip_vs_secure_tcp) {
181         case 0:
182                 if (old_secure_tcp >= 2)
183                         to_change = 0;
184                 break;
185         case 1:
186                 if (nomem) {
187                         if (old_secure_tcp < 2)
188                                 to_change = 1;
189                         sysctl_ip_vs_secure_tcp = 2;
190                 } else {
191                         if (old_secure_tcp >= 2)
192                                 to_change = 0;
193                 }
194                 break;
195         case 2:
196                 if (nomem) {
197                         if (old_secure_tcp < 2)
198                                 to_change = 1;
199                 } else {
200                         if (old_secure_tcp >= 2)
201                                 to_change = 0;
202                         sysctl_ip_vs_secure_tcp = 1;
203                 }
204                 break;
205         case 3:
206                 if (old_secure_tcp < 2)
207                         to_change = 1;
208                 break;
209         }
210         old_secure_tcp = sysctl_ip_vs_secure_tcp;
211         if (to_change >= 0)
212                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
213         write_unlock(&__ip_vs_securetcp_lock);
214
215         local_bh_enable();
216 }
217
218
219 /*
220  *      Timer for checking the defense
221  */
222 #define DEFENSE_TIMER_PERIOD    1*HZ
223 static void defense_work_handler(void *data);
224 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
225
226 static void defense_work_handler(void *data)
227 {
228         update_defense_level();
229         if (atomic_read(&ip_vs_dropentry))
230                 ip_vs_random_dropentry();
231
232         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
233 }
234
235 int
236 ip_vs_use_count_inc(void)
237 {
238         return try_module_get(THIS_MODULE);
239 }
240
241 void
242 ip_vs_use_count_dec(void)
243 {
244         module_put(THIS_MODULE);
245 }
246
247
248 /*
249  *      Hash table: for virtual service lookups
250  */
251 #define IP_VS_SVC_TAB_BITS 8
252 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
253 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
254
255 /* the service table hashed by <protocol, addr, port> */
256 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
257 /* the service table hashed by fwmark */
258 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
259
260 /*
261  *      Hash table: for real service lookups
262  */
263 #define IP_VS_RTAB_BITS 4
264 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
265 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
266
267 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
268
269 /*
270  *      Trash for destinations
271  */
272 static LIST_HEAD(ip_vs_dest_trash);
273
274 /*
275  *      FTP & NULL virtual service counters
276  */
277 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
278 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
279
280
281 /*
282  *      Returns hash value for virtual service
283  */
284 static __inline__ unsigned
285 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
286 {
287         register unsigned porth = ntohs(port);
288
289         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
290                 & IP_VS_SVC_TAB_MASK;
291 }
292
293 /*
294  *      Returns hash value of fwmark for virtual service lookup
295  */
296 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
297 {
298         return fwmark & IP_VS_SVC_TAB_MASK;
299 }
300
301 /*
302  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
303  *      or in the ip_vs_svc_fwm_table by fwmark.
304  *      Should be called with locked tables.
305  */
306 static int ip_vs_svc_hash(struct ip_vs_service *svc)
307 {
308         unsigned hash;
309
310         if (svc->flags & IP_VS_SVC_F_HASHED) {
311                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
312                           "called from %p\n", __builtin_return_address(0));
313                 return 0;
314         }
315
316         if (svc->fwmark == 0) {
317                 /*
318                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
319                  */
320                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
321                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
322         } else {
323                 /*
324                  *  Hash it by fwmark in ip_vs_svc_fwm_table
325                  */
326                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
327                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
328         }
329
330         svc->flags |= IP_VS_SVC_F_HASHED;
331         /* increase its refcnt because it is referenced by the svc table */
332         atomic_inc(&svc->refcnt);
333         return 1;
334 }
335
336
337 /*
338  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
339  *      Should be called with locked tables.
340  */
341 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
342 {
343         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
344                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
345                           "called from %p\n", __builtin_return_address(0));
346                 return 0;
347         }
348
349         if (svc->fwmark == 0) {
350                 /* Remove it from the ip_vs_svc_table table */
351                 list_del(&svc->s_list);
352         } else {
353                 /* Remove it from the ip_vs_svc_fwm_table table */
354                 list_del(&svc->f_list);
355         }
356
357         svc->flags &= ~IP_VS_SVC_F_HASHED;
358         atomic_dec(&svc->refcnt);
359         return 1;
360 }
361
362
363 /*
364  *      Get service by {proto,addr,port} in the service table.
365  */
366 static __inline__ struct ip_vs_service *
367 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
368 {
369         unsigned hash;
370         struct ip_vs_service *svc;
371
372         /* Check for "full" addressed entries */
373         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
374
375         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
376                 if ((svc->addr == vaddr)
377                     && (svc->port == vport)
378                     && (svc->protocol == protocol)) {
379                         /* HIT */
380                         atomic_inc(&svc->usecnt);
381                         return svc;
382                 }
383         }
384
385         return NULL;
386 }
387
388
389 /*
390  *      Get service by {fwmark} in the service table.
391  */
392 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
393 {
394         unsigned hash;
395         struct ip_vs_service *svc;
396
397         /* Check for fwmark addressed entries */
398         hash = ip_vs_svc_fwm_hashkey(fwmark);
399
400         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
401                 if (svc->fwmark == fwmark) {
402                         /* HIT */
403                         atomic_inc(&svc->usecnt);
404                         return svc;
405                 }
406         }
407
408         return NULL;
409 }
410
411 struct ip_vs_service *
412 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
413 {
414         struct ip_vs_service *svc;
415
416         read_lock(&__ip_vs_svc_lock);
417
418         /*
419          *      Check the table hashed by fwmark first
420          */
421         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
422                 goto out;
423
424         /*
425          *      Check the table hashed by <protocol,addr,port>
426          *      for "full" addressed entries
427          */
428         svc = __ip_vs_service_get(protocol, vaddr, vport);
429
430         if (svc == NULL
431             && protocol == IPPROTO_TCP
432             && atomic_read(&ip_vs_ftpsvc_counter)
433             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
434                 /*
435                  * Check if ftp service entry exists, the packet
436                  * might belong to FTP data connections.
437                  */
438                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
439         }
440
441         if (svc == NULL
442             && atomic_read(&ip_vs_nullsvc_counter)) {
443                 /*
444                  * Check if the catch-all port (port zero) exists
445                  */
446                 svc = __ip_vs_service_get(protocol, vaddr, 0);
447         }
448
449   out:
450         read_unlock(&__ip_vs_svc_lock);
451
452         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
453                   fwmark, ip_vs_proto_name(protocol),
454                   NIPQUAD(vaddr), ntohs(vport),
455                   svc?"hit":"not hit");
456
457         return svc;
458 }
459
460
461 static inline void
462 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
463 {
464         atomic_inc(&svc->refcnt);
465         dest->svc = svc;
466 }
467
468 static inline void
469 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
470 {
471         struct ip_vs_service *svc = dest->svc;
472
473         dest->svc = NULL;
474         if (atomic_dec_and_test(&svc->refcnt))
475                 kfree(svc);
476 }
477
478
479 /*
480  *      Returns hash value for real service
481  */
482 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
483 {
484         register unsigned porth = ntohs(port);
485
486         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
487                 & IP_VS_RTAB_MASK;
488 }
489
490 /*
491  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
492  *      should be called with locked tables.
493  */
494 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
495 {
496         unsigned hash;
497
498         if (!list_empty(&dest->d_list)) {
499                 return 0;
500         }
501
502         /*
503          *      Hash by proto,addr,port,
504          *      which are the parameters of the real service.
505          */
506         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
507         list_add(&dest->d_list, &ip_vs_rtable[hash]);
508
509         return 1;
510 }
511
512 /*
513  *      UNhashes ip_vs_dest from ip_vs_rtable.
514  *      should be called with locked tables.
515  */
516 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
517 {
518         /*
519          * Remove it from the ip_vs_rtable table.
520          */
521         if (!list_empty(&dest->d_list)) {
522                 list_del(&dest->d_list);
523                 INIT_LIST_HEAD(&dest->d_list);
524         }
525
526         return 1;
527 }
528
529 /*
530  *      Lookup real service by <proto,addr,port> in the real service table.
531  */
532 struct ip_vs_dest *
533 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
534 {
535         unsigned hash;
536         struct ip_vs_dest *dest;
537
538         /*
539          *      Check for "full" addressed entries
540          *      Return the first found entry
541          */
542         hash = ip_vs_rs_hashkey(daddr, dport);
543
544         read_lock(&__ip_vs_rs_lock);
545         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
546                 if ((dest->addr == daddr)
547                     && (dest->port == dport)
548                     && ((dest->protocol == protocol) ||
549                         dest->vfwmark)) {
550                         /* HIT */
551                         read_unlock(&__ip_vs_rs_lock);
552                         return dest;
553                 }
554         }
555         read_unlock(&__ip_vs_rs_lock);
556
557         return NULL;
558 }
559
560 /*
561  *      Lookup destination by {addr,port} in the given service
562  */
563 static struct ip_vs_dest *
564 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
565 {
566         struct ip_vs_dest *dest;
567
568         /*
569          * Find the destination for the given service
570          */
571         list_for_each_entry(dest, &svc->destinations, n_list) {
572                 if ((dest->addr == daddr) && (dest->port == dport)) {
573                         /* HIT */
574                         return dest;
575                 }
576         }
577
578         return NULL;
579 }
580
581
582 /*
583  *  Lookup dest by {svc,addr,port} in the destination trash.
584  *  The destination trash is used to hold the destinations that are removed
585  *  from the service table but are still referenced by some conn entries.
586  *  The reason to add the destination trash is when the dest is temporary
587  *  down (either by administrator or by monitor program), the dest can be
588  *  picked back from the trash, the remaining connections to the dest can
589  *  continue, and the counting information of the dest is also useful for
590  *  scheduling.
591  */
592 static struct ip_vs_dest *
593 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
594 {
595         struct ip_vs_dest *dest, *nxt;
596
597         /*
598          * Find the destination in trash
599          */
600         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
601                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
602                           "dest->refcnt=%d\n",
603                           dest->vfwmark,
604                           NIPQUAD(dest->addr), ntohs(dest->port),
605                           atomic_read(&dest->refcnt));
606                 if (dest->addr == daddr &&
607                     dest->port == dport &&
608                     dest->vfwmark == svc->fwmark &&
609                     dest->protocol == svc->protocol &&
610                     (svc->fwmark ||
611                      (dest->vaddr == svc->addr &&
612                       dest->vport == svc->port))) {
613                         /* HIT */
614                         return dest;
615                 }
616
617                 /*
618                  * Try to purge the destination from trash if not referenced
619                  */
620                 if (atomic_read(&dest->refcnt) == 1) {
621                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
622                                   "from trash\n",
623                                   dest->vfwmark,
624                                   NIPQUAD(dest->addr), ntohs(dest->port));
625                         list_del(&dest->n_list);
626                         ip_vs_dst_reset(dest);
627                         __ip_vs_unbind_svc(dest);
628                         kfree(dest);
629                 }
630         }
631
632         return NULL;
633 }
634
635
636 /*
637  *  Clean up all the destinations in the trash
638  *  Called by the ip_vs_control_cleanup()
639  *
640  *  When the ip_vs_control_clearup is activated by ipvs module exit,
641  *  the service tables must have been flushed and all the connections
642  *  are expired, and the refcnt of each destination in the trash must
643  *  be 1, so we simply release them here.
644  */
645 static void ip_vs_trash_cleanup(void)
646 {
647         struct ip_vs_dest *dest, *nxt;
648
649         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
650                 list_del(&dest->n_list);
651                 ip_vs_dst_reset(dest);
652                 __ip_vs_unbind_svc(dest);
653                 kfree(dest);
654         }
655 }
656
657
658 static void
659 ip_vs_zero_stats(struct ip_vs_stats *stats)
660 {
661         spin_lock_bh(&stats->lock);
662         memset(stats, 0, (char *)&stats->lock - (char *)stats);
663         spin_unlock_bh(&stats->lock);
664         ip_vs_zero_estimator(stats);
665 }
666
667 /*
668  *      Update a destination in the given service
669  */
670 static void
671 __ip_vs_update_dest(struct ip_vs_service *svc,
672                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
673 {
674         int conn_flags;
675
676         /* set the weight and the flags */
677         atomic_set(&dest->weight, udest->weight);
678         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
679
680         /* check if local node and update the flags */
681         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
682                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
683                         | IP_VS_CONN_F_LOCALNODE;
684         }
685
686         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
687         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
688                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
689         } else {
690                 /*
691                  *    Put the real service in ip_vs_rtable if not present.
692                  *    For now only for NAT!
693                  */
694                 write_lock_bh(&__ip_vs_rs_lock);
695                 ip_vs_rs_hash(dest);
696                 write_unlock_bh(&__ip_vs_rs_lock);
697         }
698         atomic_set(&dest->conn_flags, conn_flags);
699
700         /* bind the service */
701         if (!dest->svc) {
702                 __ip_vs_bind_svc(dest, svc);
703         } else {
704                 if (dest->svc != svc) {
705                         __ip_vs_unbind_svc(dest);
706                         ip_vs_zero_stats(&dest->stats);
707                         __ip_vs_bind_svc(dest, svc);
708                 }
709         }
710
711         /* set the dest status flags */
712         dest->flags |= IP_VS_DEST_F_AVAILABLE;
713
714         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
715                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
716         dest->u_threshold = udest->u_threshold;
717         dest->l_threshold = udest->l_threshold;
718 }
719
720
721 /*
722  *      Create a destination for the given service
723  */
724 static int
725 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
726                struct ip_vs_dest **dest_p)
727 {
728         struct ip_vs_dest *dest;
729         unsigned atype;
730
731         EnterFunction(2);
732
733         atype = inet_addr_type(udest->addr);
734         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
735                 return -EINVAL;
736
737         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
738         if (dest == NULL) {
739                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
740                 return -ENOMEM;
741         }
742         memset(dest, 0, sizeof(struct ip_vs_dest));
743
744         dest->protocol = svc->protocol;
745         dest->vaddr = svc->addr;
746         dest->vport = svc->port;
747         dest->vfwmark = svc->fwmark;
748         dest->addr = udest->addr;
749         dest->port = udest->port;
750
751         atomic_set(&dest->activeconns, 0);
752         atomic_set(&dest->inactconns, 0);
753         atomic_set(&dest->persistconns, 0);
754         atomic_set(&dest->refcnt, 0);
755
756         INIT_LIST_HEAD(&dest->d_list);
757         spin_lock_init(&dest->dst_lock);
758         spin_lock_init(&dest->stats.lock);
759         __ip_vs_update_dest(svc, dest, udest);
760         ip_vs_new_estimator(&dest->stats);
761
762         *dest_p = dest;
763
764         LeaveFunction(2);
765         return 0;
766 }
767
768
769 /*
770  *      Add a destination into an existing service
771  */
772 static int
773 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
774 {
775         struct ip_vs_dest *dest;
776         __u32 daddr = udest->addr;
777         __u16 dport = udest->port;
778         int ret;
779
780         EnterFunction(2);
781
782         if (udest->weight < 0) {
783                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
784                 return -ERANGE;
785         }
786
787         if (udest->l_threshold > udest->u_threshold) {
788                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
789                           "upper threshold\n");
790                 return -ERANGE;
791         }
792
793         /*
794          * Check if the dest already exists in the list
795          */
796         dest = ip_vs_lookup_dest(svc, daddr, dport);
797         if (dest != NULL) {
798                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
799                 return -EEXIST;
800         }
801
802         /*
803          * Check if the dest already exists in the trash and
804          * is from the same service
805          */
806         dest = ip_vs_trash_get_dest(svc, daddr, dport);
807         if (dest != NULL) {
808                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
809                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
810                           NIPQUAD(daddr), ntohs(dport),
811                           atomic_read(&dest->refcnt),
812                           dest->vfwmark,
813                           NIPQUAD(dest->vaddr),
814                           ntohs(dest->vport));
815                 __ip_vs_update_dest(svc, dest, udest);
816
817                 /*
818                  * Get the destination from the trash
819                  */
820                 list_del(&dest->n_list);
821
822                 ip_vs_new_estimator(&dest->stats);
823
824                 write_lock_bh(&__ip_vs_svc_lock);
825
826                 /*
827                  * Wait until all other svc users go away.
828                  */
829                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
830
831                 list_add(&dest->n_list, &svc->destinations);
832                 svc->num_dests++;
833
834                 /* call the update_service function of its scheduler */
835                 svc->scheduler->update_service(svc);
836
837                 write_unlock_bh(&__ip_vs_svc_lock);
838                 return 0;
839         }
840
841         /*
842          * Allocate and initialize the dest structure
843          */
844         ret = ip_vs_new_dest(svc, udest, &dest);
845         if (ret) {
846                 return ret;
847         }
848
849         /*
850          * Add the dest entry into the list
851          */
852         atomic_inc(&dest->refcnt);
853
854         write_lock_bh(&__ip_vs_svc_lock);
855
856         /*
857          * Wait until all other svc users go away.
858          */
859         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
860
861         list_add(&dest->n_list, &svc->destinations);
862         svc->num_dests++;
863
864         /* call the update_service function of its scheduler */
865         svc->scheduler->update_service(svc);
866
867         write_unlock_bh(&__ip_vs_svc_lock);
868
869         LeaveFunction(2);
870
871         return 0;
872 }
873
874
875 /*
876  *      Edit a destination in the given service
877  */
878 static int
879 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
880 {
881         struct ip_vs_dest *dest;
882         __u32 daddr = udest->addr;
883         __u16 dport = udest->port;
884
885         EnterFunction(2);
886
887         if (udest->weight < 0) {
888                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
889                 return -ERANGE;
890         }
891
892         if (udest->l_threshold > udest->u_threshold) {
893                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
894                           "upper threshold\n");
895                 return -ERANGE;
896         }
897
898         /*
899          *  Lookup the destination list
900          */
901         dest = ip_vs_lookup_dest(svc, daddr, dport);
902         if (dest == NULL) {
903                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
904                 return -ENOENT;
905         }
906
907         __ip_vs_update_dest(svc, dest, udest);
908
909         write_lock_bh(&__ip_vs_svc_lock);
910
911         /* Wait until all other svc users go away */
912         while (atomic_read(&svc->usecnt) > 1) {};
913
914         /* call the update_service, because server weight may be changed */
915         svc->scheduler->update_service(svc);
916
917         write_unlock_bh(&__ip_vs_svc_lock);
918
919         LeaveFunction(2);
920
921         return 0;
922 }
923
924
925 /*
926  *      Delete a destination (must be already unlinked from the service)
927  */
928 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
929 {
930         ip_vs_kill_estimator(&dest->stats);
931
932         /*
933          *  Remove it from the d-linked list with the real services.
934          */
935         write_lock_bh(&__ip_vs_rs_lock);
936         ip_vs_rs_unhash(dest);
937         write_unlock_bh(&__ip_vs_rs_lock);
938
939         /*
940          *  Decrease the refcnt of the dest, and free the dest
941          *  if nobody refers to it (refcnt=0). Otherwise, throw
942          *  the destination into the trash.
943          */
944         if (atomic_dec_and_test(&dest->refcnt)) {
945                 ip_vs_dst_reset(dest);
946                 /* simply decrease svc->refcnt here, let the caller check
947                    and release the service if nobody refers to it.
948                    Only user context can release destination and service,
949                    and only one user context can update virtual service at a
950                    time, so the operation here is OK */
951                 atomic_dec(&dest->svc->refcnt);
952                 kfree(dest);
953         } else {
954                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
955                           "dest->refcnt=%d\n",
956                           NIPQUAD(dest->addr), ntohs(dest->port),
957                           atomic_read(&dest->refcnt));
958                 list_add(&dest->n_list, &ip_vs_dest_trash);
959                 atomic_inc(&dest->refcnt);
960         }
961 }
962
963
964 /*
965  *      Unlink a destination from the given service
966  */
967 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
968                                 struct ip_vs_dest *dest,
969                                 int svcupd)
970 {
971         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
972
973         /*
974          *  Remove it from the d-linked destination list.
975          */
976         list_del(&dest->n_list);
977         svc->num_dests--;
978         if (svcupd) {
979                 /*
980                  *  Call the update_service function of its scheduler
981                  */
982                 svc->scheduler->update_service(svc);
983         }
984 }
985
986
987 /*
988  *      Delete a destination server in the given service
989  */
990 static int
991 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
992 {
993         struct ip_vs_dest *dest;
994         __u32 daddr = udest->addr;
995         __u16 dport = udest->port;
996
997         EnterFunction(2);
998
999         dest = ip_vs_lookup_dest(svc, daddr, dport);
1000         if (dest == NULL) {
1001                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1002                 return -ENOENT;
1003         }
1004
1005         write_lock_bh(&__ip_vs_svc_lock);
1006
1007         /*
1008          *      Wait until all other svc users go away.
1009          */
1010         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1011
1012         /*
1013          *      Unlink dest from the service
1014          */
1015         __ip_vs_unlink_dest(svc, dest, 1);
1016
1017         write_unlock_bh(&__ip_vs_svc_lock);
1018
1019         /*
1020          *      Delete the destination
1021          */
1022         __ip_vs_del_dest(dest);
1023
1024         LeaveFunction(2);
1025
1026         return 0;
1027 }
1028
1029
1030 /*
1031  *      Add a service into the service hash table
1032  */
1033 static int
1034 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1035 {
1036         int ret = 0;
1037         struct ip_vs_scheduler *sched = NULL;
1038         struct ip_vs_service *svc = NULL;
1039
1040         /* increase the module use count */
1041         ip_vs_use_count_inc();
1042
1043         /* Lookup the scheduler by 'u->sched_name' */
1044         sched = ip_vs_scheduler_get(u->sched_name);
1045         if (sched == NULL) {
1046                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1047                            u->sched_name);
1048                 ret = -ENOENT;
1049                 goto out_mod_dec;
1050         }
1051
1052         svc = (struct ip_vs_service *)
1053                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1054         if (svc == NULL) {
1055                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1056                 ret = -ENOMEM;
1057                 goto out_err;
1058         }
1059         memset(svc, 0, sizeof(struct ip_vs_service));
1060
1061         /* I'm the first user of the service */
1062         atomic_set(&svc->usecnt, 1);
1063         atomic_set(&svc->refcnt, 0);
1064
1065         svc->protocol = u->protocol;
1066         svc->addr = u->addr;
1067         svc->port = u->port;
1068         svc->fwmark = u->fwmark;
1069         svc->flags = u->flags;
1070         svc->timeout = u->timeout * HZ;
1071         svc->netmask = u->netmask;
1072
1073         INIT_LIST_HEAD(&svc->destinations);
1074         rwlock_init(&svc->sched_lock);
1075         spin_lock_init(&svc->stats.lock);
1076
1077         /* Bind the scheduler */
1078         ret = ip_vs_bind_scheduler(svc, sched);
1079         if (ret)
1080                 goto out_err;
1081         sched = NULL;
1082
1083         /* Update the virtual service counters */
1084         if (svc->port == FTPPORT)
1085                 atomic_inc(&ip_vs_ftpsvc_counter);
1086         else if (svc->port == 0)
1087                 atomic_inc(&ip_vs_nullsvc_counter);
1088
1089         ip_vs_new_estimator(&svc->stats);
1090         ip_vs_num_services++;
1091
1092         /* Hash the service into the service table */
1093         write_lock_bh(&__ip_vs_svc_lock);
1094         ip_vs_svc_hash(svc);
1095         write_unlock_bh(&__ip_vs_svc_lock);
1096
1097         *svc_p = svc;
1098         return 0;
1099
1100   out_err:
1101         if (svc != NULL) {
1102                 if (svc->scheduler)
1103                         ip_vs_unbind_scheduler(svc);
1104                 if (svc->inc) {
1105                         local_bh_disable();
1106                         ip_vs_app_inc_put(svc->inc);
1107                         local_bh_enable();
1108                 }
1109                 kfree(svc);
1110         }
1111         ip_vs_scheduler_put(sched);
1112
1113   out_mod_dec:
1114         /* decrease the module use count */
1115         ip_vs_use_count_dec();
1116
1117         return ret;
1118 }
1119
1120
1121 /*
1122  *      Edit a service and bind it with a new scheduler
1123  */
1124 static int
1125 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1126 {
1127         struct ip_vs_scheduler *sched, *old_sched;
1128         int ret = 0;
1129
1130         /*
1131          * Lookup the scheduler, by 'u->sched_name'
1132          */
1133         sched = ip_vs_scheduler_get(u->sched_name);
1134         if (sched == NULL) {
1135                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1136                            u->sched_name);
1137                 return -ENOENT;
1138         }
1139         old_sched = sched;
1140
1141         write_lock_bh(&__ip_vs_svc_lock);
1142
1143         /*
1144          * Wait until all other svc users go away.
1145          */
1146         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1147
1148         /*
1149          * Set the flags and timeout value
1150          */
1151         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1152         svc->timeout = u->timeout * HZ;
1153         svc->netmask = u->netmask;
1154
1155         old_sched = svc->scheduler;
1156         if (sched != old_sched) {
1157                 /*
1158                  * Unbind the old scheduler
1159                  */
1160                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1161                         old_sched = sched;
1162                         goto out;
1163                 }
1164
1165                 /*
1166                  * Bind the new scheduler
1167                  */
1168                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1169                         /*
1170                          * If ip_vs_bind_scheduler fails, restore the old
1171                          * scheduler.
1172                          * The main reason of failure is out of memory.
1173                          *
1174                          * The question is if the old scheduler can be
1175                          * restored all the time. TODO: if it cannot be
1176                          * restored some time, we must delete the service,
1177                          * otherwise the system may crash.
1178                          */
1179                         ip_vs_bind_scheduler(svc, old_sched);
1180                         old_sched = sched;
1181                         goto out;
1182                 }
1183         }
1184
1185   out:
1186         write_unlock_bh(&__ip_vs_svc_lock);
1187
1188         if (old_sched)
1189                 ip_vs_scheduler_put(old_sched);
1190
1191         return ret;
1192 }
1193
1194
1195 /*
1196  *      Delete a service from the service list
1197  *      - The service must be unlinked, unlocked and not referenced!
1198  *      - We are called under _bh lock
1199  */
1200 static void __ip_vs_del_service(struct ip_vs_service *svc)
1201 {
1202         struct ip_vs_dest *dest, *nxt;
1203         struct ip_vs_scheduler *old_sched;
1204
1205         ip_vs_num_services--;
1206         ip_vs_kill_estimator(&svc->stats);
1207
1208         /* Unbind scheduler */
1209         old_sched = svc->scheduler;
1210         ip_vs_unbind_scheduler(svc);
1211         if (old_sched)
1212                 ip_vs_scheduler_put(old_sched);
1213
1214         /* Unbind app inc */
1215         if (svc->inc) {
1216                 ip_vs_app_inc_put(svc->inc);
1217                 svc->inc = NULL;
1218         }
1219
1220         /*
1221          *    Unlink the whole destination list
1222          */
1223         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1224                 __ip_vs_unlink_dest(svc, dest, 0);
1225                 __ip_vs_del_dest(dest);
1226         }
1227
1228         /*
1229          *    Update the virtual service counters
1230          */
1231         if (svc->port == FTPPORT)
1232                 atomic_dec(&ip_vs_ftpsvc_counter);
1233         else if (svc->port == 0)
1234                 atomic_dec(&ip_vs_nullsvc_counter);
1235
1236         /*
1237          *    Free the service if nobody refers to it
1238          */
1239         if (atomic_read(&svc->refcnt) == 0)
1240                 kfree(svc);
1241
1242         /* decrease the module use count */
1243         ip_vs_use_count_dec();
1244 }
1245
1246 /*
1247  *      Delete a service from the service list
1248  */
1249 static int ip_vs_del_service(struct ip_vs_service *svc)
1250 {
1251         if (svc == NULL)
1252                 return -EEXIST;
1253
1254         /*
1255          * Unhash it from the service table
1256          */
1257         write_lock_bh(&__ip_vs_svc_lock);
1258
1259         ip_vs_svc_unhash(svc);
1260
1261         /*
1262          * Wait until all the svc users go away.
1263          */
1264         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1265
1266         __ip_vs_del_service(svc);
1267
1268         write_unlock_bh(&__ip_vs_svc_lock);
1269
1270         return 0;
1271 }
1272
1273
1274 /*
1275  *      Flush all the virtual services
1276  */
1277 static int ip_vs_flush(void)
1278 {
1279         int idx;
1280         struct ip_vs_service *svc, *nxt;
1281
1282         /*
1283          * Flush the service table hashed by <protocol,addr,port>
1284          */
1285         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1286                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1287                         write_lock_bh(&__ip_vs_svc_lock);
1288                         ip_vs_svc_unhash(svc);
1289                         /*
1290                          * Wait until all the svc users go away.
1291                          */
1292                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1293                         __ip_vs_del_service(svc);
1294                         write_unlock_bh(&__ip_vs_svc_lock);
1295                 }
1296         }
1297
1298         /*
1299          * Flush the service table hashed by fwmark
1300          */
1301         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1302                 list_for_each_entry_safe(svc, nxt,
1303                                          &ip_vs_svc_fwm_table[idx], f_list) {
1304                         write_lock_bh(&__ip_vs_svc_lock);
1305                         ip_vs_svc_unhash(svc);
1306                         /*
1307                          * Wait until all the svc users go away.
1308                          */
1309                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1310                         __ip_vs_del_service(svc);
1311                         write_unlock_bh(&__ip_vs_svc_lock);
1312                 }
1313         }
1314
1315         return 0;
1316 }
1317
1318
1319 /*
1320  *      Zero counters in a service or all services
1321  */
1322 static int ip_vs_zero_service(struct ip_vs_service *svc)
1323 {
1324         struct ip_vs_dest *dest;
1325
1326         write_lock_bh(&__ip_vs_svc_lock);
1327         list_for_each_entry(dest, &svc->destinations, n_list) {
1328                 ip_vs_zero_stats(&dest->stats);
1329         }
1330         ip_vs_zero_stats(&svc->stats);
1331         write_unlock_bh(&__ip_vs_svc_lock);
1332         return 0;
1333 }
1334
1335 static int ip_vs_zero_all(void)
1336 {
1337         int idx;
1338         struct ip_vs_service *svc;
1339
1340         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1341                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1342                         ip_vs_zero_service(svc);
1343                 }
1344         }
1345
1346         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1347                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1348                         ip_vs_zero_service(svc);
1349                 }
1350         }
1351
1352         ip_vs_zero_stats(&ip_vs_stats);
1353         return 0;
1354 }
1355
1356
1357 static int
1358 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1359                      void __user *buffer, size_t *lenp, loff_t *ppos)
1360 {
1361         int *valp = table->data;
1362         int val = *valp;
1363         int rc;
1364
1365         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1366         if (write && (*valp != val)) {
1367                 if ((*valp < 0) || (*valp > 3)) {
1368                         /* Restore the correct value */
1369                         *valp = val;
1370                 } else {
1371                         update_defense_level();
1372                 }
1373         }
1374         return rc;
1375 }
1376
1377
1378 static int
1379 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1380                        void __user *buffer, size_t *lenp, loff_t *ppos)
1381 {
1382         int *valp = table->data;
1383         int val[2];
1384         int rc;
1385
1386         /* backup the value first */
1387         memcpy(val, valp, sizeof(val));
1388
1389         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1390         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1391                 /* Restore the correct value */
1392                 memcpy(valp, val, sizeof(val));
1393         }
1394         return rc;
1395 }
1396
1397
1398 /*
1399  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1400  */
1401
1402 static struct ctl_table vs_vars[] = {
1403         {
1404                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1405                 .procname       = "amemthresh",
1406                 .data           = &sysctl_ip_vs_amemthresh,
1407                 .maxlen         = sizeof(int),
1408                 .mode           = 0644,
1409                 .proc_handler   = &proc_dointvec,
1410         },
1411 #ifdef CONFIG_IP_VS_DEBUG
1412         {
1413                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1414                 .procname       = "debug_level",
1415                 .data           = &sysctl_ip_vs_debug_level,
1416                 .maxlen         = sizeof(int),
1417                 .mode           = 0644,
1418                 .proc_handler   = &proc_dointvec,
1419         },
1420 #endif
1421         {
1422                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1423                 .procname       = "am_droprate",
1424                 .data           = &sysctl_ip_vs_am_droprate,
1425                 .maxlen         = sizeof(int),
1426                 .mode           = 0644,
1427                 .proc_handler   = &proc_dointvec,
1428         },
1429         {
1430                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1431                 .procname       = "drop_entry",
1432                 .data           = &sysctl_ip_vs_drop_entry,
1433                 .maxlen         = sizeof(int),
1434                 .mode           = 0644,
1435                 .proc_handler   = &proc_do_defense_mode,
1436         },
1437         {
1438                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1439                 .procname       = "drop_packet",
1440                 .data           = &sysctl_ip_vs_drop_packet,
1441                 .maxlen         = sizeof(int),
1442                 .mode           = 0644,
1443                 .proc_handler   = &proc_do_defense_mode,
1444         },
1445         {
1446                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1447                 .procname       = "secure_tcp",
1448                 .data           = &sysctl_ip_vs_secure_tcp,
1449                 .maxlen         = sizeof(int),
1450                 .mode           = 0644,
1451                 .proc_handler   = &proc_do_defense_mode,
1452         },
1453 #if 0
1454         {
1455                 .ctl_name       = NET_IPV4_VS_TO_ES,
1456                 .procname       = "timeout_established",
1457                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1458                 .maxlen         = sizeof(int),
1459                 .mode           = 0644,
1460                 .proc_handler   = &proc_dointvec_jiffies,
1461         },
1462         {
1463                 .ctl_name       = NET_IPV4_VS_TO_SS,
1464                 .procname       = "timeout_synsent",
1465                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1466                 .maxlen         = sizeof(int),
1467                 .mode           = 0644,
1468                 .proc_handler   = &proc_dointvec_jiffies,
1469         },
1470         {
1471                 .ctl_name       = NET_IPV4_VS_TO_SR,
1472                 .procname       = "timeout_synrecv",
1473                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1474                 .maxlen         = sizeof(int),
1475                 .mode           = 0644,
1476                 .proc_handler   = &proc_dointvec_jiffies,
1477         },
1478         {
1479                 .ctl_name       = NET_IPV4_VS_TO_FW,
1480                 .procname       = "timeout_finwait",
1481                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1482                 .maxlen         = sizeof(int),
1483                 .mode           = 0644,
1484                 .proc_handler   = &proc_dointvec_jiffies,
1485         },
1486         {
1487                 .ctl_name       = NET_IPV4_VS_TO_TW,
1488                 .procname       = "timeout_timewait",
1489                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1490                 .maxlen         = sizeof(int),
1491                 .mode           = 0644,
1492                 .proc_handler   = &proc_dointvec_jiffies,
1493         },
1494         {
1495                 .ctl_name       = NET_IPV4_VS_TO_CL,
1496                 .procname       = "timeout_close",
1497                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1498                 .maxlen         = sizeof(int),
1499                 .mode           = 0644,
1500                 .proc_handler   = &proc_dointvec_jiffies,
1501         },
1502         {
1503                 .ctl_name       = NET_IPV4_VS_TO_CW,
1504                 .procname       = "timeout_closewait",
1505                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1506                 .maxlen         = sizeof(int),
1507                 .mode           = 0644,
1508                 .proc_handler   = &proc_dointvec_jiffies,
1509         },
1510         {
1511                 .ctl_name       = NET_IPV4_VS_TO_LA,
1512                 .procname       = "timeout_lastack",
1513                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1514                 .maxlen         = sizeof(int),
1515                 .mode           = 0644,
1516                 .proc_handler   = &proc_dointvec_jiffies,
1517         },
1518         {
1519                 .ctl_name       = NET_IPV4_VS_TO_LI,
1520                 .procname       = "timeout_listen",
1521                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1522                 .maxlen         = sizeof(int),
1523                 .mode           = 0644,
1524                 .proc_handler   = &proc_dointvec_jiffies,
1525         },
1526         {
1527                 .ctl_name       = NET_IPV4_VS_TO_SA,
1528                 .procname       = "timeout_synack",
1529                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1530                 .maxlen         = sizeof(int),
1531                 .mode           = 0644,
1532                 .proc_handler   = &proc_dointvec_jiffies,
1533         },
1534         {
1535                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1536                 .procname       = "timeout_udp",
1537                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1538                 .maxlen         = sizeof(int),
1539                 .mode           = 0644,
1540                 .proc_handler   = &proc_dointvec_jiffies,
1541         },
1542         {
1543                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1544                 .procname       = "timeout_icmp",
1545                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1546                 .maxlen         = sizeof(int),
1547                 .mode           = 0644,
1548                 .proc_handler   = &proc_dointvec_jiffies,
1549         },
1550 #endif
1551         {
1552                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1553                 .procname       = "cache_bypass",
1554                 .data           = &sysctl_ip_vs_cache_bypass,
1555                 .maxlen         = sizeof(int),
1556                 .mode           = 0644,
1557                 .proc_handler   = &proc_dointvec,
1558         },
1559         {
1560                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1561                 .procname       = "expire_nodest_conn",
1562                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1563                 .maxlen         = sizeof(int),
1564                 .mode           = 0644,
1565                 .proc_handler   = &proc_dointvec,
1566         },
1567         {
1568                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1569                 .procname       = "expire_quiescent_template",
1570                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1571                 .maxlen         = sizeof(int),
1572                 .mode           = 0644,
1573                 .proc_handler   = &proc_dointvec,
1574         },
1575         {
1576                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1577                 .procname       = "sync_threshold",
1578                 .data           = &sysctl_ip_vs_sync_threshold,
1579                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1580                 .mode           = 0644,
1581                 .proc_handler   = &proc_do_sync_threshold,
1582         },
1583         {
1584                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1585                 .procname       = "nat_icmp_send",
1586                 .data           = &sysctl_ip_vs_nat_icmp_send,
1587                 .maxlen         = sizeof(int),
1588                 .mode           = 0644,
1589                 .proc_handler   = &proc_dointvec,
1590         },
1591         { .ctl_name = 0 }
1592 };
1593
1594 static ctl_table vs_table[] = {
1595         {
1596                 .ctl_name       = NET_IPV4_VS,
1597                 .procname       = "vs",
1598                 .mode           = 0555,
1599                 .child          = vs_vars
1600         },
1601         { .ctl_name = 0 }
1602 };
1603
1604 static ctl_table ipvs_ipv4_table[] = {
1605         {
1606                 .ctl_name       = NET_IPV4,
1607                 .procname       = "ipv4",
1608                 .mode           = 0555,
1609                 .child          = vs_table,
1610         },
1611         { .ctl_name = 0 }
1612 };
1613
1614 static ctl_table vs_root_table[] = {
1615         {
1616                 .ctl_name       = CTL_NET,
1617                 .procname       = "net",
1618                 .mode           = 0555,
1619                 .child          = ipvs_ipv4_table,
1620         },
1621         { .ctl_name = 0 }
1622 };
1623
1624 static struct ctl_table_header * sysctl_header;
1625
1626 #ifdef CONFIG_PROC_FS
1627
1628 struct ip_vs_iter {
1629         struct list_head *table;
1630         int bucket;
1631 };
1632
1633 /*
1634  *      Write the contents of the VS rule table to a PROCfs file.
1635  *      (It is kept just for backward compatibility)
1636  */
1637 static inline const char *ip_vs_fwd_name(unsigned flags)
1638 {
1639         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1640         case IP_VS_CONN_F_LOCALNODE:
1641                 return "Local";
1642         case IP_VS_CONN_F_TUNNEL:
1643                 return "Tunnel";
1644         case IP_VS_CONN_F_DROUTE:
1645                 return "Route";
1646         default:
1647                 return "Masq";
1648         }
1649 }
1650
1651
1652 /* Get the Nth entry in the two lists */
1653 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1654 {
1655         struct ip_vs_iter *iter = seq->private;
1656         int idx;
1657         struct ip_vs_service *svc;
1658
1659         /* look in hash by protocol */
1660         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1661                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1662                         if (pos-- == 0){
1663                                 iter->table = ip_vs_svc_table;
1664                                 iter->bucket = idx;
1665                                 return svc;
1666                         }
1667                 }
1668         }
1669
1670         /* keep looking in fwmark */
1671         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1672                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1673                         if (pos-- == 0) {
1674                                 iter->table = ip_vs_svc_fwm_table;
1675                                 iter->bucket = idx;
1676                                 return svc;
1677                         }
1678                 }
1679         }
1680
1681         return NULL;
1682 }
1683
1684 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1685 {
1686
1687         read_lock_bh(&__ip_vs_svc_lock);
1688         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1689 }
1690
1691
1692 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1693 {
1694         struct list_head *e;
1695         struct ip_vs_iter *iter;
1696         struct ip_vs_service *svc;
1697
1698         ++*pos;
1699         if (v == SEQ_START_TOKEN)
1700                 return ip_vs_info_array(seq,0);
1701
1702         svc = v;
1703         iter = seq->private;
1704
1705         if (iter->table == ip_vs_svc_table) {
1706                 /* next service in table hashed by protocol */
1707                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1708                         return list_entry(e, struct ip_vs_service, s_list);
1709
1710
1711                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1712                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1713                                             s_list) {
1714                                 return svc;
1715                         }
1716                 }
1717
1718                 iter->table = ip_vs_svc_fwm_table;
1719                 iter->bucket = -1;
1720                 goto scan_fwmark;
1721         }
1722
1723         /* next service in hashed by fwmark */
1724         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1725                 return list_entry(e, struct ip_vs_service, f_list);
1726
1727  scan_fwmark:
1728         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1729                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1730                                     f_list)
1731                         return svc;
1732         }
1733
1734         return NULL;
1735 }
1736
1737 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1738 {
1739         read_unlock_bh(&__ip_vs_svc_lock);
1740 }
1741
1742
1743 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1744 {
1745         if (v == SEQ_START_TOKEN) {
1746                 seq_printf(seq,
1747                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1748                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1749                 seq_puts(seq,
1750                          "Prot LocalAddress:Port Scheduler Flags\n");
1751                 seq_puts(seq,
1752                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1753         } else {
1754                 const struct ip_vs_service *svc = v;
1755                 const struct ip_vs_iter *iter = seq->private;
1756                 const struct ip_vs_dest *dest;
1757
1758                 if (iter->table == ip_vs_svc_table)
1759                         seq_printf(seq, "%s  %08X:%04X %s ",
1760                                    ip_vs_proto_name(svc->protocol),
1761                                    ntohl(svc->addr),
1762                                    ntohs(svc->port),
1763                                    svc->scheduler->name);
1764                 else
1765                         seq_printf(seq, "FWM  %08X %s ",
1766                                    svc->fwmark, svc->scheduler->name);
1767
1768                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1769                         seq_printf(seq, "persistent %d %08X\n",
1770                                 svc->timeout,
1771                                 ntohl(svc->netmask));
1772                 else
1773                         seq_putc(seq, '\n');
1774
1775                 list_for_each_entry(dest, &svc->destinations, n_list) {
1776                         seq_printf(seq,
1777                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1778                                    ntohl(dest->addr), ntohs(dest->port),
1779                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1780                                    atomic_read(&dest->weight),
1781                                    atomic_read(&dest->activeconns),
1782                                    atomic_read(&dest->inactconns));
1783                 }
1784         }
1785         return 0;
1786 }
1787
1788 static struct seq_operations ip_vs_info_seq_ops = {
1789         .start = ip_vs_info_seq_start,
1790         .next  = ip_vs_info_seq_next,
1791         .stop  = ip_vs_info_seq_stop,
1792         .show  = ip_vs_info_seq_show,
1793 };
1794
1795 static int ip_vs_info_open(struct inode *inode, struct file *file)
1796 {
1797         struct seq_file *seq;
1798         int rc = -ENOMEM;
1799         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1800
1801         if (!s)
1802                 goto out;
1803
1804         rc = seq_open(file, &ip_vs_info_seq_ops);
1805         if (rc)
1806                 goto out_kfree;
1807
1808         seq          = file->private_data;
1809         seq->private = s;
1810         memset(s, 0, sizeof(*s));
1811 out:
1812         return rc;
1813 out_kfree:
1814         kfree(s);
1815         goto out;
1816 }
1817
1818 static struct file_operations ip_vs_info_fops = {
1819         .owner   = THIS_MODULE,
1820         .open    = ip_vs_info_open,
1821         .read    = seq_read,
1822         .llseek  = seq_lseek,
1823         .release = seq_release_private,
1824 };
1825
1826 #endif
1827
1828 struct ip_vs_stats ip_vs_stats;
1829
1830 #ifdef CONFIG_PROC_FS
1831 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1832 {
1833
1834 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1835         seq_puts(seq,
1836                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1837         seq_printf(seq,
1838                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1839
1840         spin_lock_bh(&ip_vs_stats.lock);
1841         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1842                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1843                    (unsigned long long) ip_vs_stats.inbytes,
1844                    (unsigned long long) ip_vs_stats.outbytes);
1845
1846 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1847         seq_puts(seq,
1848                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1849         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1850                         ip_vs_stats.cps,
1851                         ip_vs_stats.inpps,
1852                         ip_vs_stats.outpps,
1853                         ip_vs_stats.inbps,
1854                         ip_vs_stats.outbps);
1855         spin_unlock_bh(&ip_vs_stats.lock);
1856
1857         return 0;
1858 }
1859
1860 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1861 {
1862         return single_open(file, ip_vs_stats_show, NULL);
1863 }
1864
1865 static struct file_operations ip_vs_stats_fops = {
1866         .owner = THIS_MODULE,
1867         .open = ip_vs_stats_seq_open,
1868         .read = seq_read,
1869         .llseek = seq_lseek,
1870         .release = single_release,
1871 };
1872
1873 #endif
1874
1875 /*
1876  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1877  */
1878 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1879 {
1880         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1881                   u->tcp_timeout,
1882                   u->tcp_fin_timeout,
1883                   u->udp_timeout);
1884
1885 #ifdef CONFIG_IP_VS_PROTO_TCP
1886         if (u->tcp_timeout) {
1887                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1888                         = u->tcp_timeout * HZ;
1889         }
1890
1891         if (u->tcp_fin_timeout) {
1892                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1893                         = u->tcp_fin_timeout * HZ;
1894         }
1895 #endif
1896
1897 #ifdef CONFIG_IP_VS_PROTO_UDP
1898         if (u->udp_timeout) {
1899                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1900                         = u->udp_timeout * HZ;
1901         }
1902 #endif
1903         return 0;
1904 }
1905
1906
1907 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1908 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1909 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1910                                  sizeof(struct ip_vs_dest_user))
1911 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1912 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1913 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1914
1915 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1916         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1917         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1920         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1921         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1924         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1925         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1926         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1927 };
1928
1929 static int
1930 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1931 {
1932         int ret;
1933         unsigned char arg[MAX_ARG_LEN];
1934         struct ip_vs_service_user *usvc;
1935         struct ip_vs_service *svc;
1936         struct ip_vs_dest_user *udest;
1937
1938         if (!capable(CAP_NET_ADMIN))
1939                 return -EPERM;
1940
1941         if (len != set_arglen[SET_CMDID(cmd)]) {
1942                 IP_VS_ERR("set_ctl: len %u != %u\n",
1943                           len, set_arglen[SET_CMDID(cmd)]);
1944                 return -EINVAL;
1945         }
1946
1947         if (copy_from_user(arg, user, len) != 0)
1948                 return -EFAULT;
1949
1950         /* increase the module use count */
1951         ip_vs_use_count_inc();
1952
1953         if (down_interruptible(&__ip_vs_mutex)) {
1954                 ret = -ERESTARTSYS;
1955                 goto out_dec;
1956         }
1957
1958         if (cmd == IP_VS_SO_SET_FLUSH) {
1959                 /* Flush the virtual service */
1960                 ret = ip_vs_flush();
1961                 goto out_unlock;
1962         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1963                 /* Set timeout values for (tcp tcpfin udp) */
1964                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1965                 goto out_unlock;
1966         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1967                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1968                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1969                 goto out_unlock;
1970         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1971                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1972                 ret = stop_sync_thread(dm->state);
1973                 goto out_unlock;
1974         }
1975
1976         usvc = (struct ip_vs_service_user *)arg;
1977         udest = (struct ip_vs_dest_user *)(usvc + 1);
1978
1979         if (cmd == IP_VS_SO_SET_ZERO) {
1980                 /* if no service address is set, zero counters in all */
1981                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1982                         ret = ip_vs_zero_all();
1983                         goto out_unlock;
1984                 }
1985         }
1986
1987         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1988         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1989                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1990                           usvc->protocol, NIPQUAD(usvc->addr),
1991                           ntohs(usvc->port), usvc->sched_name);
1992                 ret = -EFAULT;
1993                 goto out_unlock;
1994         }
1995
1996         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1997         if (usvc->fwmark == 0)
1998                 svc = __ip_vs_service_get(usvc->protocol,
1999                                           usvc->addr, usvc->port);
2000         else
2001                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2002
2003         if (cmd != IP_VS_SO_SET_ADD
2004             && (svc == NULL || svc->protocol != usvc->protocol)) {
2005                 ret = -ESRCH;
2006                 goto out_unlock;
2007         }
2008
2009         switch (cmd) {
2010         case IP_VS_SO_SET_ADD:
2011                 if (svc != NULL)
2012                         ret = -EEXIST;
2013                 else
2014                         ret = ip_vs_add_service(usvc, &svc);
2015                 break;
2016         case IP_VS_SO_SET_EDIT:
2017                 ret = ip_vs_edit_service(svc, usvc);
2018                 break;
2019         case IP_VS_SO_SET_DEL:
2020                 ret = ip_vs_del_service(svc);
2021                 if (!ret)
2022                         goto out_unlock;
2023                 break;
2024         case IP_VS_SO_SET_ZERO:
2025                 ret = ip_vs_zero_service(svc);
2026                 break;
2027         case IP_VS_SO_SET_ADDDEST:
2028                 ret = ip_vs_add_dest(svc, udest);
2029                 break;
2030         case IP_VS_SO_SET_EDITDEST:
2031                 ret = ip_vs_edit_dest(svc, udest);
2032                 break;
2033         case IP_VS_SO_SET_DELDEST:
2034                 ret = ip_vs_del_dest(svc, udest);
2035                 break;
2036         default:
2037                 ret = -EINVAL;
2038         }
2039
2040         if (svc)
2041                 ip_vs_service_put(svc);
2042
2043   out_unlock:
2044         up(&__ip_vs_mutex);
2045   out_dec:
2046         /* decrease the module use count */
2047         ip_vs_use_count_dec();
2048
2049         return ret;
2050 }
2051
2052
2053 static void
2054 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2055 {
2056         spin_lock_bh(&src->lock);
2057         memcpy(dst, src, (char*)&src->lock - (char*)src);
2058         spin_unlock_bh(&src->lock);
2059 }
2060
2061 static void
2062 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2063 {
2064         dst->protocol = src->protocol;
2065         dst->addr = src->addr;
2066         dst->port = src->port;
2067         dst->fwmark = src->fwmark;
2068         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2069         dst->flags = src->flags;
2070         dst->timeout = src->timeout / HZ;
2071         dst->netmask = src->netmask;
2072         dst->num_dests = src->num_dests;
2073         ip_vs_copy_stats(&dst->stats, &src->stats);
2074 }
2075
2076 static inline int
2077 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2078                             struct ip_vs_get_services __user *uptr)
2079 {
2080         int idx, count=0;
2081         struct ip_vs_service *svc;
2082         struct ip_vs_service_entry entry;
2083         int ret = 0;
2084
2085         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2086                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2087                         if (count >= get->num_services)
2088                                 goto out;
2089                         memset(&entry, 0, sizeof(entry));
2090                         ip_vs_copy_service(&entry, svc);
2091                         if (copy_to_user(&uptr->entrytable[count],
2092                                          &entry, sizeof(entry))) {
2093                                 ret = -EFAULT;
2094                                 goto out;
2095                         }
2096                         count++;
2097                 }
2098         }
2099
2100         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2101                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2102                         if (count >= get->num_services)
2103                                 goto out;
2104                         memset(&entry, 0, sizeof(entry));
2105                         ip_vs_copy_service(&entry, svc);
2106                         if (copy_to_user(&uptr->entrytable[count],
2107                                          &entry, sizeof(entry))) {
2108                                 ret = -EFAULT;
2109                                 goto out;
2110                         }
2111                         count++;
2112                 }
2113         }
2114   out:
2115         return ret;
2116 }
2117
2118 static inline int
2119 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2120                          struct ip_vs_get_dests __user *uptr)
2121 {
2122         struct ip_vs_service *svc;
2123         int ret = 0;
2124
2125         if (get->fwmark)
2126                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2127         else
2128                 svc = __ip_vs_service_get(get->protocol,
2129                                           get->addr, get->port);
2130         if (svc) {
2131                 int count = 0;
2132                 struct ip_vs_dest *dest;
2133                 struct ip_vs_dest_entry entry;
2134
2135                 list_for_each_entry(dest, &svc->destinations, n_list) {
2136                         if (count >= get->num_dests)
2137                                 break;
2138
2139                         entry.addr = dest->addr;
2140                         entry.port = dest->port;
2141                         entry.conn_flags = atomic_read(&dest->conn_flags);
2142                         entry.weight = atomic_read(&dest->weight);
2143                         entry.u_threshold = dest->u_threshold;
2144                         entry.l_threshold = dest->l_threshold;
2145                         entry.activeconns = atomic_read(&dest->activeconns);
2146                         entry.inactconns = atomic_read(&dest->inactconns);
2147                         entry.persistconns = atomic_read(&dest->persistconns);
2148                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2149                         if (copy_to_user(&uptr->entrytable[count],
2150                                          &entry, sizeof(entry))) {
2151                                 ret = -EFAULT;
2152                                 break;
2153                         }
2154                         count++;
2155                 }
2156                 ip_vs_service_put(svc);
2157         } else
2158                 ret = -ESRCH;
2159         return ret;
2160 }
2161
2162 static inline void
2163 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2164 {
2165 #ifdef CONFIG_IP_VS_PROTO_TCP
2166         u->tcp_timeout =
2167                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2168         u->tcp_fin_timeout =
2169                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2170 #endif
2171 #ifdef CONFIG_IP_VS_PROTO_UDP
2172         u->udp_timeout =
2173                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2174 #endif
2175 }
2176
2177
2178 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2179 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2180 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2181 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2182 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2183 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2184 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2185
2186 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2187         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2188         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2189         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2190         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2191         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2192         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2193         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2194 };
2195
2196 static int
2197 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2198 {
2199         unsigned char arg[128];
2200         int ret = 0;
2201
2202         if (!capable(CAP_NET_ADMIN))
2203                 return -EPERM;
2204
2205         if (*len < get_arglen[GET_CMDID(cmd)]) {
2206                 IP_VS_ERR("get_ctl: len %u < %u\n",
2207                           *len, get_arglen[GET_CMDID(cmd)]);
2208                 return -EINVAL;
2209         }
2210
2211         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2212                 return -EFAULT;
2213
2214         if (down_interruptible(&__ip_vs_mutex))
2215                 return -ERESTARTSYS;
2216
2217         switch (cmd) {
2218         case IP_VS_SO_GET_VERSION:
2219         {
2220                 char buf[64];
2221
2222                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2223                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2224                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2225                         ret = -EFAULT;
2226                         goto out;
2227                 }
2228                 *len = strlen(buf)+1;
2229         }
2230         break;
2231
2232         case IP_VS_SO_GET_INFO:
2233         {
2234                 struct ip_vs_getinfo info;
2235                 info.version = IP_VS_VERSION_CODE;
2236                 info.size = IP_VS_CONN_TAB_SIZE;
2237                 info.num_services = ip_vs_num_services;
2238                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2239                         ret = -EFAULT;
2240         }
2241         break;
2242
2243         case IP_VS_SO_GET_SERVICES:
2244         {
2245                 struct ip_vs_get_services *get;
2246                 int size;
2247
2248                 get = (struct ip_vs_get_services *)arg;
2249                 size = sizeof(*get) +
2250                         sizeof(struct ip_vs_service_entry) * get->num_services;
2251                 if (*len != size) {
2252                         IP_VS_ERR("length: %u != %u\n", *len, size);
2253                         ret = -EINVAL;
2254                         goto out;
2255                 }
2256                 ret = __ip_vs_get_service_entries(get, user);
2257         }
2258         break;
2259
2260         case IP_VS_SO_GET_SERVICE:
2261         {
2262                 struct ip_vs_service_entry *entry;
2263                 struct ip_vs_service *svc;
2264
2265                 entry = (struct ip_vs_service_entry *)arg;
2266                 if (entry->fwmark)
2267                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2268                 else
2269                         svc = __ip_vs_service_get(entry->protocol,
2270                                                   entry->addr, entry->port);
2271                 if (svc) {
2272                         ip_vs_copy_service(entry, svc);
2273                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2274                                 ret = -EFAULT;
2275                         ip_vs_service_put(svc);
2276                 } else
2277                         ret = -ESRCH;
2278         }
2279         break;
2280
2281         case IP_VS_SO_GET_DESTS:
2282         {
2283                 struct ip_vs_get_dests *get;
2284                 int size;
2285
2286                 get = (struct ip_vs_get_dests *)arg;
2287                 size = sizeof(*get) +
2288                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2289                 if (*len != size) {
2290                         IP_VS_ERR("length: %u != %u\n", *len, size);
2291                         ret = -EINVAL;
2292                         goto out;
2293                 }
2294                 ret = __ip_vs_get_dest_entries(get, user);
2295         }
2296         break;
2297
2298         case IP_VS_SO_GET_TIMEOUT:
2299         {
2300                 struct ip_vs_timeout_user t;
2301
2302                 __ip_vs_get_timeouts(&t);
2303                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2304                         ret = -EFAULT;
2305         }
2306         break;
2307
2308         case IP_VS_SO_GET_DAEMON:
2309         {
2310                 struct ip_vs_daemon_user d[2];
2311
2312                 memset(&d, 0, sizeof(d));
2313                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2314                         d[0].state = IP_VS_STATE_MASTER;
2315                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2316                         d[0].syncid = ip_vs_master_syncid;
2317                 }
2318                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2319                         d[1].state = IP_VS_STATE_BACKUP;
2320                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2321                         d[1].syncid = ip_vs_backup_syncid;
2322                 }
2323                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2324                         ret = -EFAULT;
2325         }
2326         break;
2327
2328         default:
2329                 ret = -EINVAL;
2330         }
2331
2332   out:
2333         up(&__ip_vs_mutex);
2334         return ret;
2335 }
2336
2337
2338 static struct nf_sockopt_ops ip_vs_sockopts = {
2339         .pf             = PF_INET,
2340         .set_optmin     = IP_VS_BASE_CTL,
2341         .set_optmax     = IP_VS_SO_SET_MAX+1,
2342         .set            = do_ip_vs_set_ctl,
2343         .get_optmin     = IP_VS_BASE_CTL,
2344         .get_optmax     = IP_VS_SO_GET_MAX+1,
2345         .get            = do_ip_vs_get_ctl,
2346 };
2347
2348
2349 int ip_vs_control_init(void)
2350 {
2351         int ret;
2352         int idx;
2353
2354         EnterFunction(2);
2355
2356         ret = nf_register_sockopt(&ip_vs_sockopts);
2357         if (ret) {
2358                 IP_VS_ERR("cannot register sockopt.\n");
2359                 return ret;
2360         }
2361
2362         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2363         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2364
2365         sysctl_header = register_sysctl_table(vs_root_table, 0);
2366
2367         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2368         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2369                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2370                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2371         }
2372         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2373                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2374         }
2375
2376         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2377         spin_lock_init(&ip_vs_stats.lock);
2378         ip_vs_new_estimator(&ip_vs_stats);
2379
2380         /* Hook the defense timer */
2381         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2382
2383         LeaveFunction(2);
2384         return 0;
2385 }
2386
2387
2388 void ip_vs_control_cleanup(void)
2389 {
2390         EnterFunction(2);
2391         ip_vs_trash_cleanup();
2392         cancel_rearming_delayed_work(&defense_work);
2393         ip_vs_kill_estimator(&ip_vs_stats);
2394         unregister_sysctl_table(sysctl_header);
2395         proc_net_remove("ip_vs_stats");
2396         proc_net_remove("ip_vs");
2397         nf_unregister_sockopt(&ip_vs_sockopts);
2398         LeaveFunction(2);
2399 }