GNU Linux-libre 4.4.288-gnu1
[releases.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* sysctl variables */
59
60 #ifdef CONFIG_IP_VS_DEBUG
61 static int sysctl_ip_vs_debug_level = 0;
62
63 int ip_vs_get_debug_level(void)
64 {
65         return sysctl_ip_vs_debug_level;
66 }
67 #endif
68
69
70 /*  Protos */
71 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
72
73
74 #ifdef CONFIG_IP_VS_IPV6
75 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
76 static bool __ip_vs_addr_is_local_v6(struct net *net,
77                                      const struct in6_addr *addr)
78 {
79         struct flowi6 fl6 = {
80                 .daddr = *addr,
81         };
82         struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
83         bool is_local;
84
85         is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
86
87         dst_release(dst);
88         return is_local;
89 }
90 #endif
91
92 #ifdef CONFIG_SYSCTL
93 /*
94  *      update_defense_level is called from keventd and from sysctl,
95  *      so it needs to protect itself from softirqs
96  */
97 static void update_defense_level(struct netns_ipvs *ipvs)
98 {
99         struct sysinfo i;
100         int availmem;
101         int nomem;
102         int to_change = -1;
103
104         /* we only count free and buffered memory (in pages) */
105         si_meminfo(&i);
106         availmem = i.freeram + i.bufferram;
107         /* however in linux 2.5 the i.bufferram is total page cache size,
108            we need adjust it */
109         /* si_swapinfo(&i); */
110         /* availmem = availmem - (i.totalswap - i.freeswap); */
111
112         nomem = (availmem < ipvs->sysctl_amemthresh);
113
114         local_bh_disable();
115
116         /* drop_entry */
117         spin_lock(&ipvs->dropentry_lock);
118         switch (ipvs->sysctl_drop_entry) {
119         case 0:
120                 atomic_set(&ipvs->dropentry, 0);
121                 break;
122         case 1:
123                 if (nomem) {
124                         atomic_set(&ipvs->dropentry, 1);
125                         ipvs->sysctl_drop_entry = 2;
126                 } else {
127                         atomic_set(&ipvs->dropentry, 0);
128                 }
129                 break;
130         case 2:
131                 if (nomem) {
132                         atomic_set(&ipvs->dropentry, 1);
133                 } else {
134                         atomic_set(&ipvs->dropentry, 0);
135                         ipvs->sysctl_drop_entry = 1;
136                 };
137                 break;
138         case 3:
139                 atomic_set(&ipvs->dropentry, 1);
140                 break;
141         }
142         spin_unlock(&ipvs->dropentry_lock);
143
144         /* drop_packet */
145         spin_lock(&ipvs->droppacket_lock);
146         switch (ipvs->sysctl_drop_packet) {
147         case 0:
148                 ipvs->drop_rate = 0;
149                 break;
150         case 1:
151                 if (nomem) {
152                         ipvs->drop_rate = ipvs->drop_counter
153                                 = ipvs->sysctl_amemthresh /
154                                 (ipvs->sysctl_amemthresh-availmem);
155                         ipvs->sysctl_drop_packet = 2;
156                 } else {
157                         ipvs->drop_rate = 0;
158                 }
159                 break;
160         case 2:
161                 if (nomem) {
162                         ipvs->drop_rate = ipvs->drop_counter
163                                 = ipvs->sysctl_amemthresh /
164                                 (ipvs->sysctl_amemthresh-availmem);
165                 } else {
166                         ipvs->drop_rate = 0;
167                         ipvs->sysctl_drop_packet = 1;
168                 }
169                 break;
170         case 3:
171                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
172                 break;
173         }
174         spin_unlock(&ipvs->droppacket_lock);
175
176         /* secure_tcp */
177         spin_lock(&ipvs->securetcp_lock);
178         switch (ipvs->sysctl_secure_tcp) {
179         case 0:
180                 if (ipvs->old_secure_tcp >= 2)
181                         to_change = 0;
182                 break;
183         case 1:
184                 if (nomem) {
185                         if (ipvs->old_secure_tcp < 2)
186                                 to_change = 1;
187                         ipvs->sysctl_secure_tcp = 2;
188                 } else {
189                         if (ipvs->old_secure_tcp >= 2)
190                                 to_change = 0;
191                 }
192                 break;
193         case 2:
194                 if (nomem) {
195                         if (ipvs->old_secure_tcp < 2)
196                                 to_change = 1;
197                 } else {
198                         if (ipvs->old_secure_tcp >= 2)
199                                 to_change = 0;
200                         ipvs->sysctl_secure_tcp = 1;
201                 }
202                 break;
203         case 3:
204                 if (ipvs->old_secure_tcp < 2)
205                         to_change = 1;
206                 break;
207         }
208         ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
209         if (to_change >= 0)
210                 ip_vs_protocol_timeout_change(ipvs,
211                                               ipvs->sysctl_secure_tcp > 1);
212         spin_unlock(&ipvs->securetcp_lock);
213
214         local_bh_enable();
215 }
216
217
218 /*
219  *      Timer for checking the defense
220  */
221 #define DEFENSE_TIMER_PERIOD    1*HZ
222
223 static void defense_work_handler(struct work_struct *work)
224 {
225         struct netns_ipvs *ipvs =
226                 container_of(work, struct netns_ipvs, defense_work.work);
227
228         update_defense_level(ipvs);
229         if (atomic_read(&ipvs->dropentry))
230                 ip_vs_random_dropentry(ipvs);
231         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
232 }
233 #endif
234
235 int
236 ip_vs_use_count_inc(void)
237 {
238         return try_module_get(THIS_MODULE);
239 }
240
241 void
242 ip_vs_use_count_dec(void)
243 {
244         module_put(THIS_MODULE);
245 }
246
247
248 /*
249  *      Hash table: for virtual service lookups
250  */
251 #define IP_VS_SVC_TAB_BITS 8
252 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
253 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
254
255 /* the service table hashed by <protocol, addr, port> */
256 static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
257 /* the service table hashed by fwmark */
258 static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
259
260
261 /*
262  *      Returns hash value for virtual service
263  */
264 static inline unsigned int
265 ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
266                   const union nf_inet_addr *addr, __be16 port)
267 {
268         register unsigned int porth = ntohs(port);
269         __be32 addr_fold = addr->ip;
270         __u32 ahash;
271
272 #ifdef CONFIG_IP_VS_IPV6
273         if (af == AF_INET6)
274                 addr_fold = addr->ip6[0]^addr->ip6[1]^
275                             addr->ip6[2]^addr->ip6[3];
276 #endif
277         ahash = ntohl(addr_fold);
278         ahash ^= ((size_t) ipvs >> 8);
279
280         return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
281                IP_VS_SVC_TAB_MASK;
282 }
283
284 /*
285  *      Returns hash value of fwmark for virtual service lookup
286  */
287 static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
288 {
289         return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
290 }
291
292 /*
293  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
294  *      or in the ip_vs_svc_fwm_table by fwmark.
295  *      Should be called with locked tables.
296  */
297 static int ip_vs_svc_hash(struct ip_vs_service *svc)
298 {
299         unsigned int hash;
300
301         if (svc->flags & IP_VS_SVC_F_HASHED) {
302                 pr_err("%s(): request for already hashed, called from %pF\n",
303                        __func__, __builtin_return_address(0));
304                 return 0;
305         }
306
307         if (svc->fwmark == 0) {
308                 /*
309                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
310                  */
311                 hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
312                                          &svc->addr, svc->port);
313                 hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
314         } else {
315                 /*
316                  *  Hash it by fwmark in svc_fwm_table
317                  */
318                 hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
319                 hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
320         }
321
322         svc->flags |= IP_VS_SVC_F_HASHED;
323         /* increase its refcnt because it is referenced by the svc table */
324         atomic_inc(&svc->refcnt);
325         return 1;
326 }
327
328
329 /*
330  *      Unhashes a service from svc_table / svc_fwm_table.
331  *      Should be called with locked tables.
332  */
333 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
334 {
335         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
336                 pr_err("%s(): request for unhash flagged, called from %pF\n",
337                        __func__, __builtin_return_address(0));
338                 return 0;
339         }
340
341         if (svc->fwmark == 0) {
342                 /* Remove it from the svc_table table */
343                 hlist_del_rcu(&svc->s_list);
344         } else {
345                 /* Remove it from the svc_fwm_table table */
346                 hlist_del_rcu(&svc->f_list);
347         }
348
349         svc->flags &= ~IP_VS_SVC_F_HASHED;
350         atomic_dec(&svc->refcnt);
351         return 1;
352 }
353
354
355 /*
356  *      Get service by {netns, proto,addr,port} in the service table.
357  */
358 static inline struct ip_vs_service *
359 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
360                      const union nf_inet_addr *vaddr, __be16 vport)
361 {
362         unsigned int hash;
363         struct ip_vs_service *svc;
364
365         /* Check for "full" addressed entries */
366         hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
367
368         hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
369                 if ((svc->af == af)
370                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
371                     && (svc->port == vport)
372                     && (svc->protocol == protocol)
373                     && (svc->ipvs == ipvs)) {
374                         /* HIT */
375                         return svc;
376                 }
377         }
378
379         return NULL;
380 }
381
382
383 /*
384  *      Get service by {fwmark} in the service table.
385  */
386 static inline struct ip_vs_service *
387 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
388 {
389         unsigned int hash;
390         struct ip_vs_service *svc;
391
392         /* Check for fwmark addressed entries */
393         hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
394
395         hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
396                 if (svc->fwmark == fwmark && svc->af == af
397                     && (svc->ipvs == ipvs)) {
398                         /* HIT */
399                         return svc;
400                 }
401         }
402
403         return NULL;
404 }
405
406 /* Find service, called under RCU lock */
407 struct ip_vs_service *
408 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
409                    const union nf_inet_addr *vaddr, __be16 vport)
410 {
411         struct ip_vs_service *svc;
412
413         /*
414          *      Check the table hashed by fwmark first
415          */
416         if (fwmark) {
417                 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
418                 if (svc)
419                         goto out;
420         }
421
422         /*
423          *      Check the table hashed by <protocol,addr,port>
424          *      for "full" addressed entries
425          */
426         svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
427
428         if (svc == NULL
429             && protocol == IPPROTO_TCP
430             && atomic_read(&ipvs->ftpsvc_counter)
431             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
432                 /*
433                  * Check if ftp service entry exists, the packet
434                  * might belong to FTP data connections.
435                  */
436                 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
437         }
438
439         if (svc == NULL
440             && atomic_read(&ipvs->nullsvc_counter)) {
441                 /*
442                  * Check if the catch-all port (port zero) exists
443                  */
444                 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
445         }
446
447   out:
448         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
449                       fwmark, ip_vs_proto_name(protocol),
450                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
451                       svc ? "hit" : "not hit");
452
453         return svc;
454 }
455
456
457 static inline void
458 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
459 {
460         atomic_inc(&svc->refcnt);
461         rcu_assign_pointer(dest->svc, svc);
462 }
463
464 static void ip_vs_service_free(struct ip_vs_service *svc)
465 {
466         free_percpu(svc->stats.cpustats);
467         kfree(svc);
468 }
469
470 static void ip_vs_service_rcu_free(struct rcu_head *head)
471 {
472         struct ip_vs_service *svc;
473
474         svc = container_of(head, struct ip_vs_service, rcu_head);
475         ip_vs_service_free(svc);
476 }
477
478 static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
479 {
480         if (atomic_dec_and_test(&svc->refcnt)) {
481                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
482                               svc->fwmark,
483                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
484                               ntohs(svc->port));
485                 if (do_delay)
486                         call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
487                 else
488                         ip_vs_service_free(svc);
489         }
490 }
491
492
493 /*
494  *      Returns hash value for real service
495  */
496 static inline unsigned int ip_vs_rs_hashkey(int af,
497                                             const union nf_inet_addr *addr,
498                                             __be16 port)
499 {
500         register unsigned int porth = ntohs(port);
501         __be32 addr_fold = addr->ip;
502
503 #ifdef CONFIG_IP_VS_IPV6
504         if (af == AF_INET6)
505                 addr_fold = addr->ip6[0]^addr->ip6[1]^
506                             addr->ip6[2]^addr->ip6[3];
507 #endif
508
509         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
510                 & IP_VS_RTAB_MASK;
511 }
512
513 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
514 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
515 {
516         unsigned int hash;
517
518         if (dest->in_rs_table)
519                 return;
520
521         /*
522          *      Hash by proto,addr,port,
523          *      which are the parameters of the real service.
524          */
525         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
526
527         hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
528         dest->in_rs_table = 1;
529 }
530
531 /* Unhash ip_vs_dest from rs_table. */
532 static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
533 {
534         /*
535          * Remove it from the rs_table table.
536          */
537         if (dest->in_rs_table) {
538                 hlist_del_rcu(&dest->d_list);
539                 dest->in_rs_table = 0;
540         }
541 }
542
543 /* Check if real service by <proto,addr,port> is present */
544 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
545                             const union nf_inet_addr *daddr, __be16 dport)
546 {
547         unsigned int hash;
548         struct ip_vs_dest *dest;
549
550         /* Check for "full" addressed entries */
551         hash = ip_vs_rs_hashkey(af, daddr, dport);
552
553         rcu_read_lock();
554         hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
555                 if (dest->port == dport &&
556                     dest->af == af &&
557                     ip_vs_addr_equal(af, &dest->addr, daddr) &&
558                     (dest->protocol == protocol || dest->vfwmark)) {
559                         /* HIT */
560                         rcu_read_unlock();
561                         return true;
562                 }
563         }
564         rcu_read_unlock();
565
566         return false;
567 }
568
569 /* Lookup destination by {addr,port} in the given service
570  * Called under RCU lock.
571  */
572 static struct ip_vs_dest *
573 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
574                   const union nf_inet_addr *daddr, __be16 dport)
575 {
576         struct ip_vs_dest *dest;
577
578         /*
579          * Find the destination for the given service
580          */
581         list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
582                 if ((dest->af == dest_af) &&
583                     ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
584                     (dest->port == dport)) {
585                         /* HIT */
586                         return dest;
587                 }
588         }
589
590         return NULL;
591 }
592
593 /*
594  * Find destination by {daddr,dport,vaddr,protocol}
595  * Created to be used in ip_vs_process_message() in
596  * the backup synchronization daemon. It finds the
597  * destination to be bound to the received connection
598  * on the backup.
599  * Called under RCU lock, no refcnt is returned.
600  */
601 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
602                                    const union nf_inet_addr *daddr,
603                                    __be16 dport,
604                                    const union nf_inet_addr *vaddr,
605                                    __be16 vport, __u16 protocol, __u32 fwmark,
606                                    __u32 flags)
607 {
608         struct ip_vs_dest *dest;
609         struct ip_vs_service *svc;
610         __be16 port = dport;
611
612         svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
613         if (!svc)
614                 return NULL;
615         if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
616                 port = 0;
617         dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
618         if (!dest)
619                 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
620         return dest;
621 }
622
623 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
624 {
625         struct ip_vs_dest_dst *dest_dst = container_of(head,
626                                                        struct ip_vs_dest_dst,
627                                                        rcu_head);
628
629         dst_release(dest_dst->dst_cache);
630         kfree(dest_dst);
631 }
632
633 /* Release dest_dst and dst_cache for dest in user context */
634 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
635 {
636         struct ip_vs_dest_dst *old;
637
638         old = rcu_dereference_protected(dest->dest_dst, 1);
639         if (old) {
640                 RCU_INIT_POINTER(dest->dest_dst, NULL);
641                 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
642         }
643 }
644
645 /*
646  *  Lookup dest by {svc,addr,port} in the destination trash.
647  *  The destination trash is used to hold the destinations that are removed
648  *  from the service table but are still referenced by some conn entries.
649  *  The reason to add the destination trash is when the dest is temporary
650  *  down (either by administrator or by monitor program), the dest can be
651  *  picked back from the trash, the remaining connections to the dest can
652  *  continue, and the counting information of the dest is also useful for
653  *  scheduling.
654  */
655 static struct ip_vs_dest *
656 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
657                      const union nf_inet_addr *daddr, __be16 dport)
658 {
659         struct ip_vs_dest *dest;
660         struct netns_ipvs *ipvs = svc->ipvs;
661
662         /*
663          * Find the destination in trash
664          */
665         spin_lock_bh(&ipvs->dest_trash_lock);
666         list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
667                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
668                               "dest->refcnt=%d\n",
669                               dest->vfwmark,
670                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
671                               ntohs(dest->port),
672                               atomic_read(&dest->refcnt));
673                 if (dest->af == dest_af &&
674                     ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
675                     dest->port == dport &&
676                     dest->vfwmark == svc->fwmark &&
677                     dest->protocol == svc->protocol &&
678                     (svc->fwmark ||
679                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
680                       dest->vport == svc->port))) {
681                         /* HIT */
682                         list_del(&dest->t_list);
683                         ip_vs_dest_hold(dest);
684                         goto out;
685                 }
686         }
687
688         dest = NULL;
689
690 out:
691         spin_unlock_bh(&ipvs->dest_trash_lock);
692
693         return dest;
694 }
695
696 static void ip_vs_dest_free(struct ip_vs_dest *dest)
697 {
698         struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
699
700         __ip_vs_dst_cache_reset(dest);
701         __ip_vs_svc_put(svc, false);
702         free_percpu(dest->stats.cpustats);
703         ip_vs_dest_put_and_free(dest);
704 }
705
706 /*
707  *  Clean up all the destinations in the trash
708  *  Called by the ip_vs_control_cleanup()
709  *
710  *  When the ip_vs_control_clearup is activated by ipvs module exit,
711  *  the service tables must have been flushed and all the connections
712  *  are expired, and the refcnt of each destination in the trash must
713  *  be 0, so we simply release them here.
714  */
715 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
716 {
717         struct ip_vs_dest *dest, *nxt;
718
719         del_timer_sync(&ipvs->dest_trash_timer);
720         /* No need to use dest_trash_lock */
721         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
722                 list_del(&dest->t_list);
723                 ip_vs_dest_free(dest);
724         }
725 }
726
727 static void
728 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
729 {
730 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
731
732         spin_lock_bh(&src->lock);
733
734         IP_VS_SHOW_STATS_COUNTER(conns);
735         IP_VS_SHOW_STATS_COUNTER(inpkts);
736         IP_VS_SHOW_STATS_COUNTER(outpkts);
737         IP_VS_SHOW_STATS_COUNTER(inbytes);
738         IP_VS_SHOW_STATS_COUNTER(outbytes);
739
740         ip_vs_read_estimator(dst, src);
741
742         spin_unlock_bh(&src->lock);
743 }
744
745 static void
746 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
747 {
748         dst->conns = (u32)src->conns;
749         dst->inpkts = (u32)src->inpkts;
750         dst->outpkts = (u32)src->outpkts;
751         dst->inbytes = src->inbytes;
752         dst->outbytes = src->outbytes;
753         dst->cps = (u32)src->cps;
754         dst->inpps = (u32)src->inpps;
755         dst->outpps = (u32)src->outpps;
756         dst->inbps = (u32)src->inbps;
757         dst->outbps = (u32)src->outbps;
758 }
759
760 static void
761 ip_vs_zero_stats(struct ip_vs_stats *stats)
762 {
763         spin_lock_bh(&stats->lock);
764
765         /* get current counters as zero point, rates are zeroed */
766
767 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
768
769         IP_VS_ZERO_STATS_COUNTER(conns);
770         IP_VS_ZERO_STATS_COUNTER(inpkts);
771         IP_VS_ZERO_STATS_COUNTER(outpkts);
772         IP_VS_ZERO_STATS_COUNTER(inbytes);
773         IP_VS_ZERO_STATS_COUNTER(outbytes);
774
775         ip_vs_zero_estimator(stats);
776
777         spin_unlock_bh(&stats->lock);
778 }
779
780 /*
781  *      Update a destination in the given service
782  */
783 static void
784 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
785                     struct ip_vs_dest_user_kern *udest, int add)
786 {
787         struct netns_ipvs *ipvs = svc->ipvs;
788         struct ip_vs_service *old_svc;
789         struct ip_vs_scheduler *sched;
790         int conn_flags;
791
792         /* We cannot modify an address and change the address family */
793         BUG_ON(!add && udest->af != dest->af);
794
795         if (add && udest->af != svc->af)
796                 ipvs->mixed_address_family_dests++;
797
798         /* set the weight and the flags */
799         atomic_set(&dest->weight, udest->weight);
800         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
801         conn_flags |= IP_VS_CONN_F_INACTIVE;
802
803         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
804         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
805                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
806         } else {
807                 /*
808                  *    Put the real service in rs_table if not present.
809                  *    For now only for NAT!
810                  */
811                 ip_vs_rs_hash(ipvs, dest);
812         }
813         atomic_set(&dest->conn_flags, conn_flags);
814
815         /* bind the service */
816         old_svc = rcu_dereference_protected(dest->svc, 1);
817         if (!old_svc) {
818                 __ip_vs_bind_svc(dest, svc);
819         } else {
820                 if (old_svc != svc) {
821                         ip_vs_zero_stats(&dest->stats);
822                         __ip_vs_bind_svc(dest, svc);
823                         __ip_vs_svc_put(old_svc, true);
824                 }
825         }
826
827         /* set the dest status flags */
828         dest->flags |= IP_VS_DEST_F_AVAILABLE;
829
830         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
831                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
832         dest->u_threshold = udest->u_threshold;
833         dest->l_threshold = udest->l_threshold;
834
835         dest->af = udest->af;
836
837         spin_lock_bh(&dest->dst_lock);
838         __ip_vs_dst_cache_reset(dest);
839         spin_unlock_bh(&dest->dst_lock);
840
841         if (add) {
842                 ip_vs_start_estimator(svc->ipvs, &dest->stats);
843                 list_add_rcu(&dest->n_list, &svc->destinations);
844                 svc->num_dests++;
845                 sched = rcu_dereference_protected(svc->scheduler, 1);
846                 if (sched && sched->add_dest)
847                         sched->add_dest(svc, dest);
848         } else {
849                 sched = rcu_dereference_protected(svc->scheduler, 1);
850                 if (sched && sched->upd_dest)
851                         sched->upd_dest(svc, dest);
852         }
853 }
854
855
856 /*
857  *      Create a destination for the given service
858  */
859 static int
860 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
861                struct ip_vs_dest **dest_p)
862 {
863         struct ip_vs_dest *dest;
864         unsigned int atype, i;
865
866         EnterFunction(2);
867
868 #ifdef CONFIG_IP_VS_IPV6
869         if (udest->af == AF_INET6) {
870                 atype = ipv6_addr_type(&udest->addr.in6);
871                 if ((!(atype & IPV6_ADDR_UNICAST) ||
872                         atype & IPV6_ADDR_LINKLOCAL) &&
873                         !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
874                         return -EINVAL;
875         } else
876 #endif
877         {
878                 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
879                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
880                         return -EINVAL;
881         }
882
883         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
884         if (dest == NULL)
885                 return -ENOMEM;
886
887         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
888         if (!dest->stats.cpustats)
889                 goto err_alloc;
890
891         for_each_possible_cpu(i) {
892                 struct ip_vs_cpu_stats *ip_vs_dest_stats;
893                 ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
894                 u64_stats_init(&ip_vs_dest_stats->syncp);
895         }
896
897         dest->af = udest->af;
898         dest->protocol = svc->protocol;
899         dest->vaddr = svc->addr;
900         dest->vport = svc->port;
901         dest->vfwmark = svc->fwmark;
902         ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
903         dest->port = udest->port;
904
905         atomic_set(&dest->activeconns, 0);
906         atomic_set(&dest->inactconns, 0);
907         atomic_set(&dest->persistconns, 0);
908         atomic_set(&dest->refcnt, 1);
909
910         INIT_HLIST_NODE(&dest->d_list);
911         spin_lock_init(&dest->dst_lock);
912         spin_lock_init(&dest->stats.lock);
913         __ip_vs_update_dest(svc, dest, udest, 1);
914
915         *dest_p = dest;
916
917         LeaveFunction(2);
918         return 0;
919
920 err_alloc:
921         kfree(dest);
922         return -ENOMEM;
923 }
924
925
926 /*
927  *      Add a destination into an existing service
928  */
929 static int
930 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
931 {
932         struct ip_vs_dest *dest;
933         union nf_inet_addr daddr;
934         __be16 dport = udest->port;
935         int ret;
936
937         EnterFunction(2);
938
939         if (udest->weight < 0) {
940                 pr_err("%s(): server weight less than zero\n", __func__);
941                 return -ERANGE;
942         }
943
944         if (udest->l_threshold > udest->u_threshold) {
945                 pr_err("%s(): lower threshold is higher than upper threshold\n",
946                         __func__);
947                 return -ERANGE;
948         }
949
950         ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
951
952         /* We use function that requires RCU lock */
953         rcu_read_lock();
954         dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
955         rcu_read_unlock();
956
957         if (dest != NULL) {
958                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
959                 return -EEXIST;
960         }
961
962         /*
963          * Check if the dest already exists in the trash and
964          * is from the same service
965          */
966         dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
967
968         if (dest != NULL) {
969                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
970                               "dest->refcnt=%d, service %u/%s:%u\n",
971                               IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
972                               atomic_read(&dest->refcnt),
973                               dest->vfwmark,
974                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
975                               ntohs(dest->vport));
976
977                 __ip_vs_update_dest(svc, dest, udest, 1);
978                 ret = 0;
979         } else {
980                 /*
981                  * Allocate and initialize the dest structure
982                  */
983                 ret = ip_vs_new_dest(svc, udest, &dest);
984         }
985         LeaveFunction(2);
986
987         return ret;
988 }
989
990
991 /*
992  *      Edit a destination in the given service
993  */
994 static int
995 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
996 {
997         struct ip_vs_dest *dest;
998         union nf_inet_addr daddr;
999         __be16 dport = udest->port;
1000
1001         EnterFunction(2);
1002
1003         if (udest->weight < 0) {
1004                 pr_err("%s(): server weight less than zero\n", __func__);
1005                 return -ERANGE;
1006         }
1007
1008         if (udest->l_threshold > udest->u_threshold) {
1009                 pr_err("%s(): lower threshold is higher than upper threshold\n",
1010                         __func__);
1011                 return -ERANGE;
1012         }
1013
1014         ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1015
1016         /* We use function that requires RCU lock */
1017         rcu_read_lock();
1018         dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1019         rcu_read_unlock();
1020
1021         if (dest == NULL) {
1022                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1023                 return -ENOENT;
1024         }
1025
1026         __ip_vs_update_dest(svc, dest, udest, 0);
1027         LeaveFunction(2);
1028
1029         return 0;
1030 }
1031
1032 /*
1033  *      Delete a destination (must be already unlinked from the service)
1034  */
1035 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1036                              bool cleanup)
1037 {
1038         ip_vs_stop_estimator(ipvs, &dest->stats);
1039
1040         /*
1041          *  Remove it from the d-linked list with the real services.
1042          */
1043         ip_vs_rs_unhash(dest);
1044
1045         spin_lock_bh(&ipvs->dest_trash_lock);
1046         IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1047                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1048                       atomic_read(&dest->refcnt));
1049         if (list_empty(&ipvs->dest_trash) && !cleanup)
1050                 mod_timer(&ipvs->dest_trash_timer,
1051                           jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1052         /* dest lives in trash without reference */
1053         list_add(&dest->t_list, &ipvs->dest_trash);
1054         dest->idle_start = 0;
1055         spin_unlock_bh(&ipvs->dest_trash_lock);
1056         ip_vs_dest_put(dest);
1057 }
1058
1059
1060 /*
1061  *      Unlink a destination from the given service
1062  */
1063 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1064                                 struct ip_vs_dest *dest,
1065                                 int svcupd)
1066 {
1067         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1068
1069         /*
1070          *  Remove it from the d-linked destination list.
1071          */
1072         list_del_rcu(&dest->n_list);
1073         svc->num_dests--;
1074
1075         if (dest->af != svc->af)
1076                 svc->ipvs->mixed_address_family_dests--;
1077
1078         if (svcupd) {
1079                 struct ip_vs_scheduler *sched;
1080
1081                 sched = rcu_dereference_protected(svc->scheduler, 1);
1082                 if (sched && sched->del_dest)
1083                         sched->del_dest(svc, dest);
1084         }
1085 }
1086
1087
1088 /*
1089  *      Delete a destination server in the given service
1090  */
1091 static int
1092 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1093 {
1094         struct ip_vs_dest *dest;
1095         __be16 dport = udest->port;
1096
1097         EnterFunction(2);
1098
1099         /* We use function that requires RCU lock */
1100         rcu_read_lock();
1101         dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
1102         rcu_read_unlock();
1103
1104         if (dest == NULL) {
1105                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1106                 return -ENOENT;
1107         }
1108
1109         /*
1110          *      Unlink dest from the service
1111          */
1112         __ip_vs_unlink_dest(svc, dest, 1);
1113
1114         /*
1115          *      Delete the destination
1116          */
1117         __ip_vs_del_dest(svc->ipvs, dest, false);
1118
1119         LeaveFunction(2);
1120
1121         return 0;
1122 }
1123
1124 static void ip_vs_dest_trash_expire(unsigned long data)
1125 {
1126         struct netns_ipvs *ipvs = (struct netns_ipvs *)data;
1127         struct ip_vs_dest *dest, *next;
1128         unsigned long now = jiffies;
1129
1130         spin_lock(&ipvs->dest_trash_lock);
1131         list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1132                 if (atomic_read(&dest->refcnt) > 0)
1133                         continue;
1134                 if (dest->idle_start) {
1135                         if (time_before(now, dest->idle_start +
1136                                              IP_VS_DEST_TRASH_PERIOD))
1137                                 continue;
1138                 } else {
1139                         dest->idle_start = max(1UL, now);
1140                         continue;
1141                 }
1142                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1143                               dest->vfwmark,
1144                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1145                               ntohs(dest->port));
1146                 list_del(&dest->t_list);
1147                 ip_vs_dest_free(dest);
1148         }
1149         if (!list_empty(&ipvs->dest_trash))
1150                 mod_timer(&ipvs->dest_trash_timer,
1151                           jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1152         spin_unlock(&ipvs->dest_trash_lock);
1153 }
1154
1155 /*
1156  *      Add a service into the service hash table
1157  */
1158 static int
1159 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
1160                   struct ip_vs_service **svc_p)
1161 {
1162         int ret = 0, i;
1163         struct ip_vs_scheduler *sched = NULL;
1164         struct ip_vs_pe *pe = NULL;
1165         struct ip_vs_service *svc = NULL;
1166
1167         /* increase the module use count */
1168         ip_vs_use_count_inc();
1169
1170         /* Lookup the scheduler by 'u->sched_name' */
1171         if (strcmp(u->sched_name, "none")) {
1172                 sched = ip_vs_scheduler_get(u->sched_name);
1173                 if (!sched) {
1174                         pr_info("Scheduler module ip_vs_%s not found\n",
1175                                 u->sched_name);
1176                         ret = -ENOENT;
1177                         goto out_err;
1178                 }
1179         }
1180
1181         if (u->pe_name && *u->pe_name) {
1182                 pe = ip_vs_pe_getbyname(u->pe_name);
1183                 if (pe == NULL) {
1184                         pr_info("persistence engine module ip_vs_pe_%s "
1185                                 "not found\n", u->pe_name);
1186                         ret = -ENOENT;
1187                         goto out_err;
1188                 }
1189         }
1190
1191 #ifdef CONFIG_IP_VS_IPV6
1192         if (u->af == AF_INET6) {
1193                 __u32 plen = (__force __u32) u->netmask;
1194
1195                 if (plen < 1 || plen > 128) {
1196                         ret = -EINVAL;
1197                         goto out_err;
1198                 }
1199         }
1200 #endif
1201
1202         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1203         if (svc == NULL) {
1204                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1205                 ret = -ENOMEM;
1206                 goto out_err;
1207         }
1208         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1209         if (!svc->stats.cpustats) {
1210                 ret = -ENOMEM;
1211                 goto out_err;
1212         }
1213
1214         for_each_possible_cpu(i) {
1215                 struct ip_vs_cpu_stats *ip_vs_stats;
1216                 ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
1217                 u64_stats_init(&ip_vs_stats->syncp);
1218         }
1219
1220
1221         /* I'm the first user of the service */
1222         atomic_set(&svc->refcnt, 0);
1223
1224         svc->af = u->af;
1225         svc->protocol = u->protocol;
1226         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1227         svc->port = u->port;
1228         svc->fwmark = u->fwmark;
1229         svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
1230         svc->timeout = u->timeout * HZ;
1231         svc->netmask = u->netmask;
1232         svc->ipvs = ipvs;
1233
1234         INIT_LIST_HEAD(&svc->destinations);
1235         spin_lock_init(&svc->sched_lock);
1236         spin_lock_init(&svc->stats.lock);
1237
1238         /* Bind the scheduler */
1239         if (sched) {
1240                 ret = ip_vs_bind_scheduler(svc, sched);
1241                 if (ret)
1242                         goto out_err;
1243                 sched = NULL;
1244         }
1245
1246         /* Bind the ct retriever */
1247         RCU_INIT_POINTER(svc->pe, pe);
1248         pe = NULL;
1249
1250         /* Update the virtual service counters */
1251         if (svc->port == FTPPORT)
1252                 atomic_inc(&ipvs->ftpsvc_counter);
1253         else if (svc->port == 0)
1254                 atomic_inc(&ipvs->nullsvc_counter);
1255
1256         ip_vs_start_estimator(ipvs, &svc->stats);
1257
1258         /* Count only IPv4 services for old get/setsockopt interface */
1259         if (svc->af == AF_INET)
1260                 ipvs->num_services++;
1261
1262         /* Hash the service into the service table */
1263         ip_vs_svc_hash(svc);
1264
1265         *svc_p = svc;
1266         /* Now there is a service - full throttle */
1267         ipvs->enable = 1;
1268         return 0;
1269
1270
1271  out_err:
1272         if (svc != NULL) {
1273                 ip_vs_unbind_scheduler(svc, sched);
1274                 ip_vs_service_free(svc);
1275         }
1276         ip_vs_scheduler_put(sched);
1277         ip_vs_pe_put(pe);
1278
1279         /* decrease the module use count */
1280         ip_vs_use_count_dec();
1281
1282         return ret;
1283 }
1284
1285
1286 /*
1287  *      Edit a service and bind it with a new scheduler
1288  */
1289 static int
1290 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1291 {
1292         struct ip_vs_scheduler *sched = NULL, *old_sched;
1293         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1294         int ret = 0;
1295
1296         /*
1297          * Lookup the scheduler, by 'u->sched_name'
1298          */
1299         if (strcmp(u->sched_name, "none")) {
1300                 sched = ip_vs_scheduler_get(u->sched_name);
1301                 if (!sched) {
1302                         pr_info("Scheduler module ip_vs_%s not found\n",
1303                                 u->sched_name);
1304                         return -ENOENT;
1305                 }
1306         }
1307         old_sched = sched;
1308
1309         if (u->pe_name && *u->pe_name) {
1310                 pe = ip_vs_pe_getbyname(u->pe_name);
1311                 if (pe == NULL) {
1312                         pr_info("persistence engine module ip_vs_pe_%s "
1313                                 "not found\n", u->pe_name);
1314                         ret = -ENOENT;
1315                         goto out;
1316                 }
1317                 old_pe = pe;
1318         }
1319
1320 #ifdef CONFIG_IP_VS_IPV6
1321         if (u->af == AF_INET6) {
1322                 __u32 plen = (__force __u32) u->netmask;
1323
1324                 if (plen < 1 || plen > 128) {
1325                         ret = -EINVAL;
1326                         goto out;
1327                 }
1328         }
1329 #endif
1330
1331         old_sched = rcu_dereference_protected(svc->scheduler, 1);
1332         if (sched != old_sched) {
1333                 if (old_sched) {
1334                         ip_vs_unbind_scheduler(svc, old_sched);
1335                         RCU_INIT_POINTER(svc->scheduler, NULL);
1336                         /* Wait all svc->sched_data users */
1337                         synchronize_rcu();
1338                 }
1339                 /* Bind the new scheduler */
1340                 if (sched) {
1341                         ret = ip_vs_bind_scheduler(svc, sched);
1342                         if (ret) {
1343                                 ip_vs_scheduler_put(sched);
1344                                 goto out;
1345                         }
1346                 }
1347         }
1348
1349         /*
1350          * Set the flags and timeout value
1351          */
1352         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1353         svc->timeout = u->timeout * HZ;
1354         svc->netmask = u->netmask;
1355
1356         old_pe = rcu_dereference_protected(svc->pe, 1);
1357         if (pe != old_pe)
1358                 rcu_assign_pointer(svc->pe, pe);
1359
1360 out:
1361         ip_vs_scheduler_put(old_sched);
1362         ip_vs_pe_put(old_pe);
1363         return ret;
1364 }
1365
1366 /*
1367  *      Delete a service from the service list
1368  *      - The service must be unlinked, unlocked and not referenced!
1369  *      - We are called under _bh lock
1370  */
1371 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1372 {
1373         struct ip_vs_dest *dest, *nxt;
1374         struct ip_vs_scheduler *old_sched;
1375         struct ip_vs_pe *old_pe;
1376         struct netns_ipvs *ipvs = svc->ipvs;
1377
1378         pr_info("%s: enter\n", __func__);
1379
1380         /* Count only IPv4 services for old get/setsockopt interface */
1381         if (svc->af == AF_INET)
1382                 ipvs->num_services--;
1383
1384         ip_vs_stop_estimator(svc->ipvs, &svc->stats);
1385
1386         /* Unbind scheduler */
1387         old_sched = rcu_dereference_protected(svc->scheduler, 1);
1388         ip_vs_unbind_scheduler(svc, old_sched);
1389         ip_vs_scheduler_put(old_sched);
1390
1391         /* Unbind persistence engine, keep svc->pe */
1392         old_pe = rcu_dereference_protected(svc->pe, 1);
1393         ip_vs_pe_put(old_pe);
1394
1395         /*
1396          *    Unlink the whole destination list
1397          */
1398         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1399                 __ip_vs_unlink_dest(svc, dest, 0);
1400                 __ip_vs_del_dest(svc->ipvs, dest, cleanup);
1401         }
1402
1403         /*
1404          *    Update the virtual service counters
1405          */
1406         if (svc->port == FTPPORT)
1407                 atomic_dec(&ipvs->ftpsvc_counter);
1408         else if (svc->port == 0)
1409                 atomic_dec(&ipvs->nullsvc_counter);
1410
1411         /*
1412          *    Free the service if nobody refers to it
1413          */
1414         __ip_vs_svc_put(svc, true);
1415
1416         /* decrease the module use count */
1417         ip_vs_use_count_dec();
1418 }
1419
1420 /*
1421  * Unlink a service from list and try to delete it if its refcnt reached 0
1422  */
1423 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
1424 {
1425         /* Hold svc to avoid double release from dest_trash */
1426         atomic_inc(&svc->refcnt);
1427         /*
1428          * Unhash it from the service table
1429          */
1430         ip_vs_svc_unhash(svc);
1431
1432         __ip_vs_del_service(svc, cleanup);
1433 }
1434
1435 /*
1436  *      Delete a service from the service list
1437  */
1438 static int ip_vs_del_service(struct ip_vs_service *svc)
1439 {
1440         if (svc == NULL)
1441                 return -EEXIST;
1442         ip_vs_unlink_service(svc, false);
1443
1444         return 0;
1445 }
1446
1447
1448 /*
1449  *      Flush all the virtual services
1450  */
1451 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
1452 {
1453         int idx;
1454         struct ip_vs_service *svc;
1455         struct hlist_node *n;
1456
1457         /*
1458          * Flush the service table hashed by <netns,protocol,addr,port>
1459          */
1460         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1461                 hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
1462                                           s_list) {
1463                         if (svc->ipvs == ipvs)
1464                                 ip_vs_unlink_service(svc, cleanup);
1465                 }
1466         }
1467
1468         /*
1469          * Flush the service table hashed by fwmark
1470          */
1471         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1472                 hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
1473                                           f_list) {
1474                         if (svc->ipvs == ipvs)
1475                                 ip_vs_unlink_service(svc, cleanup);
1476                 }
1477         }
1478
1479         return 0;
1480 }
1481
1482 /*
1483  *      Delete service by {netns} in the service table.
1484  *      Called by __ip_vs_cleanup()
1485  */
1486 void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
1487 {
1488         EnterFunction(2);
1489         /* Check for "full" addressed entries */
1490         mutex_lock(&__ip_vs_mutex);
1491         ip_vs_flush(ipvs, true);
1492         mutex_unlock(&__ip_vs_mutex);
1493         LeaveFunction(2);
1494 }
1495
1496 /* Put all references for device (dst_cache) */
1497 static inline void
1498 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1499 {
1500         struct ip_vs_dest_dst *dest_dst;
1501
1502         spin_lock_bh(&dest->dst_lock);
1503         dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
1504         if (dest_dst && dest_dst->dst_cache->dev == dev) {
1505                 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1506                               dev->name,
1507                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1508                               ntohs(dest->port),
1509                               atomic_read(&dest->refcnt));
1510                 __ip_vs_dst_cache_reset(dest);
1511         }
1512         spin_unlock_bh(&dest->dst_lock);
1513
1514 }
1515 /* Netdev event receiver
1516  * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1517  */
1518 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1519                            void *ptr)
1520 {
1521         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1522         struct net *net = dev_net(dev);
1523         struct netns_ipvs *ipvs = net_ipvs(net);
1524         struct ip_vs_service *svc;
1525         struct ip_vs_dest *dest;
1526         unsigned int idx;
1527
1528         if (event != NETDEV_DOWN || !ipvs)
1529                 return NOTIFY_DONE;
1530         IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1531         EnterFunction(2);
1532         mutex_lock(&__ip_vs_mutex);
1533         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1534                 hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1535                         if (svc->ipvs == ipvs) {
1536                                 list_for_each_entry(dest, &svc->destinations,
1537                                                     n_list) {
1538                                         ip_vs_forget_dev(dest, dev);
1539                                 }
1540                         }
1541                 }
1542
1543                 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1544                         if (svc->ipvs == ipvs) {
1545                                 list_for_each_entry(dest, &svc->destinations,
1546                                                     n_list) {
1547                                         ip_vs_forget_dev(dest, dev);
1548                                 }
1549                         }
1550
1551                 }
1552         }
1553
1554         spin_lock_bh(&ipvs->dest_trash_lock);
1555         list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1556                 ip_vs_forget_dev(dest, dev);
1557         }
1558         spin_unlock_bh(&ipvs->dest_trash_lock);
1559         mutex_unlock(&__ip_vs_mutex);
1560         LeaveFunction(2);
1561         return NOTIFY_DONE;
1562 }
1563
1564 /*
1565  *      Zero counters in a service or all services
1566  */
1567 static int ip_vs_zero_service(struct ip_vs_service *svc)
1568 {
1569         struct ip_vs_dest *dest;
1570
1571         list_for_each_entry(dest, &svc->destinations, n_list) {
1572                 ip_vs_zero_stats(&dest->stats);
1573         }
1574         ip_vs_zero_stats(&svc->stats);
1575         return 0;
1576 }
1577
1578 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
1579 {
1580         int idx;
1581         struct ip_vs_service *svc;
1582
1583         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1584                 hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1585                         if (svc->ipvs == ipvs)
1586                                 ip_vs_zero_service(svc);
1587                 }
1588         }
1589
1590         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1591                 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1592                         if (svc->ipvs == ipvs)
1593                                 ip_vs_zero_service(svc);
1594                 }
1595         }
1596
1597         ip_vs_zero_stats(&ipvs->tot_stats);
1598         return 0;
1599 }
1600
1601 #ifdef CONFIG_SYSCTL
1602
1603 static int zero;
1604 static int three = 3;
1605
1606 static int
1607 proc_do_defense_mode(struct ctl_table *table, int write,
1608                      void __user *buffer, size_t *lenp, loff_t *ppos)
1609 {
1610         struct netns_ipvs *ipvs = table->extra2;
1611         int *valp = table->data;
1612         int val = *valp;
1613         int rc;
1614
1615         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1616         if (write && (*valp != val)) {
1617                 if ((*valp < 0) || (*valp > 3)) {
1618                         /* Restore the correct value */
1619                         *valp = val;
1620                 } else {
1621                         update_defense_level(ipvs);
1622                 }
1623         }
1624         return rc;
1625 }
1626
1627 static int
1628 proc_do_sync_threshold(struct ctl_table *table, int write,
1629                        void __user *buffer, size_t *lenp, loff_t *ppos)
1630 {
1631         int *valp = table->data;
1632         int val[2];
1633         int rc;
1634
1635         /* backup the value first */
1636         memcpy(val, valp, sizeof(val));
1637
1638         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1639         if (write && (valp[0] < 0 || valp[1] < 0 ||
1640             (valp[0] >= valp[1] && valp[1]))) {
1641                 /* Restore the correct value */
1642                 memcpy(valp, val, sizeof(val));
1643         }
1644         return rc;
1645 }
1646
1647 static int
1648 proc_do_sync_mode(struct ctl_table *table, int write,
1649                      void __user *buffer, size_t *lenp, loff_t *ppos)
1650 {
1651         int *valp = table->data;
1652         int val = *valp;
1653         int rc;
1654
1655         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1656         if (write && (*valp != val)) {
1657                 if ((*valp < 0) || (*valp > 1)) {
1658                         /* Restore the correct value */
1659                         *valp = val;
1660                 }
1661         }
1662         return rc;
1663 }
1664
1665 static int
1666 proc_do_sync_ports(struct ctl_table *table, int write,
1667                    void __user *buffer, size_t *lenp, loff_t *ppos)
1668 {
1669         int *valp = table->data;
1670         int val = *valp;
1671         int rc;
1672
1673         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1674         if (write && (*valp != val)) {
1675                 if (*valp < 1 || !is_power_of_2(*valp)) {
1676                         /* Restore the correct value */
1677                         *valp = val;
1678                 }
1679         }
1680         return rc;
1681 }
1682
1683 /*
1684  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1685  *      Do not change order or insert new entries without
1686  *      align with netns init in ip_vs_control_net_init()
1687  */
1688
1689 static struct ctl_table vs_vars[] = {
1690         {
1691                 .procname       = "amemthresh",
1692                 .maxlen         = sizeof(int),
1693                 .mode           = 0644,
1694                 .proc_handler   = proc_dointvec,
1695         },
1696         {
1697                 .procname       = "am_droprate",
1698                 .maxlen         = sizeof(int),
1699                 .mode           = 0644,
1700                 .proc_handler   = proc_dointvec,
1701         },
1702         {
1703                 .procname       = "drop_entry",
1704                 .maxlen         = sizeof(int),
1705                 .mode           = 0644,
1706                 .proc_handler   = proc_do_defense_mode,
1707         },
1708         {
1709                 .procname       = "drop_packet",
1710                 .maxlen         = sizeof(int),
1711                 .mode           = 0644,
1712                 .proc_handler   = proc_do_defense_mode,
1713         },
1714 #ifdef CONFIG_IP_VS_NFCT
1715         {
1716                 .procname       = "conntrack",
1717                 .maxlen         = sizeof(int),
1718                 .mode           = 0644,
1719                 .proc_handler   = &proc_dointvec,
1720         },
1721 #endif
1722         {
1723                 .procname       = "secure_tcp",
1724                 .maxlen         = sizeof(int),
1725                 .mode           = 0644,
1726                 .proc_handler   = proc_do_defense_mode,
1727         },
1728         {
1729                 .procname       = "snat_reroute",
1730                 .maxlen         = sizeof(int),
1731                 .mode           = 0644,
1732                 .proc_handler   = &proc_dointvec,
1733         },
1734         {
1735                 .procname       = "sync_version",
1736                 .maxlen         = sizeof(int),
1737                 .mode           = 0644,
1738                 .proc_handler   = &proc_do_sync_mode,
1739         },
1740         {
1741                 .procname       = "sync_ports",
1742                 .maxlen         = sizeof(int),
1743                 .mode           = 0644,
1744                 .proc_handler   = &proc_do_sync_ports,
1745         },
1746         {
1747                 .procname       = "sync_persist_mode",
1748                 .maxlen         = sizeof(int),
1749                 .mode           = 0644,
1750                 .proc_handler   = proc_dointvec,
1751         },
1752         {
1753                 .procname       = "sync_qlen_max",
1754                 .maxlen         = sizeof(unsigned long),
1755                 .mode           = 0644,
1756                 .proc_handler   = proc_doulongvec_minmax,
1757         },
1758         {
1759                 .procname       = "sync_sock_size",
1760                 .maxlen         = sizeof(int),
1761                 .mode           = 0644,
1762                 .proc_handler   = proc_dointvec,
1763         },
1764         {
1765                 .procname       = "cache_bypass",
1766                 .maxlen         = sizeof(int),
1767                 .mode           = 0644,
1768                 .proc_handler   = proc_dointvec,
1769         },
1770         {
1771                 .procname       = "expire_nodest_conn",
1772                 .maxlen         = sizeof(int),
1773                 .mode           = 0644,
1774                 .proc_handler   = proc_dointvec,
1775         },
1776         {
1777                 .procname       = "sloppy_tcp",
1778                 .maxlen         = sizeof(int),
1779                 .mode           = 0644,
1780                 .proc_handler   = proc_dointvec,
1781         },
1782         {
1783                 .procname       = "sloppy_sctp",
1784                 .maxlen         = sizeof(int),
1785                 .mode           = 0644,
1786                 .proc_handler   = proc_dointvec,
1787         },
1788         {
1789                 .procname       = "expire_quiescent_template",
1790                 .maxlen         = sizeof(int),
1791                 .mode           = 0644,
1792                 .proc_handler   = proc_dointvec,
1793         },
1794         {
1795                 .procname       = "sync_threshold",
1796                 .maxlen         =
1797                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1798                 .mode           = 0644,
1799                 .proc_handler   = proc_do_sync_threshold,
1800         },
1801         {
1802                 .procname       = "sync_refresh_period",
1803                 .maxlen         = sizeof(int),
1804                 .mode           = 0644,
1805                 .proc_handler   = proc_dointvec_jiffies,
1806         },
1807         {
1808                 .procname       = "sync_retries",
1809                 .maxlen         = sizeof(int),
1810                 .mode           = 0644,
1811                 .proc_handler   = proc_dointvec_minmax,
1812                 .extra1         = &zero,
1813                 .extra2         = &three,
1814         },
1815         {
1816                 .procname       = "nat_icmp_send",
1817                 .maxlen         = sizeof(int),
1818                 .mode           = 0644,
1819                 .proc_handler   = proc_dointvec,
1820         },
1821         {
1822                 .procname       = "pmtu_disc",
1823                 .maxlen         = sizeof(int),
1824                 .mode           = 0644,
1825                 .proc_handler   = proc_dointvec,
1826         },
1827         {
1828                 .procname       = "backup_only",
1829                 .maxlen         = sizeof(int),
1830                 .mode           = 0644,
1831                 .proc_handler   = proc_dointvec,
1832         },
1833         {
1834                 .procname       = "conn_reuse_mode",
1835                 .maxlen         = sizeof(int),
1836                 .mode           = 0644,
1837                 .proc_handler   = proc_dointvec,
1838         },
1839         {
1840                 .procname       = "schedule_icmp",
1841                 .maxlen         = sizeof(int),
1842                 .mode           = 0644,
1843                 .proc_handler   = proc_dointvec,
1844         },
1845         {
1846                 .procname       = "ignore_tunneled",
1847                 .maxlen         = sizeof(int),
1848                 .mode           = 0644,
1849                 .proc_handler   = proc_dointvec,
1850         },
1851 #ifdef CONFIG_IP_VS_DEBUG
1852         {
1853                 .procname       = "debug_level",
1854                 .data           = &sysctl_ip_vs_debug_level,
1855                 .maxlen         = sizeof(int),
1856                 .mode           = 0644,
1857                 .proc_handler   = proc_dointvec,
1858         },
1859 #endif
1860         { }
1861 };
1862
1863 #endif
1864
1865 #ifdef CONFIG_PROC_FS
1866
1867 struct ip_vs_iter {
1868         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1869         struct hlist_head *table;
1870         int bucket;
1871 };
1872
1873 /*
1874  *      Write the contents of the VS rule table to a PROCfs file.
1875  *      (It is kept just for backward compatibility)
1876  */
1877 static inline const char *ip_vs_fwd_name(unsigned int flags)
1878 {
1879         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1880         case IP_VS_CONN_F_LOCALNODE:
1881                 return "Local";
1882         case IP_VS_CONN_F_TUNNEL:
1883                 return "Tunnel";
1884         case IP_VS_CONN_F_DROUTE:
1885                 return "Route";
1886         default:
1887                 return "Masq";
1888         }
1889 }
1890
1891
1892 /* Get the Nth entry in the two lists */
1893 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1894 {
1895         struct net *net = seq_file_net(seq);
1896         struct netns_ipvs *ipvs = net_ipvs(net);
1897         struct ip_vs_iter *iter = seq->private;
1898         int idx;
1899         struct ip_vs_service *svc;
1900
1901         /* look in hash by protocol */
1902         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1903                 hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
1904                         if ((svc->ipvs == ipvs) && pos-- == 0) {
1905                                 iter->table = ip_vs_svc_table;
1906                                 iter->bucket = idx;
1907                                 return svc;
1908                         }
1909                 }
1910         }
1911
1912         /* keep looking in fwmark */
1913         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1914                 hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
1915                                          f_list) {
1916                         if ((svc->ipvs == ipvs) && pos-- == 0) {
1917                                 iter->table = ip_vs_svc_fwm_table;
1918                                 iter->bucket = idx;
1919                                 return svc;
1920                         }
1921                 }
1922         }
1923
1924         return NULL;
1925 }
1926
1927 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1928         __acquires(RCU)
1929 {
1930         rcu_read_lock();
1931         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1932 }
1933
1934
1935 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1936 {
1937         struct hlist_node *e;
1938         struct ip_vs_iter *iter;
1939         struct ip_vs_service *svc;
1940
1941         ++*pos;
1942         if (v == SEQ_START_TOKEN)
1943                 return ip_vs_info_array(seq,0);
1944
1945         svc = v;
1946         iter = seq->private;
1947
1948         if (iter->table == ip_vs_svc_table) {
1949                 /* next service in table hashed by protocol */
1950                 e = rcu_dereference(hlist_next_rcu(&svc->s_list));
1951                 if (e)
1952                         return hlist_entry(e, struct ip_vs_service, s_list);
1953
1954                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1955                         hlist_for_each_entry_rcu(svc,
1956                                                  &ip_vs_svc_table[iter->bucket],
1957                                                  s_list) {
1958                                 return svc;
1959                         }
1960                 }
1961
1962                 iter->table = ip_vs_svc_fwm_table;
1963                 iter->bucket = -1;
1964                 goto scan_fwmark;
1965         }
1966
1967         /* next service in hashed by fwmark */
1968         e = rcu_dereference(hlist_next_rcu(&svc->f_list));
1969         if (e)
1970                 return hlist_entry(e, struct ip_vs_service, f_list);
1971
1972  scan_fwmark:
1973         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1974                 hlist_for_each_entry_rcu(svc,
1975                                          &ip_vs_svc_fwm_table[iter->bucket],
1976                                          f_list)
1977                         return svc;
1978         }
1979
1980         return NULL;
1981 }
1982
1983 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1984         __releases(RCU)
1985 {
1986         rcu_read_unlock();
1987 }
1988
1989
1990 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1991 {
1992         if (v == SEQ_START_TOKEN) {
1993                 seq_printf(seq,
1994                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1995                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1996                 seq_puts(seq,
1997                          "Prot LocalAddress:Port Scheduler Flags\n");
1998                 seq_puts(seq,
1999                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2000         } else {
2001                 struct net *net = seq_file_net(seq);
2002                 struct netns_ipvs *ipvs = net_ipvs(net);
2003                 const struct ip_vs_service *svc = v;
2004                 const struct ip_vs_iter *iter = seq->private;
2005                 const struct ip_vs_dest *dest;
2006                 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2007                 char *sched_name = sched ? sched->name : "none";
2008
2009                 if (svc->ipvs != ipvs)
2010                         return 0;
2011                 if (iter->table == ip_vs_svc_table) {
2012 #ifdef CONFIG_IP_VS_IPV6
2013                         if (svc->af == AF_INET6)
2014                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
2015                                            ip_vs_proto_name(svc->protocol),
2016                                            &svc->addr.in6,
2017                                            ntohs(svc->port),
2018                                            sched_name);
2019                         else
2020 #endif
2021                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
2022                                            ip_vs_proto_name(svc->protocol),
2023                                            ntohl(svc->addr.ip),
2024                                            ntohs(svc->port),
2025                                            sched_name,
2026                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2027                 } else {
2028                         seq_printf(seq, "FWM  %08X %s %s",
2029                                    svc->fwmark, sched_name,
2030                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2031                 }
2032
2033                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2034                         seq_printf(seq, "persistent %d %08X\n",
2035                                 svc->timeout,
2036                                 ntohl(svc->netmask));
2037                 else
2038                         seq_putc(seq, '\n');
2039
2040                 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2041 #ifdef CONFIG_IP_VS_IPV6
2042                         if (dest->af == AF_INET6)
2043                                 seq_printf(seq,
2044                                            "  -> [%pI6]:%04X"
2045                                            "      %-7s %-6d %-10d %-10d\n",
2046                                            &dest->addr.in6,
2047                                            ntohs(dest->port),
2048                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2049                                            atomic_read(&dest->weight),
2050                                            atomic_read(&dest->activeconns),
2051                                            atomic_read(&dest->inactconns));
2052                         else
2053 #endif
2054                                 seq_printf(seq,
2055                                            "  -> %08X:%04X      "
2056                                            "%-7s %-6d %-10d %-10d\n",
2057                                            ntohl(dest->addr.ip),
2058                                            ntohs(dest->port),
2059                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2060                                            atomic_read(&dest->weight),
2061                                            atomic_read(&dest->activeconns),
2062                                            atomic_read(&dest->inactconns));
2063
2064                 }
2065         }
2066         return 0;
2067 }
2068
2069 static const struct seq_operations ip_vs_info_seq_ops = {
2070         .start = ip_vs_info_seq_start,
2071         .next  = ip_vs_info_seq_next,
2072         .stop  = ip_vs_info_seq_stop,
2073         .show  = ip_vs_info_seq_show,
2074 };
2075
2076 static int ip_vs_info_open(struct inode *inode, struct file *file)
2077 {
2078         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2079                         sizeof(struct ip_vs_iter));
2080 }
2081
2082 static const struct file_operations ip_vs_info_fops = {
2083         .owner   = THIS_MODULE,
2084         .open    = ip_vs_info_open,
2085         .read    = seq_read,
2086         .llseek  = seq_lseek,
2087         .release = seq_release_net,
2088 };
2089
2090 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2091 {
2092         struct net *net = seq_file_single_net(seq);
2093         struct ip_vs_kstats show;
2094
2095 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2096         seq_puts(seq,
2097                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
2098         seq_printf(seq,
2099                    "   Conns  Packets  Packets            Bytes            Bytes\n");
2100
2101         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2102         seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
2103                    (unsigned long long)show.conns,
2104                    (unsigned long long)show.inpkts,
2105                    (unsigned long long)show.outpkts,
2106                    (unsigned long long)show.inbytes,
2107                    (unsigned long long)show.outbytes);
2108
2109 /*                01234567 01234567 01234567 0123456701234567 0123456701234567*/
2110         seq_puts(seq,
2111                  " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2112         seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
2113                    (unsigned long long)show.cps,
2114                    (unsigned long long)show.inpps,
2115                    (unsigned long long)show.outpps,
2116                    (unsigned long long)show.inbps,
2117                    (unsigned long long)show.outbps);
2118
2119         return 0;
2120 }
2121
2122 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2123 {
2124         return single_open_net(inode, file, ip_vs_stats_show);
2125 }
2126
2127 static const struct file_operations ip_vs_stats_fops = {
2128         .owner = THIS_MODULE,
2129         .open = ip_vs_stats_seq_open,
2130         .read = seq_read,
2131         .llseek = seq_lseek,
2132         .release = single_release_net,
2133 };
2134
2135 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2136 {
2137         struct net *net = seq_file_single_net(seq);
2138         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2139         struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
2140         struct ip_vs_kstats kstats;
2141         int i;
2142
2143 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2144         seq_puts(seq,
2145                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2146         seq_printf(seq,
2147                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2148
2149         for_each_possible_cpu(i) {
2150                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2151                 unsigned int start;
2152                 u64 conns, inpkts, outpkts, inbytes, outbytes;
2153
2154                 do {
2155                         start = u64_stats_fetch_begin_irq(&u->syncp);
2156                         conns = u->cnt.conns;
2157                         inpkts = u->cnt.inpkts;
2158                         outpkts = u->cnt.outpkts;
2159                         inbytes = u->cnt.inbytes;
2160                         outbytes = u->cnt.outbytes;
2161                 } while (u64_stats_fetch_retry_irq(&u->syncp, start));
2162
2163                 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
2164                            i, (u64)conns, (u64)inpkts,
2165                            (u64)outpkts, (u64)inbytes,
2166                            (u64)outbytes);
2167         }
2168
2169         ip_vs_copy_stats(&kstats, tot_stats);
2170
2171         seq_printf(seq, "  ~ %8LX %8LX %8LX %16LX %16LX\n\n",
2172                    (unsigned long long)kstats.conns,
2173                    (unsigned long long)kstats.inpkts,
2174                    (unsigned long long)kstats.outpkts,
2175                    (unsigned long long)kstats.inbytes,
2176                    (unsigned long long)kstats.outbytes);
2177
2178 /*                ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2179         seq_puts(seq,
2180                  "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2181         seq_printf(seq, "    %8LX %8LX %8LX %16LX %16LX\n",
2182                    kstats.cps,
2183                    kstats.inpps,
2184                    kstats.outpps,
2185                    kstats.inbps,
2186                    kstats.outbps);
2187
2188         return 0;
2189 }
2190
2191 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2192 {
2193         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2194 }
2195
2196 static const struct file_operations ip_vs_stats_percpu_fops = {
2197         .owner = THIS_MODULE,
2198         .open = ip_vs_stats_percpu_seq_open,
2199         .read = seq_read,
2200         .llseek = seq_lseek,
2201         .release = single_release_net,
2202 };
2203 #endif
2204
2205 /*
2206  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2207  */
2208 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
2209 {
2210 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2211         struct ip_vs_proto_data *pd;
2212 #endif
2213
2214         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2215                   u->tcp_timeout,
2216                   u->tcp_fin_timeout,
2217                   u->udp_timeout);
2218
2219 #ifdef CONFIG_IP_VS_PROTO_TCP
2220         if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
2221             u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
2222                 return -EINVAL;
2223         }
2224 #endif
2225
2226 #ifdef CONFIG_IP_VS_PROTO_UDP
2227         if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
2228                 return -EINVAL;
2229 #endif
2230
2231 #ifdef CONFIG_IP_VS_PROTO_TCP
2232         if (u->tcp_timeout) {
2233                 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
2234                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2235                         = u->tcp_timeout * HZ;
2236         }
2237
2238         if (u->tcp_fin_timeout) {
2239                 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
2240                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2241                         = u->tcp_fin_timeout * HZ;
2242         }
2243 #endif
2244
2245 #ifdef CONFIG_IP_VS_PROTO_UDP
2246         if (u->udp_timeout) {
2247                 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
2248                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2249                         = u->udp_timeout * HZ;
2250         }
2251 #endif
2252         return 0;
2253 }
2254
2255 #define CMDID(cmd)              (cmd - IP_VS_BASE_CTL)
2256
2257 struct ip_vs_svcdest_user {
2258         struct ip_vs_service_user       s;
2259         struct ip_vs_dest_user          d;
2260 };
2261
2262 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
2263         [CMDID(IP_VS_SO_SET_ADD)]         = sizeof(struct ip_vs_service_user),
2264         [CMDID(IP_VS_SO_SET_EDIT)]        = sizeof(struct ip_vs_service_user),
2265         [CMDID(IP_VS_SO_SET_DEL)]         = sizeof(struct ip_vs_service_user),
2266         [CMDID(IP_VS_SO_SET_ADDDEST)]     = sizeof(struct ip_vs_svcdest_user),
2267         [CMDID(IP_VS_SO_SET_DELDEST)]     = sizeof(struct ip_vs_svcdest_user),
2268         [CMDID(IP_VS_SO_SET_EDITDEST)]    = sizeof(struct ip_vs_svcdest_user),
2269         [CMDID(IP_VS_SO_SET_TIMEOUT)]     = sizeof(struct ip_vs_timeout_user),
2270         [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
2271         [CMDID(IP_VS_SO_SET_STOPDAEMON)]  = sizeof(struct ip_vs_daemon_user),
2272         [CMDID(IP_VS_SO_SET_ZERO)]        = sizeof(struct ip_vs_service_user),
2273 };
2274
2275 union ip_vs_set_arglen {
2276         struct ip_vs_service_user       field_IP_VS_SO_SET_ADD;
2277         struct ip_vs_service_user       field_IP_VS_SO_SET_EDIT;
2278         struct ip_vs_service_user       field_IP_VS_SO_SET_DEL;
2279         struct ip_vs_svcdest_user       field_IP_VS_SO_SET_ADDDEST;
2280         struct ip_vs_svcdest_user       field_IP_VS_SO_SET_DELDEST;
2281         struct ip_vs_svcdest_user       field_IP_VS_SO_SET_EDITDEST;
2282         struct ip_vs_timeout_user       field_IP_VS_SO_SET_TIMEOUT;
2283         struct ip_vs_daemon_user        field_IP_VS_SO_SET_STARTDAEMON;
2284         struct ip_vs_daemon_user        field_IP_VS_SO_SET_STOPDAEMON;
2285         struct ip_vs_service_user       field_IP_VS_SO_SET_ZERO;
2286 };
2287
2288 #define MAX_SET_ARGLEN  sizeof(union ip_vs_set_arglen)
2289
2290 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2291                                   struct ip_vs_service_user *usvc_compat)
2292 {
2293         memset(usvc, 0, sizeof(*usvc));
2294
2295         usvc->af                = AF_INET;
2296         usvc->protocol          = usvc_compat->protocol;
2297         usvc->addr.ip           = usvc_compat->addr;
2298         usvc->port              = usvc_compat->port;
2299         usvc->fwmark            = usvc_compat->fwmark;
2300
2301         /* Deep copy of sched_name is not needed here */
2302         usvc->sched_name        = usvc_compat->sched_name;
2303
2304         usvc->flags             = usvc_compat->flags;
2305         usvc->timeout           = usvc_compat->timeout;
2306         usvc->netmask           = usvc_compat->netmask;
2307 }
2308
2309 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2310                                    struct ip_vs_dest_user *udest_compat)
2311 {
2312         memset(udest, 0, sizeof(*udest));
2313
2314         udest->addr.ip          = udest_compat->addr;
2315         udest->port             = udest_compat->port;
2316         udest->conn_flags       = udest_compat->conn_flags;
2317         udest->weight           = udest_compat->weight;
2318         udest->u_threshold      = udest_compat->u_threshold;
2319         udest->l_threshold      = udest_compat->l_threshold;
2320         udest->af               = AF_INET;
2321 }
2322
2323 static int
2324 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2325 {
2326         struct net *net = sock_net(sk);
2327         int ret;
2328         unsigned char arg[MAX_SET_ARGLEN];
2329         struct ip_vs_service_user *usvc_compat;
2330         struct ip_vs_service_user_kern usvc;
2331         struct ip_vs_service *svc;
2332         struct ip_vs_dest_user *udest_compat;
2333         struct ip_vs_dest_user_kern udest;
2334         struct netns_ipvs *ipvs = net_ipvs(net);
2335
2336         BUILD_BUG_ON(sizeof(arg) > 255);
2337         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2338                 return -EPERM;
2339
2340         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2341                 return -EINVAL;
2342         if (len != set_arglen[CMDID(cmd)]) {
2343                 IP_VS_DBG(1, "set_ctl: len %u != %u\n",
2344                           len, set_arglen[CMDID(cmd)]);
2345                 return -EINVAL;
2346         }
2347
2348         if (copy_from_user(arg, user, len) != 0)
2349                 return -EFAULT;
2350
2351         /* increase the module use count */
2352         ip_vs_use_count_inc();
2353
2354         /* Handle daemons since they have another lock */
2355         if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2356             cmd == IP_VS_SO_SET_STOPDAEMON) {
2357                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2358
2359                 if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2360                         struct ipvs_sync_daemon_cfg cfg;
2361
2362                         memset(&cfg, 0, sizeof(cfg));
2363                         ret = -EINVAL;
2364                         if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
2365                                     sizeof(cfg.mcast_ifn)) <= 0)
2366                                 goto out_dec;
2367                         cfg.syncid = dm->syncid;
2368                         ret = start_sync_thread(ipvs, &cfg, dm->state);
2369                 } else {
2370                         mutex_lock(&ipvs->sync_mutex);
2371                         ret = stop_sync_thread(ipvs, dm->state);
2372                         mutex_unlock(&ipvs->sync_mutex);
2373                 }
2374                 goto out_dec;
2375         }
2376
2377         mutex_lock(&__ip_vs_mutex);
2378         if (cmd == IP_VS_SO_SET_FLUSH) {
2379                 /* Flush the virtual service */
2380                 ret = ip_vs_flush(ipvs, false);
2381                 goto out_unlock;
2382         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2383                 /* Set timeout values for (tcp tcpfin udp) */
2384                 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
2385                 goto out_unlock;
2386         } else if (!len) {
2387                 /* No more commands with len == 0 below */
2388                 ret = -EINVAL;
2389                 goto out_unlock;
2390         }
2391
2392         usvc_compat = (struct ip_vs_service_user *)arg;
2393         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2394
2395         /* We only use the new structs internally, so copy userspace compat
2396          * structs to extended internal versions */
2397         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2398         ip_vs_copy_udest_compat(&udest, udest_compat);
2399
2400         if (cmd == IP_VS_SO_SET_ZERO) {
2401                 /* if no service address is set, zero counters in all */
2402                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2403                         ret = ip_vs_zero_all(ipvs);
2404                         goto out_unlock;
2405                 }
2406         }
2407
2408         if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
2409             strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
2410             IP_VS_SCHEDNAME_MAXLEN) {
2411                 ret = -EINVAL;
2412                 goto out_unlock;
2413         }
2414
2415         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2416         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2417             usvc.protocol != IPPROTO_SCTP) {
2418                 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
2419                        usvc.protocol, &usvc.addr.ip,
2420                        ntohs(usvc.port));
2421                 ret = -EFAULT;
2422                 goto out_unlock;
2423         }
2424
2425         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2426         rcu_read_lock();
2427         if (usvc.fwmark == 0)
2428                 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
2429                                            &usvc.addr, usvc.port);
2430         else
2431                 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
2432         rcu_read_unlock();
2433
2434         if (cmd != IP_VS_SO_SET_ADD
2435             && (svc == NULL || svc->protocol != usvc.protocol)) {
2436                 ret = -ESRCH;
2437                 goto out_unlock;
2438         }
2439
2440         switch (cmd) {
2441         case IP_VS_SO_SET_ADD:
2442                 if (svc != NULL)
2443                         ret = -EEXIST;
2444                 else
2445                         ret = ip_vs_add_service(ipvs, &usvc, &svc);
2446                 break;
2447         case IP_VS_SO_SET_EDIT:
2448                 ret = ip_vs_edit_service(svc, &usvc);
2449                 break;
2450         case IP_VS_SO_SET_DEL:
2451                 ret = ip_vs_del_service(svc);
2452                 if (!ret)
2453                         goto out_unlock;
2454                 break;
2455         case IP_VS_SO_SET_ZERO:
2456                 ret = ip_vs_zero_service(svc);
2457                 break;
2458         case IP_VS_SO_SET_ADDDEST:
2459                 ret = ip_vs_add_dest(svc, &udest);
2460                 break;
2461         case IP_VS_SO_SET_EDITDEST:
2462                 ret = ip_vs_edit_dest(svc, &udest);
2463                 break;
2464         case IP_VS_SO_SET_DELDEST:
2465                 ret = ip_vs_del_dest(svc, &udest);
2466         }
2467
2468   out_unlock:
2469         mutex_unlock(&__ip_vs_mutex);
2470   out_dec:
2471         /* decrease the module use count */
2472         ip_vs_use_count_dec();
2473
2474         return ret;
2475 }
2476
2477
2478 static void
2479 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2480 {
2481         struct ip_vs_scheduler *sched;
2482         struct ip_vs_kstats kstats;
2483         char *sched_name;
2484
2485         sched = rcu_dereference_protected(src->scheduler, 1);
2486         sched_name = sched ? sched->name : "none";
2487         dst->protocol = src->protocol;
2488         dst->addr = src->addr.ip;
2489         dst->port = src->port;
2490         dst->fwmark = src->fwmark;
2491         strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
2492         dst->flags = src->flags;
2493         dst->timeout = src->timeout / HZ;
2494         dst->netmask = src->netmask;
2495         dst->num_dests = src->num_dests;
2496         ip_vs_copy_stats(&kstats, &src->stats);
2497         ip_vs_export_stats_user(&dst->stats, &kstats);
2498 }
2499
2500 static inline int
2501 __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
2502                             const struct ip_vs_get_services *get,
2503                             struct ip_vs_get_services __user *uptr)
2504 {
2505         int idx, count=0;
2506         struct ip_vs_service *svc;
2507         struct ip_vs_service_entry entry;
2508         int ret = 0;
2509
2510         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2511                 hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2512                         /* Only expose IPv4 entries to old interface */
2513                         if (svc->af != AF_INET || (svc->ipvs != ipvs))
2514                                 continue;
2515
2516                         if (count >= get->num_services)
2517                                 goto out;
2518                         memset(&entry, 0, sizeof(entry));
2519                         ip_vs_copy_service(&entry, svc);
2520                         if (copy_to_user(&uptr->entrytable[count],
2521                                          &entry, sizeof(entry))) {
2522                                 ret = -EFAULT;
2523                                 goto out;
2524                         }
2525                         count++;
2526                 }
2527         }
2528
2529         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2530                 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2531                         /* Only expose IPv4 entries to old interface */
2532                         if (svc->af != AF_INET || (svc->ipvs != ipvs))
2533                                 continue;
2534
2535                         if (count >= get->num_services)
2536                                 goto out;
2537                         memset(&entry, 0, sizeof(entry));
2538                         ip_vs_copy_service(&entry, svc);
2539                         if (copy_to_user(&uptr->entrytable[count],
2540                                          &entry, sizeof(entry))) {
2541                                 ret = -EFAULT;
2542                                 goto out;
2543                         }
2544                         count++;
2545                 }
2546         }
2547 out:
2548         return ret;
2549 }
2550
2551 static inline int
2552 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
2553                          struct ip_vs_get_dests __user *uptr)
2554 {
2555         struct ip_vs_service *svc;
2556         union nf_inet_addr addr = { .ip = get->addr };
2557         int ret = 0;
2558
2559         rcu_read_lock();
2560         if (get->fwmark)
2561                 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
2562         else
2563                 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
2564                                            get->port);
2565         rcu_read_unlock();
2566
2567         if (svc) {
2568                 int count = 0;
2569                 struct ip_vs_dest *dest;
2570                 struct ip_vs_dest_entry entry;
2571                 struct ip_vs_kstats kstats;
2572
2573                 memset(&entry, 0, sizeof(entry));
2574                 list_for_each_entry(dest, &svc->destinations, n_list) {
2575                         if (count >= get->num_dests)
2576                                 break;
2577
2578                         /* Cannot expose heterogeneous members via sockopt
2579                          * interface
2580                          */
2581                         if (dest->af != svc->af)
2582                                 continue;
2583
2584                         entry.addr = dest->addr.ip;
2585                         entry.port = dest->port;
2586                         entry.conn_flags = atomic_read(&dest->conn_flags);
2587                         entry.weight = atomic_read(&dest->weight);
2588                         entry.u_threshold = dest->u_threshold;
2589                         entry.l_threshold = dest->l_threshold;
2590                         entry.activeconns = atomic_read(&dest->activeconns);
2591                         entry.inactconns = atomic_read(&dest->inactconns);
2592                         entry.persistconns = atomic_read(&dest->persistconns);
2593                         ip_vs_copy_stats(&kstats, &dest->stats);
2594                         ip_vs_export_stats_user(&entry.stats, &kstats);
2595                         if (copy_to_user(&uptr->entrytable[count],
2596                                          &entry, sizeof(entry))) {
2597                                 ret = -EFAULT;
2598                                 break;
2599                         }
2600                         count++;
2601                 }
2602         } else
2603                 ret = -ESRCH;
2604         return ret;
2605 }
2606
2607 static inline void
2608 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
2609 {
2610 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2611         struct ip_vs_proto_data *pd;
2612 #endif
2613
2614         memset(u, 0, sizeof (*u));
2615
2616 #ifdef CONFIG_IP_VS_PROTO_TCP
2617         pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
2618         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2619         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2620 #endif
2621 #ifdef CONFIG_IP_VS_PROTO_UDP
2622         pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
2623         u->udp_timeout =
2624                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2625 #endif
2626 }
2627
2628 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
2629         [CMDID(IP_VS_SO_GET_VERSION)]  = 64,
2630         [CMDID(IP_VS_SO_GET_INFO)]     = sizeof(struct ip_vs_getinfo),
2631         [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
2632         [CMDID(IP_VS_SO_GET_SERVICE)]  = sizeof(struct ip_vs_service_entry),
2633         [CMDID(IP_VS_SO_GET_DESTS)]    = sizeof(struct ip_vs_get_dests),
2634         [CMDID(IP_VS_SO_GET_TIMEOUT)]  = sizeof(struct ip_vs_timeout_user),
2635         [CMDID(IP_VS_SO_GET_DAEMON)]   = 2 * sizeof(struct ip_vs_daemon_user),
2636 };
2637
2638 union ip_vs_get_arglen {
2639         char                            field_IP_VS_SO_GET_VERSION[64];
2640         struct ip_vs_getinfo            field_IP_VS_SO_GET_INFO;
2641         struct ip_vs_get_services       field_IP_VS_SO_GET_SERVICES;
2642         struct ip_vs_service_entry      field_IP_VS_SO_GET_SERVICE;
2643         struct ip_vs_get_dests          field_IP_VS_SO_GET_DESTS;
2644         struct ip_vs_timeout_user       field_IP_VS_SO_GET_TIMEOUT;
2645         struct ip_vs_daemon_user        field_IP_VS_SO_GET_DAEMON[2];
2646 };
2647
2648 #define MAX_GET_ARGLEN  sizeof(union ip_vs_get_arglen)
2649
2650 static int
2651 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2652 {
2653         unsigned char arg[MAX_GET_ARGLEN];
2654         int ret = 0;
2655         unsigned int copylen;
2656         struct net *net = sock_net(sk);
2657         struct netns_ipvs *ipvs = net_ipvs(net);
2658
2659         BUG_ON(!net);
2660         BUILD_BUG_ON(sizeof(arg) > 255);
2661         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2662                 return -EPERM;
2663
2664         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2665                 return -EINVAL;
2666
2667         copylen = get_arglen[CMDID(cmd)];
2668         if (*len < (int) copylen) {
2669                 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
2670                 return -EINVAL;
2671         }
2672
2673         if (copy_from_user(arg, user, copylen) != 0)
2674                 return -EFAULT;
2675         /*
2676          * Handle daemons first since it has its own locking
2677          */
2678         if (cmd == IP_VS_SO_GET_DAEMON) {
2679                 struct ip_vs_daemon_user d[2];
2680
2681                 memset(&d, 0, sizeof(d));
2682                 mutex_lock(&ipvs->sync_mutex);
2683                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2684                         d[0].state = IP_VS_STATE_MASTER;
2685                         strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
2686                                 sizeof(d[0].mcast_ifn));
2687                         d[0].syncid = ipvs->mcfg.syncid;
2688                 }
2689                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2690                         d[1].state = IP_VS_STATE_BACKUP;
2691                         strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
2692                                 sizeof(d[1].mcast_ifn));
2693                         d[1].syncid = ipvs->bcfg.syncid;
2694                 }
2695                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2696                         ret = -EFAULT;
2697                 mutex_unlock(&ipvs->sync_mutex);
2698                 return ret;
2699         }
2700
2701         mutex_lock(&__ip_vs_mutex);
2702         switch (cmd) {
2703         case IP_VS_SO_GET_VERSION:
2704         {
2705                 char buf[64];
2706
2707                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2708                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2709                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2710                         ret = -EFAULT;
2711                         goto out;
2712                 }
2713                 *len = strlen(buf)+1;
2714         }
2715         break;
2716
2717         case IP_VS_SO_GET_INFO:
2718         {
2719                 struct ip_vs_getinfo info;
2720                 info.version = IP_VS_VERSION_CODE;
2721                 info.size = ip_vs_conn_tab_size;
2722                 info.num_services = ipvs->num_services;
2723                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2724                         ret = -EFAULT;
2725         }
2726         break;
2727
2728         case IP_VS_SO_GET_SERVICES:
2729         {
2730                 struct ip_vs_get_services *get;
2731                 int size;
2732
2733                 get = (struct ip_vs_get_services *)arg;
2734                 size = sizeof(*get) +
2735                         sizeof(struct ip_vs_service_entry) * get->num_services;
2736                 if (*len != size) {
2737                         pr_err("length: %u != %u\n", *len, size);
2738                         ret = -EINVAL;
2739                         goto out;
2740                 }
2741                 ret = __ip_vs_get_service_entries(ipvs, get, user);
2742         }
2743         break;
2744
2745         case IP_VS_SO_GET_SERVICE:
2746         {
2747                 struct ip_vs_service_entry *entry;
2748                 struct ip_vs_service *svc;
2749                 union nf_inet_addr addr;
2750
2751                 entry = (struct ip_vs_service_entry *)arg;
2752                 addr.ip = entry->addr;
2753                 rcu_read_lock();
2754                 if (entry->fwmark)
2755                         svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
2756                 else
2757                         svc = __ip_vs_service_find(ipvs, AF_INET,
2758                                                    entry->protocol, &addr,
2759                                                    entry->port);
2760                 rcu_read_unlock();
2761                 if (svc) {
2762                         ip_vs_copy_service(entry, svc);
2763                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2764                                 ret = -EFAULT;
2765                 } else
2766                         ret = -ESRCH;
2767         }
2768         break;
2769
2770         case IP_VS_SO_GET_DESTS:
2771         {
2772                 struct ip_vs_get_dests *get;
2773                 int size;
2774
2775                 get = (struct ip_vs_get_dests *)arg;
2776                 size = sizeof(*get) +
2777                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2778                 if (*len != size) {
2779                         pr_err("length: %u != %u\n", *len, size);
2780                         ret = -EINVAL;
2781                         goto out;
2782                 }
2783                 ret = __ip_vs_get_dest_entries(ipvs, get, user);
2784         }
2785         break;
2786
2787         case IP_VS_SO_GET_TIMEOUT:
2788         {
2789                 struct ip_vs_timeout_user t;
2790
2791                 __ip_vs_get_timeouts(ipvs, &t);
2792                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2793                         ret = -EFAULT;
2794         }
2795         break;
2796
2797         default:
2798                 ret = -EINVAL;
2799         }
2800
2801 out:
2802         mutex_unlock(&__ip_vs_mutex);
2803         return ret;
2804 }
2805
2806
2807 static struct nf_sockopt_ops ip_vs_sockopts = {
2808         .pf             = PF_INET,
2809         .set_optmin     = IP_VS_BASE_CTL,
2810         .set_optmax     = IP_VS_SO_SET_MAX+1,
2811         .set            = do_ip_vs_set_ctl,
2812         .get_optmin     = IP_VS_BASE_CTL,
2813         .get_optmax     = IP_VS_SO_GET_MAX+1,
2814         .get            = do_ip_vs_get_ctl,
2815         .owner          = THIS_MODULE,
2816 };
2817
2818 /*
2819  * Generic Netlink interface
2820  */
2821
2822 /* IPVS genetlink family */
2823 static struct genl_family ip_vs_genl_family = {
2824         .id             = GENL_ID_GENERATE,
2825         .hdrsize        = 0,
2826         .name           = IPVS_GENL_NAME,
2827         .version        = IPVS_GENL_VERSION,
2828         .maxattr        = IPVS_CMD_MAX,
2829         .netnsok        = true,         /* Make ipvsadm to work on netns */
2830 };
2831
2832 /* Policy used for first-level command attributes */
2833 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2834         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2835         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2836         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2837         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2838         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2839         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2840 };
2841
2842 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2843 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2844         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2845         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2846                                             .len = IP_VS_IFNAME_MAXLEN - 1 },
2847         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2848         [IPVS_DAEMON_ATTR_SYNC_MAXLEN]  = { .type = NLA_U16 },
2849         [IPVS_DAEMON_ATTR_MCAST_GROUP]  = { .type = NLA_U32 },
2850         [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
2851         [IPVS_DAEMON_ATTR_MCAST_PORT]   = { .type = NLA_U16 },
2852         [IPVS_DAEMON_ATTR_MCAST_TTL]    = { .type = NLA_U8 },
2853 };
2854
2855 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2856 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2857         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2858         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2859         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2860                                             .len = sizeof(union nf_inet_addr) },
2861         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2862         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2863         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2864                                             .len = IP_VS_SCHEDNAME_MAXLEN - 1 },
2865         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2866                                             .len = IP_VS_PENAME_MAXLEN },
2867         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2868                                             .len = sizeof(struct ip_vs_flags) },
2869         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2870         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2871         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2872 };
2873
2874 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2875 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2876         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2877                                             .len = sizeof(union nf_inet_addr) },
2878         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2879         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2880         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2881         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2882         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2883         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2884         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2885         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2886         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2887         [IPVS_DEST_ATTR_ADDR_FAMILY]    = { .type = NLA_U16 },
2888 };
2889
2890 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2891                                  struct ip_vs_kstats *kstats)
2892 {
2893         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2894
2895         if (!nl_stats)
2896                 return -EMSGSIZE;
2897
2898         if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
2899             nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
2900             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
2901             nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
2902             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
2903             nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
2904             nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
2905             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
2906             nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
2907             nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
2908                 goto nla_put_failure;
2909         nla_nest_end(skb, nl_stats);
2910
2911         return 0;
2912
2913 nla_put_failure:
2914         nla_nest_cancel(skb, nl_stats);
2915         return -EMSGSIZE;
2916 }
2917
2918 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
2919                                    struct ip_vs_kstats *kstats)
2920 {
2921         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2922
2923         if (!nl_stats)
2924                 return -EMSGSIZE;
2925
2926         if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) ||
2927             nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) ||
2928             nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) ||
2929             nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
2930             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
2931             nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) ||
2932             nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) ||
2933             nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) ||
2934             nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) ||
2935             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps))
2936                 goto nla_put_failure;
2937         nla_nest_end(skb, nl_stats);
2938
2939         return 0;
2940
2941 nla_put_failure:
2942         nla_nest_cancel(skb, nl_stats);
2943         return -EMSGSIZE;
2944 }
2945
2946 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2947                                    struct ip_vs_service *svc)
2948 {
2949         struct ip_vs_scheduler *sched;
2950         struct ip_vs_pe *pe;
2951         struct nlattr *nl_service;
2952         struct ip_vs_flags flags = { .flags = svc->flags,
2953                                      .mask = ~0 };
2954         struct ip_vs_kstats kstats;
2955         char *sched_name;
2956
2957         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2958         if (!nl_service)
2959                 return -EMSGSIZE;
2960
2961         if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2962                 goto nla_put_failure;
2963         if (svc->fwmark) {
2964                 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2965                         goto nla_put_failure;
2966         } else {
2967                 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2968                     nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2969                     nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2970                         goto nla_put_failure;
2971         }
2972
2973         sched = rcu_dereference_protected(svc->scheduler, 1);
2974         sched_name = sched ? sched->name : "none";
2975         pe = rcu_dereference_protected(svc->pe, 1);
2976         if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
2977             (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
2978             nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2979             nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2980             nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2981                 goto nla_put_failure;
2982         ip_vs_copy_stats(&kstats, &svc->stats);
2983         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
2984                 goto nla_put_failure;
2985         if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
2986                 goto nla_put_failure;
2987
2988         nla_nest_end(skb, nl_service);
2989
2990         return 0;
2991
2992 nla_put_failure:
2993         nla_nest_cancel(skb, nl_service);
2994         return -EMSGSIZE;
2995 }
2996
2997 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2998                                    struct ip_vs_service *svc,
2999                                    struct netlink_callback *cb)
3000 {
3001         void *hdr;
3002
3003         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3004                           &ip_vs_genl_family, NLM_F_MULTI,
3005                           IPVS_CMD_NEW_SERVICE);
3006         if (!hdr)
3007                 return -EMSGSIZE;
3008
3009         if (ip_vs_genl_fill_service(skb, svc) < 0)
3010                 goto nla_put_failure;
3011
3012         genlmsg_end(skb, hdr);
3013         return 0;
3014
3015 nla_put_failure:
3016         genlmsg_cancel(skb, hdr);
3017         return -EMSGSIZE;
3018 }
3019
3020 static int ip_vs_genl_dump_services(struct sk_buff *skb,
3021                                     struct netlink_callback *cb)
3022 {
3023         int idx = 0, i;
3024         int start = cb->args[0];
3025         struct ip_vs_service *svc;
3026         struct net *net = sock_net(skb->sk);
3027         struct netns_ipvs *ipvs = net_ipvs(net);
3028
3029         mutex_lock(&__ip_vs_mutex);
3030         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3031                 hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
3032                         if (++idx <= start || (svc->ipvs != ipvs))
3033                                 continue;
3034                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3035                                 idx--;
3036                                 goto nla_put_failure;
3037                         }
3038                 }
3039         }
3040
3041         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3042                 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
3043                         if (++idx <= start || (svc->ipvs != ipvs))
3044                                 continue;
3045                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3046                                 idx--;
3047                                 goto nla_put_failure;
3048                         }
3049                 }
3050         }
3051
3052 nla_put_failure:
3053         mutex_unlock(&__ip_vs_mutex);
3054         cb->args[0] = idx;
3055
3056         return skb->len;
3057 }
3058
3059 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
3060                                     struct ip_vs_service_user_kern *usvc,
3061                                     struct nlattr *nla, int full_entry,
3062                                     struct ip_vs_service **ret_svc)
3063 {
3064         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3065         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3066         struct ip_vs_service *svc;
3067
3068         /* Parse mandatory identifying service fields first */
3069         if (nla == NULL ||
3070             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3071                 return -EINVAL;
3072
3073         nla_af          = attrs[IPVS_SVC_ATTR_AF];
3074         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
3075         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
3076         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
3077         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
3078
3079         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3080                 return -EINVAL;
3081
3082         memset(usvc, 0, sizeof(*usvc));
3083
3084         usvc->af = nla_get_u16(nla_af);
3085 #ifdef CONFIG_IP_VS_IPV6
3086         if (usvc->af != AF_INET && usvc->af != AF_INET6)
3087 #else
3088         if (usvc->af != AF_INET)
3089 #endif
3090                 return -EAFNOSUPPORT;
3091
3092         if (nla_fwmark) {
3093                 usvc->protocol = IPPROTO_TCP;
3094                 usvc->fwmark = nla_get_u32(nla_fwmark);
3095         } else {
3096                 usvc->protocol = nla_get_u16(nla_protocol);
3097                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3098                 usvc->port = nla_get_be16(nla_port);
3099                 usvc->fwmark = 0;
3100         }
3101
3102         rcu_read_lock();
3103         if (usvc->fwmark)
3104                 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
3105         else
3106                 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
3107                                            &usvc->addr, usvc->port);
3108         rcu_read_unlock();
3109         *ret_svc = svc;
3110
3111         /* If a full entry was requested, check for the additional fields */
3112         if (full_entry) {
3113                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3114                               *nla_netmask;
3115                 struct ip_vs_flags flags;
3116
3117                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3118                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3119                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3120                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3121                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3122
3123                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3124                         return -EINVAL;
3125
3126                 nla_memcpy(&flags, nla_flags, sizeof(flags));
3127
3128                 /* prefill flags from service if it already exists */
3129                 if (svc)
3130                         usvc->flags = svc->flags;
3131
3132                 /* set new flags from userland */
3133                 usvc->flags = (usvc->flags & ~flags.mask) |
3134                               (flags.flags & flags.mask);
3135                 usvc->sched_name = nla_data(nla_sched);
3136                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3137                 usvc->timeout = nla_get_u32(nla_timeout);
3138                 usvc->netmask = nla_get_be32(nla_netmask);
3139         }
3140
3141         return 0;
3142 }
3143
3144 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
3145                                                      struct nlattr *nla)
3146 {
3147         struct ip_vs_service_user_kern usvc;
3148         struct ip_vs_service *svc;
3149         int ret;
3150
3151         ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc);
3152         return ret ? ERR_PTR(ret) : svc;
3153 }
3154
3155 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3156 {
3157         struct nlattr *nl_dest;
3158         struct ip_vs_kstats kstats;
3159
3160         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3161         if (!nl_dest)
3162                 return -EMSGSIZE;
3163
3164         if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3165             nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3166             nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3167                         (atomic_read(&dest->conn_flags) &
3168                          IP_VS_CONN_F_FWD_MASK)) ||
3169             nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3170                         atomic_read(&dest->weight)) ||
3171             nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3172             nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3173             nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3174                         atomic_read(&dest->activeconns)) ||
3175             nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3176                         atomic_read(&dest->inactconns)) ||
3177             nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3178                         atomic_read(&dest->persistconns)) ||
3179             nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
3180                 goto nla_put_failure;
3181         ip_vs_copy_stats(&kstats, &dest->stats);
3182         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
3183                 goto nla_put_failure;
3184         if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
3185                 goto nla_put_failure;
3186
3187         nla_nest_end(skb, nl_dest);
3188
3189         return 0;
3190
3191 nla_put_failure:
3192         nla_nest_cancel(skb, nl_dest);
3193         return -EMSGSIZE;
3194 }
3195
3196 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3197                                 struct netlink_callback *cb)
3198 {
3199         void *hdr;
3200
3201         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3202                           &ip_vs_genl_family, NLM_F_MULTI,
3203                           IPVS_CMD_NEW_DEST);
3204         if (!hdr)
3205                 return -EMSGSIZE;
3206
3207         if (ip_vs_genl_fill_dest(skb, dest) < 0)
3208                 goto nla_put_failure;
3209
3210         genlmsg_end(skb, hdr);
3211         return 0;
3212
3213 nla_put_failure:
3214         genlmsg_cancel(skb, hdr);
3215         return -EMSGSIZE;
3216 }
3217
3218 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3219                                  struct netlink_callback *cb)
3220 {
3221         int idx = 0;
3222         int start = cb->args[0];
3223         struct ip_vs_service *svc;
3224         struct ip_vs_dest *dest;
3225         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3226         struct net *net = sock_net(skb->sk);
3227         struct netns_ipvs *ipvs = net_ipvs(net);
3228
3229         mutex_lock(&__ip_vs_mutex);
3230
3231         /* Try to find the service for which to dump destinations */
3232         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3233                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3234                 goto out_err;
3235
3236
3237         svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
3238         if (IS_ERR(svc) || svc == NULL)
3239                 goto out_err;
3240
3241         /* Dump the destinations */
3242         list_for_each_entry(dest, &svc->destinations, n_list) {
3243                 if (++idx <= start)
3244                         continue;
3245                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3246                         idx--;
3247                         goto nla_put_failure;
3248                 }
3249         }
3250
3251 nla_put_failure:
3252         cb->args[0] = idx;
3253
3254 out_err:
3255         mutex_unlock(&__ip_vs_mutex);
3256
3257         return skb->len;
3258 }
3259
3260 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3261                                  struct nlattr *nla, int full_entry)
3262 {
3263         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3264         struct nlattr *nla_addr, *nla_port;
3265         struct nlattr *nla_addr_family;
3266
3267         /* Parse mandatory identifying destination fields first */
3268         if (nla == NULL ||
3269             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3270                 return -EINVAL;
3271
3272         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3273         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3274         nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
3275
3276         if (!(nla_addr && nla_port))
3277                 return -EINVAL;
3278
3279         memset(udest, 0, sizeof(*udest));
3280
3281         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3282         udest->port = nla_get_be16(nla_port);
3283
3284         if (nla_addr_family)
3285                 udest->af = nla_get_u16(nla_addr_family);
3286         else
3287                 udest->af = 0;
3288
3289         /* If a full entry was requested, check for the additional fields */
3290         if (full_entry) {
3291                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3292                               *nla_l_thresh;
3293
3294                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3295                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3296                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3297                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3298
3299                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3300                         return -EINVAL;
3301
3302                 udest->conn_flags = nla_get_u32(nla_fwd)
3303                                     & IP_VS_CONN_F_FWD_MASK;
3304                 udest->weight = nla_get_u32(nla_weight);
3305                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3306                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3307         }
3308
3309         return 0;
3310 }
3311
3312 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
3313                                   struct ipvs_sync_daemon_cfg *c)
3314 {
3315         struct nlattr *nl_daemon;
3316
3317         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3318         if (!nl_daemon)
3319                 return -EMSGSIZE;
3320
3321         if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3322             nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
3323             nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
3324             nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
3325             nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
3326             nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
3327                 goto nla_put_failure;
3328 #ifdef CONFIG_IP_VS_IPV6
3329         if (c->mcast_af == AF_INET6) {
3330                 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
3331                                      &c->mcast_group.in6))
3332                         goto nla_put_failure;
3333         } else
3334 #endif
3335                 if (c->mcast_af == AF_INET &&
3336                     nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
3337                                     c->mcast_group.ip))
3338                         goto nla_put_failure;
3339         nla_nest_end(skb, nl_daemon);
3340
3341         return 0;
3342
3343 nla_put_failure:
3344         nla_nest_cancel(skb, nl_daemon);
3345         return -EMSGSIZE;
3346 }
3347
3348 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
3349                                   struct ipvs_sync_daemon_cfg *c,
3350                                   struct netlink_callback *cb)
3351 {
3352         void *hdr;
3353         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3354                           &ip_vs_genl_family, NLM_F_MULTI,
3355                           IPVS_CMD_NEW_DAEMON);
3356         if (!hdr)
3357                 return -EMSGSIZE;
3358
3359         if (ip_vs_genl_fill_daemon(skb, state, c))
3360                 goto nla_put_failure;
3361
3362         genlmsg_end(skb, hdr);
3363         return 0;
3364
3365 nla_put_failure:
3366         genlmsg_cancel(skb, hdr);
3367         return -EMSGSIZE;
3368 }
3369
3370 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3371                                    struct netlink_callback *cb)
3372 {
3373         struct net *net = sock_net(skb->sk);
3374         struct netns_ipvs *ipvs = net_ipvs(net);
3375
3376         mutex_lock(&ipvs->sync_mutex);
3377         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3378                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3379                                            &ipvs->mcfg, cb) < 0)
3380                         goto nla_put_failure;
3381
3382                 cb->args[0] = 1;
3383         }
3384
3385         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3386                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3387                                            &ipvs->bcfg, cb) < 0)
3388                         goto nla_put_failure;
3389
3390                 cb->args[1] = 1;
3391         }
3392
3393 nla_put_failure:
3394         mutex_unlock(&ipvs->sync_mutex);
3395
3396         return skb->len;
3397 }
3398
3399 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
3400 {
3401         struct ipvs_sync_daemon_cfg c;
3402         struct nlattr *a;
3403         int ret;
3404
3405         memset(&c, 0, sizeof(c));
3406         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3407               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3408               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3409                 return -EINVAL;
3410         strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3411                 sizeof(c.mcast_ifn));
3412         c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
3413
3414         a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
3415         if (a)
3416                 c.sync_maxlen = nla_get_u16(a);
3417
3418         a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
3419         if (a) {
3420                 c.mcast_af = AF_INET;
3421                 c.mcast_group.ip = nla_get_in_addr(a);
3422                 if (!ipv4_is_multicast(c.mcast_group.ip))
3423                         return -EINVAL;
3424         } else {
3425                 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
3426                 if (a) {
3427 #ifdef CONFIG_IP_VS_IPV6
3428                         int addr_type;
3429
3430                         c.mcast_af = AF_INET6;
3431                         c.mcast_group.in6 = nla_get_in6_addr(a);
3432                         addr_type = ipv6_addr_type(&c.mcast_group.in6);
3433                         if (!(addr_type & IPV6_ADDR_MULTICAST))
3434                                 return -EINVAL;
3435 #else
3436                         return -EAFNOSUPPORT;
3437 #endif
3438                 }
3439         }
3440
3441         a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
3442         if (a)
3443                 c.mcast_port = nla_get_u16(a);
3444
3445         a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
3446         if (a)
3447                 c.mcast_ttl = nla_get_u8(a);
3448
3449         /* The synchronization protocol is incompatible with mixed family
3450          * services
3451          */
3452         if (ipvs->mixed_address_family_dests > 0)
3453                 return -EINVAL;
3454
3455         ret = start_sync_thread(ipvs, &c,
3456                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3457         return ret;
3458 }
3459
3460 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
3461 {
3462         int ret;
3463
3464         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3465                 return -EINVAL;
3466
3467         mutex_lock(&ipvs->sync_mutex);
3468         ret = stop_sync_thread(ipvs,
3469                                nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3470         mutex_unlock(&ipvs->sync_mutex);
3471         return ret;
3472 }
3473
3474 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
3475 {
3476         struct ip_vs_timeout_user t;
3477
3478         __ip_vs_get_timeouts(ipvs, &t);
3479
3480         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3481                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3482
3483         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3484                 t.tcp_fin_timeout =
3485                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3486
3487         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3488                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3489
3490         return ip_vs_set_timeout(ipvs, &t);
3491 }
3492
3493 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3494 {
3495         int ret = -EINVAL, cmd;
3496         struct net *net = sock_net(skb->sk);
3497         struct netns_ipvs *ipvs = net_ipvs(net);
3498
3499         cmd = info->genlhdr->cmd;
3500
3501         if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3502                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3503
3504                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3505                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3506                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3507                                      ip_vs_daemon_policy))
3508                         goto out;
3509
3510                 if (cmd == IPVS_CMD_NEW_DAEMON)
3511                         ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
3512                 else
3513                         ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
3514         }
3515
3516 out:
3517         return ret;
3518 }
3519
3520 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3521 {
3522         struct ip_vs_service *svc = NULL;
3523         struct ip_vs_service_user_kern usvc;
3524         struct ip_vs_dest_user_kern udest;
3525         int ret = 0, cmd;
3526         int need_full_svc = 0, need_full_dest = 0;
3527         struct net *net = sock_net(skb->sk);
3528         struct netns_ipvs *ipvs = net_ipvs(net);
3529
3530         cmd = info->genlhdr->cmd;
3531
3532         mutex_lock(&__ip_vs_mutex);
3533
3534         if (cmd == IPVS_CMD_FLUSH) {
3535                 ret = ip_vs_flush(ipvs, false);
3536                 goto out;
3537         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3538                 ret = ip_vs_genl_set_config(ipvs, info->attrs);
3539                 goto out;
3540         } else if (cmd == IPVS_CMD_ZERO &&
3541                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3542                 ret = ip_vs_zero_all(ipvs);
3543                 goto out;
3544         }
3545
3546         /* All following commands require a service argument, so check if we
3547          * received a valid one. We need a full service specification when
3548          * adding / editing a service. Only identifying members otherwise. */
3549         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3550                 need_full_svc = 1;
3551
3552         ret = ip_vs_genl_parse_service(ipvs, &usvc,
3553                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3554                                        need_full_svc, &svc);
3555         if (ret)
3556                 goto out;
3557
3558         /* Unless we're adding a new service, the service must already exist */
3559         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3560                 ret = -ESRCH;
3561                 goto out;
3562         }
3563
3564         /* Destination commands require a valid destination argument. For
3565          * adding / editing a destination, we need a full destination
3566          * specification. */
3567         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3568             cmd == IPVS_CMD_DEL_DEST) {
3569                 if (cmd != IPVS_CMD_DEL_DEST)
3570                         need_full_dest = 1;
3571
3572                 ret = ip_vs_genl_parse_dest(&udest,
3573                                             info->attrs[IPVS_CMD_ATTR_DEST],
3574                                             need_full_dest);
3575                 if (ret)
3576                         goto out;
3577
3578                 /* Old protocols did not allow the user to specify address
3579                  * family, so we set it to zero instead.  We also didn't
3580                  * allow heterogeneous pools in the old code, so it's safe
3581                  * to assume that this will have the same address family as
3582                  * the service.
3583                  */
3584                 if (udest.af == 0)
3585                         udest.af = svc->af;
3586
3587                 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
3588                         /* The synchronization protocol is incompatible
3589                          * with mixed family services
3590                          */
3591                         if (ipvs->sync_state) {
3592                                 ret = -EINVAL;
3593                                 goto out;
3594                         }
3595
3596                         /* Which connection types do we support? */
3597                         switch (udest.conn_flags) {
3598                         case IP_VS_CONN_F_TUNNEL:
3599                                 /* We are able to forward this */
3600                                 break;
3601                         default:
3602                                 ret = -EINVAL;
3603                                 goto out;
3604                         }
3605                 }
3606         }
3607
3608         switch (cmd) {
3609         case IPVS_CMD_NEW_SERVICE:
3610                 if (svc == NULL)
3611                         ret = ip_vs_add_service(ipvs, &usvc, &svc);
3612                 else
3613                         ret = -EEXIST;
3614                 break;
3615         case IPVS_CMD_SET_SERVICE:
3616                 ret = ip_vs_edit_service(svc, &usvc);
3617                 break;
3618         case IPVS_CMD_DEL_SERVICE:
3619                 ret = ip_vs_del_service(svc);
3620                 /* do not use svc, it can be freed */
3621                 break;
3622         case IPVS_CMD_NEW_DEST:
3623                 ret = ip_vs_add_dest(svc, &udest);
3624                 break;
3625         case IPVS_CMD_SET_DEST:
3626                 ret = ip_vs_edit_dest(svc, &udest);
3627                 break;
3628         case IPVS_CMD_DEL_DEST:
3629                 ret = ip_vs_del_dest(svc, &udest);
3630                 break;
3631         case IPVS_CMD_ZERO:
3632                 ret = ip_vs_zero_service(svc);
3633                 break;
3634         default:
3635                 ret = -EINVAL;
3636         }
3637
3638 out:
3639         mutex_unlock(&__ip_vs_mutex);
3640
3641         return ret;
3642 }
3643
3644 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3645 {
3646         struct sk_buff *msg;
3647         void *reply;
3648         int ret, cmd, reply_cmd;
3649         struct net *net = sock_net(skb->sk);
3650         struct netns_ipvs *ipvs = net_ipvs(net);
3651
3652         cmd = info->genlhdr->cmd;
3653
3654         if (cmd == IPVS_CMD_GET_SERVICE)
3655                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3656         else if (cmd == IPVS_CMD_GET_INFO)
3657                 reply_cmd = IPVS_CMD_SET_INFO;
3658         else if (cmd == IPVS_CMD_GET_CONFIG)
3659                 reply_cmd = IPVS_CMD_SET_CONFIG;
3660         else {
3661                 pr_err("unknown Generic Netlink command\n");
3662                 return -EINVAL;
3663         }
3664
3665         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3666         if (!msg)
3667                 return -ENOMEM;
3668
3669         mutex_lock(&__ip_vs_mutex);
3670
3671         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3672         if (reply == NULL)
3673                 goto nla_put_failure;
3674
3675         switch (cmd) {
3676         case IPVS_CMD_GET_SERVICE:
3677         {
3678                 struct ip_vs_service *svc;
3679
3680                 svc = ip_vs_genl_find_service(ipvs,
3681                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3682                 if (IS_ERR(svc)) {
3683                         ret = PTR_ERR(svc);
3684                         goto out_err;
3685                 } else if (svc) {
3686                         ret = ip_vs_genl_fill_service(msg, svc);
3687                         if (ret)
3688                                 goto nla_put_failure;
3689                 } else {
3690                         ret = -ESRCH;
3691                         goto out_err;
3692                 }
3693
3694                 break;
3695         }
3696
3697         case IPVS_CMD_GET_CONFIG:
3698         {
3699                 struct ip_vs_timeout_user t;
3700
3701                 __ip_vs_get_timeouts(ipvs, &t);
3702 #ifdef CONFIG_IP_VS_PROTO_TCP
3703                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
3704                                 t.tcp_timeout) ||
3705                     nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3706                                 t.tcp_fin_timeout))
3707                         goto nla_put_failure;
3708 #endif
3709 #ifdef CONFIG_IP_VS_PROTO_UDP
3710                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
3711                         goto nla_put_failure;
3712 #endif
3713
3714                 break;
3715         }
3716
3717         case IPVS_CMD_GET_INFO:
3718                 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
3719                                 IP_VS_VERSION_CODE) ||
3720                     nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3721                                 ip_vs_conn_tab_size))
3722                         goto nla_put_failure;
3723                 break;
3724         }
3725
3726         genlmsg_end(msg, reply);
3727         ret = genlmsg_reply(msg, info);
3728         goto out;
3729
3730 nla_put_failure:
3731         pr_err("not enough space in Netlink message\n");
3732         ret = -EMSGSIZE;
3733
3734 out_err:
3735         nlmsg_free(msg);
3736 out:
3737         mutex_unlock(&__ip_vs_mutex);
3738
3739         return ret;
3740 }
3741
3742
3743 static const struct genl_ops ip_vs_genl_ops[] = {
3744         {
3745                 .cmd    = IPVS_CMD_NEW_SERVICE,
3746                 .flags  = GENL_ADMIN_PERM,
3747                 .policy = ip_vs_cmd_policy,
3748                 .doit   = ip_vs_genl_set_cmd,
3749         },
3750         {
3751                 .cmd    = IPVS_CMD_SET_SERVICE,
3752                 .flags  = GENL_ADMIN_PERM,
3753                 .policy = ip_vs_cmd_policy,
3754                 .doit   = ip_vs_genl_set_cmd,
3755         },
3756         {
3757                 .cmd    = IPVS_CMD_DEL_SERVICE,
3758                 .flags  = GENL_ADMIN_PERM,
3759                 .policy = ip_vs_cmd_policy,
3760                 .doit   = ip_vs_genl_set_cmd,
3761         },
3762         {
3763                 .cmd    = IPVS_CMD_GET_SERVICE,
3764                 .flags  = GENL_ADMIN_PERM,
3765                 .doit   = ip_vs_genl_get_cmd,
3766                 .dumpit = ip_vs_genl_dump_services,
3767                 .policy = ip_vs_cmd_policy,
3768         },
3769         {
3770                 .cmd    = IPVS_CMD_NEW_DEST,
3771                 .flags  = GENL_ADMIN_PERM,
3772                 .policy = ip_vs_cmd_policy,
3773                 .doit   = ip_vs_genl_set_cmd,
3774         },
3775         {
3776                 .cmd    = IPVS_CMD_SET_DEST,
3777                 .flags  = GENL_ADMIN_PERM,
3778                 .policy = ip_vs_cmd_policy,
3779                 .doit   = ip_vs_genl_set_cmd,
3780         },
3781         {
3782                 .cmd    = IPVS_CMD_DEL_DEST,
3783                 .flags  = GENL_ADMIN_PERM,
3784                 .policy = ip_vs_cmd_policy,
3785                 .doit   = ip_vs_genl_set_cmd,
3786         },
3787         {
3788                 .cmd    = IPVS_CMD_GET_DEST,
3789                 .flags  = GENL_ADMIN_PERM,
3790                 .policy = ip_vs_cmd_policy,
3791                 .dumpit = ip_vs_genl_dump_dests,
3792         },
3793         {
3794                 .cmd    = IPVS_CMD_NEW_DAEMON,
3795                 .flags  = GENL_ADMIN_PERM,
3796                 .policy = ip_vs_cmd_policy,
3797                 .doit   = ip_vs_genl_set_daemon,
3798         },
3799         {
3800                 .cmd    = IPVS_CMD_DEL_DAEMON,
3801                 .flags  = GENL_ADMIN_PERM,
3802                 .policy = ip_vs_cmd_policy,
3803                 .doit   = ip_vs_genl_set_daemon,
3804         },
3805         {
3806                 .cmd    = IPVS_CMD_GET_DAEMON,
3807                 .flags  = GENL_ADMIN_PERM,
3808                 .dumpit = ip_vs_genl_dump_daemons,
3809         },
3810         {
3811                 .cmd    = IPVS_CMD_SET_CONFIG,
3812                 .flags  = GENL_ADMIN_PERM,
3813                 .policy = ip_vs_cmd_policy,
3814                 .doit   = ip_vs_genl_set_cmd,
3815         },
3816         {
3817                 .cmd    = IPVS_CMD_GET_CONFIG,
3818                 .flags  = GENL_ADMIN_PERM,
3819                 .doit   = ip_vs_genl_get_cmd,
3820         },
3821         {
3822                 .cmd    = IPVS_CMD_GET_INFO,
3823                 .flags  = GENL_ADMIN_PERM,
3824                 .doit   = ip_vs_genl_get_cmd,
3825         },
3826         {
3827                 .cmd    = IPVS_CMD_ZERO,
3828                 .flags  = GENL_ADMIN_PERM,
3829                 .policy = ip_vs_cmd_policy,
3830                 .doit   = ip_vs_genl_set_cmd,
3831         },
3832         {
3833                 .cmd    = IPVS_CMD_FLUSH,
3834                 .flags  = GENL_ADMIN_PERM,
3835                 .doit   = ip_vs_genl_set_cmd,
3836         },
3837 };
3838
3839 static int __init ip_vs_genl_register(void)
3840 {
3841         return genl_register_family_with_ops(&ip_vs_genl_family,
3842                                              ip_vs_genl_ops);
3843 }
3844
3845 static void ip_vs_genl_unregister(void)
3846 {
3847         genl_unregister_family(&ip_vs_genl_family);
3848 }
3849
3850 /* End of Generic Netlink interface definitions */
3851
3852 /*
3853  * per netns intit/exit func.
3854  */
3855 #ifdef CONFIG_SYSCTL
3856 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
3857 {
3858         struct net *net = ipvs->net;
3859         int idx;
3860         struct ctl_table *tbl;
3861
3862         atomic_set(&ipvs->dropentry, 0);
3863         spin_lock_init(&ipvs->dropentry_lock);
3864         spin_lock_init(&ipvs->droppacket_lock);
3865         spin_lock_init(&ipvs->securetcp_lock);
3866
3867         if (!net_eq(net, &init_net)) {
3868                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3869                 if (tbl == NULL)
3870                         return -ENOMEM;
3871
3872                 /* Don't export sysctls to unprivileged users */
3873                 if (net->user_ns != &init_user_ns)
3874                         tbl[0].procname = NULL;
3875         } else
3876                 tbl = vs_vars;
3877         /* Initialize sysctl defaults */
3878         for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
3879                 if (tbl[idx].proc_handler == proc_do_defense_mode)
3880                         tbl[idx].extra2 = ipvs;
3881         }
3882         idx = 0;
3883         ipvs->sysctl_amemthresh = 1024;
3884         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3885         ipvs->sysctl_am_droprate = 10;
3886         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3887         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3888         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3889 #ifdef CONFIG_IP_VS_NFCT
3890         tbl[idx++].data = &ipvs->sysctl_conntrack;
3891 #endif
3892         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3893         ipvs->sysctl_snat_reroute = 1;
3894         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3895         ipvs->sysctl_sync_ver = 1;
3896         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3897         ipvs->sysctl_sync_ports = 1;
3898         tbl[idx++].data = &ipvs->sysctl_sync_ports;
3899         tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
3900         ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3901         tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3902         ipvs->sysctl_sync_sock_size = 0;
3903         tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3904         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3905         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3906         tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
3907         tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
3908         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3909         ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3910         ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3911         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3912         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3913         ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
3914         tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
3915         ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
3916         tbl[idx++].data = &ipvs->sysctl_sync_retries;
3917         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3918         ipvs->sysctl_pmtu_disc = 1;
3919         tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3920         tbl[idx++].data = &ipvs->sysctl_backup_only;
3921         ipvs->sysctl_conn_reuse_mode = 1;
3922         tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
3923         tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
3924         tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
3925
3926         ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3927         if (ipvs->sysctl_hdr == NULL) {
3928                 if (!net_eq(net, &init_net))
3929                         kfree(tbl);
3930                 return -ENOMEM;
3931         }
3932         ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
3933         ipvs->sysctl_tbl = tbl;
3934         /* Schedule defense work */
3935         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3936         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3937
3938         return 0;
3939 }
3940
3941 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
3942 {
3943         struct net *net = ipvs->net;
3944
3945         cancel_delayed_work_sync(&ipvs->defense_work);
3946         cancel_work_sync(&ipvs->defense_work.work);
3947         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3948         ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
3949
3950         if (!net_eq(net, &init_net))
3951                 kfree(ipvs->sysctl_tbl);
3952 }
3953
3954 #else
3955
3956 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
3957 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
3958
3959 #endif
3960
3961 static struct notifier_block ip_vs_dst_notifier = {
3962         .notifier_call = ip_vs_dst_event,
3963 };
3964
3965 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
3966 {
3967         int i, idx;
3968
3969         /* Initialize rs_table */
3970         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3971                 INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
3972
3973         INIT_LIST_HEAD(&ipvs->dest_trash);
3974         spin_lock_init(&ipvs->dest_trash_lock);
3975         setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
3976                     (unsigned long) ipvs);
3977         atomic_set(&ipvs->ftpsvc_counter, 0);
3978         atomic_set(&ipvs->nullsvc_counter, 0);
3979
3980         /* procfs stats */
3981         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3982         if (!ipvs->tot_stats.cpustats)
3983                 return -ENOMEM;
3984
3985         for_each_possible_cpu(i) {
3986                 struct ip_vs_cpu_stats *ipvs_tot_stats;
3987                 ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
3988                 u64_stats_init(&ipvs_tot_stats->syncp);
3989         }
3990
3991         spin_lock_init(&ipvs->tot_stats.lock);
3992
3993         proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops);
3994         proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
3995         proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
3996                     &ip_vs_stats_percpu_fops);
3997
3998         if (ip_vs_control_net_init_sysctl(ipvs))
3999                 goto err;
4000
4001         return 0;
4002
4003 err:
4004         free_percpu(ipvs->tot_stats.cpustats);
4005         return -ENOMEM;
4006 }
4007
4008 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
4009 {
4010         ip_vs_trash_cleanup(ipvs);
4011         ip_vs_control_net_cleanup_sysctl(ipvs);
4012         remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
4013         remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
4014         remove_proc_entry("ip_vs", ipvs->net->proc_net);
4015         free_percpu(ipvs->tot_stats.cpustats);
4016 }
4017
4018 int __init ip_vs_register_nl_ioctl(void)
4019 {
4020         int ret;
4021
4022         ret = nf_register_sockopt(&ip_vs_sockopts);
4023         if (ret) {
4024                 pr_err("cannot register sockopt.\n");
4025                 goto err_sock;
4026         }
4027
4028         ret = ip_vs_genl_register();
4029         if (ret) {
4030                 pr_err("cannot register Generic Netlink interface.\n");
4031                 goto err_genl;
4032         }
4033         return 0;
4034
4035 err_genl:
4036         nf_unregister_sockopt(&ip_vs_sockopts);
4037 err_sock:
4038         return ret;
4039 }
4040
4041 void ip_vs_unregister_nl_ioctl(void)
4042 {
4043         ip_vs_genl_unregister();
4044         nf_unregister_sockopt(&ip_vs_sockopts);
4045 }
4046
4047 int __init ip_vs_control_init(void)
4048 {
4049         int idx;
4050         int ret;
4051
4052         EnterFunction(2);
4053
4054         /* Initialize svc_table, ip_vs_svc_fwm_table */
4055         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4056                 INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
4057                 INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
4058         }
4059
4060         smp_wmb();      /* Do we really need it now ? */
4061
4062         ret = register_netdevice_notifier(&ip_vs_dst_notifier);
4063         if (ret < 0)
4064                 return ret;
4065
4066         LeaveFunction(2);
4067         return 0;
4068 }
4069
4070
4071 void ip_vs_control_cleanup(void)
4072 {
4073         EnterFunction(2);
4074         unregister_netdevice_notifier(&ip_vs_dst_notifier);
4075         LeaveFunction(2);
4076 }