fbmem: fix horribly incorrect placement of __maybe_unused
[linux-2.6-microblaze.git] / drivers / misc / sgi-xp / xpc_main.c
1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
7  * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
8  */
9
10 /*
11  * Cross Partition Communication (XPC) support - standard version.
12  *
13  *      XPC provides a message passing capability that crosses partition
14  *      boundaries. This module is made up of two parts:
15  *
16  *          partition   This part detects the presence/absence of other
17  *                      partitions. It provides a heartbeat and monitors
18  *                      the heartbeats of other partitions.
19  *
20  *          channel     This part manages the channels and sends/receives
21  *                      messages across them to/from other partitions.
22  *
23  *      There are a couple of additional functions residing in XP, which
24  *      provide an interface to XPC for its users.
25  *
26  *
27  *      Caveats:
28  *
29  *        . Currently on sn2, we have no way to determine which nasid an IRQ
30  *          came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
31  *          followed by an IPI. The amo indicates where data is to be pulled
32  *          from, so after the IPI arrives, the remote partition checks the amo
33  *          word. The IPI can actually arrive before the amo however, so other
34  *          code must periodically check for this case. Also, remote amo
35  *          operations do not reliably time out. Thus we do a remote PIO read
36  *          solely to know whether the remote partition is down and whether we
37  *          should stop sending IPIs to it. This remote PIO read operation is
38  *          set up in a special nofault region so SAL knows to ignore (and
39  *          cleanup) any errors due to the remote amo write, PIO read, and/or
40  *          PIO write operations.
41  *
42  *          If/when new hardware solves this IPI problem, we should abandon
43  *          the current approach.
44  *
45  */
46
47 #include <linux/module.h>
48 #include <linux/slab.h>
49 #include <linux/sysctl.h>
50 #include <linux/device.h>
51 #include <linux/delay.h>
52 #include <linux/reboot.h>
53 #include <linux/kdebug.h>
54 #include <linux/kthread.h>
55 #include "xpc.h"
56
57 #ifdef CONFIG_X86_64
58 #include <asm/traps.h>
59 #endif
60
61 /* define two XPC debug device structures to be used with dev_dbg() et al */
62
63 static struct device_driver xpc_dbg_name = {
64         .name = "xpc"
65 };
66
67 static struct device xpc_part_dbg_subname = {
68         .init_name = "",        /* set to "part" at xpc_init() time */
69         .driver = &xpc_dbg_name
70 };
71
72 static struct device xpc_chan_dbg_subname = {
73         .init_name = "",        /* set to "chan" at xpc_init() time */
74         .driver = &xpc_dbg_name
75 };
76
77 struct device *xpc_part = &xpc_part_dbg_subname;
78 struct device *xpc_chan = &xpc_chan_dbg_subname;
79
80 static int xpc_kdebug_ignore;
81
82 /* systune related variables for /proc/sys directories */
83
84 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
85 static int xpc_hb_min_interval = 1;
86 static int xpc_hb_max_interval = 10;
87
88 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
89 static int xpc_hb_check_min_interval = 10;
90 static int xpc_hb_check_max_interval = 120;
91
92 int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
93 static int xpc_disengage_min_timelimit; /* = 0 */
94 static int xpc_disengage_max_timelimit = 120;
95
96 static struct ctl_table xpc_sys_xpc_hb_dir[] = {
97         {
98          .procname = "hb_interval",
99          .data = &xpc_hb_interval,
100          .maxlen = sizeof(int),
101          .mode = 0644,
102          .proc_handler = proc_dointvec_minmax,
103          .extra1 = &xpc_hb_min_interval,
104          .extra2 = &xpc_hb_max_interval},
105         {
106          .procname = "hb_check_interval",
107          .data = &xpc_hb_check_interval,
108          .maxlen = sizeof(int),
109          .mode = 0644,
110          .proc_handler = proc_dointvec_minmax,
111          .extra1 = &xpc_hb_check_min_interval,
112          .extra2 = &xpc_hb_check_max_interval},
113         {}
114 };
115 static struct ctl_table xpc_sys_xpc_dir[] = {
116         {
117          .procname = "hb",
118          .mode = 0555,
119          .child = xpc_sys_xpc_hb_dir},
120         {
121          .procname = "disengage_timelimit",
122          .data = &xpc_disengage_timelimit,
123          .maxlen = sizeof(int),
124          .mode = 0644,
125          .proc_handler = proc_dointvec_minmax,
126          .extra1 = &xpc_disengage_min_timelimit,
127          .extra2 = &xpc_disengage_max_timelimit},
128         {}
129 };
130 static struct ctl_table xpc_sys_dir[] = {
131         {
132          .procname = "xpc",
133          .mode = 0555,
134          .child = xpc_sys_xpc_dir},
135         {}
136 };
137 static struct ctl_table_header *xpc_sysctl;
138
139 /* non-zero if any remote partition disengage was timed out */
140 int xpc_disengage_timedout;
141
142 /* #of activate IRQs received and not yet processed */
143 int xpc_activate_IRQ_rcvd;
144 DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
145
146 /* IRQ handler notifies this wait queue on receipt of an IRQ */
147 DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
148
149 static unsigned long xpc_hb_check_timeout;
150 static struct timer_list xpc_hb_timer;
151
152 /* notification that the xpc_hb_checker thread has exited */
153 static DECLARE_COMPLETION(xpc_hb_checker_exited);
154
155 /* notification that the xpc_discovery thread has exited */
156 static DECLARE_COMPLETION(xpc_discovery_exited);
157
158 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
159
160 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
161 static struct notifier_block xpc_reboot_notifier = {
162         .notifier_call = xpc_system_reboot,
163 };
164
165 static int xpc_system_die(struct notifier_block *, unsigned long, void *);
166 static struct notifier_block xpc_die_notifier = {
167         .notifier_call = xpc_system_die,
168 };
169
170 struct xpc_arch_operations xpc_arch_ops;
171
172 /*
173  * Timer function to enforce the timelimit on the partition disengage.
174  */
175 static void
176 xpc_timeout_partition_disengage(struct timer_list *t)
177 {
178         struct xpc_partition *part = from_timer(part, t, disengage_timer);
179
180         DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
181
182         xpc_partition_disengaged_from_timer(part);
183
184         DBUG_ON(part->disengage_timeout != 0);
185         DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part)));
186 }
187
188 /*
189  * Timer to produce the heartbeat.  The timer structures function is
190  * already set when this is initially called.  A tunable is used to
191  * specify when the next timeout should occur.
192  */
193 static void
194 xpc_hb_beater(struct timer_list *unused)
195 {
196         xpc_arch_ops.increment_heartbeat();
197
198         if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
199                 wake_up_interruptible(&xpc_activate_IRQ_wq);
200
201         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
202         add_timer(&xpc_hb_timer);
203 }
204
205 static void
206 xpc_start_hb_beater(void)
207 {
208         xpc_arch_ops.heartbeat_init();
209         timer_setup(&xpc_hb_timer, xpc_hb_beater, 0);
210         xpc_hb_beater(NULL);
211 }
212
213 static void
214 xpc_stop_hb_beater(void)
215 {
216         del_timer_sync(&xpc_hb_timer);
217         xpc_arch_ops.heartbeat_exit();
218 }
219
220 /*
221  * At periodic intervals, scan through all active partitions and ensure
222  * their heartbeat is still active.  If not, the partition is deactivated.
223  */
224 static void
225 xpc_check_remote_hb(void)
226 {
227         struct xpc_partition *part;
228         short partid;
229         enum xp_retval ret;
230
231         for (partid = 0; partid < xp_max_npartitions; partid++) {
232
233                 if (xpc_exiting)
234                         break;
235
236                 if (partid == xp_partition_id)
237                         continue;
238
239                 part = &xpc_partitions[partid];
240
241                 if (part->act_state == XPC_P_AS_INACTIVE ||
242                     part->act_state == XPC_P_AS_DEACTIVATING) {
243                         continue;
244                 }
245
246                 ret = xpc_arch_ops.get_remote_heartbeat(part);
247                 if (ret != xpSuccess)
248                         XPC_DEACTIVATE_PARTITION(part, ret);
249         }
250 }
251
252 /*
253  * This thread is responsible for nearly all of the partition
254  * activation/deactivation.
255  */
256 static int
257 xpc_hb_checker(void *ignore)
258 {
259         int force_IRQ = 0;
260
261         /* this thread was marked active by xpc_hb_init() */
262
263         set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
264
265         /* set our heartbeating to other partitions into motion */
266         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
267         xpc_start_hb_beater();
268
269         while (!xpc_exiting) {
270
271                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
272                         "been received\n",
273                         (int)(xpc_hb_check_timeout - jiffies),
274                         xpc_activate_IRQ_rcvd);
275
276                 /* checking of remote heartbeats is skewed by IRQ handling */
277                 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
278                         xpc_hb_check_timeout = jiffies +
279                             (xpc_hb_check_interval * HZ);
280
281                         dev_dbg(xpc_part, "checking remote heartbeats\n");
282                         xpc_check_remote_hb();
283                 }
284
285                 /* check for outstanding IRQs */
286                 if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
287                         force_IRQ = 0;
288                         dev_dbg(xpc_part, "processing activate IRQs "
289                                 "received\n");
290                         xpc_arch_ops.process_activate_IRQ_rcvd();
291                 }
292
293                 /* wait for IRQ or timeout */
294                 (void)wait_event_interruptible(xpc_activate_IRQ_wq,
295                                                (time_is_before_eq_jiffies(
296                                                 xpc_hb_check_timeout) ||
297                                                 xpc_activate_IRQ_rcvd > 0 ||
298                                                 xpc_exiting));
299         }
300
301         xpc_stop_hb_beater();
302
303         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
304
305         /* mark this thread as having exited */
306         complete(&xpc_hb_checker_exited);
307         return 0;
308 }
309
310 /*
311  * This thread will attempt to discover other partitions to activate
312  * based on info provided by SAL. This new thread is short lived and
313  * will exit once discovery is complete.
314  */
315 static int
316 xpc_initiate_discovery(void *ignore)
317 {
318         xpc_discovery();
319
320         dev_dbg(xpc_part, "discovery thread is exiting\n");
321
322         /* mark this thread as having exited */
323         complete(&xpc_discovery_exited);
324         return 0;
325 }
326
327 /*
328  * The first kthread assigned to a newly activated partition is the one
329  * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
330  * that kthread until the partition is brought down, at which time that kthread
331  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
332  * that XPC has dismantled all communication infrastructure for the associated
333  * partition.) This kthread becomes the channel manager for that partition.
334  *
335  * Each active partition has a channel manager, who, besides connecting and
336  * disconnecting channels, will ensure that each of the partition's connected
337  * channels has the required number of assigned kthreads to get the work done.
338  */
339 static void
340 xpc_channel_mgr(struct xpc_partition *part)
341 {
342         while (part->act_state != XPC_P_AS_DEACTIVATING ||
343                atomic_read(&part->nchannels_active) > 0 ||
344                !xpc_partition_disengaged(part)) {
345
346                 xpc_process_sent_chctl_flags(part);
347
348                 /*
349                  * Wait until we've been requested to activate kthreads or
350                  * all of the channel's message queues have been torn down or
351                  * a signal is pending.
352                  *
353                  * The channel_mgr_requests is set to 1 after being awakened,
354                  * This is done to prevent the channel mgr from making one pass
355                  * through the loop for each request, since he will
356                  * be servicing all the requests in one pass. The reason it's
357                  * set to 1 instead of 0 is so that other kthreads will know
358                  * that the channel mgr is running and won't bother trying to
359                  * wake him up.
360                  */
361                 atomic_dec(&part->channel_mgr_requests);
362                 (void)wait_event_interruptible(part->channel_mgr_wq,
363                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
364                                  part->chctl.all_flags != 0 ||
365                                  (part->act_state == XPC_P_AS_DEACTIVATING &&
366                                  atomic_read(&part->nchannels_active) == 0 &&
367                                  xpc_partition_disengaged(part))));
368                 atomic_set(&part->channel_mgr_requests, 1);
369         }
370 }
371
372 /*
373  * Guarantee that the kzalloc'd memory is cacheline aligned.
374  */
375 void *
376 xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
377 {
378         /* see if kzalloc will give us cachline aligned memory by default */
379         *base = kzalloc(size, flags);
380         if (*base == NULL)
381                 return NULL;
382
383         if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
384                 return *base;
385
386         kfree(*base);
387
388         /* nope, we'll have to do it ourselves */
389         *base = kzalloc(size + L1_CACHE_BYTES, flags);
390         if (*base == NULL)
391                 return NULL;
392
393         return (void *)L1_CACHE_ALIGN((u64)*base);
394 }
395
396 /*
397  * Setup the channel structures necessary to support XPartition Communication
398  * between the specified remote partition and the local one.
399  */
400 static enum xp_retval
401 xpc_setup_ch_structures(struct xpc_partition *part)
402 {
403         enum xp_retval ret;
404         int ch_number;
405         struct xpc_channel *ch;
406         short partid = XPC_PARTID(part);
407
408         /*
409          * Allocate all of the channel structures as a contiguous chunk of
410          * memory.
411          */
412         DBUG_ON(part->channels != NULL);
413         part->channels = kcalloc(XPC_MAX_NCHANNELS,
414                                  sizeof(struct xpc_channel),
415                                  GFP_KERNEL);
416         if (part->channels == NULL) {
417                 dev_err(xpc_chan, "can't get memory for channels\n");
418                 return xpNoMemory;
419         }
420
421         /* allocate the remote open and close args */
422
423         part->remote_openclose_args =
424             xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
425                                           GFP_KERNEL, &part->
426                                           remote_openclose_args_base);
427         if (part->remote_openclose_args == NULL) {
428                 dev_err(xpc_chan, "can't get memory for remote connect args\n");
429                 ret = xpNoMemory;
430                 goto out_1;
431         }
432
433         part->chctl.all_flags = 0;
434         spin_lock_init(&part->chctl_lock);
435
436         atomic_set(&part->channel_mgr_requests, 1);
437         init_waitqueue_head(&part->channel_mgr_wq);
438
439         part->nchannels = XPC_MAX_NCHANNELS;
440
441         atomic_set(&part->nchannels_active, 0);
442         atomic_set(&part->nchannels_engaged, 0);
443
444         for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
445                 ch = &part->channels[ch_number];
446
447                 ch->partid = partid;
448                 ch->number = ch_number;
449                 ch->flags = XPC_C_DISCONNECTED;
450
451                 atomic_set(&ch->kthreads_assigned, 0);
452                 atomic_set(&ch->kthreads_idle, 0);
453                 atomic_set(&ch->kthreads_active, 0);
454
455                 atomic_set(&ch->references, 0);
456                 atomic_set(&ch->n_to_notify, 0);
457
458                 spin_lock_init(&ch->lock);
459                 init_completion(&ch->wdisconnect_wait);
460
461                 atomic_set(&ch->n_on_msg_allocate_wq, 0);
462                 init_waitqueue_head(&ch->msg_allocate_wq);
463                 init_waitqueue_head(&ch->idle_wq);
464         }
465
466         ret = xpc_arch_ops.setup_ch_structures(part);
467         if (ret != xpSuccess)
468                 goto out_2;
469
470         /*
471          * With the setting of the partition setup_state to XPC_P_SS_SETUP,
472          * we're declaring that this partition is ready to go.
473          */
474         part->setup_state = XPC_P_SS_SETUP;
475
476         return xpSuccess;
477
478         /* setup of ch structures failed */
479 out_2:
480         kfree(part->remote_openclose_args_base);
481         part->remote_openclose_args = NULL;
482 out_1:
483         kfree(part->channels);
484         part->channels = NULL;
485         return ret;
486 }
487
488 /*
489  * Teardown the channel structures necessary to support XPartition Communication
490  * between the specified remote partition and the local one.
491  */
492 static void
493 xpc_teardown_ch_structures(struct xpc_partition *part)
494 {
495         DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
496         DBUG_ON(atomic_read(&part->nchannels_active) != 0);
497
498         /*
499          * Make this partition inaccessible to local processes by marking it
500          * as no longer setup. Then wait before proceeding with the teardown
501          * until all existing references cease.
502          */
503         DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
504         part->setup_state = XPC_P_SS_WTEARDOWN;
505
506         wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
507
508         /* now we can begin tearing down the infrastructure */
509
510         xpc_arch_ops.teardown_ch_structures(part);
511
512         kfree(part->remote_openclose_args_base);
513         part->remote_openclose_args = NULL;
514         kfree(part->channels);
515         part->channels = NULL;
516
517         part->setup_state = XPC_P_SS_TORNDOWN;
518 }
519
520 /*
521  * When XPC HB determines that a partition has come up, it will create a new
522  * kthread and that kthread will call this function to attempt to set up the
523  * basic infrastructure used for Cross Partition Communication with the newly
524  * upped partition.
525  *
526  * The kthread that was created by XPC HB and which setup the XPC
527  * infrastructure will remain assigned to the partition becoming the channel
528  * manager for that partition until the partition is deactivating, at which
529  * time the kthread will teardown the XPC infrastructure and then exit.
530  */
531 static int
532 xpc_activating(void *__partid)
533 {
534         short partid = (u64)__partid;
535         struct xpc_partition *part = &xpc_partitions[partid];
536         unsigned long irq_flags;
537
538         DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
539
540         spin_lock_irqsave(&part->act_lock, irq_flags);
541
542         if (part->act_state == XPC_P_AS_DEACTIVATING) {
543                 part->act_state = XPC_P_AS_INACTIVE;
544                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
545                 part->remote_rp_pa = 0;
546                 return 0;
547         }
548
549         /* indicate the thread is activating */
550         DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
551         part->act_state = XPC_P_AS_ACTIVATING;
552
553         XPC_SET_REASON(part, 0, 0);
554         spin_unlock_irqrestore(&part->act_lock, irq_flags);
555
556         dev_dbg(xpc_part, "activating partition %d\n", partid);
557
558         xpc_arch_ops.allow_hb(partid);
559
560         if (xpc_setup_ch_structures(part) == xpSuccess) {
561                 (void)xpc_part_ref(part);       /* this will always succeed */
562
563                 if (xpc_arch_ops.make_first_contact(part) == xpSuccess) {
564                         xpc_mark_partition_active(part);
565                         xpc_channel_mgr(part);
566                         /* won't return until partition is deactivating */
567                 }
568
569                 xpc_part_deref(part);
570                 xpc_teardown_ch_structures(part);
571         }
572
573         xpc_arch_ops.disallow_hb(partid);
574         xpc_mark_partition_inactive(part);
575
576         if (part->reason == xpReactivating) {
577                 /* interrupting ourselves results in activating partition */
578                 xpc_arch_ops.request_partition_reactivation(part);
579         }
580
581         return 0;
582 }
583
584 void
585 xpc_activate_partition(struct xpc_partition *part)
586 {
587         short partid = XPC_PARTID(part);
588         unsigned long irq_flags;
589         struct task_struct *kthread;
590
591         spin_lock_irqsave(&part->act_lock, irq_flags);
592
593         DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
594
595         part->act_state = XPC_P_AS_ACTIVATION_REQ;
596         XPC_SET_REASON(part, xpCloneKThread, __LINE__);
597
598         spin_unlock_irqrestore(&part->act_lock, irq_flags);
599
600         kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
601                               partid);
602         if (IS_ERR(kthread)) {
603                 spin_lock_irqsave(&part->act_lock, irq_flags);
604                 part->act_state = XPC_P_AS_INACTIVE;
605                 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
606                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
607         }
608 }
609
610 void
611 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
612 {
613         int idle = atomic_read(&ch->kthreads_idle);
614         int assigned = atomic_read(&ch->kthreads_assigned);
615         int wakeup;
616
617         DBUG_ON(needed <= 0);
618
619         if (idle > 0) {
620                 wakeup = (needed > idle) ? idle : needed;
621                 needed -= wakeup;
622
623                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
624                         "channel=%d\n", wakeup, ch->partid, ch->number);
625
626                 /* only wakeup the requested number of kthreads */
627                 wake_up_nr(&ch->idle_wq, wakeup);
628         }
629
630         if (needed <= 0)
631                 return;
632
633         if (needed + assigned > ch->kthreads_assigned_limit) {
634                 needed = ch->kthreads_assigned_limit - assigned;
635                 if (needed <= 0)
636                         return;
637         }
638
639         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
640                 needed, ch->partid, ch->number);
641
642         xpc_create_kthreads(ch, needed, 0);
643 }
644
645 /*
646  * This function is where XPC's kthreads wait for messages to deliver.
647  */
648 static void
649 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
650 {
651         int (*n_of_deliverable_payloads) (struct xpc_channel *) =
652                 xpc_arch_ops.n_of_deliverable_payloads;
653
654         do {
655                 /* deliver messages to their intended recipients */
656
657                 while (n_of_deliverable_payloads(ch) > 0 &&
658                        !(ch->flags & XPC_C_DISCONNECTING)) {
659                         xpc_deliver_payload(ch);
660                 }
661
662                 if (atomic_inc_return(&ch->kthreads_idle) >
663                     ch->kthreads_idle_limit) {
664                         /* too many idle kthreads on this channel */
665                         atomic_dec(&ch->kthreads_idle);
666                         break;
667                 }
668
669                 dev_dbg(xpc_chan, "idle kthread calling "
670                         "wait_event_interruptible_exclusive()\n");
671
672                 (void)wait_event_interruptible_exclusive(ch->idle_wq,
673                                 (n_of_deliverable_payloads(ch) > 0 ||
674                                  (ch->flags & XPC_C_DISCONNECTING)));
675
676                 atomic_dec(&ch->kthreads_idle);
677
678         } while (!(ch->flags & XPC_C_DISCONNECTING));
679 }
680
681 static int
682 xpc_kthread_start(void *args)
683 {
684         short partid = XPC_UNPACK_ARG1(args);
685         u16 ch_number = XPC_UNPACK_ARG2(args);
686         struct xpc_partition *part = &xpc_partitions[partid];
687         struct xpc_channel *ch;
688         int n_needed;
689         unsigned long irq_flags;
690         int (*n_of_deliverable_payloads) (struct xpc_channel *) =
691                 xpc_arch_ops.n_of_deliverable_payloads;
692
693         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
694                 partid, ch_number);
695
696         ch = &part->channels[ch_number];
697
698         if (!(ch->flags & XPC_C_DISCONNECTING)) {
699
700                 /* let registerer know that connection has been established */
701
702                 spin_lock_irqsave(&ch->lock, irq_flags);
703                 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
704                         ch->flags |= XPC_C_CONNECTEDCALLOUT;
705                         spin_unlock_irqrestore(&ch->lock, irq_flags);
706
707                         xpc_connected_callout(ch);
708
709                         spin_lock_irqsave(&ch->lock, irq_flags);
710                         ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
711                         spin_unlock_irqrestore(&ch->lock, irq_flags);
712
713                         /*
714                          * It is possible that while the callout was being
715                          * made that the remote partition sent some messages.
716                          * If that is the case, we may need to activate
717                          * additional kthreads to help deliver them. We only
718                          * need one less than total #of messages to deliver.
719                          */
720                         n_needed = n_of_deliverable_payloads(ch) - 1;
721                         if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
722                                 xpc_activate_kthreads(ch, n_needed);
723
724                 } else {
725                         spin_unlock_irqrestore(&ch->lock, irq_flags);
726                 }
727
728                 xpc_kthread_waitmsgs(part, ch);
729         }
730
731         /* let registerer know that connection is disconnecting */
732
733         spin_lock_irqsave(&ch->lock, irq_flags);
734         if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
735             !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
736                 ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
737                 spin_unlock_irqrestore(&ch->lock, irq_flags);
738
739                 xpc_disconnect_callout(ch, xpDisconnecting);
740
741                 spin_lock_irqsave(&ch->lock, irq_flags);
742                 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
743         }
744         spin_unlock_irqrestore(&ch->lock, irq_flags);
745
746         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
747             atomic_dec_return(&part->nchannels_engaged) == 0) {
748                 xpc_arch_ops.indicate_partition_disengaged(part);
749         }
750
751         xpc_msgqueue_deref(ch);
752
753         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
754                 partid, ch_number);
755
756         xpc_part_deref(part);
757         return 0;
758 }
759
760 /*
761  * For each partition that XPC has established communications with, there is
762  * a minimum of one kernel thread assigned to perform any operation that
763  * may potentially sleep or block (basically the callouts to the asynchronous
764  * functions registered via xpc_connect()).
765  *
766  * Additional kthreads are created and destroyed by XPC as the workload
767  * demands.
768  *
769  * A kthread is assigned to one of the active channels that exists for a given
770  * partition.
771  */
772 void
773 xpc_create_kthreads(struct xpc_channel *ch, int needed,
774                     int ignore_disconnecting)
775 {
776         unsigned long irq_flags;
777         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
778         struct xpc_partition *part = &xpc_partitions[ch->partid];
779         struct task_struct *kthread;
780         void (*indicate_partition_disengaged) (struct xpc_partition *) =
781                 xpc_arch_ops.indicate_partition_disengaged;
782
783         while (needed-- > 0) {
784
785                 /*
786                  * The following is done on behalf of the newly created
787                  * kthread. That kthread is responsible for doing the
788                  * counterpart to the following before it exits.
789                  */
790                 if (ignore_disconnecting) {
791                         if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
792                                 /* kthreads assigned had gone to zero */
793                                 BUG_ON(!(ch->flags &
794                                          XPC_C_DISCONNECTINGCALLOUT_MADE));
795                                 break;
796                         }
797
798                 } else if (ch->flags & XPC_C_DISCONNECTING) {
799                         break;
800
801                 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
802                            atomic_inc_return(&part->nchannels_engaged) == 1) {
803                         xpc_arch_ops.indicate_partition_engaged(part);
804                 }
805                 (void)xpc_part_ref(part);
806                 xpc_msgqueue_ref(ch);
807
808                 kthread = kthread_run(xpc_kthread_start, (void *)args,
809                                       "xpc%02dc%d", ch->partid, ch->number);
810                 if (IS_ERR(kthread)) {
811                         /* the fork failed */
812
813                         /*
814                          * NOTE: if (ignore_disconnecting &&
815                          * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
816                          * then we'll deadlock if all other kthreads assigned
817                          * to this channel are blocked in the channel's
818                          * registerer, because the only thing that will unblock
819                          * them is the xpDisconnecting callout that this
820                          * failed kthread_run() would have made.
821                          */
822
823                         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
824                             atomic_dec_return(&part->nchannels_engaged) == 0) {
825                                 indicate_partition_disengaged(part);
826                         }
827                         xpc_msgqueue_deref(ch);
828                         xpc_part_deref(part);
829
830                         if (atomic_read(&ch->kthreads_assigned) <
831                             ch->kthreads_idle_limit) {
832                                 /*
833                                  * Flag this as an error only if we have an
834                                  * insufficient #of kthreads for the channel
835                                  * to function.
836                                  */
837                                 spin_lock_irqsave(&ch->lock, irq_flags);
838                                 XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources,
839                                                        &irq_flags);
840                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
841                         }
842                         break;
843                 }
844         }
845 }
846
847 void
848 xpc_disconnect_wait(int ch_number)
849 {
850         unsigned long irq_flags;
851         short partid;
852         struct xpc_partition *part;
853         struct xpc_channel *ch;
854         int wakeup_channel_mgr;
855
856         /* now wait for all callouts to the caller's function to cease */
857         for (partid = 0; partid < xp_max_npartitions; partid++) {
858                 part = &xpc_partitions[partid];
859
860                 if (!xpc_part_ref(part))
861                         continue;
862
863                 ch = &part->channels[ch_number];
864
865                 if (!(ch->flags & XPC_C_WDISCONNECT)) {
866                         xpc_part_deref(part);
867                         continue;
868                 }
869
870                 wait_for_completion(&ch->wdisconnect_wait);
871
872                 spin_lock_irqsave(&ch->lock, irq_flags);
873                 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
874                 wakeup_channel_mgr = 0;
875
876                 if (ch->delayed_chctl_flags) {
877                         if (part->act_state != XPC_P_AS_DEACTIVATING) {
878                                 spin_lock(&part->chctl_lock);
879                                 part->chctl.flags[ch->number] |=
880                                     ch->delayed_chctl_flags;
881                                 spin_unlock(&part->chctl_lock);
882                                 wakeup_channel_mgr = 1;
883                         }
884                         ch->delayed_chctl_flags = 0;
885                 }
886
887                 ch->flags &= ~XPC_C_WDISCONNECT;
888                 spin_unlock_irqrestore(&ch->lock, irq_flags);
889
890                 if (wakeup_channel_mgr)
891                         xpc_wakeup_channel_mgr(part);
892
893                 xpc_part_deref(part);
894         }
895 }
896
897 static int
898 xpc_setup_partitions(void)
899 {
900         short partid;
901         struct xpc_partition *part;
902
903         xpc_partitions = kcalloc(xp_max_npartitions,
904                                  sizeof(struct xpc_partition),
905                                  GFP_KERNEL);
906         if (xpc_partitions == NULL) {
907                 dev_err(xpc_part, "can't get memory for partition structure\n");
908                 return -ENOMEM;
909         }
910
911         /*
912          * The first few fields of each entry of xpc_partitions[] need to
913          * be initialized now so that calls to xpc_connect() and
914          * xpc_disconnect() can be made prior to the activation of any remote
915          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
916          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
917          * PARTITION HAS BEEN ACTIVATED.
918          */
919         for (partid = 0; partid < xp_max_npartitions; partid++) {
920                 part = &xpc_partitions[partid];
921
922                 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
923
924                 part->activate_IRQ_rcvd = 0;
925                 spin_lock_init(&part->act_lock);
926                 part->act_state = XPC_P_AS_INACTIVE;
927                 XPC_SET_REASON(part, 0, 0);
928
929                 timer_setup(&part->disengage_timer,
930                             xpc_timeout_partition_disengage, 0);
931
932                 part->setup_state = XPC_P_SS_UNSET;
933                 init_waitqueue_head(&part->teardown_wq);
934                 atomic_set(&part->references, 0);
935         }
936
937         return xpc_arch_ops.setup_partitions();
938 }
939
940 static void
941 xpc_teardown_partitions(void)
942 {
943         xpc_arch_ops.teardown_partitions();
944         kfree(xpc_partitions);
945 }
946
947 static void
948 xpc_do_exit(enum xp_retval reason)
949 {
950         short partid;
951         int active_part_count, printed_waiting_msg = 0;
952         struct xpc_partition *part;
953         unsigned long printmsg_time, disengage_timeout = 0;
954
955         /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
956         DBUG_ON(xpc_exiting == 1);
957
958         /*
959          * Let the heartbeat checker thread and the discovery thread
960          * (if one is running) know that they should exit. Also wake up
961          * the heartbeat checker thread in case it's sleeping.
962          */
963         xpc_exiting = 1;
964         wake_up_interruptible(&xpc_activate_IRQ_wq);
965
966         /* wait for the discovery thread to exit */
967         wait_for_completion(&xpc_discovery_exited);
968
969         /* wait for the heartbeat checker thread to exit */
970         wait_for_completion(&xpc_hb_checker_exited);
971
972         /* sleep for a 1/3 of a second or so */
973         (void)msleep_interruptible(300);
974
975         /* wait for all partitions to become inactive */
976
977         printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
978         xpc_disengage_timedout = 0;
979
980         do {
981                 active_part_count = 0;
982
983                 for (partid = 0; partid < xp_max_npartitions; partid++) {
984                         part = &xpc_partitions[partid];
985
986                         if (xpc_partition_disengaged(part) &&
987                             part->act_state == XPC_P_AS_INACTIVE) {
988                                 continue;
989                         }
990
991                         active_part_count++;
992
993                         XPC_DEACTIVATE_PARTITION(part, reason);
994
995                         if (part->disengage_timeout > disengage_timeout)
996                                 disengage_timeout = part->disengage_timeout;
997                 }
998
999                 if (xpc_arch_ops.any_partition_engaged()) {
1000                         if (time_is_before_jiffies(printmsg_time)) {
1001                                 dev_info(xpc_part, "waiting for remote "
1002                                          "partitions to deactivate, timeout in "
1003                                          "%ld seconds\n", (disengage_timeout -
1004                                          jiffies) / HZ);
1005                                 printmsg_time = jiffies +
1006                                     (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
1007                                 printed_waiting_msg = 1;
1008                         }
1009
1010                 } else if (active_part_count > 0) {
1011                         if (printed_waiting_msg) {
1012                                 dev_info(xpc_part, "waiting for local partition"
1013                                          " to deactivate\n");
1014                                 printed_waiting_msg = 0;
1015                         }
1016
1017                 } else {
1018                         if (!xpc_disengage_timedout) {
1019                                 dev_info(xpc_part, "all partitions have "
1020                                          "deactivated\n");
1021                         }
1022                         break;
1023                 }
1024
1025                 /* sleep for a 1/3 of a second or so */
1026                 (void)msleep_interruptible(300);
1027
1028         } while (1);
1029
1030         DBUG_ON(xpc_arch_ops.any_partition_engaged());
1031
1032         xpc_teardown_rsvd_page();
1033
1034         if (reason == xpUnloading) {
1035                 (void)unregister_die_notifier(&xpc_die_notifier);
1036                 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1037         }
1038
1039         /* clear the interface to XPC's functions */
1040         xpc_clear_interface();
1041
1042         if (xpc_sysctl)
1043                 unregister_sysctl_table(xpc_sysctl);
1044
1045         xpc_teardown_partitions();
1046
1047         if (is_uv_system())
1048                 xpc_exit_uv();
1049 }
1050
1051 /*
1052  * This function is called when the system is being rebooted.
1053  */
1054 static int
1055 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1056 {
1057         enum xp_retval reason;
1058
1059         switch (event) {
1060         case SYS_RESTART:
1061                 reason = xpSystemReboot;
1062                 break;
1063         case SYS_HALT:
1064                 reason = xpSystemHalt;
1065                 break;
1066         case SYS_POWER_OFF:
1067                 reason = xpSystemPoweroff;
1068                 break;
1069         default:
1070                 reason = xpSystemGoingDown;
1071         }
1072
1073         xpc_do_exit(reason);
1074         return NOTIFY_DONE;
1075 }
1076
1077 /* Used to only allow one cpu to complete disconnect */
1078 static unsigned int xpc_die_disconnecting;
1079
1080 /*
1081  * Notify other partitions to deactivate from us by first disengaging from all
1082  * references to our memory.
1083  */
1084 static void
1085 xpc_die_deactivate(void)
1086 {
1087         struct xpc_partition *part;
1088         short partid;
1089         int any_engaged;
1090         long keep_waiting;
1091         long wait_to_print;
1092
1093         if (cmpxchg(&xpc_die_disconnecting, 0, 1))
1094                 return;
1095
1096         /* keep xpc_hb_checker thread from doing anything (just in case) */
1097         xpc_exiting = 1;
1098
1099         xpc_arch_ops.disallow_all_hbs();   /*indicate we're deactivated */
1100
1101         for (partid = 0; partid < xp_max_npartitions; partid++) {
1102                 part = &xpc_partitions[partid];
1103
1104                 if (xpc_arch_ops.partition_engaged(partid) ||
1105                     part->act_state != XPC_P_AS_INACTIVE) {
1106                         xpc_arch_ops.request_partition_deactivation(part);
1107                         xpc_arch_ops.indicate_partition_disengaged(part);
1108                 }
1109         }
1110
1111         /*
1112          * Though we requested that all other partitions deactivate from us,
1113          * we only wait until they've all disengaged or we've reached the
1114          * defined timelimit.
1115          *
1116          * Given that one iteration through the following while-loop takes
1117          * approximately 200 microseconds, calculate the #of loops to take
1118          * before bailing and the #of loops before printing a waiting message.
1119          */
1120         keep_waiting = xpc_disengage_timelimit * 1000 * 5;
1121         wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
1122
1123         while (1) {
1124                 any_engaged = xpc_arch_ops.any_partition_engaged();
1125                 if (!any_engaged) {
1126                         dev_info(xpc_part, "all partitions have deactivated\n");
1127                         break;
1128                 }
1129
1130                 if (!keep_waiting--) {
1131                         for (partid = 0; partid < xp_max_npartitions;
1132                              partid++) {
1133                                 if (xpc_arch_ops.partition_engaged(partid)) {
1134                                         dev_info(xpc_part, "deactivate from "
1135                                                  "remote partition %d timed "
1136                                                  "out\n", partid);
1137                                 }
1138                         }
1139                         break;
1140                 }
1141
1142                 if (!wait_to_print--) {
1143                         dev_info(xpc_part, "waiting for remote partitions to "
1144                                  "deactivate, timeout in %ld seconds\n",
1145                                  keep_waiting / (1000 * 5));
1146                         wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
1147                             1000 * 5;
1148                 }
1149
1150                 udelay(200);
1151         }
1152 }
1153
1154 /*
1155  * This function is called when the system is being restarted or halted due
1156  * to some sort of system failure. If this is the case we need to notify the
1157  * other partitions to disengage from all references to our memory.
1158  * This function can also be called when our heartbeater could be offlined
1159  * for a time. In this case we need to notify other partitions to not worry
1160  * about the lack of a heartbeat.
1161  */
1162 static int
1163 xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
1164 {
1165 #ifdef CONFIG_IA64              /* !!! temporary kludge */
1166         switch (event) {
1167         case DIE_MACHINE_RESTART:
1168         case DIE_MACHINE_HALT:
1169                 xpc_die_deactivate();
1170                 break;
1171
1172         case DIE_KDEBUG_ENTER:
1173                 /* Should lack of heartbeat be ignored by other partitions? */
1174                 if (!xpc_kdebug_ignore)
1175                         break;
1176
1177                 fallthrough;
1178         case DIE_MCA_MONARCH_ENTER:
1179         case DIE_INIT_MONARCH_ENTER:
1180                 xpc_arch_ops.offline_heartbeat();
1181                 break;
1182
1183         case DIE_KDEBUG_LEAVE:
1184                 /* Is lack of heartbeat being ignored by other partitions? */
1185                 if (!xpc_kdebug_ignore)
1186                         break;
1187
1188                 fallthrough;
1189         case DIE_MCA_MONARCH_LEAVE:
1190         case DIE_INIT_MONARCH_LEAVE:
1191                 xpc_arch_ops.online_heartbeat();
1192                 break;
1193         }
1194 #else
1195         struct die_args *die_args = _die_args;
1196
1197         switch (event) {
1198         case DIE_TRAP:
1199                 if (die_args->trapnr == X86_TRAP_DF)
1200                         xpc_die_deactivate();
1201
1202                 if (((die_args->trapnr == X86_TRAP_MF) ||
1203                      (die_args->trapnr == X86_TRAP_XF)) &&
1204                     !user_mode(die_args->regs))
1205                         xpc_die_deactivate();
1206
1207                 break;
1208         case DIE_INT3:
1209         case DIE_DEBUG:
1210                 break;
1211         case DIE_OOPS:
1212         case DIE_GPF:
1213         default:
1214                 xpc_die_deactivate();
1215         }
1216 #endif
1217
1218         return NOTIFY_DONE;
1219 }
1220
1221 static int __init
1222 xpc_init(void)
1223 {
1224         int ret;
1225         struct task_struct *kthread;
1226
1227         dev_set_name(xpc_part, "part");
1228         dev_set_name(xpc_chan, "chan");
1229
1230         if (is_uv_system()) {
1231                 ret = xpc_init_uv();
1232
1233         } else {
1234                 ret = -ENODEV;
1235         }
1236
1237         if (ret != 0)
1238                 return ret;
1239
1240         ret = xpc_setup_partitions();
1241         if (ret != 0) {
1242                 dev_err(xpc_part, "can't get memory for partition structure\n");
1243                 goto out_1;
1244         }
1245
1246         xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1247
1248         /*
1249          * Fill the partition reserved page with the information needed by
1250          * other partitions to discover we are alive and establish initial
1251          * communications.
1252          */
1253         ret = xpc_setup_rsvd_page();
1254         if (ret != 0) {
1255                 dev_err(xpc_part, "can't setup our reserved page\n");
1256                 goto out_2;
1257         }
1258
1259         /* add ourselves to the reboot_notifier_list */
1260         ret = register_reboot_notifier(&xpc_reboot_notifier);
1261         if (ret != 0)
1262                 dev_warn(xpc_part, "can't register reboot notifier\n");
1263
1264         /* add ourselves to the die_notifier list */
1265         ret = register_die_notifier(&xpc_die_notifier);
1266         if (ret != 0)
1267                 dev_warn(xpc_part, "can't register die notifier\n");
1268
1269         /*
1270          * The real work-horse behind xpc.  This processes incoming
1271          * interrupts and monitors remote heartbeats.
1272          */
1273         kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1274         if (IS_ERR(kthread)) {
1275                 dev_err(xpc_part, "failed while forking hb check thread\n");
1276                 ret = -EBUSY;
1277                 goto out_3;
1278         }
1279
1280         /*
1281          * Startup a thread that will attempt to discover other partitions to
1282          * activate based on info provided by SAL. This new thread is short
1283          * lived and will exit once discovery is complete.
1284          */
1285         kthread = kthread_run(xpc_initiate_discovery, NULL,
1286                               XPC_DISCOVERY_THREAD_NAME);
1287         if (IS_ERR(kthread)) {
1288                 dev_err(xpc_part, "failed while forking discovery thread\n");
1289
1290                 /* mark this new thread as a non-starter */
1291                 complete(&xpc_discovery_exited);
1292
1293                 xpc_do_exit(xpUnloading);
1294                 return -EBUSY;
1295         }
1296
1297         /* set the interface to point at XPC's functions */
1298         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1299                           xpc_initiate_send, xpc_initiate_send_notify,
1300                           xpc_initiate_received, xpc_initiate_partid_to_nasids);
1301
1302         return 0;
1303
1304         /* initialization was not successful */
1305 out_3:
1306         xpc_teardown_rsvd_page();
1307
1308         (void)unregister_die_notifier(&xpc_die_notifier);
1309         (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1310 out_2:
1311         if (xpc_sysctl)
1312                 unregister_sysctl_table(xpc_sysctl);
1313
1314         xpc_teardown_partitions();
1315 out_1:
1316         if (is_uv_system())
1317                 xpc_exit_uv();
1318         return ret;
1319 }
1320
1321 module_init(xpc_init);
1322
1323 static void __exit
1324 xpc_exit(void)
1325 {
1326         xpc_do_exit(xpUnloading);
1327 }
1328
1329 module_exit(xpc_exit);
1330
1331 MODULE_AUTHOR("Silicon Graphics, Inc.");
1332 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1333 MODULE_LICENSE("GPL");
1334
1335 module_param(xpc_hb_interval, int, 0);
1336 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1337                  "heartbeat increments.");
1338
1339 module_param(xpc_hb_check_interval, int, 0);
1340 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1341                  "heartbeat checks.");
1342
1343 module_param(xpc_disengage_timelimit, int, 0);
1344 MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
1345                  "for disengage to complete.");
1346
1347 module_param(xpc_kdebug_ignore, int, 0);
1348 MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1349                  "other partitions when dropping into kdebug.");