Merge tag 'hyperv-next-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/hyper...
[linux-2.6-microblaze.git] / drivers / hv / vmbus_drv.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2009, Microsoft Corporation.
4  *
5  * Authors:
6  *   Haiyang Zhang <haiyangz@microsoft.com>
7  *   Hank Janssen  <hjanssen@microsoft.com>
8  *   K. Y. Srinivasan <kys@microsoft.com>
9  */
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/device.h>
15 #include <linux/interrupt.h>
16 #include <linux/sysctl.h>
17 #include <linux/slab.h>
18 #include <linux/acpi.h>
19 #include <linux/completion.h>
20 #include <linux/hyperv.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/clockchips.h>
23 #include <linux/cpu.h>
24 #include <linux/sched/task_stack.h>
25
26 #include <linux/delay.h>
27 #include <linux/notifier.h>
28 #include <linux/ptrace.h>
29 #include <linux/screen_info.h>
30 #include <linux/kdebug.h>
31 #include <linux/efi.h>
32 #include <linux/random.h>
33 #include <linux/kernel.h>
34 #include <linux/syscore_ops.h>
35 #include <clocksource/hyperv_timer.h>
36 #include "hyperv_vmbus.h"
37
38 struct vmbus_dynid {
39         struct list_head node;
40         struct hv_vmbus_device_id id;
41 };
42
43 static struct acpi_device  *hv_acpi_dev;
44
45 static struct completion probe_event;
46
47 static int hyperv_cpuhp_online;
48
49 static void *hv_panic_page;
50
51 /*
52  * Boolean to control whether to report panic messages over Hyper-V.
53  *
54  * It can be set via /proc/sys/kernel/hyperv/record_panic_msg
55  */
56 static int sysctl_record_panic_msg = 1;
57
58 static int hyperv_report_reg(void)
59 {
60         return !sysctl_record_panic_msg || !hv_panic_page;
61 }
62
63 static int hyperv_panic_event(struct notifier_block *nb, unsigned long val,
64                               void *args)
65 {
66         struct pt_regs *regs;
67
68         vmbus_initiate_unload(true);
69
70         /*
71          * Hyper-V should be notified only once about a panic.  If we will be
72          * doing hyperv_report_panic_msg() later with kmsg data, don't do
73          * the notification here.
74          */
75         if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE
76             && hyperv_report_reg()) {
77                 regs = current_pt_regs();
78                 hyperv_report_panic(regs, val, false);
79         }
80         return NOTIFY_DONE;
81 }
82
83 static int hyperv_die_event(struct notifier_block *nb, unsigned long val,
84                             void *args)
85 {
86         struct die_args *die = (struct die_args *)args;
87         struct pt_regs *regs = die->regs;
88
89         /*
90          * Hyper-V should be notified only once about a panic.  If we will be
91          * doing hyperv_report_panic_msg() later with kmsg data, don't do
92          * the notification here.
93          */
94         if (hyperv_report_reg())
95                 hyperv_report_panic(regs, val, true);
96         return NOTIFY_DONE;
97 }
98
99 static struct notifier_block hyperv_die_block = {
100         .notifier_call = hyperv_die_event,
101 };
102 static struct notifier_block hyperv_panic_block = {
103         .notifier_call = hyperv_panic_event,
104 };
105
106 static const char *fb_mmio_name = "fb_range";
107 static struct resource *fb_mmio;
108 static struct resource *hyperv_mmio;
109 static DEFINE_MUTEX(hyperv_mmio_lock);
110
111 static int vmbus_exists(void)
112 {
113         if (hv_acpi_dev == NULL)
114                 return -ENODEV;
115
116         return 0;
117 }
118
119 static u8 channel_monitor_group(const struct vmbus_channel *channel)
120 {
121         return (u8)channel->offermsg.monitorid / 32;
122 }
123
124 static u8 channel_monitor_offset(const struct vmbus_channel *channel)
125 {
126         return (u8)channel->offermsg.monitorid % 32;
127 }
128
129 static u32 channel_pending(const struct vmbus_channel *channel,
130                            const struct hv_monitor_page *monitor_page)
131 {
132         u8 monitor_group = channel_monitor_group(channel);
133
134         return monitor_page->trigger_group[monitor_group].pending;
135 }
136
137 static u32 channel_latency(const struct vmbus_channel *channel,
138                            const struct hv_monitor_page *monitor_page)
139 {
140         u8 monitor_group = channel_monitor_group(channel);
141         u8 monitor_offset = channel_monitor_offset(channel);
142
143         return monitor_page->latency[monitor_group][monitor_offset];
144 }
145
146 static u32 channel_conn_id(struct vmbus_channel *channel,
147                            struct hv_monitor_page *monitor_page)
148 {
149         u8 monitor_group = channel_monitor_group(channel);
150         u8 monitor_offset = channel_monitor_offset(channel);
151         return monitor_page->parameter[monitor_group][monitor_offset].connectionid.u.id;
152 }
153
154 static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr,
155                        char *buf)
156 {
157         struct hv_device *hv_dev = device_to_hv_device(dev);
158
159         if (!hv_dev->channel)
160                 return -ENODEV;
161         return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid);
162 }
163 static DEVICE_ATTR_RO(id);
164
165 static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr,
166                           char *buf)
167 {
168         struct hv_device *hv_dev = device_to_hv_device(dev);
169
170         if (!hv_dev->channel)
171                 return -ENODEV;
172         return sprintf(buf, "%d\n", hv_dev->channel->state);
173 }
174 static DEVICE_ATTR_RO(state);
175
176 static ssize_t monitor_id_show(struct device *dev,
177                                struct device_attribute *dev_attr, char *buf)
178 {
179         struct hv_device *hv_dev = device_to_hv_device(dev);
180
181         if (!hv_dev->channel)
182                 return -ENODEV;
183         return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid);
184 }
185 static DEVICE_ATTR_RO(monitor_id);
186
187 static ssize_t class_id_show(struct device *dev,
188                                struct device_attribute *dev_attr, char *buf)
189 {
190         struct hv_device *hv_dev = device_to_hv_device(dev);
191
192         if (!hv_dev->channel)
193                 return -ENODEV;
194         return sprintf(buf, "{%pUl}\n",
195                        &hv_dev->channel->offermsg.offer.if_type);
196 }
197 static DEVICE_ATTR_RO(class_id);
198
199 static ssize_t device_id_show(struct device *dev,
200                               struct device_attribute *dev_attr, char *buf)
201 {
202         struct hv_device *hv_dev = device_to_hv_device(dev);
203
204         if (!hv_dev->channel)
205                 return -ENODEV;
206         return sprintf(buf, "{%pUl}\n",
207                        &hv_dev->channel->offermsg.offer.if_instance);
208 }
209 static DEVICE_ATTR_RO(device_id);
210
211 static ssize_t modalias_show(struct device *dev,
212                              struct device_attribute *dev_attr, char *buf)
213 {
214         struct hv_device *hv_dev = device_to_hv_device(dev);
215
216         return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type);
217 }
218 static DEVICE_ATTR_RO(modalias);
219
220 #ifdef CONFIG_NUMA
221 static ssize_t numa_node_show(struct device *dev,
222                               struct device_attribute *attr, char *buf)
223 {
224         struct hv_device *hv_dev = device_to_hv_device(dev);
225
226         if (!hv_dev->channel)
227                 return -ENODEV;
228
229         return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu));
230 }
231 static DEVICE_ATTR_RO(numa_node);
232 #endif
233
234 static ssize_t server_monitor_pending_show(struct device *dev,
235                                            struct device_attribute *dev_attr,
236                                            char *buf)
237 {
238         struct hv_device *hv_dev = device_to_hv_device(dev);
239
240         if (!hv_dev->channel)
241                 return -ENODEV;
242         return sprintf(buf, "%d\n",
243                        channel_pending(hv_dev->channel,
244                                        vmbus_connection.monitor_pages[0]));
245 }
246 static DEVICE_ATTR_RO(server_monitor_pending);
247
248 static ssize_t client_monitor_pending_show(struct device *dev,
249                                            struct device_attribute *dev_attr,
250                                            char *buf)
251 {
252         struct hv_device *hv_dev = device_to_hv_device(dev);
253
254         if (!hv_dev->channel)
255                 return -ENODEV;
256         return sprintf(buf, "%d\n",
257                        channel_pending(hv_dev->channel,
258                                        vmbus_connection.monitor_pages[1]));
259 }
260 static DEVICE_ATTR_RO(client_monitor_pending);
261
262 static ssize_t server_monitor_latency_show(struct device *dev,
263                                            struct device_attribute *dev_attr,
264                                            char *buf)
265 {
266         struct hv_device *hv_dev = device_to_hv_device(dev);
267
268         if (!hv_dev->channel)
269                 return -ENODEV;
270         return sprintf(buf, "%d\n",
271                        channel_latency(hv_dev->channel,
272                                        vmbus_connection.monitor_pages[0]));
273 }
274 static DEVICE_ATTR_RO(server_monitor_latency);
275
276 static ssize_t client_monitor_latency_show(struct device *dev,
277                                            struct device_attribute *dev_attr,
278                                            char *buf)
279 {
280         struct hv_device *hv_dev = device_to_hv_device(dev);
281
282         if (!hv_dev->channel)
283                 return -ENODEV;
284         return sprintf(buf, "%d\n",
285                        channel_latency(hv_dev->channel,
286                                        vmbus_connection.monitor_pages[1]));
287 }
288 static DEVICE_ATTR_RO(client_monitor_latency);
289
290 static ssize_t server_monitor_conn_id_show(struct device *dev,
291                                            struct device_attribute *dev_attr,
292                                            char *buf)
293 {
294         struct hv_device *hv_dev = device_to_hv_device(dev);
295
296         if (!hv_dev->channel)
297                 return -ENODEV;
298         return sprintf(buf, "%d\n",
299                        channel_conn_id(hv_dev->channel,
300                                        vmbus_connection.monitor_pages[0]));
301 }
302 static DEVICE_ATTR_RO(server_monitor_conn_id);
303
304 static ssize_t client_monitor_conn_id_show(struct device *dev,
305                                            struct device_attribute *dev_attr,
306                                            char *buf)
307 {
308         struct hv_device *hv_dev = device_to_hv_device(dev);
309
310         if (!hv_dev->channel)
311                 return -ENODEV;
312         return sprintf(buf, "%d\n",
313                        channel_conn_id(hv_dev->channel,
314                                        vmbus_connection.monitor_pages[1]));
315 }
316 static DEVICE_ATTR_RO(client_monitor_conn_id);
317
318 static ssize_t out_intr_mask_show(struct device *dev,
319                                   struct device_attribute *dev_attr, char *buf)
320 {
321         struct hv_device *hv_dev = device_to_hv_device(dev);
322         struct hv_ring_buffer_debug_info outbound;
323         int ret;
324
325         if (!hv_dev->channel)
326                 return -ENODEV;
327
328         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
329                                           &outbound);
330         if (ret < 0)
331                 return ret;
332
333         return sprintf(buf, "%d\n", outbound.current_interrupt_mask);
334 }
335 static DEVICE_ATTR_RO(out_intr_mask);
336
337 static ssize_t out_read_index_show(struct device *dev,
338                                    struct device_attribute *dev_attr, char *buf)
339 {
340         struct hv_device *hv_dev = device_to_hv_device(dev);
341         struct hv_ring_buffer_debug_info outbound;
342         int ret;
343
344         if (!hv_dev->channel)
345                 return -ENODEV;
346
347         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
348                                           &outbound);
349         if (ret < 0)
350                 return ret;
351         return sprintf(buf, "%d\n", outbound.current_read_index);
352 }
353 static DEVICE_ATTR_RO(out_read_index);
354
355 static ssize_t out_write_index_show(struct device *dev,
356                                     struct device_attribute *dev_attr,
357                                     char *buf)
358 {
359         struct hv_device *hv_dev = device_to_hv_device(dev);
360         struct hv_ring_buffer_debug_info outbound;
361         int ret;
362
363         if (!hv_dev->channel)
364                 return -ENODEV;
365
366         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
367                                           &outbound);
368         if (ret < 0)
369                 return ret;
370         return sprintf(buf, "%d\n", outbound.current_write_index);
371 }
372 static DEVICE_ATTR_RO(out_write_index);
373
374 static ssize_t out_read_bytes_avail_show(struct device *dev,
375                                          struct device_attribute *dev_attr,
376                                          char *buf)
377 {
378         struct hv_device *hv_dev = device_to_hv_device(dev);
379         struct hv_ring_buffer_debug_info outbound;
380         int ret;
381
382         if (!hv_dev->channel)
383                 return -ENODEV;
384
385         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
386                                           &outbound);
387         if (ret < 0)
388                 return ret;
389         return sprintf(buf, "%d\n", outbound.bytes_avail_toread);
390 }
391 static DEVICE_ATTR_RO(out_read_bytes_avail);
392
393 static ssize_t out_write_bytes_avail_show(struct device *dev,
394                                           struct device_attribute *dev_attr,
395                                           char *buf)
396 {
397         struct hv_device *hv_dev = device_to_hv_device(dev);
398         struct hv_ring_buffer_debug_info outbound;
399         int ret;
400
401         if (!hv_dev->channel)
402                 return -ENODEV;
403
404         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
405                                           &outbound);
406         if (ret < 0)
407                 return ret;
408         return sprintf(buf, "%d\n", outbound.bytes_avail_towrite);
409 }
410 static DEVICE_ATTR_RO(out_write_bytes_avail);
411
412 static ssize_t in_intr_mask_show(struct device *dev,
413                                  struct device_attribute *dev_attr, char *buf)
414 {
415         struct hv_device *hv_dev = device_to_hv_device(dev);
416         struct hv_ring_buffer_debug_info inbound;
417         int ret;
418
419         if (!hv_dev->channel)
420                 return -ENODEV;
421
422         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
423         if (ret < 0)
424                 return ret;
425
426         return sprintf(buf, "%d\n", inbound.current_interrupt_mask);
427 }
428 static DEVICE_ATTR_RO(in_intr_mask);
429
430 static ssize_t in_read_index_show(struct device *dev,
431                                   struct device_attribute *dev_attr, char *buf)
432 {
433         struct hv_device *hv_dev = device_to_hv_device(dev);
434         struct hv_ring_buffer_debug_info inbound;
435         int ret;
436
437         if (!hv_dev->channel)
438                 return -ENODEV;
439
440         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
441         if (ret < 0)
442                 return ret;
443
444         return sprintf(buf, "%d\n", inbound.current_read_index);
445 }
446 static DEVICE_ATTR_RO(in_read_index);
447
448 static ssize_t in_write_index_show(struct device *dev,
449                                    struct device_attribute *dev_attr, char *buf)
450 {
451         struct hv_device *hv_dev = device_to_hv_device(dev);
452         struct hv_ring_buffer_debug_info inbound;
453         int ret;
454
455         if (!hv_dev->channel)
456                 return -ENODEV;
457
458         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
459         if (ret < 0)
460                 return ret;
461
462         return sprintf(buf, "%d\n", inbound.current_write_index);
463 }
464 static DEVICE_ATTR_RO(in_write_index);
465
466 static ssize_t in_read_bytes_avail_show(struct device *dev,
467                                         struct device_attribute *dev_attr,
468                                         char *buf)
469 {
470         struct hv_device *hv_dev = device_to_hv_device(dev);
471         struct hv_ring_buffer_debug_info inbound;
472         int ret;
473
474         if (!hv_dev->channel)
475                 return -ENODEV;
476
477         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
478         if (ret < 0)
479                 return ret;
480
481         return sprintf(buf, "%d\n", inbound.bytes_avail_toread);
482 }
483 static DEVICE_ATTR_RO(in_read_bytes_avail);
484
485 static ssize_t in_write_bytes_avail_show(struct device *dev,
486                                          struct device_attribute *dev_attr,
487                                          char *buf)
488 {
489         struct hv_device *hv_dev = device_to_hv_device(dev);
490         struct hv_ring_buffer_debug_info inbound;
491         int ret;
492
493         if (!hv_dev->channel)
494                 return -ENODEV;
495
496         ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
497         if (ret < 0)
498                 return ret;
499
500         return sprintf(buf, "%d\n", inbound.bytes_avail_towrite);
501 }
502 static DEVICE_ATTR_RO(in_write_bytes_avail);
503
504 static ssize_t channel_vp_mapping_show(struct device *dev,
505                                        struct device_attribute *dev_attr,
506                                        char *buf)
507 {
508         struct hv_device *hv_dev = device_to_hv_device(dev);
509         struct vmbus_channel *channel = hv_dev->channel, *cur_sc;
510         int buf_size = PAGE_SIZE, n_written, tot_written;
511         struct list_head *cur;
512
513         if (!channel)
514                 return -ENODEV;
515
516         mutex_lock(&vmbus_connection.channel_mutex);
517
518         tot_written = snprintf(buf, buf_size, "%u:%u\n",
519                 channel->offermsg.child_relid, channel->target_cpu);
520
521         list_for_each(cur, &channel->sc_list) {
522                 if (tot_written >= buf_size - 1)
523                         break;
524
525                 cur_sc = list_entry(cur, struct vmbus_channel, sc_list);
526                 n_written = scnprintf(buf + tot_written,
527                                      buf_size - tot_written,
528                                      "%u:%u\n",
529                                      cur_sc->offermsg.child_relid,
530                                      cur_sc->target_cpu);
531                 tot_written += n_written;
532         }
533
534         mutex_unlock(&vmbus_connection.channel_mutex);
535
536         return tot_written;
537 }
538 static DEVICE_ATTR_RO(channel_vp_mapping);
539
540 static ssize_t vendor_show(struct device *dev,
541                            struct device_attribute *dev_attr,
542                            char *buf)
543 {
544         struct hv_device *hv_dev = device_to_hv_device(dev);
545         return sprintf(buf, "0x%x\n", hv_dev->vendor_id);
546 }
547 static DEVICE_ATTR_RO(vendor);
548
549 static ssize_t device_show(struct device *dev,
550                            struct device_attribute *dev_attr,
551                            char *buf)
552 {
553         struct hv_device *hv_dev = device_to_hv_device(dev);
554         return sprintf(buf, "0x%x\n", hv_dev->device_id);
555 }
556 static DEVICE_ATTR_RO(device);
557
558 static ssize_t driver_override_store(struct device *dev,
559                                      struct device_attribute *attr,
560                                      const char *buf, size_t count)
561 {
562         struct hv_device *hv_dev = device_to_hv_device(dev);
563         char *driver_override, *old, *cp;
564
565         /* We need to keep extra room for a newline */
566         if (count >= (PAGE_SIZE - 1))
567                 return -EINVAL;
568
569         driver_override = kstrndup(buf, count, GFP_KERNEL);
570         if (!driver_override)
571                 return -ENOMEM;
572
573         cp = strchr(driver_override, '\n');
574         if (cp)
575                 *cp = '\0';
576
577         device_lock(dev);
578         old = hv_dev->driver_override;
579         if (strlen(driver_override)) {
580                 hv_dev->driver_override = driver_override;
581         } else {
582                 kfree(driver_override);
583                 hv_dev->driver_override = NULL;
584         }
585         device_unlock(dev);
586
587         kfree(old);
588
589         return count;
590 }
591
592 static ssize_t driver_override_show(struct device *dev,
593                                     struct device_attribute *attr, char *buf)
594 {
595         struct hv_device *hv_dev = device_to_hv_device(dev);
596         ssize_t len;
597
598         device_lock(dev);
599         len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override);
600         device_unlock(dev);
601
602         return len;
603 }
604 static DEVICE_ATTR_RW(driver_override);
605
606 /* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */
607 static struct attribute *vmbus_dev_attrs[] = {
608         &dev_attr_id.attr,
609         &dev_attr_state.attr,
610         &dev_attr_monitor_id.attr,
611         &dev_attr_class_id.attr,
612         &dev_attr_device_id.attr,
613         &dev_attr_modalias.attr,
614 #ifdef CONFIG_NUMA
615         &dev_attr_numa_node.attr,
616 #endif
617         &dev_attr_server_monitor_pending.attr,
618         &dev_attr_client_monitor_pending.attr,
619         &dev_attr_server_monitor_latency.attr,
620         &dev_attr_client_monitor_latency.attr,
621         &dev_attr_server_monitor_conn_id.attr,
622         &dev_attr_client_monitor_conn_id.attr,
623         &dev_attr_out_intr_mask.attr,
624         &dev_attr_out_read_index.attr,
625         &dev_attr_out_write_index.attr,
626         &dev_attr_out_read_bytes_avail.attr,
627         &dev_attr_out_write_bytes_avail.attr,
628         &dev_attr_in_intr_mask.attr,
629         &dev_attr_in_read_index.attr,
630         &dev_attr_in_write_index.attr,
631         &dev_attr_in_read_bytes_avail.attr,
632         &dev_attr_in_write_bytes_avail.attr,
633         &dev_attr_channel_vp_mapping.attr,
634         &dev_attr_vendor.attr,
635         &dev_attr_device.attr,
636         &dev_attr_driver_override.attr,
637         NULL,
638 };
639
640 /*
641  * Device-level attribute_group callback function. Returns the permission for
642  * each attribute, and returns 0 if an attribute is not visible.
643  */
644 static umode_t vmbus_dev_attr_is_visible(struct kobject *kobj,
645                                          struct attribute *attr, int idx)
646 {
647         struct device *dev = kobj_to_dev(kobj);
648         const struct hv_device *hv_dev = device_to_hv_device(dev);
649
650         /* Hide the monitor attributes if the monitor mechanism is not used. */
651         if (!hv_dev->channel->offermsg.monitor_allocated &&
652             (attr == &dev_attr_monitor_id.attr ||
653              attr == &dev_attr_server_monitor_pending.attr ||
654              attr == &dev_attr_client_monitor_pending.attr ||
655              attr == &dev_attr_server_monitor_latency.attr ||
656              attr == &dev_attr_client_monitor_latency.attr ||
657              attr == &dev_attr_server_monitor_conn_id.attr ||
658              attr == &dev_attr_client_monitor_conn_id.attr))
659                 return 0;
660
661         return attr->mode;
662 }
663
664 static const struct attribute_group vmbus_dev_group = {
665         .attrs = vmbus_dev_attrs,
666         .is_visible = vmbus_dev_attr_is_visible
667 };
668 __ATTRIBUTE_GROUPS(vmbus_dev);
669
670 /*
671  * vmbus_uevent - add uevent for our device
672  *
673  * This routine is invoked when a device is added or removed on the vmbus to
674  * generate a uevent to udev in the userspace. The udev will then look at its
675  * rule and the uevent generated here to load the appropriate driver
676  *
677  * The alias string will be of the form vmbus:guid where guid is the string
678  * representation of the device guid (each byte of the guid will be
679  * represented with two hex characters.
680  */
681 static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env)
682 {
683         struct hv_device *dev = device_to_hv_device(device);
684         const char *format = "MODALIAS=vmbus:%*phN";
685
686         return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type);
687 }
688
689 static const struct hv_vmbus_device_id *
690 hv_vmbus_dev_match(const struct hv_vmbus_device_id *id, const guid_t *guid)
691 {
692         if (id == NULL)
693                 return NULL; /* empty device table */
694
695         for (; !guid_is_null(&id->guid); id++)
696                 if (guid_equal(&id->guid, guid))
697                         return id;
698
699         return NULL;
700 }
701
702 static const struct hv_vmbus_device_id *
703 hv_vmbus_dynid_match(struct hv_driver *drv, const guid_t *guid)
704 {
705         const struct hv_vmbus_device_id *id = NULL;
706         struct vmbus_dynid *dynid;
707
708         spin_lock(&drv->dynids.lock);
709         list_for_each_entry(dynid, &drv->dynids.list, node) {
710                 if (guid_equal(&dynid->id.guid, guid)) {
711                         id = &dynid->id;
712                         break;
713                 }
714         }
715         spin_unlock(&drv->dynids.lock);
716
717         return id;
718 }
719
720 static const struct hv_vmbus_device_id vmbus_device_null;
721
722 /*
723  * Return a matching hv_vmbus_device_id pointer.
724  * If there is no match, return NULL.
725  */
726 static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv,
727                                                         struct hv_device *dev)
728 {
729         const guid_t *guid = &dev->dev_type;
730         const struct hv_vmbus_device_id *id;
731
732         /* When driver_override is set, only bind to the matching driver */
733         if (dev->driver_override && strcmp(dev->driver_override, drv->name))
734                 return NULL;
735
736         /* Look at the dynamic ids first, before the static ones */
737         id = hv_vmbus_dynid_match(drv, guid);
738         if (!id)
739                 id = hv_vmbus_dev_match(drv->id_table, guid);
740
741         /* driver_override will always match, send a dummy id */
742         if (!id && dev->driver_override)
743                 id = &vmbus_device_null;
744
745         return id;
746 }
747
748 /* vmbus_add_dynid - add a new device ID to this driver and re-probe devices */
749 static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid)
750 {
751         struct vmbus_dynid *dynid;
752
753         dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
754         if (!dynid)
755                 return -ENOMEM;
756
757         dynid->id.guid = *guid;
758
759         spin_lock(&drv->dynids.lock);
760         list_add_tail(&dynid->node, &drv->dynids.list);
761         spin_unlock(&drv->dynids.lock);
762
763         return driver_attach(&drv->driver);
764 }
765
766 static void vmbus_free_dynids(struct hv_driver *drv)
767 {
768         struct vmbus_dynid *dynid, *n;
769
770         spin_lock(&drv->dynids.lock);
771         list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
772                 list_del(&dynid->node);
773                 kfree(dynid);
774         }
775         spin_unlock(&drv->dynids.lock);
776 }
777
778 /*
779  * store_new_id - sysfs frontend to vmbus_add_dynid()
780  *
781  * Allow GUIDs to be added to an existing driver via sysfs.
782  */
783 static ssize_t new_id_store(struct device_driver *driver, const char *buf,
784                             size_t count)
785 {
786         struct hv_driver *drv = drv_to_hv_drv(driver);
787         guid_t guid;
788         ssize_t retval;
789
790         retval = guid_parse(buf, &guid);
791         if (retval)
792                 return retval;
793
794         if (hv_vmbus_dynid_match(drv, &guid))
795                 return -EEXIST;
796
797         retval = vmbus_add_dynid(drv, &guid);
798         if (retval)
799                 return retval;
800         return count;
801 }
802 static DRIVER_ATTR_WO(new_id);
803
804 /*
805  * store_remove_id - remove a PCI device ID from this driver
806  *
807  * Removes a dynamic pci device ID to this driver.
808  */
809 static ssize_t remove_id_store(struct device_driver *driver, const char *buf,
810                                size_t count)
811 {
812         struct hv_driver *drv = drv_to_hv_drv(driver);
813         struct vmbus_dynid *dynid, *n;
814         guid_t guid;
815         ssize_t retval;
816
817         retval = guid_parse(buf, &guid);
818         if (retval)
819                 return retval;
820
821         retval = -ENODEV;
822         spin_lock(&drv->dynids.lock);
823         list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
824                 struct hv_vmbus_device_id *id = &dynid->id;
825
826                 if (guid_equal(&id->guid, &guid)) {
827                         list_del(&dynid->node);
828                         kfree(dynid);
829                         retval = count;
830                         break;
831                 }
832         }
833         spin_unlock(&drv->dynids.lock);
834
835         return retval;
836 }
837 static DRIVER_ATTR_WO(remove_id);
838
839 static struct attribute *vmbus_drv_attrs[] = {
840         &driver_attr_new_id.attr,
841         &driver_attr_remove_id.attr,
842         NULL,
843 };
844 ATTRIBUTE_GROUPS(vmbus_drv);
845
846
847 /*
848  * vmbus_match - Attempt to match the specified device to the specified driver
849  */
850 static int vmbus_match(struct device *device, struct device_driver *driver)
851 {
852         struct hv_driver *drv = drv_to_hv_drv(driver);
853         struct hv_device *hv_dev = device_to_hv_device(device);
854
855         /* The hv_sock driver handles all hv_sock offers. */
856         if (is_hvsock_channel(hv_dev->channel))
857                 return drv->hvsock;
858
859         if (hv_vmbus_get_id(drv, hv_dev))
860                 return 1;
861
862         return 0;
863 }
864
865 /*
866  * vmbus_probe - Add the new vmbus's child device
867  */
868 static int vmbus_probe(struct device *child_device)
869 {
870         int ret = 0;
871         struct hv_driver *drv =
872                         drv_to_hv_drv(child_device->driver);
873         struct hv_device *dev = device_to_hv_device(child_device);
874         const struct hv_vmbus_device_id *dev_id;
875
876         dev_id = hv_vmbus_get_id(drv, dev);
877         if (drv->probe) {
878                 ret = drv->probe(dev, dev_id);
879                 if (ret != 0)
880                         pr_err("probe failed for device %s (%d)\n",
881                                dev_name(child_device), ret);
882
883         } else {
884                 pr_err("probe not set for driver %s\n",
885                        dev_name(child_device));
886                 ret = -ENODEV;
887         }
888         return ret;
889 }
890
891 /*
892  * vmbus_remove - Remove a vmbus device
893  */
894 static int vmbus_remove(struct device *child_device)
895 {
896         struct hv_driver *drv;
897         struct hv_device *dev = device_to_hv_device(child_device);
898
899         if (child_device->driver) {
900                 drv = drv_to_hv_drv(child_device->driver);
901                 if (drv->remove)
902                         drv->remove(dev);
903         }
904
905         return 0;
906 }
907
908
909 /*
910  * vmbus_shutdown - Shutdown a vmbus device
911  */
912 static void vmbus_shutdown(struct device *child_device)
913 {
914         struct hv_driver *drv;
915         struct hv_device *dev = device_to_hv_device(child_device);
916
917
918         /* The device may not be attached yet */
919         if (!child_device->driver)
920                 return;
921
922         drv = drv_to_hv_drv(child_device->driver);
923
924         if (drv->shutdown)
925                 drv->shutdown(dev);
926 }
927
928 #ifdef CONFIG_PM_SLEEP
929 /*
930  * vmbus_suspend - Suspend a vmbus device
931  */
932 static int vmbus_suspend(struct device *child_device)
933 {
934         struct hv_driver *drv;
935         struct hv_device *dev = device_to_hv_device(child_device);
936
937         /* The device may not be attached yet */
938         if (!child_device->driver)
939                 return 0;
940
941         drv = drv_to_hv_drv(child_device->driver);
942         if (!drv->suspend)
943                 return -EOPNOTSUPP;
944
945         return drv->suspend(dev);
946 }
947
948 /*
949  * vmbus_resume - Resume a vmbus device
950  */
951 static int vmbus_resume(struct device *child_device)
952 {
953         struct hv_driver *drv;
954         struct hv_device *dev = device_to_hv_device(child_device);
955
956         /* The device may not be attached yet */
957         if (!child_device->driver)
958                 return 0;
959
960         drv = drv_to_hv_drv(child_device->driver);
961         if (!drv->resume)
962                 return -EOPNOTSUPP;
963
964         return drv->resume(dev);
965 }
966 #else
967 #define vmbus_suspend NULL
968 #define vmbus_resume NULL
969 #endif /* CONFIG_PM_SLEEP */
970
971 /*
972  * vmbus_device_release - Final callback release of the vmbus child device
973  */
974 static void vmbus_device_release(struct device *device)
975 {
976         struct hv_device *hv_dev = device_to_hv_device(device);
977         struct vmbus_channel *channel = hv_dev->channel;
978
979         hv_debug_rm_dev_dir(hv_dev);
980
981         mutex_lock(&vmbus_connection.channel_mutex);
982         hv_process_channel_removal(channel);
983         mutex_unlock(&vmbus_connection.channel_mutex);
984         kfree(hv_dev);
985 }
986
987 /*
988  * Note: we must use the "noirq" ops: see the comment before vmbus_bus_pm.
989  *
990  * suspend_noirq/resume_noirq are set to NULL to support Suspend-to-Idle: we
991  * shouldn't suspend the vmbus devices upon Suspend-to-Idle, otherwise there
992  * is no way to wake up a Generation-2 VM.
993  *
994  * The other 4 ops are for hibernation.
995  */
996
997 static const struct dev_pm_ops vmbus_pm = {
998         .suspend_noirq  = NULL,
999         .resume_noirq   = NULL,
1000         .freeze_noirq   = vmbus_suspend,
1001         .thaw_noirq     = vmbus_resume,
1002         .poweroff_noirq = vmbus_suspend,
1003         .restore_noirq  = vmbus_resume,
1004 };
1005
1006 /* The one and only one */
1007 static struct bus_type  hv_bus = {
1008         .name =         "vmbus",
1009         .match =                vmbus_match,
1010         .shutdown =             vmbus_shutdown,
1011         .remove =               vmbus_remove,
1012         .probe =                vmbus_probe,
1013         .uevent =               vmbus_uevent,
1014         .dev_groups =           vmbus_dev_groups,
1015         .drv_groups =           vmbus_drv_groups,
1016         .pm =                   &vmbus_pm,
1017 };
1018
1019 struct onmessage_work_context {
1020         struct work_struct work;
1021         struct {
1022                 struct hv_message_header header;
1023                 u8 payload[];
1024         } msg;
1025 };
1026
1027 static void vmbus_onmessage_work(struct work_struct *work)
1028 {
1029         struct onmessage_work_context *ctx;
1030
1031         /* Do not process messages if we're in DISCONNECTED state */
1032         if (vmbus_connection.conn_state == DISCONNECTED)
1033                 return;
1034
1035         ctx = container_of(work, struct onmessage_work_context,
1036                            work);
1037         vmbus_onmessage((struct vmbus_channel_message_header *)
1038                         &ctx->msg.payload);
1039         kfree(ctx);
1040 }
1041
1042 void vmbus_on_msg_dpc(unsigned long data)
1043 {
1044         struct hv_per_cpu_context *hv_cpu = (void *)data;
1045         void *page_addr = hv_cpu->synic_message_page;
1046         struct hv_message *msg = (struct hv_message *)page_addr +
1047                                   VMBUS_MESSAGE_SINT;
1048         struct vmbus_channel_message_header *hdr;
1049         const struct vmbus_channel_message_table_entry *entry;
1050         struct onmessage_work_context *ctx;
1051         u32 message_type = msg->header.message_type;
1052
1053         /*
1054          * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
1055          * it is being used in 'struct vmbus_channel_message_header' definition
1056          * which is supposed to match hypervisor ABI.
1057          */
1058         BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32));
1059
1060         if (message_type == HVMSG_NONE)
1061                 /* no msg */
1062                 return;
1063
1064         hdr = (struct vmbus_channel_message_header *)msg->u.payload;
1065
1066         trace_vmbus_on_msg_dpc(hdr);
1067
1068         if (hdr->msgtype >= CHANNELMSG_COUNT) {
1069                 WARN_ONCE(1, "unknown msgtype=%d\n", hdr->msgtype);
1070                 goto msg_handled;
1071         }
1072
1073         if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
1074                 WARN_ONCE(1, "payload size is too large (%d)\n",
1075                           msg->header.payload_size);
1076                 goto msg_handled;
1077         }
1078
1079         entry = &channel_message_table[hdr->msgtype];
1080
1081         if (!entry->message_handler)
1082                 goto msg_handled;
1083
1084         if (msg->header.payload_size < entry->min_payload_len) {
1085                 WARN_ONCE(1, "message too short: msgtype=%d len=%d\n",
1086                           hdr->msgtype, msg->header.payload_size);
1087                 goto msg_handled;
1088         }
1089
1090         if (entry->handler_type == VMHT_BLOCKING) {
1091                 ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size,
1092                               GFP_ATOMIC);
1093                 if (ctx == NULL)
1094                         return;
1095
1096                 INIT_WORK(&ctx->work, vmbus_onmessage_work);
1097                 memcpy(&ctx->msg, msg, sizeof(msg->header) +
1098                        msg->header.payload_size);
1099
1100                 /*
1101                  * The host can generate a rescind message while we
1102                  * may still be handling the original offer. We deal with
1103                  * this condition by relying on the synchronization provided
1104                  * by offer_in_progress and by channel_mutex.  See also the
1105                  * inline comments in vmbus_onoffer_rescind().
1106                  */
1107                 switch (hdr->msgtype) {
1108                 case CHANNELMSG_RESCIND_CHANNELOFFER:
1109                         /*
1110                          * If we are handling the rescind message;
1111                          * schedule the work on the global work queue.
1112                          *
1113                          * The OFFER message and the RESCIND message should
1114                          * not be handled by the same serialized work queue,
1115                          * because the OFFER handler may call vmbus_open(),
1116                          * which tries to open the channel by sending an
1117                          * OPEN_CHANNEL message to the host and waits for
1118                          * the host's response; however, if the host has
1119                          * rescinded the channel before it receives the
1120                          * OPEN_CHANNEL message, the host just silently
1121                          * ignores the OPEN_CHANNEL message; as a result,
1122                          * the guest's OFFER handler hangs for ever, if we
1123                          * handle the RESCIND message in the same serialized
1124                          * work queue: the RESCIND handler can not start to
1125                          * run before the OFFER handler finishes.
1126                          */
1127                         schedule_work(&ctx->work);
1128                         break;
1129
1130                 case CHANNELMSG_OFFERCHANNEL:
1131                         /*
1132                          * The host sends the offer message of a given channel
1133                          * before sending the rescind message of the same
1134                          * channel.  These messages are sent to the guest's
1135                          * connect CPU; the guest then starts processing them
1136                          * in the tasklet handler on this CPU:
1137                          *
1138                          * VMBUS_CONNECT_CPU
1139                          *
1140                          * [vmbus_on_msg_dpc()]
1141                          * atomic_inc()  // CHANNELMSG_OFFERCHANNEL
1142                          * queue_work()
1143                          * ...
1144                          * [vmbus_on_msg_dpc()]
1145                          * schedule_work()  // CHANNELMSG_RESCIND_CHANNELOFFER
1146                          *
1147                          * We rely on the memory-ordering properties of the
1148                          * queue_work() and schedule_work() primitives, which
1149                          * guarantee that the atomic increment will be visible
1150                          * to the CPUs which will execute the offer & rescind
1151                          * works by the time these works will start execution.
1152                          */
1153                         atomic_inc(&vmbus_connection.offer_in_progress);
1154                         fallthrough;
1155
1156                 default:
1157                         queue_work(vmbus_connection.work_queue, &ctx->work);
1158                 }
1159         } else
1160                 entry->message_handler(hdr);
1161
1162 msg_handled:
1163         vmbus_signal_eom(msg, message_type);
1164 }
1165
1166 #ifdef CONFIG_PM_SLEEP
1167 /*
1168  * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
1169  * hibernation, because hv_sock connections can not persist across hibernation.
1170  */
1171 static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
1172 {
1173         struct onmessage_work_context *ctx;
1174         struct vmbus_channel_rescind_offer *rescind;
1175
1176         WARN_ON(!is_hvsock_channel(channel));
1177
1178         /*
1179          * Allocation size is small and the allocation should really not fail,
1180          * otherwise the state of the hv_sock connections ends up in limbo.
1181          */
1182         ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind),
1183                       GFP_KERNEL | __GFP_NOFAIL);
1184
1185         /*
1186          * So far, these are not really used by Linux. Just set them to the
1187          * reasonable values conforming to the definitions of the fields.
1188          */
1189         ctx->msg.header.message_type = 1;
1190         ctx->msg.header.payload_size = sizeof(*rescind);
1191
1192         /* These values are actually used by Linux. */
1193         rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload;
1194         rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER;
1195         rescind->child_relid = channel->offermsg.child_relid;
1196
1197         INIT_WORK(&ctx->work, vmbus_onmessage_work);
1198
1199         queue_work(vmbus_connection.work_queue, &ctx->work);
1200 }
1201 #endif /* CONFIG_PM_SLEEP */
1202
1203 /*
1204  * Schedule all channels with events pending
1205  */
1206 static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
1207 {
1208         unsigned long *recv_int_page;
1209         u32 maxbits, relid;
1210
1211         if (vmbus_proto_version < VERSION_WIN8) {
1212                 maxbits = MAX_NUM_CHANNELS_SUPPORTED;
1213                 recv_int_page = vmbus_connection.recv_int_page;
1214         } else {
1215                 /*
1216                  * When the host is win8 and beyond, the event page
1217                  * can be directly checked to get the id of the channel
1218                  * that has the interrupt pending.
1219                  */
1220                 void *page_addr = hv_cpu->synic_event_page;
1221                 union hv_synic_event_flags *event
1222                         = (union hv_synic_event_flags *)page_addr +
1223                                                  VMBUS_MESSAGE_SINT;
1224
1225                 maxbits = HV_EVENT_FLAGS_COUNT;
1226                 recv_int_page = event->flags;
1227         }
1228
1229         if (unlikely(!recv_int_page))
1230                 return;
1231
1232         for_each_set_bit(relid, recv_int_page, maxbits) {
1233                 void (*callback_fn)(void *context);
1234                 struct vmbus_channel *channel;
1235
1236                 if (!sync_test_and_clear_bit(relid, recv_int_page))
1237                         continue;
1238
1239                 /* Special case - vmbus channel protocol msg */
1240                 if (relid == 0)
1241                         continue;
1242
1243                 /*
1244                  * Pairs with the kfree_rcu() in vmbus_chan_release().
1245                  * Guarantees that the channel data structure doesn't
1246                  * get freed while the channel pointer below is being
1247                  * dereferenced.
1248                  */
1249                 rcu_read_lock();
1250
1251                 /* Find channel based on relid */
1252                 channel = relid2channel(relid);
1253                 if (channel == NULL)
1254                         goto sched_unlock_rcu;
1255
1256                 if (channel->rescind)
1257                         goto sched_unlock_rcu;
1258
1259                 /*
1260                  * Make sure that the ring buffer data structure doesn't get
1261                  * freed while we dereference the ring buffer pointer.  Test
1262                  * for the channel's onchannel_callback being NULL within a
1263                  * sched_lock critical section.  See also the inline comments
1264                  * in vmbus_reset_channel_cb().
1265                  */
1266                 spin_lock(&channel->sched_lock);
1267
1268                 callback_fn = channel->onchannel_callback;
1269                 if (unlikely(callback_fn == NULL))
1270                         goto sched_unlock;
1271
1272                 trace_vmbus_chan_sched(channel);
1273
1274                 ++channel->interrupts;
1275
1276                 switch (channel->callback_mode) {
1277                 case HV_CALL_ISR:
1278                         (*callback_fn)(channel->channel_callback_context);
1279                         break;
1280
1281                 case HV_CALL_BATCHED:
1282                         hv_begin_read(&channel->inbound);
1283                         fallthrough;
1284                 case HV_CALL_DIRECT:
1285                         tasklet_schedule(&channel->callback_event);
1286                 }
1287
1288 sched_unlock:
1289                 spin_unlock(&channel->sched_lock);
1290 sched_unlock_rcu:
1291                 rcu_read_unlock();
1292         }
1293 }
1294
1295 static void vmbus_isr(void)
1296 {
1297         struct hv_per_cpu_context *hv_cpu
1298                 = this_cpu_ptr(hv_context.cpu_context);
1299         void *page_addr = hv_cpu->synic_event_page;
1300         struct hv_message *msg;
1301         union hv_synic_event_flags *event;
1302         bool handled = false;
1303
1304         if (unlikely(page_addr == NULL))
1305                 return;
1306
1307         event = (union hv_synic_event_flags *)page_addr +
1308                                          VMBUS_MESSAGE_SINT;
1309         /*
1310          * Check for events before checking for messages. This is the order
1311          * in which events and messages are checked in Windows guests on
1312          * Hyper-V, and the Windows team suggested we do the same.
1313          */
1314
1315         if ((vmbus_proto_version == VERSION_WS2008) ||
1316                 (vmbus_proto_version == VERSION_WIN7)) {
1317
1318                 /* Since we are a child, we only need to check bit 0 */
1319                 if (sync_test_and_clear_bit(0, event->flags))
1320                         handled = true;
1321         } else {
1322                 /*
1323                  * Our host is win8 or above. The signaling mechanism
1324                  * has changed and we can directly look at the event page.
1325                  * If bit n is set then we have an interrup on the channel
1326                  * whose id is n.
1327                  */
1328                 handled = true;
1329         }
1330
1331         if (handled)
1332                 vmbus_chan_sched(hv_cpu);
1333
1334         page_addr = hv_cpu->synic_message_page;
1335         msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
1336
1337         /* Check if there are actual msgs to be processed */
1338         if (msg->header.message_type != HVMSG_NONE) {
1339                 if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
1340                         hv_stimer0_isr();
1341                         vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
1342                 } else
1343                         tasklet_schedule(&hv_cpu->msg_dpc);
1344         }
1345
1346         add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
1347 }
1348
1349 /*
1350  * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
1351  * buffer and call into Hyper-V to transfer the data.
1352  */
1353 static void hv_kmsg_dump(struct kmsg_dumper *dumper,
1354                          enum kmsg_dump_reason reason)
1355 {
1356         size_t bytes_written;
1357         phys_addr_t panic_pa;
1358
1359         /* We are only interested in panics. */
1360         if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg))
1361                 return;
1362
1363         panic_pa = virt_to_phys(hv_panic_page);
1364
1365         /*
1366          * Write dump contents to the page. No need to synchronize; panic should
1367          * be single-threaded.
1368          */
1369         kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE,
1370                              &bytes_written);
1371         if (bytes_written)
1372                 hyperv_report_panic_msg(panic_pa, bytes_written);
1373 }
1374
1375 static struct kmsg_dumper hv_kmsg_dumper = {
1376         .dump = hv_kmsg_dump,
1377 };
1378
1379 static struct ctl_table_header *hv_ctl_table_hdr;
1380
1381 /*
1382  * sysctl option to allow the user to control whether kmsg data should be
1383  * reported to Hyper-V on panic.
1384  */
1385 static struct ctl_table hv_ctl_table[] = {
1386         {
1387                 .procname       = "hyperv_record_panic_msg",
1388                 .data           = &sysctl_record_panic_msg,
1389                 .maxlen         = sizeof(int),
1390                 .mode           = 0644,
1391                 .proc_handler   = proc_dointvec_minmax,
1392                 .extra1         = SYSCTL_ZERO,
1393                 .extra2         = SYSCTL_ONE
1394         },
1395         {}
1396 };
1397
1398 static struct ctl_table hv_root_table[] = {
1399         {
1400                 .procname       = "kernel",
1401                 .mode           = 0555,
1402                 .child          = hv_ctl_table
1403         },
1404         {}
1405 };
1406
1407 /*
1408  * vmbus_bus_init -Main vmbus driver initialization routine.
1409  *
1410  * Here, we
1411  *      - initialize the vmbus driver context
1412  *      - invoke the vmbus hv main init routine
1413  *      - retrieve the channel offers
1414  */
1415 static int vmbus_bus_init(void)
1416 {
1417         int ret;
1418
1419         ret = hv_init();
1420         if (ret != 0) {
1421                 pr_err("Unable to initialize the hypervisor - 0x%x\n", ret);
1422                 return ret;
1423         }
1424
1425         ret = bus_register(&hv_bus);
1426         if (ret)
1427                 return ret;
1428
1429         hv_setup_vmbus_irq(vmbus_isr);
1430
1431         ret = hv_synic_alloc();
1432         if (ret)
1433                 goto err_alloc;
1434
1435         /*
1436          * Initialize the per-cpu interrupt state and stimer state.
1437          * Then connect to the host.
1438          */
1439         ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
1440                                 hv_synic_init, hv_synic_cleanup);
1441         if (ret < 0)
1442                 goto err_cpuhp;
1443         hyperv_cpuhp_online = ret;
1444
1445         ret = vmbus_connect();
1446         if (ret)
1447                 goto err_connect;
1448
1449         /*
1450          * Only register if the crash MSRs are available
1451          */
1452         if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
1453                 u64 hyperv_crash_ctl;
1454                 /*
1455                  * Sysctl registration is not fatal, since by default
1456                  * reporting is enabled.
1457                  */
1458                 hv_ctl_table_hdr = register_sysctl_table(hv_root_table);
1459                 if (!hv_ctl_table_hdr)
1460                         pr_err("Hyper-V: sysctl table register error");
1461
1462                 /*
1463                  * Register for panic kmsg callback only if the right
1464                  * capability is supported by the hypervisor.
1465                  */
1466                 hv_get_crash_ctl(hyperv_crash_ctl);
1467                 if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) {
1468                         hv_panic_page = (void *)hv_alloc_hyperv_zeroed_page();
1469                         if (hv_panic_page) {
1470                                 ret = kmsg_dump_register(&hv_kmsg_dumper);
1471                                 if (ret) {
1472                                         pr_err("Hyper-V: kmsg dump register "
1473                                                 "error 0x%x\n", ret);
1474                                         hv_free_hyperv_page(
1475                                             (unsigned long)hv_panic_page);
1476                                         hv_panic_page = NULL;
1477                                 }
1478                         } else
1479                                 pr_err("Hyper-V: panic message page memory "
1480                                         "allocation failed");
1481                 }
1482
1483                 register_die_notifier(&hyperv_die_block);
1484         }
1485
1486         /*
1487          * Always register the panic notifier because we need to unload
1488          * the VMbus channel connection to prevent any VMbus
1489          * activity after the VM panics.
1490          */
1491         atomic_notifier_chain_register(&panic_notifier_list,
1492                                &hyperv_panic_block);
1493
1494         vmbus_request_offers();
1495
1496         return 0;
1497
1498 err_connect:
1499         cpuhp_remove_state(hyperv_cpuhp_online);
1500 err_cpuhp:
1501         hv_synic_free();
1502 err_alloc:
1503         hv_remove_vmbus_irq();
1504
1505         bus_unregister(&hv_bus);
1506         unregister_sysctl_table(hv_ctl_table_hdr);
1507         hv_ctl_table_hdr = NULL;
1508         return ret;
1509 }
1510
1511 /**
1512  * __vmbus_child_driver_register() - Register a vmbus's driver
1513  * @hv_driver: Pointer to driver structure you want to register
1514  * @owner: owner module of the drv
1515  * @mod_name: module name string
1516  *
1517  * Registers the given driver with Linux through the 'driver_register()' call
1518  * and sets up the hyper-v vmbus handling for this driver.
1519  * It will return the state of the 'driver_register()' call.
1520  *
1521  */
1522 int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, const char *mod_name)
1523 {
1524         int ret;
1525
1526         pr_info("registering driver %s\n", hv_driver->name);
1527
1528         ret = vmbus_exists();
1529         if (ret < 0)
1530                 return ret;
1531
1532         hv_driver->driver.name = hv_driver->name;
1533         hv_driver->driver.owner = owner;
1534         hv_driver->driver.mod_name = mod_name;
1535         hv_driver->driver.bus = &hv_bus;
1536
1537         spin_lock_init(&hv_driver->dynids.lock);
1538         INIT_LIST_HEAD(&hv_driver->dynids.list);
1539
1540         ret = driver_register(&hv_driver->driver);
1541
1542         return ret;
1543 }
1544 EXPORT_SYMBOL_GPL(__vmbus_driver_register);
1545
1546 /**
1547  * vmbus_driver_unregister() - Unregister a vmbus's driver
1548  * @hv_driver: Pointer to driver structure you want to
1549  *             un-register
1550  *
1551  * Un-register the given driver that was previous registered with a call to
1552  * vmbus_driver_register()
1553  */
1554 void vmbus_driver_unregister(struct hv_driver *hv_driver)
1555 {
1556         pr_info("unregistering driver %s\n", hv_driver->name);
1557
1558         if (!vmbus_exists()) {
1559                 driver_unregister(&hv_driver->driver);
1560                 vmbus_free_dynids(hv_driver);
1561         }
1562 }
1563 EXPORT_SYMBOL_GPL(vmbus_driver_unregister);
1564
1565
1566 /*
1567  * Called when last reference to channel is gone.
1568  */
1569 static void vmbus_chan_release(struct kobject *kobj)
1570 {
1571         struct vmbus_channel *channel
1572                 = container_of(kobj, struct vmbus_channel, kobj);
1573
1574         kfree_rcu(channel, rcu);
1575 }
1576
1577 struct vmbus_chan_attribute {
1578         struct attribute attr;
1579         ssize_t (*show)(struct vmbus_channel *chan, char *buf);
1580         ssize_t (*store)(struct vmbus_channel *chan,
1581                          const char *buf, size_t count);
1582 };
1583 #define VMBUS_CHAN_ATTR(_name, _mode, _show, _store) \
1584         struct vmbus_chan_attribute chan_attr_##_name \
1585                 = __ATTR(_name, _mode, _show, _store)
1586 #define VMBUS_CHAN_ATTR_RW(_name) \
1587         struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RW(_name)
1588 #define VMBUS_CHAN_ATTR_RO(_name) \
1589         struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RO(_name)
1590 #define VMBUS_CHAN_ATTR_WO(_name) \
1591         struct vmbus_chan_attribute chan_attr_##_name = __ATTR_WO(_name)
1592
1593 static ssize_t vmbus_chan_attr_show(struct kobject *kobj,
1594                                     struct attribute *attr, char *buf)
1595 {
1596         const struct vmbus_chan_attribute *attribute
1597                 = container_of(attr, struct vmbus_chan_attribute, attr);
1598         struct vmbus_channel *chan
1599                 = container_of(kobj, struct vmbus_channel, kobj);
1600
1601         if (!attribute->show)
1602                 return -EIO;
1603
1604         return attribute->show(chan, buf);
1605 }
1606
1607 static ssize_t vmbus_chan_attr_store(struct kobject *kobj,
1608                                      struct attribute *attr, const char *buf,
1609                                      size_t count)
1610 {
1611         const struct vmbus_chan_attribute *attribute
1612                 = container_of(attr, struct vmbus_chan_attribute, attr);
1613         struct vmbus_channel *chan
1614                 = container_of(kobj, struct vmbus_channel, kobj);
1615
1616         if (!attribute->store)
1617                 return -EIO;
1618
1619         return attribute->store(chan, buf, count);
1620 }
1621
1622 static const struct sysfs_ops vmbus_chan_sysfs_ops = {
1623         .show = vmbus_chan_attr_show,
1624         .store = vmbus_chan_attr_store,
1625 };
1626
1627 static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf)
1628 {
1629         struct hv_ring_buffer_info *rbi = &channel->outbound;
1630         ssize_t ret;
1631
1632         mutex_lock(&rbi->ring_buffer_mutex);
1633         if (!rbi->ring_buffer) {
1634                 mutex_unlock(&rbi->ring_buffer_mutex);
1635                 return -EINVAL;
1636         }
1637
1638         ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
1639         mutex_unlock(&rbi->ring_buffer_mutex);
1640         return ret;
1641 }
1642 static VMBUS_CHAN_ATTR_RO(out_mask);
1643
1644 static ssize_t in_mask_show(struct vmbus_channel *channel, char *buf)
1645 {
1646         struct hv_ring_buffer_info *rbi = &channel->inbound;
1647         ssize_t ret;
1648
1649         mutex_lock(&rbi->ring_buffer_mutex);
1650         if (!rbi->ring_buffer) {
1651                 mutex_unlock(&rbi->ring_buffer_mutex);
1652                 return -EINVAL;
1653         }
1654
1655         ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
1656         mutex_unlock(&rbi->ring_buffer_mutex);
1657         return ret;
1658 }
1659 static VMBUS_CHAN_ATTR_RO(in_mask);
1660
1661 static ssize_t read_avail_show(struct vmbus_channel *channel, char *buf)
1662 {
1663         struct hv_ring_buffer_info *rbi = &channel->inbound;
1664         ssize_t ret;
1665
1666         mutex_lock(&rbi->ring_buffer_mutex);
1667         if (!rbi->ring_buffer) {
1668                 mutex_unlock(&rbi->ring_buffer_mutex);
1669                 return -EINVAL;
1670         }
1671
1672         ret = sprintf(buf, "%u\n", hv_get_bytes_to_read(rbi));
1673         mutex_unlock(&rbi->ring_buffer_mutex);
1674         return ret;
1675 }
1676 static VMBUS_CHAN_ATTR_RO(read_avail);
1677
1678 static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf)
1679 {
1680         struct hv_ring_buffer_info *rbi = &channel->outbound;
1681         ssize_t ret;
1682
1683         mutex_lock(&rbi->ring_buffer_mutex);
1684         if (!rbi->ring_buffer) {
1685                 mutex_unlock(&rbi->ring_buffer_mutex);
1686                 return -EINVAL;
1687         }
1688
1689         ret = sprintf(buf, "%u\n", hv_get_bytes_to_write(rbi));
1690         mutex_unlock(&rbi->ring_buffer_mutex);
1691         return ret;
1692 }
1693 static VMBUS_CHAN_ATTR_RO(write_avail);
1694
1695 static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
1696 {
1697         return sprintf(buf, "%u\n", channel->target_cpu);
1698 }
1699 static ssize_t target_cpu_store(struct vmbus_channel *channel,
1700                                 const char *buf, size_t count)
1701 {
1702         u32 target_cpu, origin_cpu;
1703         ssize_t ret = count;
1704
1705         if (vmbus_proto_version < VERSION_WIN10_V4_1)
1706                 return -EIO;
1707
1708         if (sscanf(buf, "%uu", &target_cpu) != 1)
1709                 return -EIO;
1710
1711         /* Validate target_cpu for the cpumask_test_cpu() operation below. */
1712         if (target_cpu >= nr_cpumask_bits)
1713                 return -EINVAL;
1714
1715         /* No CPUs should come up or down during this. */
1716         cpus_read_lock();
1717
1718         if (!cpu_online(target_cpu)) {
1719                 cpus_read_unlock();
1720                 return -EINVAL;
1721         }
1722
1723         /*
1724          * Synchronizes target_cpu_store() and channel closure:
1725          *
1726          * { Initially: state = CHANNEL_OPENED }
1727          *
1728          * CPU1                         CPU2
1729          *
1730          * [target_cpu_store()]         [vmbus_disconnect_ring()]
1731          *
1732          * LOCK channel_mutex           LOCK channel_mutex
1733          * LOAD r1 = state              LOAD r2 = state
1734          * IF (r1 == CHANNEL_OPENED)    IF (r2 == CHANNEL_OPENED)
1735          *   SEND MODIFYCHANNEL           STORE state = CHANNEL_OPEN
1736          *   [...]                        SEND CLOSECHANNEL
1737          * UNLOCK channel_mutex         UNLOCK channel_mutex
1738          *
1739          * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes
1740          *              CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND
1741          *
1742          * Note.  The host processes the channel messages "sequentially", in
1743          * the order in which they are received on a per-partition basis.
1744          */
1745         mutex_lock(&vmbus_connection.channel_mutex);
1746
1747         /*
1748          * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
1749          * avoid sending the message and fail here for such channels.
1750          */
1751         if (channel->state != CHANNEL_OPENED_STATE) {
1752                 ret = -EIO;
1753                 goto cpu_store_unlock;
1754         }
1755
1756         origin_cpu = channel->target_cpu;
1757         if (target_cpu == origin_cpu)
1758                 goto cpu_store_unlock;
1759
1760         if (vmbus_send_modifychannel(channel->offermsg.child_relid,
1761                                      hv_cpu_number_to_vp_number(target_cpu))) {
1762                 ret = -EIO;
1763                 goto cpu_store_unlock;
1764         }
1765
1766         /*
1767          * Warning.  At this point, there is *no* guarantee that the host will
1768          * have successfully processed the vmbus_send_modifychannel() request.
1769          * See the header comment of vmbus_send_modifychannel() for more info.
1770          *
1771          * Lags in the processing of the above vmbus_send_modifychannel() can
1772          * result in missed interrupts if the "old" target CPU is taken offline
1773          * before Hyper-V starts sending interrupts to the "new" target CPU.
1774          * But apart from this offlining scenario, the code tolerates such
1775          * lags.  It will function correctly even if a channel interrupt comes
1776          * in on a CPU that is different from the channel target_cpu value.
1777          */
1778
1779         channel->target_cpu = target_cpu;
1780
1781         /* See init_vp_index(). */
1782         if (hv_is_perf_channel(channel))
1783                 hv_update_alloced_cpus(origin_cpu, target_cpu);
1784
1785         /* Currently set only for storvsc channels. */
1786         if (channel->change_target_cpu_callback) {
1787                 (*channel->change_target_cpu_callback)(channel,
1788                                 origin_cpu, target_cpu);
1789         }
1790
1791 cpu_store_unlock:
1792         mutex_unlock(&vmbus_connection.channel_mutex);
1793         cpus_read_unlock();
1794         return ret;
1795 }
1796 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
1797
1798 static ssize_t channel_pending_show(struct vmbus_channel *channel,
1799                                     char *buf)
1800 {
1801         return sprintf(buf, "%d\n",
1802                        channel_pending(channel,
1803                                        vmbus_connection.monitor_pages[1]));
1804 }
1805 static VMBUS_CHAN_ATTR(pending, S_IRUGO, channel_pending_show, NULL);
1806
1807 static ssize_t channel_latency_show(struct vmbus_channel *channel,
1808                                     char *buf)
1809 {
1810         return sprintf(buf, "%d\n",
1811                        channel_latency(channel,
1812                                        vmbus_connection.monitor_pages[1]));
1813 }
1814 static VMBUS_CHAN_ATTR(latency, S_IRUGO, channel_latency_show, NULL);
1815
1816 static ssize_t channel_interrupts_show(struct vmbus_channel *channel, char *buf)
1817 {
1818         return sprintf(buf, "%llu\n", channel->interrupts);
1819 }
1820 static VMBUS_CHAN_ATTR(interrupts, S_IRUGO, channel_interrupts_show, NULL);
1821
1822 static ssize_t channel_events_show(struct vmbus_channel *channel, char *buf)
1823 {
1824         return sprintf(buf, "%llu\n", channel->sig_events);
1825 }
1826 static VMBUS_CHAN_ATTR(events, S_IRUGO, channel_events_show, NULL);
1827
1828 static ssize_t channel_intr_in_full_show(struct vmbus_channel *channel,
1829                                          char *buf)
1830 {
1831         return sprintf(buf, "%llu\n",
1832                        (unsigned long long)channel->intr_in_full);
1833 }
1834 static VMBUS_CHAN_ATTR(intr_in_full, 0444, channel_intr_in_full_show, NULL);
1835
1836 static ssize_t channel_intr_out_empty_show(struct vmbus_channel *channel,
1837                                            char *buf)
1838 {
1839         return sprintf(buf, "%llu\n",
1840                        (unsigned long long)channel->intr_out_empty);
1841 }
1842 static VMBUS_CHAN_ATTR(intr_out_empty, 0444, channel_intr_out_empty_show, NULL);
1843
1844 static ssize_t channel_out_full_first_show(struct vmbus_channel *channel,
1845                                            char *buf)
1846 {
1847         return sprintf(buf, "%llu\n",
1848                        (unsigned long long)channel->out_full_first);
1849 }
1850 static VMBUS_CHAN_ATTR(out_full_first, 0444, channel_out_full_first_show, NULL);
1851
1852 static ssize_t channel_out_full_total_show(struct vmbus_channel *channel,
1853                                            char *buf)
1854 {
1855         return sprintf(buf, "%llu\n",
1856                        (unsigned long long)channel->out_full_total);
1857 }
1858 static VMBUS_CHAN_ATTR(out_full_total, 0444, channel_out_full_total_show, NULL);
1859
1860 static ssize_t subchannel_monitor_id_show(struct vmbus_channel *channel,
1861                                           char *buf)
1862 {
1863         return sprintf(buf, "%u\n", channel->offermsg.monitorid);
1864 }
1865 static VMBUS_CHAN_ATTR(monitor_id, S_IRUGO, subchannel_monitor_id_show, NULL);
1866
1867 static ssize_t subchannel_id_show(struct vmbus_channel *channel,
1868                                   char *buf)
1869 {
1870         return sprintf(buf, "%u\n",
1871                        channel->offermsg.offer.sub_channel_index);
1872 }
1873 static VMBUS_CHAN_ATTR_RO(subchannel_id);
1874
1875 static struct attribute *vmbus_chan_attrs[] = {
1876         &chan_attr_out_mask.attr,
1877         &chan_attr_in_mask.attr,
1878         &chan_attr_read_avail.attr,
1879         &chan_attr_write_avail.attr,
1880         &chan_attr_cpu.attr,
1881         &chan_attr_pending.attr,
1882         &chan_attr_latency.attr,
1883         &chan_attr_interrupts.attr,
1884         &chan_attr_events.attr,
1885         &chan_attr_intr_in_full.attr,
1886         &chan_attr_intr_out_empty.attr,
1887         &chan_attr_out_full_first.attr,
1888         &chan_attr_out_full_total.attr,
1889         &chan_attr_monitor_id.attr,
1890         &chan_attr_subchannel_id.attr,
1891         NULL
1892 };
1893
1894 /*
1895  * Channel-level attribute_group callback function. Returns the permission for
1896  * each attribute, and returns 0 if an attribute is not visible.
1897  */
1898 static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj,
1899                                           struct attribute *attr, int idx)
1900 {
1901         const struct vmbus_channel *channel =
1902                 container_of(kobj, struct vmbus_channel, kobj);
1903
1904         /* Hide the monitor attributes if the monitor mechanism is not used. */
1905         if (!channel->offermsg.monitor_allocated &&
1906             (attr == &chan_attr_pending.attr ||
1907              attr == &chan_attr_latency.attr ||
1908              attr == &chan_attr_monitor_id.attr))
1909                 return 0;
1910
1911         return attr->mode;
1912 }
1913
1914 static struct attribute_group vmbus_chan_group = {
1915         .attrs = vmbus_chan_attrs,
1916         .is_visible = vmbus_chan_attr_is_visible
1917 };
1918
1919 static struct kobj_type vmbus_chan_ktype = {
1920         .sysfs_ops = &vmbus_chan_sysfs_ops,
1921         .release = vmbus_chan_release,
1922 };
1923
1924 /*
1925  * vmbus_add_channel_kobj - setup a sub-directory under device/channels
1926  */
1927 int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel)
1928 {
1929         const struct device *device = &dev->device;
1930         struct kobject *kobj = &channel->kobj;
1931         u32 relid = channel->offermsg.child_relid;
1932         int ret;
1933
1934         kobj->kset = dev->channels_kset;
1935         ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL,
1936                                    "%u", relid);
1937         if (ret)
1938                 return ret;
1939
1940         ret = sysfs_create_group(kobj, &vmbus_chan_group);
1941
1942         if (ret) {
1943                 /*
1944                  * The calling functions' error handling paths will cleanup the
1945                  * empty channel directory.
1946                  */
1947                 dev_err(device, "Unable to set up channel sysfs files\n");
1948                 return ret;
1949         }
1950
1951         kobject_uevent(kobj, KOBJ_ADD);
1952
1953         return 0;
1954 }
1955
1956 /*
1957  * vmbus_remove_channel_attr_group - remove the channel's attribute group
1958  */
1959 void vmbus_remove_channel_attr_group(struct vmbus_channel *channel)
1960 {
1961         sysfs_remove_group(&channel->kobj, &vmbus_chan_group);
1962 }
1963
1964 /*
1965  * vmbus_device_create - Creates and registers a new child device
1966  * on the vmbus.
1967  */
1968 struct hv_device *vmbus_device_create(const guid_t *type,
1969                                       const guid_t *instance,
1970                                       struct vmbus_channel *channel)
1971 {
1972         struct hv_device *child_device_obj;
1973
1974         child_device_obj = kzalloc(sizeof(struct hv_device), GFP_KERNEL);
1975         if (!child_device_obj) {
1976                 pr_err("Unable to allocate device object for child device\n");
1977                 return NULL;
1978         }
1979
1980         child_device_obj->channel = channel;
1981         guid_copy(&child_device_obj->dev_type, type);
1982         guid_copy(&child_device_obj->dev_instance, instance);
1983         child_device_obj->vendor_id = 0x1414; /* MSFT vendor ID */
1984
1985         return child_device_obj;
1986 }
1987
1988 /*
1989  * vmbus_device_register - Register the child device
1990  */
1991 int vmbus_device_register(struct hv_device *child_device_obj)
1992 {
1993         struct kobject *kobj = &child_device_obj->device.kobj;
1994         int ret;
1995
1996         dev_set_name(&child_device_obj->device, "%pUl",
1997                      &child_device_obj->channel->offermsg.offer.if_instance);
1998
1999         child_device_obj->device.bus = &hv_bus;
2000         child_device_obj->device.parent = &hv_acpi_dev->dev;
2001         child_device_obj->device.release = vmbus_device_release;
2002
2003         /*
2004          * Register with the LDM. This will kick off the driver/device
2005          * binding...which will eventually call vmbus_match() and vmbus_probe()
2006          */
2007         ret = device_register(&child_device_obj->device);
2008         if (ret) {
2009                 pr_err("Unable to register child device\n");
2010                 return ret;
2011         }
2012
2013         child_device_obj->channels_kset = kset_create_and_add("channels",
2014                                                               NULL, kobj);
2015         if (!child_device_obj->channels_kset) {
2016                 ret = -ENOMEM;
2017                 goto err_dev_unregister;
2018         }
2019
2020         ret = vmbus_add_channel_kobj(child_device_obj,
2021                                      child_device_obj->channel);
2022         if (ret) {
2023                 pr_err("Unable to register primary channeln");
2024                 goto err_kset_unregister;
2025         }
2026         hv_debug_add_dev_dir(child_device_obj);
2027
2028         return 0;
2029
2030 err_kset_unregister:
2031         kset_unregister(child_device_obj->channels_kset);
2032
2033 err_dev_unregister:
2034         device_unregister(&child_device_obj->device);
2035         return ret;
2036 }
2037
2038 /*
2039  * vmbus_device_unregister - Remove the specified child device
2040  * from the vmbus.
2041  */
2042 void vmbus_device_unregister(struct hv_device *device_obj)
2043 {
2044         pr_debug("child device %s unregistered\n",
2045                 dev_name(&device_obj->device));
2046
2047         kset_unregister(device_obj->channels_kset);
2048
2049         /*
2050          * Kick off the process of unregistering the device.
2051          * This will call vmbus_remove() and eventually vmbus_device_release()
2052          */
2053         device_unregister(&device_obj->device);
2054 }
2055
2056
2057 /*
2058  * VMBUS is an acpi enumerated device. Get the information we
2059  * need from DSDT.
2060  */
2061 #define VTPM_BASE_ADDRESS 0xfed40000
2062 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
2063 {
2064         resource_size_t start = 0;
2065         resource_size_t end = 0;
2066         struct resource *new_res;
2067         struct resource **old_res = &hyperv_mmio;
2068         struct resource **prev_res = NULL;
2069
2070         switch (res->type) {
2071
2072         /*
2073          * "Address" descriptors are for bus windows. Ignore
2074          * "memory" descriptors, which are for registers on
2075          * devices.
2076          */
2077         case ACPI_RESOURCE_TYPE_ADDRESS32:
2078                 start = res->data.address32.address.minimum;
2079                 end = res->data.address32.address.maximum;
2080                 break;
2081
2082         case ACPI_RESOURCE_TYPE_ADDRESS64:
2083                 start = res->data.address64.address.minimum;
2084                 end = res->data.address64.address.maximum;
2085                 break;
2086
2087         default:
2088                 /* Unused resource type */
2089                 return AE_OK;
2090
2091         }
2092         /*
2093          * Ignore ranges that are below 1MB, as they're not
2094          * necessary or useful here.
2095          */
2096         if (end < 0x100000)
2097                 return AE_OK;
2098
2099         new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC);
2100         if (!new_res)
2101                 return AE_NO_MEMORY;
2102
2103         /* If this range overlaps the virtual TPM, truncate it. */
2104         if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
2105                 end = VTPM_BASE_ADDRESS;
2106
2107         new_res->name = "hyperv mmio";
2108         new_res->flags = IORESOURCE_MEM;
2109         new_res->start = start;
2110         new_res->end = end;
2111
2112         /*
2113          * If two ranges are adjacent, merge them.
2114          */
2115         do {
2116                 if (!*old_res) {
2117                         *old_res = new_res;
2118                         break;
2119                 }
2120
2121                 if (((*old_res)->end + 1) == new_res->start) {
2122                         (*old_res)->end = new_res->end;
2123                         kfree(new_res);
2124                         break;
2125                 }
2126
2127                 if ((*old_res)->start == new_res->end + 1) {
2128                         (*old_res)->start = new_res->start;
2129                         kfree(new_res);
2130                         break;
2131                 }
2132
2133                 if ((*old_res)->start > new_res->end) {
2134                         new_res->sibling = *old_res;
2135                         if (prev_res)
2136                                 (*prev_res)->sibling = new_res;
2137                         *old_res = new_res;
2138                         break;
2139                 }
2140
2141                 prev_res = old_res;
2142                 old_res = &(*old_res)->sibling;
2143
2144         } while (1);
2145
2146         return AE_OK;
2147 }
2148
2149 static int vmbus_acpi_remove(struct acpi_device *device)
2150 {
2151         struct resource *cur_res;
2152         struct resource *next_res;
2153
2154         if (hyperv_mmio) {
2155                 if (fb_mmio) {
2156                         __release_region(hyperv_mmio, fb_mmio->start,
2157                                          resource_size(fb_mmio));
2158                         fb_mmio = NULL;
2159                 }
2160
2161                 for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) {
2162                         next_res = cur_res->sibling;
2163                         kfree(cur_res);
2164                 }
2165         }
2166
2167         return 0;
2168 }
2169
2170 static void vmbus_reserve_fb(void)
2171 {
2172         int size;
2173         /*
2174          * Make a claim for the frame buffer in the resource tree under the
2175          * first node, which will be the one below 4GB.  The length seems to
2176          * be underreported, particularly in a Generation 1 VM.  So start out
2177          * reserving a larger area and make it smaller until it succeeds.
2178          */
2179
2180         if (screen_info.lfb_base) {
2181                 if (efi_enabled(EFI_BOOT))
2182                         size = max_t(__u32, screen_info.lfb_size, 0x800000);
2183                 else
2184                         size = max_t(__u32, screen_info.lfb_size, 0x4000000);
2185
2186                 for (; !fb_mmio && (size >= 0x100000); size >>= 1) {
2187                         fb_mmio = __request_region(hyperv_mmio,
2188                                                    screen_info.lfb_base, size,
2189                                                    fb_mmio_name, 0);
2190                 }
2191         }
2192 }
2193
2194 /**
2195  * vmbus_allocate_mmio() - Pick a memory-mapped I/O range.
2196  * @new:                If successful, supplied a pointer to the
2197  *                      allocated MMIO space.
2198  * @device_obj:         Identifies the caller
2199  * @min:                Minimum guest physical address of the
2200  *                      allocation
2201  * @max:                Maximum guest physical address
2202  * @size:               Size of the range to be allocated
2203  * @align:              Alignment of the range to be allocated
2204  * @fb_overlap_ok:      Whether this allocation can be allowed
2205  *                      to overlap the video frame buffer.
2206  *
2207  * This function walks the resources granted to VMBus by the
2208  * _CRS object in the ACPI namespace underneath the parent
2209  * "bridge" whether that's a root PCI bus in the Generation 1
2210  * case or a Module Device in the Generation 2 case.  It then
2211  * attempts to allocate from the global MMIO pool in a way that
2212  * matches the constraints supplied in these parameters and by
2213  * that _CRS.
2214  *
2215  * Return: 0 on success, -errno on failure
2216  */
2217 int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
2218                         resource_size_t min, resource_size_t max,
2219                         resource_size_t size, resource_size_t align,
2220                         bool fb_overlap_ok)
2221 {
2222         struct resource *iter, *shadow;
2223         resource_size_t range_min, range_max, start;
2224         const char *dev_n = dev_name(&device_obj->device);
2225         int retval;
2226
2227         retval = -ENXIO;
2228         mutex_lock(&hyperv_mmio_lock);
2229
2230         /*
2231          * If overlaps with frame buffers are allowed, then first attempt to
2232          * make the allocation from within the reserved region.  Because it
2233          * is already reserved, no shadow allocation is necessary.
2234          */
2235         if (fb_overlap_ok && fb_mmio && !(min > fb_mmio->end) &&
2236             !(max < fb_mmio->start)) {
2237
2238                 range_min = fb_mmio->start;
2239                 range_max = fb_mmio->end;
2240                 start = (range_min + align - 1) & ~(align - 1);
2241                 for (; start + size - 1 <= range_max; start += align) {
2242                         *new = request_mem_region_exclusive(start, size, dev_n);
2243                         if (*new) {
2244                                 retval = 0;
2245                                 goto exit;
2246                         }
2247                 }
2248         }
2249
2250         for (iter = hyperv_mmio; iter; iter = iter->sibling) {
2251                 if ((iter->start >= max) || (iter->end <= min))
2252                         continue;
2253
2254                 range_min = iter->start;
2255                 range_max = iter->end;
2256                 start = (range_min + align - 1) & ~(align - 1);
2257                 for (; start + size - 1 <= range_max; start += align) {
2258                         shadow = __request_region(iter, start, size, NULL,
2259                                                   IORESOURCE_BUSY);
2260                         if (!shadow)
2261                                 continue;
2262
2263                         *new = request_mem_region_exclusive(start, size, dev_n);
2264                         if (*new) {
2265                                 shadow->name = (char *)*new;
2266                                 retval = 0;
2267                                 goto exit;
2268                         }
2269
2270                         __release_region(iter, start, size);
2271                 }
2272         }
2273
2274 exit:
2275         mutex_unlock(&hyperv_mmio_lock);
2276         return retval;
2277 }
2278 EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
2279
2280 /**
2281  * vmbus_free_mmio() - Free a memory-mapped I/O range.
2282  * @start:              Base address of region to release.
2283  * @size:               Size of the range to be allocated
2284  *
2285  * This function releases anything requested by
2286  * vmbus_mmio_allocate().
2287  */
2288 void vmbus_free_mmio(resource_size_t start, resource_size_t size)
2289 {
2290         struct resource *iter;
2291
2292         mutex_lock(&hyperv_mmio_lock);
2293         for (iter = hyperv_mmio; iter; iter = iter->sibling) {
2294                 if ((iter->start >= start + size) || (iter->end <= start))
2295                         continue;
2296
2297                 __release_region(iter, start, size);
2298         }
2299         release_mem_region(start, size);
2300         mutex_unlock(&hyperv_mmio_lock);
2301
2302 }
2303 EXPORT_SYMBOL_GPL(vmbus_free_mmio);
2304
2305 static int vmbus_acpi_add(struct acpi_device *device)
2306 {
2307         acpi_status result;
2308         int ret_val = -ENODEV;
2309         struct acpi_device *ancestor;
2310
2311         hv_acpi_dev = device;
2312
2313         result = acpi_walk_resources(device->handle, METHOD_NAME__CRS,
2314                                         vmbus_walk_resources, NULL);
2315
2316         if (ACPI_FAILURE(result))
2317                 goto acpi_walk_err;
2318         /*
2319          * Some ancestor of the vmbus acpi device (Gen1 or Gen2
2320          * firmware) is the VMOD that has the mmio ranges. Get that.
2321          */
2322         for (ancestor = device->parent; ancestor; ancestor = ancestor->parent) {
2323                 result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
2324                                              vmbus_walk_resources, NULL);
2325
2326                 if (ACPI_FAILURE(result))
2327                         continue;
2328                 if (hyperv_mmio) {
2329                         vmbus_reserve_fb();
2330                         break;
2331                 }
2332         }
2333         ret_val = 0;
2334
2335 acpi_walk_err:
2336         complete(&probe_event);
2337         if (ret_val)
2338                 vmbus_acpi_remove(device);
2339         return ret_val;
2340 }
2341
2342 #ifdef CONFIG_PM_SLEEP
2343 static int vmbus_bus_suspend(struct device *dev)
2344 {
2345         struct vmbus_channel *channel, *sc;
2346
2347         while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
2348                 /*
2349                  * We wait here until the completion of any channel
2350                  * offers that are currently in progress.
2351                  */
2352                 msleep(1);
2353         }
2354
2355         mutex_lock(&vmbus_connection.channel_mutex);
2356         list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2357                 if (!is_hvsock_channel(channel))
2358                         continue;
2359
2360                 vmbus_force_channel_rescinded(channel);
2361         }
2362         mutex_unlock(&vmbus_connection.channel_mutex);
2363
2364         /*
2365          * Wait until all the sub-channels and hv_sock channels have been
2366          * cleaned up. Sub-channels should be destroyed upon suspend, otherwise
2367          * they would conflict with the new sub-channels that will be created
2368          * in the resume path. hv_sock channels should also be destroyed, but
2369          * a hv_sock channel of an established hv_sock connection can not be
2370          * really destroyed since it may still be referenced by the userspace
2371          * application, so we just force the hv_sock channel to be rescinded
2372          * by vmbus_force_channel_rescinded(), and the userspace application
2373          * will thoroughly destroy the channel after hibernation.
2374          *
2375          * Note: the counter nr_chan_close_on_suspend may never go above 0 if
2376          * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM.
2377          */
2378         if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
2379                 wait_for_completion(&vmbus_connection.ready_for_suspend_event);
2380
2381         WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0);
2382
2383         mutex_lock(&vmbus_connection.channel_mutex);
2384
2385         list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2386                 /*
2387                  * Remove the channel from the array of channels and invalidate
2388                  * the channel's relid.  Upon resume, vmbus_onoffer() will fix
2389                  * up the relid (and other fields, if necessary) and add the
2390                  * channel back to the array.
2391                  */
2392                 vmbus_channel_unmap_relid(channel);
2393                 channel->offermsg.child_relid = INVALID_RELID;
2394
2395                 if (is_hvsock_channel(channel)) {
2396                         if (!channel->rescind) {
2397                                 pr_err("hv_sock channel not rescinded!\n");
2398                                 WARN_ON_ONCE(1);
2399                         }
2400                         continue;
2401                 }
2402
2403                 list_for_each_entry(sc, &channel->sc_list, sc_list) {
2404                         pr_err("Sub-channel not deleted!\n");
2405                         WARN_ON_ONCE(1);
2406                 }
2407
2408                 atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume);
2409         }
2410
2411         mutex_unlock(&vmbus_connection.channel_mutex);
2412
2413         vmbus_initiate_unload(false);
2414
2415         /* Reset the event for the next resume. */
2416         reinit_completion(&vmbus_connection.ready_for_resume_event);
2417
2418         return 0;
2419 }
2420
2421 static int vmbus_bus_resume(struct device *dev)
2422 {
2423         struct vmbus_channel_msginfo *msginfo;
2424         size_t msgsize;
2425         int ret;
2426
2427         /*
2428          * We only use the 'vmbus_proto_version', which was in use before
2429          * hibernation, to re-negotiate with the host.
2430          */
2431         if (!vmbus_proto_version) {
2432                 pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version);
2433                 return -EINVAL;
2434         }
2435
2436         msgsize = sizeof(*msginfo) +
2437                   sizeof(struct vmbus_channel_initiate_contact);
2438
2439         msginfo = kzalloc(msgsize, GFP_KERNEL);
2440
2441         if (msginfo == NULL)
2442                 return -ENOMEM;
2443
2444         ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);
2445
2446         kfree(msginfo);
2447
2448         if (ret != 0)
2449                 return ret;
2450
2451         WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0);
2452
2453         vmbus_request_offers();
2454
2455         wait_for_completion(&vmbus_connection.ready_for_resume_event);
2456
2457         /* Reset the event for the next suspend. */
2458         reinit_completion(&vmbus_connection.ready_for_suspend_event);
2459
2460         return 0;
2461 }
2462 #else
2463 #define vmbus_bus_suspend NULL
2464 #define vmbus_bus_resume NULL
2465 #endif /* CONFIG_PM_SLEEP */
2466
2467 static const struct acpi_device_id vmbus_acpi_device_ids[] = {
2468         {"VMBUS", 0},
2469         {"VMBus", 0},
2470         {"", 0},
2471 };
2472 MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
2473
2474 /*
2475  * Note: we must use the "no_irq" ops, otherwise hibernation can not work with
2476  * PCI device assignment, because "pci_dev_pm_ops" uses the "noirq" ops: in
2477  * the resume path, the pci "noirq" restore op runs before "non-noirq" op (see
2478  * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() ->
2479  * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's
2480  * resume callback must also run via the "noirq" ops.
2481  *
2482  * Set suspend_noirq/resume_noirq to NULL for Suspend-to-Idle: see the comment
2483  * earlier in this file before vmbus_pm.
2484  */
2485
2486 static const struct dev_pm_ops vmbus_bus_pm = {
2487         .suspend_noirq  = NULL,
2488         .resume_noirq   = NULL,
2489         .freeze_noirq   = vmbus_bus_suspend,
2490         .thaw_noirq     = vmbus_bus_resume,
2491         .poweroff_noirq = vmbus_bus_suspend,
2492         .restore_noirq  = vmbus_bus_resume
2493 };
2494
2495 static struct acpi_driver vmbus_acpi_driver = {
2496         .name = "vmbus",
2497         .ids = vmbus_acpi_device_ids,
2498         .ops = {
2499                 .add = vmbus_acpi_add,
2500                 .remove = vmbus_acpi_remove,
2501         },
2502         .drv.pm = &vmbus_bus_pm,
2503 };
2504
2505 static void hv_kexec_handler(void)
2506 {
2507         hv_stimer_global_cleanup();
2508         vmbus_initiate_unload(false);
2509         /* Make sure conn_state is set as hv_synic_cleanup checks for it */
2510         mb();
2511         cpuhp_remove_state(hyperv_cpuhp_online);
2512         hyperv_cleanup();
2513 };
2514
2515 static void hv_crash_handler(struct pt_regs *regs)
2516 {
2517         int cpu;
2518
2519         vmbus_initiate_unload(true);
2520         /*
2521          * In crash handler we can't schedule synic cleanup for all CPUs,
2522          * doing the cleanup for current CPU only. This should be sufficient
2523          * for kdump.
2524          */
2525         cpu = smp_processor_id();
2526         hv_stimer_cleanup(cpu);
2527         hv_synic_disable_regs(cpu);
2528         hyperv_cleanup();
2529 };
2530
2531 static int hv_synic_suspend(void)
2532 {
2533         /*
2534          * When we reach here, all the non-boot CPUs have been offlined.
2535          * If we're in a legacy configuration where stimer Direct Mode is
2536          * not enabled, the stimers on the non-boot CPUs have been unbound
2537          * in hv_synic_cleanup() -> hv_stimer_legacy_cleanup() ->
2538          * hv_stimer_cleanup() -> clockevents_unbind_device().
2539          *
2540          * hv_synic_suspend() only runs on CPU0 with interrupts disabled.
2541          * Here we do not call hv_stimer_legacy_cleanup() on CPU0 because:
2542          * 1) it's unnecessary as interrupts remain disabled between
2543          * syscore_suspend() and syscore_resume(): see create_image() and
2544          * resume_target_kernel()
2545          * 2) the stimer on CPU0 is automatically disabled later by
2546          * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ...
2547          * -> clockevents_shutdown() -> ... -> hv_ce_shutdown()
2548          * 3) a warning would be triggered if we call
2549          * clockevents_unbind_device(), which may sleep, in an
2550          * interrupts-disabled context.
2551          */
2552
2553         hv_synic_disable_regs(0);
2554
2555         return 0;
2556 }
2557
2558 static void hv_synic_resume(void)
2559 {
2560         hv_synic_enable_regs(0);
2561
2562         /*
2563          * Note: we don't need to call hv_stimer_init(0), because the timer
2564          * on CPU0 is not unbound in hv_synic_suspend(), and the timer is
2565          * automatically re-enabled in timekeeping_resume().
2566          */
2567 }
2568
2569 /* The callbacks run only on CPU0, with irqs_disabled. */
2570 static struct syscore_ops hv_synic_syscore_ops = {
2571         .suspend = hv_synic_suspend,
2572         .resume = hv_synic_resume,
2573 };
2574
2575 static int __init hv_acpi_init(void)
2576 {
2577         int ret, t;
2578
2579         if (!hv_is_hyperv_initialized())
2580                 return -ENODEV;
2581
2582         init_completion(&probe_event);
2583
2584         /*
2585          * Get ACPI resources first.
2586          */
2587         ret = acpi_bus_register_driver(&vmbus_acpi_driver);
2588
2589         if (ret)
2590                 return ret;
2591
2592         t = wait_for_completion_timeout(&probe_event, 5*HZ);
2593         if (t == 0) {
2594                 ret = -ETIMEDOUT;
2595                 goto cleanup;
2596         }
2597         hv_debug_init();
2598
2599         ret = vmbus_bus_init();
2600         if (ret)
2601                 goto cleanup;
2602
2603         hv_setup_kexec_handler(hv_kexec_handler);
2604         hv_setup_crash_handler(hv_crash_handler);
2605
2606         register_syscore_ops(&hv_synic_syscore_ops);
2607
2608         return 0;
2609
2610 cleanup:
2611         acpi_bus_unregister_driver(&vmbus_acpi_driver);
2612         hv_acpi_dev = NULL;
2613         return ret;
2614 }
2615
2616 static void __exit vmbus_exit(void)
2617 {
2618         int cpu;
2619
2620         unregister_syscore_ops(&hv_synic_syscore_ops);
2621
2622         hv_remove_kexec_handler();
2623         hv_remove_crash_handler();
2624         vmbus_connection.conn_state = DISCONNECTED;
2625         hv_stimer_global_cleanup();
2626         vmbus_disconnect();
2627         hv_remove_vmbus_irq();
2628         for_each_online_cpu(cpu) {
2629                 struct hv_per_cpu_context *hv_cpu
2630                         = per_cpu_ptr(hv_context.cpu_context, cpu);
2631
2632                 tasklet_kill(&hv_cpu->msg_dpc);
2633         }
2634         hv_debug_rm_all_dir();
2635
2636         vmbus_free_channels();
2637         kfree(vmbus_connection.channels);
2638
2639         if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
2640                 kmsg_dump_unregister(&hv_kmsg_dumper);
2641                 unregister_die_notifier(&hyperv_die_block);
2642                 atomic_notifier_chain_unregister(&panic_notifier_list,
2643                                                  &hyperv_panic_block);
2644         }
2645
2646         free_page((unsigned long)hv_panic_page);
2647         unregister_sysctl_table(hv_ctl_table_hdr);
2648         hv_ctl_table_hdr = NULL;
2649         bus_unregister(&hv_bus);
2650
2651         cpuhp_remove_state(hyperv_cpuhp_online);
2652         hv_synic_free();
2653         acpi_bus_unregister_driver(&vmbus_acpi_driver);
2654 }
2655
2656
2657 MODULE_LICENSE("GPL");
2658 MODULE_DESCRIPTION("Microsoft Hyper-V VMBus Driver");
2659
2660 subsys_initcall(hv_acpi_init);
2661 module_exit(vmbus_exit);