drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "LAST",
 120 };
 121
 122 /**
 123  * DOC: pcie_replay_count
 124  *
 125  * The amdgpu driver provides a sysfs API for reporting the total number
 126  * of PCIe replays (NAKs)
 127  * The file pcie_replay_count is used for this and returns the total
 128  * number of replays as a sum of the NAKs generated and NAKs received
 129  */
 130
 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 132                 struct device_attribute *attr, char *buf)
 133 {
 134         struct drm_device *ddev = dev_get_drvdata(dev);
 135         struct amdgpu_device *adev = drm_to_adev(ddev);
 136         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 137
 138         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 139 }
 140
 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 142                 amdgpu_device_get_pcie_replay_count, NULL);
 143
 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 145
 146 /**
 147  * DOC: product_name
 148  *
 149  * The amdgpu driver provides a sysfs API for reporting the product name
 150  * for the device
 151  * The file serial_number is used for this and returns the product name
 152  * as returned from the FRU.
 153  * NOTE: This is only available for certain server cards
 154  */
 155
 156 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 157                 struct device_attribute *attr, char *buf)
 158 {
 159         struct drm_device *ddev = dev_get_drvdata(dev);
 160         struct amdgpu_device *adev = drm_to_adev(ddev);
 161
 162         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 163 }
 164
 165 static DEVICE_ATTR(product_name, S_IRUGO,
 166                 amdgpu_device_get_product_name, NULL);
 167
 168 /**
 169  * DOC: product_number
 170  *
 171  * The amdgpu driver provides a sysfs API for reporting the part number
 172  * for the device
 173  * The file serial_number is used for this and returns the part number
 174  * as returned from the FRU.
 175  * NOTE: This is only available for certain server cards
 176  */
 177
 178 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 179                 struct device_attribute *attr, char *buf)
 180 {
 181         struct drm_device *ddev = dev_get_drvdata(dev);
 182         struct amdgpu_device *adev = drm_to_adev(ddev);
 183
 184         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 185 }
 186
 187 static DEVICE_ATTR(product_number, S_IRUGO,
 188                 amdgpu_device_get_product_number, NULL);
 189
 190 /**
 191  * DOC: serial_number
 192  *
 193  * The amdgpu driver provides a sysfs API for reporting the serial number
 194  * for the device
 195  * The file serial_number is used for this and returns the serial number
 196  * as returned from the FRU.
 197  * NOTE: This is only available for certain server cards
 198  */
 199
 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 201                 struct device_attribute *attr, char *buf)
 202 {
 203         struct drm_device *ddev = dev_get_drvdata(dev);
 204         struct amdgpu_device *adev = drm_to_adev(ddev);
 205
 206         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 207 }
 208
 209 static DEVICE_ATTR(serial_number, S_IRUGO,
 210                 amdgpu_device_get_serial_number, NULL);
 211
 212 /**
 213  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 214  *
 215  * @dev: drm_device pointer
 216  *
 217  * Returns true if the device is a dGPU with HG/PX power control,
 218  * otherwise return false.
 219  */
 220 bool amdgpu_device_supports_boco(struct drm_device *dev)
 221 {
 222         struct amdgpu_device *adev = drm_to_adev(dev);
 223
 224         if (adev->flags & AMD_IS_PX)
 225                 return true;
 226         return false;
 227 }
 228
 229 /**
 230  * amdgpu_device_supports_baco - Does the device support BACO
 231  *
 232  * @dev: drm_device pointer
 233  *
 234  * Returns true if the device supporte BACO,
 235  * otherwise return false.
 236  */
 237 bool amdgpu_device_supports_baco(struct drm_device *dev)
 238 {
 239         struct amdgpu_device *adev = drm_to_adev(dev);
 240
 241         return amdgpu_asic_supports_baco(adev);
 242 }
 243
 244 /**
 245  * VRAM access helper functions.
 246  *
 247  * amdgpu_device_vram_access - read/write a buffer in vram
 248  *
 249  * @adev: amdgpu_device pointer
 250  * @pos: offset of the buffer in vram
 251  * @buf: virtual address of the buffer in system memory
 252  * @size: read/write size, sizeof(@buf) must > @size
 253  * @write: true - write to vram, otherwise - read from vram
 254  */
 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 256                                uint32_t *buf, size_t size, bool write)
 257 {
 258         unsigned long flags;
 259         uint32_t hi = ~0;
 260         uint64_t last;
 261
 262
 263 #ifdef CONFIG_64BIT
 264         last = min(pos + size, adev->gmc.visible_vram_size);
 265         if (last > pos) {
 266                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 267                 size_t count = last - pos;
 268
 269                 if (write) {
 270                         memcpy_toio(addr, buf, count);
 271                         mb();
 272                         amdgpu_asic_flush_hdp(adev, NULL);
 273                 } else {
 274                         amdgpu_asic_invalidate_hdp(adev, NULL);
 275                         mb();
 276                         memcpy_fromio(buf, addr, count);
 277                 }
 278
 279                 if (count == size)
 280                         return;
 281
 282                 pos += count;
 283                 buf += count / 4;
 284                 size -= count;
 285         }
 286 #endif
 287
 288         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 289         for (last = pos + size; pos < last; pos += 4) {
 290                 uint32_t tmp = pos >> 31;
 291
 292                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 293                 if (tmp != hi) {
 294                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 295                         hi = tmp;
 296                 }
 297                 if (write)
 298                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 299                 else
 300                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 301         }
 302         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 303 }
 304
 305 /*
 306  * MMIO register access helper functions.
 307  */
 308 /**
 309  * amdgpu_mm_rreg - read a memory mapped IO register
 310  *
 311  * @adev: amdgpu_device pointer
 312  * @reg: dword aligned register offset
 313  * @acc_flags: access flags which require special behavior
 314  *
 315  * Returns the 32 bit value from the offset specified.
 316  */
 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 318                         uint32_t acc_flags)
 319 {
 320         uint32_t ret;
 321
 322         if (adev->in_pci_err_recovery)
 323                 return 0;
 324
 325         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
 326             down_read_trylock(&adev->reset_sem)) {
 327                 ret = amdgpu_kiq_rreg(adev, reg);
 328                 up_read(&adev->reset_sem);
 329                 return ret;
 330         }
 331
 332         if ((reg * 4) < adev->rmmio_size)
 333                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 334         else {
 335                 unsigned long flags;
 336
 337                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 338                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 339                 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 340                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 341         }
 342
 343         trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
 344         return ret;
 345 }
 346
 347 /*
 348  * MMIO register read with bytes helper functions
 349  * @offset:bytes offset from MMIO start
 350  *
 351 */
 352
 353 /**
 354  * amdgpu_mm_rreg8 - read a memory mapped IO register
 355  *
 356  * @adev: amdgpu_device pointer
 357  * @offset: byte aligned register offset
 358  *
 359  * Returns the 8 bit value from the offset specified.
 360  */
 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 362         if (adev->in_pci_err_recovery)
 363                 return 0;
 364
 365         if (offset < adev->rmmio_size)
 366                 return (readb(adev->rmmio + offset));
 367         BUG();
 368 }
 369
 370 /*
 371  * MMIO register write with bytes helper functions
 372  * @offset:bytes offset from MMIO start
 373  * @value: the value want to be written to the register
 374  *
 375 */
 376 /**
 377  * amdgpu_mm_wreg8 - read a memory mapped IO register
 378  *
 379  * @adev: amdgpu_device pointer
 380  * @offset: byte aligned register offset
 381  * @value: 8 bit value to write
 382  *
 383  * Writes the value specified to the offset specified.
 384  */
 385 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 386         if (adev->in_pci_err_recovery)
 387                 return;
 388
 389         if (offset < adev->rmmio_size)
 390                 writeb(value, adev->rmmio + offset);
 391         else
 392                 BUG();
 393 }
 394
 395 static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
 396                                        uint32_t reg, uint32_t v,
 397                                        uint32_t acc_flags)
 398 {
 399         if (adev->in_pci_err_recovery)
 400                 return;
 401
 402         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 403
 404         if ((reg * 4) < adev->rmmio_size)
 405                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 406         else {
 407                 unsigned long flags;
 408
 409                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 410                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 411                 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 412                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 413         }
 414 }
 415
 416 /**
 417  * amdgpu_mm_wreg - write to a memory mapped IO register
 418  *
 419  * @adev: amdgpu_device pointer
 420  * @reg: dword aligned register offset
 421  * @v: 32 bit value to write to the register
 422  * @acc_flags: access flags which require special behavior
 423  *
 424  * Writes the value specified to the offset specified.
 425  */
 426 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 427                     uint32_t acc_flags)
 428 {
 429         if (adev->in_pci_err_recovery)
 430                 return;
 431
 432         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
 433             down_read_trylock(&adev->reset_sem)) {
 434                 amdgpu_kiq_wreg(adev, reg, v);
 435                 up_read(&adev->reset_sem);
 436                 return;
 437         }
 438
 439         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 440 }
 441
 442 /*
 443  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 444  *
 445  * this function is invoked only the debugfs register access
 446  * */
 447 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 448                     uint32_t acc_flags)
 449 {
 450         if (adev->in_pci_err_recovery)
 451                 return;
 452
 453         if (amdgpu_sriov_fullaccess(adev) &&
 454                 adev->gfx.rlc.funcs &&
 455                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 456
 457                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 458                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 459         }
 460
 461         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 462 }
 463
 464 /**
 465  * amdgpu_io_rreg - read an IO register
 466  *
 467  * @adev: amdgpu_device pointer
 468  * @reg: dword aligned register offset
 469  *
 470  * Returns the 32 bit value from the offset specified.
 471  */
 472 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 473 {
 474         if (adev->in_pci_err_recovery)
 475                 return 0;
 476
 477         if ((reg * 4) < adev->rio_mem_size)
 478                 return ioread32(adev->rio_mem + (reg * 4));
 479         else {
 480                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 481                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 482         }
 483 }
 484
 485 /**
 486  * amdgpu_io_wreg - write to an IO register
 487  *
 488  * @adev: amdgpu_device pointer
 489  * @reg: dword aligned register offset
 490  * @v: 32 bit value to write to the register
 491  *
 492  * Writes the value specified to the offset specified.
 493  */
 494 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 495 {
 496         if (adev->in_pci_err_recovery)
 497                 return;
 498
 499         if ((reg * 4) < adev->rio_mem_size)
 500                 iowrite32(v, adev->rio_mem + (reg * 4));
 501         else {
 502                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 503                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 504         }
 505 }
 506
 507 /**
 508  * amdgpu_mm_rdoorbell - read a doorbell dword
 509  *
 510  * @adev: amdgpu_device pointer
 511  * @index: doorbell index
 512  *
 513  * Returns the value in the doorbell aperture at the
 514  * requested doorbell index (CIK).
 515  */
 516 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 517 {
 518         if (adev->in_pci_err_recovery)
 519                 return 0;
 520
 521         if (index < adev->doorbell.num_doorbells) {
 522                 return readl(adev->doorbell.ptr + index);
 523         } else {
 524                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 525                 return 0;
 526         }
 527 }
 528
 529 /**
 530  * amdgpu_mm_wdoorbell - write a doorbell dword
 531  *
 532  * @adev: amdgpu_device pointer
 533  * @index: doorbell index
 534  * @v: value to write
 535  *
 536  * Writes @v to the doorbell aperture at the
 537  * requested doorbell index (CIK).
 538  */
 539 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 540 {
 541         if (adev->in_pci_err_recovery)
 542                 return;
 543
 544         if (index < adev->doorbell.num_doorbells) {
 545                 writel(v, adev->doorbell.ptr + index);
 546         } else {
 547                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 548         }
 549 }
 550
 551 /**
 552  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 553  *
 554  * @adev: amdgpu_device pointer
 555  * @index: doorbell index
 556  *
 557  * Returns the value in the doorbell aperture at the
 558  * requested doorbell index (VEGA10+).
 559  */
 560 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 561 {
 562         if (adev->in_pci_err_recovery)
 563                 return 0;
 564
 565         if (index < adev->doorbell.num_doorbells) {
 566                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 567         } else {
 568                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 569                 return 0;
 570         }
 571 }
 572
 573 /**
 574  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 575  *
 576  * @adev: amdgpu_device pointer
 577  * @index: doorbell index
 578  * @v: value to write
 579  *
 580  * Writes @v to the doorbell aperture at the
 581  * requested doorbell index (VEGA10+).
 582  */
 583 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 584 {
 585         if (adev->in_pci_err_recovery)
 586                 return;
 587
 588         if (index < adev->doorbell.num_doorbells) {
 589                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 590         } else {
 591                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 592         }
 593 }
 594
 595 /**
 596  * amdgpu_invalid_rreg - dummy reg read function
 597  *
 598  * @adev: amdgpu device pointer
 599  * @reg: offset of register
 600  *
 601  * Dummy register read function.  Used for register blocks
 602  * that certain asics don't have (all asics).
 603  * Returns the value in the register.
 604  */
 605 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 606 {
 607         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 608         BUG();
 609         return 0;
 610 }
 611
 612 /**
 613  * amdgpu_invalid_wreg - dummy reg write function
 614  *
 615  * @adev: amdgpu device pointer
 616  * @reg: offset of register
 617  * @v: value to write to the register
 618  *
 619  * Dummy register read function.  Used for register blocks
 620  * that certain asics don't have (all asics).
 621  */
 622 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 623 {
 624         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 625                   reg, v);
 626         BUG();
 627 }
 628
 629 /**
 630  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 631  *
 632  * @adev: amdgpu device pointer
 633  * @reg: offset of register
 634  *
 635  * Dummy register read function.  Used for register blocks
 636  * that certain asics don't have (all asics).
 637  * Returns the value in the register.
 638  */
 639 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 640 {
 641         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 642         BUG();
 643         return 0;
 644 }
 645
 646 /**
 647  * amdgpu_invalid_wreg64 - dummy reg write function
 648  *
 649  * @adev: amdgpu device pointer
 650  * @reg: offset of register
 651  * @v: value to write to the register
 652  *
 653  * Dummy register read function.  Used for register blocks
 654  * that certain asics don't have (all asics).
 655  */
 656 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 657 {
 658         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 659                   reg, v);
 660         BUG();
 661 }
 662
 663 /**
 664  * amdgpu_block_invalid_rreg - dummy reg read function
 665  *
 666  * @adev: amdgpu device pointer
 667  * @block: offset of instance
 668  * @reg: offset of register
 669  *
 670  * Dummy register read function.  Used for register blocks
 671  * that certain asics don't have (all asics).
 672  * Returns the value in the register.
 673  */
 674 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 675                                           uint32_t block, uint32_t reg)
 676 {
 677         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 678                   reg, block);
 679         BUG();
 680         return 0;
 681 }
 682
 683 /**
 684  * amdgpu_block_invalid_wreg - dummy reg write function
 685  *
 686  * @adev: amdgpu device pointer
 687  * @block: offset of instance
 688  * @reg: offset of register
 689  * @v: value to write to the register
 690  *
 691  * Dummy register read function.  Used for register blocks
 692  * that certain asics don't have (all asics).
 693  */
 694 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 695                                       uint32_t block,
 696                                       uint32_t reg, uint32_t v)
 697 {
 698         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 699                   reg, block, v);
 700         BUG();
 701 }
 702
 703 /**
 704  * amdgpu_device_asic_init - Wrapper for atom asic_init
 705  *
 706  * @dev: drm_device pointer
 707  *
 708  * Does any asic specific work and then calls atom asic init.
 709  */
 710 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 711 {
 712         amdgpu_asic_pre_asic_init(adev);
 713
 714         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 715 }
 716
 717 /**
 718  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 719  *
 720  * @adev: amdgpu device pointer
 721  *
 722  * Allocates a scratch page of VRAM for use by various things in the
 723  * driver.
 724  */
 725 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 726 {
 727         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 728                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 729                                        &adev->vram_scratch.robj,
 730                                        &adev->vram_scratch.gpu_addr,
 731                                        (void **)&adev->vram_scratch.ptr);
 732 }
 733
 734 /**
 735  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 736  *
 737  * @adev: amdgpu device pointer
 738  *
 739  * Frees the VRAM scratch page.
 740  */
 741 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 742 {
 743         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 744 }
 745
 746 /**
 747  * amdgpu_device_program_register_sequence - program an array of registers.
 748  *
 749  * @adev: amdgpu_device pointer
 750  * @registers: pointer to the register array
 751  * @array_size: size of the register array
 752  *
 753  * Programs an array or registers with and and or masks.
 754  * This is a helper for setting golden registers.
 755  */
 756 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 757                                              const u32 *registers,
 758                                              const u32 array_size)
 759 {
 760         u32 tmp, reg, and_mask, or_mask;
 761         int i;
 762
 763         if (array_size % 3)
 764                 return;
 765
 766         for (i = 0; i < array_size; i +=3) {
 767                 reg = registers[i + 0];
 768                 and_mask = registers[i + 1];
 769                 or_mask = registers[i + 2];
 770
 771                 if (and_mask == 0xffffffff) {
 772                         tmp = or_mask;
 773                 } else {
 774                         tmp = RREG32(reg);
 775                         tmp &= ~and_mask;
 776                         if (adev->family >= AMDGPU_FAMILY_AI)
 777                                 tmp |= (or_mask & and_mask);
 778                         else
 779                                 tmp |= or_mask;
 780                 }
 781                 WREG32(reg, tmp);
 782         }
 783 }
 784
 785 /**
 786  * amdgpu_device_pci_config_reset - reset the GPU
 787  *
 788  * @adev: amdgpu_device pointer
 789  *
 790  * Resets the GPU using the pci config reset sequence.
 791  * Only applicable to asics prior to vega10.
 792  */
 793 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 794 {
 795         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 796 }
 797
 798 /*
 799  * GPU doorbell aperture helpers function.
 800  */
 801 /**
 802  * amdgpu_device_doorbell_init - Init doorbell driver information.
 803  *
 804  * @adev: amdgpu_device pointer
 805  *
 806  * Init doorbell driver information (CIK)
 807  * Returns 0 on success, error on failure.
 808  */
 809 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 810 {
 811
 812         /* No doorbell on SI hardware generation */
 813         if (adev->asic_type < CHIP_BONAIRE) {
 814                 adev->doorbell.base = 0;
 815                 adev->doorbell.size = 0;
 816                 adev->doorbell.num_doorbells = 0;
 817                 adev->doorbell.ptr = NULL;
 818                 return 0;
 819         }
 820
 821         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 822                 return -EINVAL;
 823
 824         amdgpu_asic_init_doorbell_index(adev);
 825
 826         /* doorbell bar mapping */
 827         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 828         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 829
 830         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 831                                              adev->doorbell_index.max_assignment+1);
 832         if (adev->doorbell.num_doorbells == 0)
 833                 return -EINVAL;
 834
 835         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 836          * paging queue doorbell use the second page. The
 837          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 838          * doorbells are in the first page. So with paging queue enabled,
 839          * the max num_doorbells should + 1 page (0x400 in dword)
 840          */
 841         if (adev->asic_type >= CHIP_VEGA10)
 842                 adev->doorbell.num_doorbells += 0x400;
 843
 844         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 845                                      adev->doorbell.num_doorbells *
 846                                      sizeof(u32));
 847         if (adev->doorbell.ptr == NULL)
 848                 return -ENOMEM;
 849
 850         return 0;
 851 }
 852
 853 /**
 854  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 855  *
 856  * @adev: amdgpu_device pointer
 857  *
 858  * Tear down doorbell driver information (CIK)
 859  */
 860 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 861 {
 862         iounmap(adev->doorbell.ptr);
 863         adev->doorbell.ptr = NULL;
 864 }
 865
 866
 867
 868 /*
 869  * amdgpu_device_wb_*()
 870  * Writeback is the method by which the GPU updates special pages in memory
 871  * with the status of certain GPU events (fences, ring pointers,etc.).
 872  */
 873
 874 /**
 875  * amdgpu_device_wb_fini - Disable Writeback and free memory
 876  *
 877  * @adev: amdgpu_device pointer
 878  *
 879  * Disables Writeback and frees the Writeback memory (all asics).
 880  * Used at driver shutdown.
 881  */
 882 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 883 {
 884         if (adev->wb.wb_obj) {
 885                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 886                                       &adev->wb.gpu_addr,
 887                                       (void **)&adev->wb.wb);
 888                 adev->wb.wb_obj = NULL;
 889         }
 890 }
 891
 892 /**
 893  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 894  *
 895  * @adev: amdgpu_device pointer
 896  *
 897  * Initializes writeback and allocates writeback memory (all asics).
 898  * Used at driver startup.
 899  * Returns 0 on success or an -error on failure.
 900  */
 901 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 902 {
 903         int r;
 904
 905         if (adev->wb.wb_obj == NULL) {
 906                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 907                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 908                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 909                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 910                                             (void **)&adev->wb.wb);
 911                 if (r) {
 912                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 913                         return r;
 914                 }
 915
 916                 adev->wb.num_wb = AMDGPU_MAX_WB;
 917                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 918
 919                 /* clear wb memory */
 920                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 921         }
 922
 923         return 0;
 924 }
 925
 926 /**
 927  * amdgpu_device_wb_get - Allocate a wb entry
 928  *
 929  * @adev: amdgpu_device pointer
 930  * @wb: wb index
 931  *
 932  * Allocate a wb slot for use by the driver (all asics).
 933  * Returns 0 on success or -EINVAL on failure.
 934  */
 935 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 936 {
 937         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 938
 939         if (offset < adev->wb.num_wb) {
 940                 __set_bit(offset, adev->wb.used);
 941                 *wb = offset << 3; /* convert to dw offset */
 942                 return 0;
 943         } else {
 944                 return -EINVAL;
 945         }
 946 }
 947
 948 /**
 949  * amdgpu_device_wb_free - Free a wb entry
 950  *
 951  * @adev: amdgpu_device pointer
 952  * @wb: wb index
 953  *
 954  * Free a wb slot allocated for use by the driver (all asics)
 955  */
 956 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 957 {
 958         wb >>= 3;
 959         if (wb < adev->wb.num_wb)
 960                 __clear_bit(wb, adev->wb.used);
 961 }
 962
 963 /**
 964  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 965  *
 966  * @adev: amdgpu_device pointer
 967  *
 968  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 969  * to fail, but if any of the BARs is not accessible after the size we abort
 970  * driver loading by returning -ENODEV.
 971  */
 972 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 973 {
 974         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 975         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 976         struct pci_bus *root;
 977         struct resource *res;
 978         unsigned i;
 979         u16 cmd;
 980         int r;
 981
 982         /* Bypass for VF */
 983         if (amdgpu_sriov_vf(adev))
 984                 return 0;
 985
 986         /* skip if the bios has already enabled large BAR */
 987         if (adev->gmc.real_vram_size &&
 988             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
 989                 return 0;
 990
 991         /* Check if the root BUS has 64bit memory resources */
 992         root = adev->pdev->bus;
 993         while (root->parent)
 994                 root = root->parent;
 995
 996         pci_bus_for_each_resource(root, res, i) {
 997                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 998                     res->start > 0x100000000ull)
 999                         break;
1000         }
1001
1002         /* Trying to resize is pointless without a root hub window above 4GB */
1003         if (!res)
1004                 return 0;
1005
1006         /* Disable memory decoding while we change the BAR addresses and size */
1007         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1008         pci_write_config_word(adev->pdev, PCI_COMMAND,
1009                               cmd & ~PCI_COMMAND_MEMORY);
1010
1011         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1012         amdgpu_device_doorbell_fini(adev);
1013         if (adev->asic_type >= CHIP_BONAIRE)
1014                 pci_release_resource(adev->pdev, 2);
1015
1016         pci_release_resource(adev->pdev, 0);
1017
1018         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1019         if (r == -ENOSPC)
1020                 DRM_INFO("Not enough PCI address space for a large BAR.");
1021         else if (r && r != -ENOTSUPP)
1022                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1023
1024         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1025
1026         /* When the doorbell or fb BAR isn't available we have no chance of
1027          * using the device.
1028          */
1029         r = amdgpu_device_doorbell_init(adev);
1030         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1031                 return -ENODEV;
1032
1033         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1034
1035         return 0;
1036 }
1037
1038 /*
1039  * GPU helpers function.
1040  */
1041 /**
1042  * amdgpu_device_need_post - check if the hw need post or not
1043  *
1044  * @adev: amdgpu_device pointer
1045  *
1046  * Check if the asic has been initialized (all asics) at driver startup
1047  * or post is needed if  hw reset is performed.
1048  * Returns true if need or false if not.
1049  */
1050 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1051 {
1052         uint32_t reg;
1053
1054         if (amdgpu_sriov_vf(adev))
1055                 return false;
1056
1057         if (amdgpu_passthrough(adev)) {
1058                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1059                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1060                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1061                  * vpost executed for smc version below 22.15
1062                  */
1063                 if (adev->asic_type == CHIP_FIJI) {
1064                         int err;
1065                         uint32_t fw_ver;
1066                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1067                         /* force vPost if error occured */
1068                         if (err)
1069                                 return true;
1070
1071                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1072                         if (fw_ver < 0x00160e00)
1073                                 return true;
1074                 }
1075         }
1076
1077         if (adev->has_hw_reset) {
1078                 adev->has_hw_reset = false;
1079                 return true;
1080         }
1081
1082         /* bios scratch used on CIK+ */
1083         if (adev->asic_type >= CHIP_BONAIRE)
1084                 return amdgpu_atombios_scratch_need_asic_init(adev);
1085
1086         /* check MEM_SIZE for older asics */
1087         reg = amdgpu_asic_get_config_memsize(adev);
1088
1089         if ((reg != 0) && (reg != 0xffffffff))
1090                 return false;
1091
1092         return true;
1093 }
1094
1095 /* if we get transitioned to only one device, take VGA back */
1096 /**
1097  * amdgpu_device_vga_set_decode - enable/disable vga decode
1098  *
1099  * @cookie: amdgpu_device pointer
1100  * @state: enable/disable vga decode
1101  *
1102  * Enable/disable vga decode (all asics).
1103  * Returns VGA resource flags.
1104  */
1105 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1106 {
1107         struct amdgpu_device *adev = cookie;
1108         amdgpu_asic_set_vga_state(adev, state);
1109         if (state)
1110                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1111                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1112         else
1113                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1114 }
1115
1116 /**
1117  * amdgpu_device_check_block_size - validate the vm block size
1118  *
1119  * @adev: amdgpu_device pointer
1120  *
1121  * Validates the vm block size specified via module parameter.
1122  * The vm block size defines number of bits in page table versus page directory,
1123  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1124  * page table and the remaining bits are in the page directory.
1125  */
1126 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1127 {
1128         /* defines number of bits in page table versus page directory,
1129          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1130          * page table and the remaining bits are in the page directory */
1131         if (amdgpu_vm_block_size == -1)
1132                 return;
1133
1134         if (amdgpu_vm_block_size < 9) {
1135                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1136                          amdgpu_vm_block_size);
1137                 amdgpu_vm_block_size = -1;
1138         }
1139 }
1140
1141 /**
1142  * amdgpu_device_check_vm_size - validate the vm size
1143  *
1144  * @adev: amdgpu_device pointer
1145  *
1146  * Validates the vm size in GB specified via module parameter.
1147  * The VM size is the size of the GPU virtual memory space in GB.
1148  */
1149 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1150 {
1151         /* no need to check the default value */
1152         if (amdgpu_vm_size == -1)
1153                 return;
1154
1155         if (amdgpu_vm_size < 1) {
1156                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1157                          amdgpu_vm_size);
1158                 amdgpu_vm_size = -1;
1159         }
1160 }
1161
1162 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1163 {
1164         struct sysinfo si;
1165         bool is_os_64 = (sizeof(void *) == 8);
1166         uint64_t total_memory;
1167         uint64_t dram_size_seven_GB = 0x1B8000000;
1168         uint64_t dram_size_three_GB = 0xB8000000;
1169
1170         if (amdgpu_smu_memory_pool_size == 0)
1171                 return;
1172
1173         if (!is_os_64) {
1174                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1175                 goto def_value;
1176         }
1177         si_meminfo(&si);
1178         total_memory = (uint64_t)si.totalram * si.mem_unit;
1179
1180         if ((amdgpu_smu_memory_pool_size == 1) ||
1181                 (amdgpu_smu_memory_pool_size == 2)) {
1182                 if (total_memory < dram_size_three_GB)
1183                         goto def_value1;
1184         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1185                 (amdgpu_smu_memory_pool_size == 8)) {
1186                 if (total_memory < dram_size_seven_GB)
1187                         goto def_value1;
1188         } else {
1189                 DRM_WARN("Smu memory pool size not supported\n");
1190                 goto def_value;
1191         }
1192         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1193
1194         return;
1195
1196 def_value1:
1197         DRM_WARN("No enough system memory\n");
1198 def_value:
1199         adev->pm.smu_prv_buffer_size = 0;
1200 }
1201
1202 /**
1203  * amdgpu_device_check_arguments - validate module params
1204  *
1205  * @adev: amdgpu_device pointer
1206  *
1207  * Validates certain module parameters and updates
1208  * the associated values used by the driver (all asics).
1209  */
1210 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1211 {
1212         if (amdgpu_sched_jobs < 4) {
1213                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1214                          amdgpu_sched_jobs);
1215                 amdgpu_sched_jobs = 4;
1216         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1217                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1218                          amdgpu_sched_jobs);
1219                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1220         }
1221
1222         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1223                 /* gart size must be greater or equal to 32M */
1224                 dev_warn(adev->dev, "gart size (%d) too small\n",
1225                          amdgpu_gart_size);
1226                 amdgpu_gart_size = -1;
1227         }
1228
1229         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1230                 /* gtt size must be greater or equal to 32M */
1231                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1232                                  amdgpu_gtt_size);
1233                 amdgpu_gtt_size = -1;
1234         }
1235
1236         /* valid range is between 4 and 9 inclusive */
1237         if (amdgpu_vm_fragment_size != -1 &&
1238             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1239                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1240                 amdgpu_vm_fragment_size = -1;
1241         }
1242
1243         if (amdgpu_sched_hw_submission < 2) {
1244                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1245                          amdgpu_sched_hw_submission);
1246                 amdgpu_sched_hw_submission = 2;
1247         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1248                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1249                          amdgpu_sched_hw_submission);
1250                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1251         }
1252
1253         amdgpu_device_check_smu_prv_buffer_size(adev);
1254
1255         amdgpu_device_check_vm_size(adev);
1256
1257         amdgpu_device_check_block_size(adev);
1258
1259         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1260
1261         amdgpu_gmc_tmz_set(adev);
1262
1263         if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1264                 amdgpu_num_kcq = 8;
1265                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1266         }
1267
1268         return 0;
1269 }
1270
1271 /**
1272  * amdgpu_switcheroo_set_state - set switcheroo state
1273  *
1274  * @pdev: pci dev pointer
1275  * @state: vga_switcheroo state
1276  *
1277  * Callback for the switcheroo driver.  Suspends or resumes the
1278  * the asics before or after it is powered up using ACPI methods.
1279  */
1280 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1281                                         enum vga_switcheroo_state state)
1282 {
1283         struct drm_device *dev = pci_get_drvdata(pdev);
1284         int r;
1285
1286         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1287                 return;
1288
1289         if (state == VGA_SWITCHEROO_ON) {
1290                 pr_info("switched on\n");
1291                 /* don't suspend or resume card normally */
1292                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1293
1294                 pci_set_power_state(dev->pdev, PCI_D0);
1295                 pci_restore_state(dev->pdev);
1296                 r = pci_enable_device(dev->pdev);
1297                 if (r)
1298                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1299                 amdgpu_device_resume(dev, true);
1300
1301                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1302                 drm_kms_helper_poll_enable(dev);
1303         } else {
1304                 pr_info("switched off\n");
1305                 drm_kms_helper_poll_disable(dev);
1306                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1307                 amdgpu_device_suspend(dev, true);
1308                 pci_save_state(dev->pdev);
1309                 /* Shut down the device */
1310                 pci_disable_device(dev->pdev);
1311                 pci_set_power_state(dev->pdev, PCI_D3cold);
1312                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1313         }
1314 }
1315
1316 /**
1317  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1318  *
1319  * @pdev: pci dev pointer
1320  *
1321  * Callback for the switcheroo driver.  Check of the switcheroo
1322  * state can be changed.
1323  * Returns true if the state can be changed, false if not.
1324  */
1325 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1326 {
1327         struct drm_device *dev = pci_get_drvdata(pdev);
1328
1329         /*
1330         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1331         * locking inversion with the driver load path. And the access here is
1332         * completely racy anyway. So don't bother with locking for now.
1333         */
1334         return atomic_read(&dev->open_count) == 0;
1335 }
1336
1337 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1338         .set_gpu_state = amdgpu_switcheroo_set_state,
1339         .reprobe = NULL,
1340         .can_switch = amdgpu_switcheroo_can_switch,
1341 };
1342
1343 /**
1344  * amdgpu_device_ip_set_clockgating_state - set the CG state
1345  *
1346  * @dev: amdgpu_device pointer
1347  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1348  * @state: clockgating state (gate or ungate)
1349  *
1350  * Sets the requested clockgating state for all instances of
1351  * the hardware IP specified.
1352  * Returns the error code from the last instance.
1353  */
1354 int amdgpu_device_ip_set_clockgating_state(void *dev,
1355                                            enum amd_ip_block_type block_type,
1356                                            enum amd_clockgating_state state)
1357 {
1358         struct amdgpu_device *adev = dev;
1359         int i, r = 0;
1360
1361         for (i = 0; i < adev->num_ip_blocks; i++) {
1362                 if (!adev->ip_blocks[i].status.valid)
1363                         continue;
1364                 if (adev->ip_blocks[i].version->type != block_type)
1365                         continue;
1366                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1367                         continue;
1368                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1369                         (void *)adev, state);
1370                 if (r)
1371                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1372                                   adev->ip_blocks[i].version->funcs->name, r);
1373         }
1374         return r;
1375 }
1376
1377 /**
1378  * amdgpu_device_ip_set_powergating_state - set the PG state
1379  *
1380  * @dev: amdgpu_device pointer
1381  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1382  * @state: powergating state (gate or ungate)
1383  *
1384  * Sets the requested powergating state for all instances of
1385  * the hardware IP specified.
1386  * Returns the error code from the last instance.
1387  */
1388 int amdgpu_device_ip_set_powergating_state(void *dev,
1389                                            enum amd_ip_block_type block_type,
1390                                            enum amd_powergating_state state)
1391 {
1392         struct amdgpu_device *adev = dev;
1393         int i, r = 0;
1394
1395         for (i = 0; i < adev->num_ip_blocks; i++) {
1396                 if (!adev->ip_blocks[i].status.valid)
1397                         continue;
1398                 if (adev->ip_blocks[i].version->type != block_type)
1399                         continue;
1400                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1401                         continue;
1402                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1403                         (void *)adev, state);
1404                 if (r)
1405                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1406                                   adev->ip_blocks[i].version->funcs->name, r);
1407         }
1408         return r;
1409 }
1410
1411 /**
1412  * amdgpu_device_ip_get_clockgating_state - get the CG state
1413  *
1414  * @adev: amdgpu_device pointer
1415  * @flags: clockgating feature flags
1416  *
1417  * Walks the list of IPs on the device and updates the clockgating
1418  * flags for each IP.
1419  * Updates @flags with the feature flags for each hardware IP where
1420  * clockgating is enabled.
1421  */
1422 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1423                                             u32 *flags)
1424 {
1425         int i;
1426
1427         for (i = 0; i < adev->num_ip_blocks; i++) {
1428                 if (!adev->ip_blocks[i].status.valid)
1429                         continue;
1430                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1431                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1432         }
1433 }
1434
1435 /**
1436  * amdgpu_device_ip_wait_for_idle - wait for idle
1437  *
1438  * @adev: amdgpu_device pointer
1439  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1440  *
1441  * Waits for the request hardware IP to be idle.
1442  * Returns 0 for success or a negative error code on failure.
1443  */
1444 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1445                                    enum amd_ip_block_type block_type)
1446 {
1447         int i, r;
1448
1449         for (i = 0; i < adev->num_ip_blocks; i++) {
1450                 if (!adev->ip_blocks[i].status.valid)
1451                         continue;
1452                 if (adev->ip_blocks[i].version->type == block_type) {
1453                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1454                         if (r)
1455                                 return r;
1456                         break;
1457                 }
1458         }
1459         return 0;
1460
1461 }
1462
1463 /**
1464  * amdgpu_device_ip_is_idle - is the hardware IP idle
1465  *
1466  * @adev: amdgpu_device pointer
1467  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1468  *
1469  * Check if the hardware IP is idle or not.
1470  * Returns true if it the IP is idle, false if not.
1471  */
1472 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1473                               enum amd_ip_block_type block_type)
1474 {
1475         int i;
1476
1477         for (i = 0; i < adev->num_ip_blocks; i++) {
1478                 if (!adev->ip_blocks[i].status.valid)
1479                         continue;
1480                 if (adev->ip_blocks[i].version->type == block_type)
1481                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1482         }
1483         return true;
1484
1485 }
1486
1487 /**
1488  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1489  *
1490  * @adev: amdgpu_device pointer
1491  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1492  *
1493  * Returns a pointer to the hardware IP block structure
1494  * if it exists for the asic, otherwise NULL.
1495  */
1496 struct amdgpu_ip_block *
1497 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1498                               enum amd_ip_block_type type)
1499 {
1500         int i;
1501
1502         for (i = 0; i < adev->num_ip_blocks; i++)
1503                 if (adev->ip_blocks[i].version->type == type)
1504                         return &adev->ip_blocks[i];
1505
1506         return NULL;
1507 }
1508
1509 /**
1510  * amdgpu_device_ip_block_version_cmp
1511  *
1512  * @adev: amdgpu_device pointer
1513  * @type: enum amd_ip_block_type
1514  * @major: major version
1515  * @minor: minor version
1516  *
1517  * return 0 if equal or greater
1518  * return 1 if smaller or the ip_block doesn't exist
1519  */
1520 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1521                                        enum amd_ip_block_type type,
1522                                        u32 major, u32 minor)
1523 {
1524         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1525
1526         if (ip_block && ((ip_block->version->major > major) ||
1527                         ((ip_block->version->major == major) &&
1528                         (ip_block->version->minor >= minor))))
1529                 return 0;
1530
1531         return 1;
1532 }
1533
1534 /**
1535  * amdgpu_device_ip_block_add
1536  *
1537  * @adev: amdgpu_device pointer
1538  * @ip_block_version: pointer to the IP to add
1539  *
1540  * Adds the IP block driver information to the collection of IPs
1541  * on the asic.
1542  */
1543 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1544                                const struct amdgpu_ip_block_version *ip_block_version)
1545 {
1546         if (!ip_block_version)
1547                 return -EINVAL;
1548
1549         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1550                   ip_block_version->funcs->name);
1551
1552         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1553
1554         return 0;
1555 }
1556
1557 /**
1558  * amdgpu_device_enable_virtual_display - enable virtual display feature
1559  *
1560  * @adev: amdgpu_device pointer
1561  *
1562  * Enabled the virtual display feature if the user has enabled it via
1563  * the module parameter virtual_display.  This feature provides a virtual
1564  * display hardware on headless boards or in virtualized environments.
1565  * This function parses and validates the configuration string specified by
1566  * the user and configues the virtual display configuration (number of
1567  * virtual connectors, crtcs, etc.) specified.
1568  */
1569 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1570 {
1571         adev->enable_virtual_display = false;
1572
1573         if (amdgpu_virtual_display) {
1574                 struct drm_device *ddev = adev_to_drm(adev);
1575                 const char *pci_address_name = pci_name(ddev->pdev);
1576                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1577
1578                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1579                 pciaddstr_tmp = pciaddstr;
1580                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1581                         pciaddname = strsep(&pciaddname_tmp, ",");
1582                         if (!strcmp("all", pciaddname)
1583                             || !strcmp(pci_address_name, pciaddname)) {
1584                                 long num_crtc;
1585                                 int res = -1;
1586
1587                                 adev->enable_virtual_display = true;
1588
1589                                 if (pciaddname_tmp)
1590                                         res = kstrtol(pciaddname_tmp, 10,
1591                                                       &num_crtc);
1592
1593                                 if (!res) {
1594                                         if (num_crtc < 1)
1595                                                 num_crtc = 1;
1596                                         if (num_crtc > 6)
1597                                                 num_crtc = 6;
1598                                         adev->mode_info.num_crtc = num_crtc;
1599                                 } else {
1600                                         adev->mode_info.num_crtc = 1;
1601                                 }
1602                                 break;
1603                         }
1604                 }
1605
1606                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1607                          amdgpu_virtual_display, pci_address_name,
1608                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1609
1610                 kfree(pciaddstr);
1611         }
1612 }
1613
1614 /**
1615  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1616  *
1617  * @adev: amdgpu_device pointer
1618  *
1619  * Parses the asic configuration parameters specified in the gpu info
1620  * firmware and makes them availale to the driver for use in configuring
1621  * the asic.
1622  * Returns 0 on success, -EINVAL on failure.
1623  */
1624 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1625 {
1626         const char *chip_name;
1627         char fw_name[40];
1628         int err;
1629         const struct gpu_info_firmware_header_v1_0 *hdr;
1630
1631         adev->firmware.gpu_info_fw = NULL;
1632
1633         if (adev->mman.discovery_bin) {
1634                 amdgpu_discovery_get_gfx_info(adev);
1635
1636                 /*
1637                  * FIXME: The bounding box is still needed by Navi12, so
1638                  * temporarily read it from gpu_info firmware. Should be droped
1639                  * when DAL no longer needs it.
1640                  */
1641                 if (adev->asic_type != CHIP_NAVI12)
1642                         return 0;
1643         }
1644
1645         switch (adev->asic_type) {
1646 #ifdef CONFIG_DRM_AMDGPU_SI
1647         case CHIP_VERDE:
1648         case CHIP_TAHITI:
1649         case CHIP_PITCAIRN:
1650         case CHIP_OLAND:
1651         case CHIP_HAINAN:
1652 #endif
1653 #ifdef CONFIG_DRM_AMDGPU_CIK
1654         case CHIP_BONAIRE:
1655         case CHIP_HAWAII:
1656         case CHIP_KAVERI:
1657         case CHIP_KABINI:
1658         case CHIP_MULLINS:
1659 #endif
1660         case CHIP_TOPAZ:
1661         case CHIP_TONGA:
1662         case CHIP_FIJI:
1663         case CHIP_POLARIS10:
1664         case CHIP_POLARIS11:
1665         case CHIP_POLARIS12:
1666         case CHIP_VEGAM:
1667         case CHIP_CARRIZO:
1668         case CHIP_STONEY:
1669         case CHIP_VEGA20:
1670         default:
1671                 return 0;
1672         case CHIP_VEGA10:
1673                 chip_name = "vega10";
1674                 break;
1675         case CHIP_VEGA12:
1676                 chip_name = "vega12";
1677                 break;
1678         case CHIP_RAVEN:
1679                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1680                         chip_name = "raven2";
1681                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1682                         chip_name = "picasso";
1683                 else
1684                         chip_name = "raven";
1685                 break;
1686         case CHIP_ARCTURUS:
1687                 chip_name = "arcturus";
1688                 break;
1689         case CHIP_RENOIR:
1690                 chip_name = "renoir";
1691                 break;
1692         case CHIP_NAVI10:
1693                 chip_name = "navi10";
1694                 break;
1695         case CHIP_NAVI14:
1696                 chip_name = "navi14";
1697                 break;
1698         case CHIP_NAVI12:
1699                 chip_name = "navi12";
1700                 break;
1701         case CHIP_SIENNA_CICHLID:
1702                 chip_name = "sienna_cichlid";
1703                 break;
1704         case CHIP_NAVY_FLOUNDER:
1705                 chip_name = "navy_flounder";
1706                 break;
1707         }
1708
1709         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1710         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1711         if (err) {
1712                 dev_err(adev->dev,
1713                         "Failed to load gpu_info firmware \"%s\"\n",
1714                         fw_name);
1715                 goto out;
1716         }
1717         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1718         if (err) {
1719                 dev_err(adev->dev,
1720                         "Failed to validate gpu_info firmware \"%s\"\n",
1721                         fw_name);
1722                 goto out;
1723         }
1724
1725         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1726         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1727
1728         switch (hdr->version_major) {
1729         case 1:
1730         {
1731                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1732                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1733                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1734
1735                 /*
1736                  * Should be droped when DAL no longer needs it.
1737                  */
1738                 if (adev->asic_type == CHIP_NAVI12)
1739                         goto parse_soc_bounding_box;
1740
1741                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1742                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1743                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1744                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1745                 adev->gfx.config.max_texture_channel_caches =
1746                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1747                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1748                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1749                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1750                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1751                 adev->gfx.config.double_offchip_lds_buf =
1752                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1753                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1754                 adev->gfx.cu_info.max_waves_per_simd =
1755                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1756                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1757                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1758                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1759                 if (hdr->version_minor >= 1) {
1760                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1761                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1762                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1763                         adev->gfx.config.num_sc_per_sh =
1764                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1765                         adev->gfx.config.num_packer_per_sc =
1766                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1767                 }
1768
1769 parse_soc_bounding_box:
1770                 /*
1771                  * soc bounding box info is not integrated in disocovery table,
1772                  * we always need to parse it from gpu info firmware if needed.
1773                  */
1774                 if (hdr->version_minor == 2) {
1775                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1776                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1777                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1778                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1779                 }
1780                 break;
1781         }
1782         default:
1783                 dev_err(adev->dev,
1784                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1785                 err = -EINVAL;
1786                 goto out;
1787         }
1788 out:
1789         return err;
1790 }
1791
1792 /**
1793  * amdgpu_device_ip_early_init - run early init for hardware IPs
1794  *
1795  * @adev: amdgpu_device pointer
1796  *
1797  * Early initialization pass for hardware IPs.  The hardware IPs that make
1798  * up each asic are discovered each IP's early_init callback is run.  This
1799  * is the first stage in initializing the asic.
1800  * Returns 0 on success, negative error code on failure.
1801  */
1802 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1803 {
1804         int i, r;
1805
1806         amdgpu_device_enable_virtual_display(adev);
1807
1808         if (amdgpu_sriov_vf(adev)) {
1809                 r = amdgpu_virt_request_full_gpu(adev, true);
1810                 if (r)
1811                         return r;
1812         }
1813
1814         switch (adev->asic_type) {
1815 #ifdef CONFIG_DRM_AMDGPU_SI
1816         case CHIP_VERDE:
1817         case CHIP_TAHITI:
1818         case CHIP_PITCAIRN:
1819         case CHIP_OLAND:
1820         case CHIP_HAINAN:
1821                 adev->family = AMDGPU_FAMILY_SI;
1822                 r = si_set_ip_blocks(adev);
1823                 if (r)
1824                         return r;
1825                 break;
1826 #endif
1827 #ifdef CONFIG_DRM_AMDGPU_CIK
1828         case CHIP_BONAIRE:
1829         case CHIP_HAWAII:
1830         case CHIP_KAVERI:
1831         case CHIP_KABINI:
1832         case CHIP_MULLINS:
1833                 if (adev->flags & AMD_IS_APU)
1834                         adev->family = AMDGPU_FAMILY_KV;
1835                 else
1836                         adev->family = AMDGPU_FAMILY_CI;
1837
1838                 r = cik_set_ip_blocks(adev);
1839                 if (r)
1840                         return r;
1841                 break;
1842 #endif
1843         case CHIP_TOPAZ:
1844         case CHIP_TONGA:
1845         case CHIP_FIJI:
1846         case CHIP_POLARIS10:
1847         case CHIP_POLARIS11:
1848         case CHIP_POLARIS12:
1849         case CHIP_VEGAM:
1850         case CHIP_CARRIZO:
1851         case CHIP_STONEY:
1852                 if (adev->flags & AMD_IS_APU)
1853                         adev->family = AMDGPU_FAMILY_CZ;
1854                 else
1855                         adev->family = AMDGPU_FAMILY_VI;
1856
1857                 r = vi_set_ip_blocks(adev);
1858                 if (r)
1859                         return r;
1860                 break;
1861         case CHIP_VEGA10:
1862         case CHIP_VEGA12:
1863         case CHIP_VEGA20:
1864         case CHIP_RAVEN:
1865         case CHIP_ARCTURUS:
1866         case CHIP_RENOIR:
1867                 if (adev->flags & AMD_IS_APU)
1868                         adev->family = AMDGPU_FAMILY_RV;
1869                 else
1870                         adev->family = AMDGPU_FAMILY_AI;
1871
1872                 r = soc15_set_ip_blocks(adev);
1873                 if (r)
1874                         return r;
1875                 break;
1876         case  CHIP_NAVI10:
1877         case  CHIP_NAVI14:
1878         case  CHIP_NAVI12:
1879         case  CHIP_SIENNA_CICHLID:
1880         case  CHIP_NAVY_FLOUNDER:
1881                 adev->family = AMDGPU_FAMILY_NV;
1882
1883                 r = nv_set_ip_blocks(adev);
1884                 if (r)
1885                         return r;
1886                 break;
1887         default:
1888                 /* FIXME: not supported yet */
1889                 return -EINVAL;
1890         }
1891
1892         amdgpu_amdkfd_device_probe(adev);
1893
1894         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1895         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1896                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1897
1898         for (i = 0; i < adev->num_ip_blocks; i++) {
1899                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1900                         DRM_ERROR("disabled ip block: %d <%s>\n",
1901                                   i, adev->ip_blocks[i].version->funcs->name);
1902                         adev->ip_blocks[i].status.valid = false;
1903                 } else {
1904                         if (adev->ip_blocks[i].version->funcs->early_init) {
1905                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1906                                 if (r == -ENOENT) {
1907                                         adev->ip_blocks[i].status.valid = false;
1908                                 } else if (r) {
1909                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1910                                                   adev->ip_blocks[i].version->funcs->name, r);
1911                                         return r;
1912                                 } else {
1913                                         adev->ip_blocks[i].status.valid = true;
1914                                 }
1915                         } else {
1916                                 adev->ip_blocks[i].status.valid = true;
1917                         }
1918                 }
1919                 /* get the vbios after the asic_funcs are set up */
1920                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1921                         r = amdgpu_device_parse_gpu_info_fw(adev);
1922                         if (r)
1923                                 return r;
1924
1925                         /* Read BIOS */
1926                         if (!amdgpu_get_bios(adev))
1927                                 return -EINVAL;
1928
1929                         r = amdgpu_atombios_init(adev);
1930                         if (r) {
1931                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1932                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1933                                 return r;
1934                         }
1935                 }
1936         }
1937
1938         adev->cg_flags &= amdgpu_cg_mask;
1939         adev->pg_flags &= amdgpu_pg_mask;
1940
1941         return 0;
1942 }
1943
1944 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1945 {
1946         int i, r;
1947
1948         for (i = 0; i < adev->num_ip_blocks; i++) {
1949                 if (!adev->ip_blocks[i].status.sw)
1950                         continue;
1951                 if (adev->ip_blocks[i].status.hw)
1952                         continue;
1953                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1954                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1955                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1956                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1957                         if (r) {
1958                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1959                                           adev->ip_blocks[i].version->funcs->name, r);
1960                                 return r;
1961                         }
1962                         adev->ip_blocks[i].status.hw = true;
1963                 }
1964         }
1965
1966         return 0;
1967 }
1968
1969 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1970 {
1971         int i, r;
1972
1973         for (i = 0; i < adev->num_ip_blocks; i++) {
1974                 if (!adev->ip_blocks[i].status.sw)
1975                         continue;
1976                 if (adev->ip_blocks[i].status.hw)
1977                         continue;
1978                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1979                 if (r) {
1980                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1981                                   adev->ip_blocks[i].version->funcs->name, r);
1982                         return r;
1983                 }
1984                 adev->ip_blocks[i].status.hw = true;
1985         }
1986
1987         return 0;
1988 }
1989
1990 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1991 {
1992         int r = 0;
1993         int i;
1994         uint32_t smu_version;
1995
1996         if (adev->asic_type >= CHIP_VEGA10) {
1997                 for (i = 0; i < adev->num_ip_blocks; i++) {
1998                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1999                                 continue;
2000
2001                         /* no need to do the fw loading again if already done*/
2002                         if (adev->ip_blocks[i].status.hw == true)
2003                                 break;
2004
2005                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2006                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2007                                 if (r) {
2008                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2009                                                           adev->ip_blocks[i].version->funcs->name, r);
2010                                         return r;
2011                                 }
2012                         } else {
2013                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2014                                 if (r) {
2015                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2016                                                           adev->ip_blocks[i].version->funcs->name, r);
2017                                         return r;
2018                                 }
2019                         }
2020
2021                         adev->ip_blocks[i].status.hw = true;
2022                         break;
2023                 }
2024         }
2025
2026         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2027                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2028
2029         return r;
2030 }
2031
2032 /**
2033  * amdgpu_device_ip_init - run init for hardware IPs
2034  *
2035  * @adev: amdgpu_device pointer
2036  *
2037  * Main initialization pass for hardware IPs.  The list of all the hardware
2038  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2039  * are run.  sw_init initializes the software state associated with each IP
2040  * and hw_init initializes the hardware associated with each IP.
2041  * Returns 0 on success, negative error code on failure.
2042  */
2043 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2044 {
2045         int i, r;
2046
2047         r = amdgpu_ras_init(adev);
2048         if (r)
2049                 return r;
2050
2051         for (i = 0; i < adev->num_ip_blocks; i++) {
2052                 if (!adev->ip_blocks[i].status.valid)
2053                         continue;
2054                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2055                 if (r) {
2056                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2057                                   adev->ip_blocks[i].version->funcs->name, r);
2058                         goto init_failed;
2059                 }
2060                 adev->ip_blocks[i].status.sw = true;
2061
2062                 /* need to do gmc hw init early so we can allocate gpu mem */
2063                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2064                         r = amdgpu_device_vram_scratch_init(adev);
2065                         if (r) {
2066                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2067                                 goto init_failed;
2068                         }
2069                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2070                         if (r) {
2071                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2072                                 goto init_failed;
2073                         }
2074                         r = amdgpu_device_wb_init(adev);
2075                         if (r) {
2076                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2077                                 goto init_failed;
2078                         }
2079                         adev->ip_blocks[i].status.hw = true;
2080
2081                         /* right after GMC hw init, we create CSA */
2082                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2083                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2084                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2085                                                                 AMDGPU_CSA_SIZE);
2086                                 if (r) {
2087                                         DRM_ERROR("allocate CSA failed %d\n", r);
2088                                         goto init_failed;
2089                                 }
2090                         }
2091                 }
2092         }
2093
2094         if (amdgpu_sriov_vf(adev))
2095                 amdgpu_virt_init_data_exchange(adev);
2096
2097         r = amdgpu_ib_pool_init(adev);
2098         if (r) {
2099                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2100                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2101                 goto init_failed;
2102         }
2103
2104         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2105         if (r)
2106                 goto init_failed;
2107
2108         r = amdgpu_device_ip_hw_init_phase1(adev);
2109         if (r)
2110                 goto init_failed;
2111
2112         r = amdgpu_device_fw_loading(adev);
2113         if (r)
2114                 goto init_failed;
2115
2116         r = amdgpu_device_ip_hw_init_phase2(adev);
2117         if (r)
2118                 goto init_failed;
2119
2120         /*
2121          * retired pages will be loaded from eeprom and reserved here,
2122          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2123          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2124          * for I2C communication which only true at this point.
2125          *
2126          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2127          * failure from bad gpu situation and stop amdgpu init process
2128          * accordingly. For other failed cases, it will still release all
2129          * the resource and print error message, rather than returning one
2130          * negative value to upper level.
2131          *
2132          * Note: theoretically, this should be called before all vram allocations
2133          * to protect retired page from abusing
2134          */
2135         r = amdgpu_ras_recovery_init(adev);
2136         if (r)
2137                 goto init_failed;
2138
2139         if (adev->gmc.xgmi.num_physical_nodes > 1)
2140                 amdgpu_xgmi_add_device(adev);
2141         amdgpu_amdkfd_device_init(adev);
2142
2143         amdgpu_fru_get_product_info(adev);
2144
2145 init_failed:
2146         if (amdgpu_sriov_vf(adev))
2147                 amdgpu_virt_release_full_gpu(adev, true);
2148
2149         return r;
2150 }
2151
2152 /**
2153  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2154  *
2155  * @adev: amdgpu_device pointer
2156  *
2157  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2158  * this function before a GPU reset.  If the value is retained after a
2159  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2160  */
2161 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2162 {
2163         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2164 }
2165
2166 /**
2167  * amdgpu_device_check_vram_lost - check if vram is valid
2168  *
2169  * @adev: amdgpu_device pointer
2170  *
2171  * Checks the reset magic value written to the gart pointer in VRAM.
2172  * The driver calls this after a GPU reset to see if the contents of
2173  * VRAM is lost or now.
2174  * returns true if vram is lost, false if not.
2175  */
2176 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2177 {
2178         if (memcmp(adev->gart.ptr, adev->reset_magic,
2179                         AMDGPU_RESET_MAGIC_NUM))
2180                 return true;
2181
2182         if (!amdgpu_in_reset(adev))
2183                 return false;
2184
2185         /*
2186          * For all ASICs with baco/mode1 reset, the VRAM is
2187          * always assumed to be lost.
2188          */
2189         switch (amdgpu_asic_reset_method(adev)) {
2190         case AMD_RESET_METHOD_BACO:
2191         case AMD_RESET_METHOD_MODE1:
2192                 return true;
2193         default:
2194                 return false;
2195         }
2196 }
2197
2198 /**
2199  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2200  *
2201  * @adev: amdgpu_device pointer
2202  * @state: clockgating state (gate or ungate)
2203  *
2204  * The list of all the hardware IPs that make up the asic is walked and the
2205  * set_clockgating_state callbacks are run.
2206  * Late initialization pass enabling clockgating for hardware IPs.
2207  * Fini or suspend, pass disabling clockgating for hardware IPs.
2208  * Returns 0 on success, negative error code on failure.
2209  */
2210
2211 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2212                                                 enum amd_clockgating_state state)
2213 {
2214         int i, j, r;
2215
2216         if (amdgpu_emu_mode == 1)
2217                 return 0;
2218
2219         for (j = 0; j < adev->num_ip_blocks; j++) {
2220                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2221                 if (!adev->ip_blocks[i].status.late_initialized)
2222                         continue;
2223                 /* skip CG for VCE/UVD, it's handled specially */
2224                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2225                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2226                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2227                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2228                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2229                         /* enable clockgating to save power */
2230                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2231                                                                                      state);
2232                         if (r) {
2233                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2234                                           adev->ip_blocks[i].version->funcs->name, r);
2235                                 return r;
2236                         }
2237                 }
2238         }
2239
2240         return 0;
2241 }
2242
2243 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2244 {
2245         int i, j, r;
2246
2247         if (amdgpu_emu_mode == 1)
2248                 return 0;
2249
2250         for (j = 0; j < adev->num_ip_blocks; j++) {
2251                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2252                 if (!adev->ip_blocks[i].status.late_initialized)
2253                         continue;
2254                 /* skip CG for VCE/UVD, it's handled specially */
2255                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2256                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2257                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2258                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2259                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2260                         /* enable powergating to save power */
2261                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2262                                                                                         state);
2263                         if (r) {
2264                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2265                                           adev->ip_blocks[i].version->funcs->name, r);
2266                                 return r;
2267                         }
2268                 }
2269         }
2270         return 0;
2271 }
2272
2273 static int amdgpu_device_enable_mgpu_fan_boost(void)
2274 {
2275         struct amdgpu_gpu_instance *gpu_ins;
2276         struct amdgpu_device *adev;
2277         int i, ret = 0;
2278
2279         mutex_lock(&mgpu_info.mutex);
2280
2281         /*
2282          * MGPU fan boost feature should be enabled
2283          * only when there are two or more dGPUs in
2284          * the system
2285          */
2286         if (mgpu_info.num_dgpu < 2)
2287                 goto out;
2288
2289         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2290                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2291                 adev = gpu_ins->adev;
2292                 if (!(adev->flags & AMD_IS_APU) &&
2293                     !gpu_ins->mgpu_fan_enabled) {
2294                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2295                         if (ret)
2296                                 break;
2297
2298                         gpu_ins->mgpu_fan_enabled = 1;
2299                 }
2300         }
2301
2302 out:
2303         mutex_unlock(&mgpu_info.mutex);
2304
2305         return ret;
2306 }
2307
2308 /**
2309  * amdgpu_device_ip_late_init - run late init for hardware IPs
2310  *
2311  * @adev: amdgpu_device pointer
2312  *
2313  * Late initialization pass for hardware IPs.  The list of all the hardware
2314  * IPs that make up the asic is walked and the late_init callbacks are run.
2315  * late_init covers any special initialization that an IP requires
2316  * after all of the have been initialized or something that needs to happen
2317  * late in the init process.
2318  * Returns 0 on success, negative error code on failure.
2319  */
2320 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2321 {
2322         struct amdgpu_gpu_instance *gpu_instance;
2323         int i = 0, r;
2324
2325         for (i = 0; i < adev->num_ip_blocks; i++) {
2326                 if (!adev->ip_blocks[i].status.hw)
2327                         continue;
2328                 if (adev->ip_blocks[i].version->funcs->late_init) {
2329                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2330                         if (r) {
2331                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2332                                           adev->ip_blocks[i].version->funcs->name, r);
2333                                 return r;
2334                         }
2335                 }
2336                 adev->ip_blocks[i].status.late_initialized = true;
2337         }
2338
2339         amdgpu_ras_set_error_query_ready(adev, true);
2340
2341         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2342         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2343
2344         amdgpu_device_fill_reset_magic(adev);
2345
2346         r = amdgpu_device_enable_mgpu_fan_boost();
2347         if (r)
2348                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2349
2350
2351         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2352                 mutex_lock(&mgpu_info.mutex);
2353
2354                 /*
2355                  * Reset device p-state to low as this was booted with high.
2356                  *
2357                  * This should be performed only after all devices from the same
2358                  * hive get initialized.
2359                  *
2360                  * However, it's unknown how many device in the hive in advance.
2361                  * As this is counted one by one during devices initializations.
2362                  *
2363                  * So, we wait for all XGMI interlinked devices initialized.
2364                  * This may bring some delays as those devices may come from
2365                  * different hives. But that should be OK.
2366                  */
2367                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2368                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2369                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2370                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2371                                         continue;
2372
2373                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2374                                                 AMDGPU_XGMI_PSTATE_MIN);
2375                                 if (r) {
2376                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2377                                         break;
2378                                 }
2379                         }
2380                 }
2381
2382                 mutex_unlock(&mgpu_info.mutex);
2383         }
2384
2385         return 0;
2386 }
2387
2388 /**
2389  * amdgpu_device_ip_fini - run fini for hardware IPs
2390  *
2391  * @adev: amdgpu_device pointer
2392  *
2393  * Main teardown pass for hardware IPs.  The list of all the hardware
2394  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2395  * are run.  hw_fini tears down the hardware associated with each IP
2396  * and sw_fini tears down any software state associated with each IP.
2397  * Returns 0 on success, negative error code on failure.
2398  */
2399 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2400 {
2401         int i, r;
2402
2403         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2404                 amdgpu_virt_release_ras_err_handler_data(adev);
2405
2406         amdgpu_ras_pre_fini(adev);
2407
2408         if (adev->gmc.xgmi.num_physical_nodes > 1)
2409                 amdgpu_xgmi_remove_device(adev);
2410
2411         amdgpu_amdkfd_device_fini(adev);
2412
2413         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2414         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2415
2416         /* need to disable SMC first */
2417         for (i = 0; i < adev->num_ip_blocks; i++) {
2418                 if (!adev->ip_blocks[i].status.hw)
2419                         continue;
2420                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2421                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2422                         /* XXX handle errors */
2423                         if (r) {
2424                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2425                                           adev->ip_blocks[i].version->funcs->name, r);
2426                         }
2427                         adev->ip_blocks[i].status.hw = false;
2428                         break;
2429                 }
2430         }
2431
2432         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2433                 if (!adev->ip_blocks[i].status.hw)
2434                         continue;
2435
2436                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2437                 /* XXX handle errors */
2438                 if (r) {
2439                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2440                                   adev->ip_blocks[i].version->funcs->name, r);
2441                 }
2442
2443                 adev->ip_blocks[i].status.hw = false;
2444         }
2445
2446
2447         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2448                 if (!adev->ip_blocks[i].status.sw)
2449                         continue;
2450
2451                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2452                         amdgpu_ucode_free_bo(adev);
2453                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2454                         amdgpu_device_wb_fini(adev);
2455                         amdgpu_device_vram_scratch_fini(adev);
2456                         amdgpu_ib_pool_fini(adev);
2457                 }
2458
2459                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2460                 /* XXX handle errors */
2461                 if (r) {
2462                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2463                                   adev->ip_blocks[i].version->funcs->name, r);
2464                 }
2465                 adev->ip_blocks[i].status.sw = false;
2466                 adev->ip_blocks[i].status.valid = false;
2467         }
2468
2469         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2470                 if (!adev->ip_blocks[i].status.late_initialized)
2471                         continue;
2472                 if (adev->ip_blocks[i].version->funcs->late_fini)
2473                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2474                 adev->ip_blocks[i].status.late_initialized = false;
2475         }
2476
2477         amdgpu_ras_fini(adev);
2478
2479         if (amdgpu_sriov_vf(adev))
2480                 if (amdgpu_virt_release_full_gpu(adev, false))
2481                         DRM_ERROR("failed to release exclusive mode on fini\n");
2482
2483         return 0;
2484 }
2485
2486 /**
2487  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2488  *
2489  * @work: work_struct.
2490  */
2491 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2492 {
2493         struct amdgpu_device *adev =
2494                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2495         int r;
2496
2497         r = amdgpu_ib_ring_tests(adev);
2498         if (r)
2499                 DRM_ERROR("ib ring test failed (%d).\n", r);
2500 }
2501
2502 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2503 {
2504         struct amdgpu_device *adev =
2505                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2506
2507         mutex_lock(&adev->gfx.gfx_off_mutex);
2508         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2509                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2510                         adev->gfx.gfx_off_state = true;
2511         }
2512         mutex_unlock(&adev->gfx.gfx_off_mutex);
2513 }
2514
2515 /**
2516  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2517  *
2518  * @adev: amdgpu_device pointer
2519  *
2520  * Main suspend function for hardware IPs.  The list of all the hardware
2521  * IPs that make up the asic is walked, clockgating is disabled and the
2522  * suspend callbacks are run.  suspend puts the hardware and software state
2523  * in each IP into a state suitable for suspend.
2524  * Returns 0 on success, negative error code on failure.
2525  */
2526 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2527 {
2528         int i, r;
2529
2530         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2531         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2532
2533         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2534                 if (!adev->ip_blocks[i].status.valid)
2535                         continue;
2536
2537                 /* displays are handled separately */
2538                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2539                         continue;
2540
2541                 /* XXX handle errors */
2542                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2543                 /* XXX handle errors */
2544                 if (r) {
2545                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2546                                   adev->ip_blocks[i].version->funcs->name, r);
2547                         return r;
2548                 }
2549
2550                 adev->ip_blocks[i].status.hw = false;
2551         }
2552
2553         return 0;
2554 }
2555
2556 /**
2557  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2558  *
2559  * @adev: amdgpu_device pointer
2560  *
2561  * Main suspend function for hardware IPs.  The list of all the hardware
2562  * IPs that make up the asic is walked, clockgating is disabled and the
2563  * suspend callbacks are run.  suspend puts the hardware and software state
2564  * in each IP into a state suitable for suspend.
2565  * Returns 0 on success, negative error code on failure.
2566  */
2567 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2568 {
2569         int i, r;
2570
2571         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2572                 if (!adev->ip_blocks[i].status.valid)
2573                         continue;
2574                 /* displays are handled in phase1 */
2575                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2576                         continue;
2577                 /* PSP lost connection when err_event_athub occurs */
2578                 if (amdgpu_ras_intr_triggered() &&
2579                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2580                         adev->ip_blocks[i].status.hw = false;
2581                         continue;
2582                 }
2583                 /* XXX handle errors */
2584                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2585                 /* XXX handle errors */
2586                 if (r) {
2587                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2588                                   adev->ip_blocks[i].version->funcs->name, r);
2589                 }
2590                 adev->ip_blocks[i].status.hw = false;
2591                 /* handle putting the SMC in the appropriate state */
2592                 if(!amdgpu_sriov_vf(adev)){
2593                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2594                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2595                                 if (r) {
2596                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2597                                                         adev->mp1_state, r);
2598                                         return r;
2599                                 }
2600                         }
2601                 }
2602                 adev->ip_blocks[i].status.hw = false;
2603         }
2604
2605         return 0;
2606 }
2607
2608 /**
2609  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2610  *
2611  * @adev: amdgpu_device pointer
2612  *
2613  * Main suspend function for hardware IPs.  The list of all the hardware
2614  * IPs that make up the asic is walked, clockgating is disabled and the
2615  * suspend callbacks are run.  suspend puts the hardware and software state
2616  * in each IP into a state suitable for suspend.
2617  * Returns 0 on success, negative error code on failure.
2618  */
2619 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2620 {
2621         int r;
2622
2623         if (amdgpu_sriov_vf(adev))
2624                 amdgpu_virt_request_full_gpu(adev, false);
2625
2626         r = amdgpu_device_ip_suspend_phase1(adev);
2627         if (r)
2628                 return r;
2629         r = amdgpu_device_ip_suspend_phase2(adev);
2630
2631         if (amdgpu_sriov_vf(adev))
2632                 amdgpu_virt_release_full_gpu(adev, false);
2633
2634         return r;
2635 }
2636
2637 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2638 {
2639         int i, r;
2640
2641         static enum amd_ip_block_type ip_order[] = {
2642                 AMD_IP_BLOCK_TYPE_GMC,
2643                 AMD_IP_BLOCK_TYPE_COMMON,
2644                 AMD_IP_BLOCK_TYPE_PSP,
2645                 AMD_IP_BLOCK_TYPE_IH,
2646         };
2647
2648         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2649                 int j;
2650                 struct amdgpu_ip_block *block;
2651
2652                 block = &adev->ip_blocks[i];
2653                 block->status.hw = false;
2654
2655                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2656
2657                         if (block->version->type != ip_order[j] ||
2658                                 !block->status.valid)
2659                                 continue;
2660
2661                         r = block->version->funcs->hw_init(adev);
2662                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2663                         if (r)
2664                                 return r;
2665                         block->status.hw = true;
2666                 }
2667         }
2668
2669         return 0;
2670 }
2671
2672 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2673 {
2674         int i, r;
2675
2676         static enum amd_ip_block_type ip_order[] = {
2677                 AMD_IP_BLOCK_TYPE_SMC,
2678                 AMD_IP_BLOCK_TYPE_DCE,
2679                 AMD_IP_BLOCK_TYPE_GFX,
2680                 AMD_IP_BLOCK_TYPE_SDMA,
2681                 AMD_IP_BLOCK_TYPE_UVD,
2682                 AMD_IP_BLOCK_TYPE_VCE,
2683                 AMD_IP_BLOCK_TYPE_VCN
2684         };
2685
2686         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2687                 int j;
2688                 struct amdgpu_ip_block *block;
2689
2690                 for (j = 0; j < adev->num_ip_blocks; j++) {
2691                         block = &adev->ip_blocks[j];
2692
2693                         if (block->version->type != ip_order[i] ||
2694                                 !block->status.valid ||
2695                                 block->status.hw)
2696                                 continue;
2697
2698                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2699                                 r = block->version->funcs->resume(adev);
2700                         else
2701                                 r = block->version->funcs->hw_init(adev);
2702
2703                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2704                         if (r)
2705                                 return r;
2706                         block->status.hw = true;
2707                 }
2708         }
2709
2710         return 0;
2711 }
2712
2713 /**
2714  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2715  *
2716  * @adev: amdgpu_device pointer
2717  *
2718  * First resume function for hardware IPs.  The list of all the hardware
2719  * IPs that make up the asic is walked and the resume callbacks are run for
2720  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2721  * after a suspend and updates the software state as necessary.  This
2722  * function is also used for restoring the GPU after a GPU reset.
2723  * Returns 0 on success, negative error code on failure.
2724  */
2725 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2726 {
2727         int i, r;
2728
2729         for (i = 0; i < adev->num_ip_blocks; i++) {
2730                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2731                         continue;
2732                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2733                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2734                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2735
2736                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2737                         if (r) {
2738                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2739                                           adev->ip_blocks[i].version->funcs->name, r);
2740                                 return r;
2741                         }
2742                         adev->ip_blocks[i].status.hw = true;
2743                 }
2744         }
2745
2746         return 0;
2747 }
2748
2749 /**
2750  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2751  *
2752  * @adev: amdgpu_device pointer
2753  *
2754  * First resume function for hardware IPs.  The list of all the hardware
2755  * IPs that make up the asic is walked and the resume callbacks are run for
2756  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2757  * functional state after a suspend and updates the software state as
2758  * necessary.  This function is also used for restoring the GPU after a GPU
2759  * reset.
2760  * Returns 0 on success, negative error code on failure.
2761  */
2762 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2763 {
2764         int i, r;
2765
2766         for (i = 0; i < adev->num_ip_blocks; i++) {
2767                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2768                         continue;
2769                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2770                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2771                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2772                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2773                         continue;
2774                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2775                 if (r) {
2776                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2777                                   adev->ip_blocks[i].version->funcs->name, r);
2778                         return r;
2779                 }
2780                 adev->ip_blocks[i].status.hw = true;
2781         }
2782
2783         return 0;
2784 }
2785
2786 /**
2787  * amdgpu_device_ip_resume - run resume for hardware IPs
2788  *
2789  * @adev: amdgpu_device pointer
2790  *
2791  * Main resume function for hardware IPs.  The hardware IPs
2792  * are split into two resume functions because they are
2793  * are also used in in recovering from a GPU reset and some additional
2794  * steps need to be take between them.  In this case (S3/S4) they are
2795  * run sequentially.
2796  * Returns 0 on success, negative error code on failure.
2797  */
2798 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2799 {
2800         int r;
2801
2802         r = amdgpu_device_ip_resume_phase1(adev);
2803         if (r)
2804                 return r;
2805
2806         r = amdgpu_device_fw_loading(adev);
2807         if (r)
2808                 return r;
2809
2810         r = amdgpu_device_ip_resume_phase2(adev);
2811
2812         return r;
2813 }
2814
2815 /**
2816  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2817  *
2818  * @adev: amdgpu_device pointer
2819  *
2820  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2821  */
2822 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2823 {
2824         if (amdgpu_sriov_vf(adev)) {
2825                 if (adev->is_atom_fw) {
2826                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2827                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2828                 } else {
2829                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2830                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2831                 }
2832
2833                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2834                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2835         }
2836 }
2837
2838 /**
2839  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2840  *
2841  * @asic_type: AMD asic type
2842  *
2843  * Check if there is DC (new modesetting infrastructre) support for an asic.
2844  * returns true if DC has support, false if not.
2845  */
2846 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2847 {
2848         switch (asic_type) {
2849 #if defined(CONFIG_DRM_AMD_DC)
2850 #if defined(CONFIG_DRM_AMD_DC_SI)
2851         case CHIP_TAHITI:
2852         case CHIP_PITCAIRN:
2853         case CHIP_VERDE:
2854         case CHIP_OLAND:
2855 #endif
2856         case CHIP_BONAIRE:
2857         case CHIP_KAVERI:
2858         case CHIP_KABINI:
2859         case CHIP_MULLINS:
2860                 /*
2861                  * We have systems in the wild with these ASICs that require
2862                  * LVDS and VGA support which is not supported with DC.
2863                  *
2864                  * Fallback to the non-DC driver here by default so as not to
2865                  * cause regressions.
2866                  */
2867                 return amdgpu_dc > 0;
2868         case CHIP_HAWAII:
2869         case CHIP_CARRIZO:
2870         case CHIP_STONEY:
2871         case CHIP_POLARIS10:
2872         case CHIP_POLARIS11:
2873         case CHIP_POLARIS12:
2874         case CHIP_VEGAM:
2875         case CHIP_TONGA:
2876         case CHIP_FIJI:
2877         case CHIP_VEGA10:
2878         case CHIP_VEGA12:
2879         case CHIP_VEGA20:
2880 #if defined(CONFIG_DRM_AMD_DC_DCN)
2881         case CHIP_RAVEN:
2882         case CHIP_NAVI10:
2883         case CHIP_NAVI14:
2884         case CHIP_NAVI12:
2885         case CHIP_RENOIR:
2886 #endif
2887 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2888         case CHIP_SIENNA_CICHLID:
2889         case CHIP_NAVY_FLOUNDER:
2890 #endif
2891                 return amdgpu_dc != 0;
2892 #endif
2893         default:
2894                 if (amdgpu_dc > 0)
2895                         DRM_INFO("Display Core has been requested via kernel parameter "
2896                                          "but isn't supported by ASIC, ignoring\n");
2897                 return false;
2898         }
2899 }
2900
2901 /**
2902  * amdgpu_device_has_dc_support - check if dc is supported
2903  *
2904  * @adev: amdgpu_device_pointer
2905  *
2906  * Returns true for supported, false for not supported
2907  */
2908 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2909 {
2910         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2911                 return false;
2912
2913         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2914 }
2915
2916
2917 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2918 {
2919         struct amdgpu_device *adev =
2920                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2921         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2922
2923         /* It's a bug to not have a hive within this function */
2924         if (WARN_ON(!hive))
2925                 return;
2926
2927         /*
2928          * Use task barrier to synchronize all xgmi reset works across the
2929          * hive. task_barrier_enter and task_barrier_exit will block
2930          * until all the threads running the xgmi reset works reach
2931          * those points. task_barrier_full will do both blocks.
2932          */
2933         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2934
2935                 task_barrier_enter(&hive->tb);
2936                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
2937
2938                 if (adev->asic_reset_res)
2939                         goto fail;
2940
2941                 task_barrier_exit(&hive->tb);
2942                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
2943
2944                 if (adev->asic_reset_res)
2945                         goto fail;
2946
2947                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2948                         adev->mmhub.funcs->reset_ras_error_count(adev);
2949         } else {
2950
2951                 task_barrier_full(&hive->tb);
2952                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2953         }
2954
2955 fail:
2956         if (adev->asic_reset_res)
2957                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2958                          adev->asic_reset_res, adev_to_drm(adev)->unique);
2959         amdgpu_put_xgmi_hive(hive);
2960 }
2961
2962 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2963 {
2964         char *input = amdgpu_lockup_timeout;
2965         char *timeout_setting = NULL;
2966         int index = 0;
2967         long timeout;
2968         int ret = 0;
2969
2970         /*
2971          * By default timeout for non compute jobs is 10000.
2972          * And there is no timeout enforced on compute jobs.
2973          * In SR-IOV or passthrough mode, timeout for compute
2974          * jobs are 60000 by default.
2975          */
2976         adev->gfx_timeout = msecs_to_jiffies(10000);
2977         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2978         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2979                 adev->compute_timeout =  msecs_to_jiffies(60000);
2980         else
2981                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2982
2983         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2984                 while ((timeout_setting = strsep(&input, ",")) &&
2985                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2986                         ret = kstrtol(timeout_setting, 0, &timeout);
2987                         if (ret)
2988                                 return ret;
2989
2990                         if (timeout == 0) {
2991                                 index++;
2992                                 continue;
2993                         } else if (timeout < 0) {
2994                                 timeout = MAX_SCHEDULE_TIMEOUT;
2995                         } else {
2996                                 timeout = msecs_to_jiffies(timeout);
2997                         }
2998
2999                         switch (index++) {
3000                         case 0:
3001                                 adev->gfx_timeout = timeout;
3002                                 break;
3003                         case 1:
3004                                 adev->compute_timeout = timeout;
3005                                 break;
3006                         case 2:
3007                                 adev->sdma_timeout = timeout;
3008                                 break;
3009                         case 3:
3010                                 adev->video_timeout = timeout;
3011                                 break;
3012                         default:
3013                                 break;
3014                         }
3015                 }
3016                 /*
3017                  * There is only one value specified and
3018                  * it should apply to all non-compute jobs.
3019                  */
3020                 if (index == 1) {
3021                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3022                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3023                                 adev->compute_timeout = adev->gfx_timeout;
3024                 }
3025         }
3026
3027         return ret;
3028 }
3029
3030 static const struct attribute *amdgpu_dev_attributes[] = {
3031         &dev_attr_product_name.attr,
3032         &dev_attr_product_number.attr,
3033         &dev_attr_serial_number.attr,
3034         &dev_attr_pcie_replay_count.attr,
3035         NULL
3036 };
3037
3038
3039 /**
3040  * amdgpu_device_init - initialize the driver
3041  *
3042  * @adev: amdgpu_device pointer
3043  * @flags: driver flags
3044  *
3045  * Initializes the driver info and hw (all asics).
3046  * Returns 0 for success or an error on failure.
3047  * Called at driver startup.
3048  */
3049 int amdgpu_device_init(struct amdgpu_device *adev,
3050                        uint32_t flags)
3051 {
3052         struct drm_device *ddev = adev_to_drm(adev);
3053         struct pci_dev *pdev = adev->pdev;
3054         int r, i;
3055         bool boco = false;
3056         u32 max_MBps;
3057
3058         adev->shutdown = false;
3059         adev->flags = flags;
3060
3061         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3062                 adev->asic_type = amdgpu_force_asic_type;
3063         else
3064                 adev->asic_type = flags & AMD_ASIC_MASK;
3065
3066         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3067         if (amdgpu_emu_mode == 1)
3068                 adev->usec_timeout *= 10;
3069         adev->gmc.gart_size = 512 * 1024 * 1024;
3070         adev->accel_working = false;
3071         adev->num_rings = 0;
3072         adev->mman.buffer_funcs = NULL;
3073         adev->mman.buffer_funcs_ring = NULL;
3074         adev->vm_manager.vm_pte_funcs = NULL;
3075         adev->vm_manager.vm_pte_num_scheds = 0;
3076         adev->gmc.gmc_funcs = NULL;
3077         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3078         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3079
3080         adev->smc_rreg = &amdgpu_invalid_rreg;
3081         adev->smc_wreg = &amdgpu_invalid_wreg;
3082         adev->pcie_rreg = &amdgpu_invalid_rreg;
3083         adev->pcie_wreg = &amdgpu_invalid_wreg;
3084         adev->pciep_rreg = &amdgpu_invalid_rreg;
3085         adev->pciep_wreg = &amdgpu_invalid_wreg;
3086         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3087         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3088         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3089         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3090         adev->didt_rreg = &amdgpu_invalid_rreg;
3091         adev->didt_wreg = &amdgpu_invalid_wreg;
3092         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3093         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3094         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3095         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3096
3097         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3098                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3099                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3100
3101         /* mutex initialization are all done here so we
3102          * can recall function without having locking issues */
3103         atomic_set(&adev->irq.ih.lock, 0);
3104         mutex_init(&adev->firmware.mutex);
3105         mutex_init(&adev->pm.mutex);
3106         mutex_init(&adev->gfx.gpu_clock_mutex);
3107         mutex_init(&adev->srbm_mutex);
3108         mutex_init(&adev->gfx.pipe_reserve_mutex);
3109         mutex_init(&adev->gfx.gfx_off_mutex);
3110         mutex_init(&adev->grbm_idx_mutex);
3111         mutex_init(&adev->mn_lock);
3112         mutex_init(&adev->virt.vf_errors.lock);
3113         hash_init(adev->mn_hash);
3114         atomic_set(&adev->in_gpu_reset, 0);
3115         init_rwsem(&adev->reset_sem);
3116         mutex_init(&adev->psp.mutex);
3117         mutex_init(&adev->notifier_lock);
3118
3119         r = amdgpu_device_check_arguments(adev);
3120         if (r)
3121                 return r;
3122
3123         spin_lock_init(&adev->mmio_idx_lock);
3124         spin_lock_init(&adev->smc_idx_lock);
3125         spin_lock_init(&adev->pcie_idx_lock);
3126         spin_lock_init(&adev->uvd_ctx_idx_lock);
3127         spin_lock_init(&adev->didt_idx_lock);
3128         spin_lock_init(&adev->gc_cac_idx_lock);
3129         spin_lock_init(&adev->se_cac_idx_lock);
3130         spin_lock_init(&adev->audio_endpt_idx_lock);
3131         spin_lock_init(&adev->mm_stats.lock);
3132
3133         INIT_LIST_HEAD(&adev->shadow_list);
3134         mutex_init(&adev->shadow_list_lock);
3135
3136         INIT_DELAYED_WORK(&adev->delayed_init_work,
3137                           amdgpu_device_delayed_init_work_handler);
3138         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3139                           amdgpu_device_delay_enable_gfx_off);
3140
3141         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3142
3143         adev->gfx.gfx_off_req_count = 1;
3144         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3145
3146         atomic_set(&adev->throttling_logging_enabled, 1);
3147         /*
3148          * If throttling continues, logging will be performed every minute
3149          * to avoid log flooding. "-1" is subtracted since the thermal
3150          * throttling interrupt comes every second. Thus, the total logging
3151          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3152          * for throttling interrupt) = 60 seconds.
3153          */
3154         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3155         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3156
3157         /* Registers mapping */
3158         /* TODO: block userspace mapping of io register */
3159         if (adev->asic_type >= CHIP_BONAIRE) {
3160                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3161                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3162         } else {
3163                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3164                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3165         }
3166
3167         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3168         if (adev->rmmio == NULL) {
3169                 return -ENOMEM;
3170         }
3171         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3172         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3173
3174         /* io port mapping */
3175         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3176                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3177                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3178                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3179                         break;
3180                 }
3181         }
3182         if (adev->rio_mem == NULL)
3183                 DRM_INFO("PCI I/O BAR is not found.\n");
3184
3185         /* enable PCIE atomic ops */
3186         r = pci_enable_atomic_ops_to_root(adev->pdev,
3187                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3188                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3189         if (r) {
3190                 adev->have_atomics_support = false;
3191                 DRM_INFO("PCIE atomic ops is not supported\n");
3192         } else {
3193                 adev->have_atomics_support = true;
3194         }
3195
3196         amdgpu_device_get_pcie_info(adev);
3197
3198         if (amdgpu_mcbp)
3199                 DRM_INFO("MCBP is enabled\n");
3200
3201         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3202                 adev->enable_mes = true;
3203
3204         /* detect hw virtualization here */
3205         amdgpu_detect_virtualization(adev);
3206
3207         r = amdgpu_device_get_job_timeout_settings(adev);
3208         if (r) {
3209                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3210                 return r;
3211         }
3212
3213         /* early init functions */
3214         r = amdgpu_device_ip_early_init(adev);
3215         if (r)
3216                 return r;
3217
3218         /* doorbell bar mapping and doorbell index init*/
3219         amdgpu_device_doorbell_init(adev);
3220
3221         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3222         /* this will fail for cards that aren't VGA class devices, just
3223          * ignore it */
3224         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3225
3226         if (amdgpu_device_supports_boco(ddev))
3227                 boco = true;
3228         if (amdgpu_has_atpx() &&
3229             (amdgpu_is_atpx_hybrid() ||
3230              amdgpu_has_atpx_dgpu_power_cntl()) &&
3231             !pci_is_thunderbolt_attached(adev->pdev))
3232                 vga_switcheroo_register_client(adev->pdev,
3233                                                &amdgpu_switcheroo_ops, boco);
3234         if (boco)
3235                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3236
3237         if (amdgpu_emu_mode == 1) {
3238                 /* post the asic on emulation mode */
3239                 emu_soc_asic_init(adev);
3240                 goto fence_driver_init;
3241         }
3242
3243         /* detect if we are with an SRIOV vbios */
3244         amdgpu_device_detect_sriov_bios(adev);
3245
3246         /* check if we need to reset the asic
3247          *  E.g., driver was not cleanly unloaded previously, etc.
3248          */
3249         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3250                 r = amdgpu_asic_reset(adev);
3251                 if (r) {
3252                         dev_err(adev->dev, "asic reset on init failed\n");
3253                         goto failed;
3254                 }
3255         }
3256
3257         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3258
3259         /* Post card if necessary */
3260         if (amdgpu_device_need_post(adev)) {
3261                 if (!adev->bios) {
3262                         dev_err(adev->dev, "no vBIOS found\n");
3263                         r = -EINVAL;
3264                         goto failed;
3265                 }
3266                 DRM_INFO("GPU posting now...\n");
3267                 r = amdgpu_device_asic_init(adev);
3268                 if (r) {
3269                         dev_err(adev->dev, "gpu post error!\n");
3270                         goto failed;
3271                 }
3272         }
3273
3274         if (adev->is_atom_fw) {
3275                 /* Initialize clocks */
3276                 r = amdgpu_atomfirmware_get_clock_info(adev);
3277                 if (r) {
3278                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3279                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3280                         goto failed;
3281                 }
3282         } else {
3283                 /* Initialize clocks */
3284                 r = amdgpu_atombios_get_clock_info(adev);
3285                 if (r) {
3286                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3287                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3288                         goto failed;
3289                 }
3290                 /* init i2c buses */
3291                 if (!amdgpu_device_has_dc_support(adev))
3292                         amdgpu_atombios_i2c_init(adev);
3293         }
3294
3295 fence_driver_init:
3296         /* Fence driver */
3297         r = amdgpu_fence_driver_init(adev);
3298         if (r) {
3299                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3300                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3301                 goto failed;
3302         }
3303
3304         /* init the mode config */
3305         drm_mode_config_init(adev_to_drm(adev));
3306
3307         r = amdgpu_device_ip_init(adev);
3308         if (r) {
3309                 /* failed in exclusive mode due to timeout */
3310                 if (amdgpu_sriov_vf(adev) &&
3311                     !amdgpu_sriov_runtime(adev) &&
3312                     amdgpu_virt_mmio_blocked(adev) &&
3313                     !amdgpu_virt_wait_reset(adev)) {
3314                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3315                         /* Don't send request since VF is inactive. */
3316                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3317                         adev->virt.ops = NULL;
3318                         r = -EAGAIN;
3319                         goto failed;
3320                 }
3321                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3322                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3323                 goto failed;
3324         }
3325
3326         dev_info(adev->dev,
3327                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3328                         adev->gfx.config.max_shader_engines,
3329                         adev->gfx.config.max_sh_per_se,
3330                         adev->gfx.config.max_cu_per_sh,
3331                         adev->gfx.cu_info.number);
3332
3333         adev->accel_working = true;
3334
3335         amdgpu_vm_check_compute_bug(adev);
3336
3337         /* Initialize the buffer migration limit. */
3338         if (amdgpu_moverate >= 0)
3339                 max_MBps = amdgpu_moverate;
3340         else
3341                 max_MBps = 8; /* Allow 8 MB/s. */
3342         /* Get a log2 for easy divisions. */
3343         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3344
3345         amdgpu_fbdev_init(adev);
3346
3347         r = amdgpu_pm_sysfs_init(adev);
3348         if (r) {
3349                 adev->pm_sysfs_en = false;
3350                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3351         } else
3352                 adev->pm_sysfs_en = true;
3353
3354         r = amdgpu_ucode_sysfs_init(adev);
3355         if (r) {
3356                 adev->ucode_sysfs_en = false;
3357                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3358         } else
3359                 adev->ucode_sysfs_en = true;
3360
3361         if ((amdgpu_testing & 1)) {
3362                 if (adev->accel_working)
3363                         amdgpu_test_moves(adev);
3364                 else
3365                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3366         }
3367         if (amdgpu_benchmarking) {
3368                 if (adev->accel_working)
3369                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3370                 else
3371                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3372         }
3373
3374         /*
3375          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3376          * Otherwise the mgpu fan boost feature will be skipped due to the
3377          * gpu instance is counted less.
3378          */
3379         amdgpu_register_gpu_instance(adev);
3380
3381         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3382          * explicit gating rather than handling it automatically.
3383          */
3384         r = amdgpu_device_ip_late_init(adev);
3385         if (r) {
3386                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3387                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3388                 goto failed;
3389         }
3390
3391         /* must succeed. */
3392         amdgpu_ras_resume(adev);
3393
3394         queue_delayed_work(system_wq, &adev->delayed_init_work,
3395                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3396
3397         if (amdgpu_sriov_vf(adev))
3398                 flush_delayed_work(&adev->delayed_init_work);
3399
3400         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3401         if (r) {
3402                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3403                 return r;
3404         }
3405
3406         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3407                 r = amdgpu_pmu_init(adev);
3408         if (r)
3409                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3410
3411         return 0;
3412
3413 failed:
3414         amdgpu_vf_error_trans_all(adev);
3415         if (boco)
3416                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3417
3418         return r;
3419 }
3420
3421 /**
3422  * amdgpu_device_fini - tear down the driver
3423  *
3424  * @adev: amdgpu_device pointer
3425  *
3426  * Tear down the driver info (all asics).
3427  * Called at driver shutdown.
3428  */
3429 void amdgpu_device_fini(struct amdgpu_device *adev)
3430 {
3431         dev_info(adev->dev, "amdgpu: finishing device.\n");
3432         flush_delayed_work(&adev->delayed_init_work);
3433         adev->shutdown = true;
3434
3435         /* make sure IB test finished before entering exclusive mode
3436          * to avoid preemption on IB test
3437          * */
3438         if (amdgpu_sriov_vf(adev))
3439                 amdgpu_virt_request_full_gpu(adev, false);
3440
3441         /* disable all interrupts */
3442         amdgpu_irq_disable_all(adev);
3443         if (adev->mode_info.mode_config_initialized){
3444                 if (!amdgpu_device_has_dc_support(adev))
3445                         drm_helper_force_disable_all(adev_to_drm(adev));
3446                 else
3447                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3448         }
3449         amdgpu_fence_driver_fini(adev);
3450         if (adev->pm_sysfs_en)
3451                 amdgpu_pm_sysfs_fini(adev);
3452         amdgpu_fbdev_fini(adev);
3453         amdgpu_device_ip_fini(adev);
3454         release_firmware(adev->firmware.gpu_info_fw);
3455         adev->firmware.gpu_info_fw = NULL;
3456         adev->accel_working = false;
3457         /* free i2c buses */
3458         if (!amdgpu_device_has_dc_support(adev))
3459                 amdgpu_i2c_fini(adev);
3460
3461         if (amdgpu_emu_mode != 1)
3462                 amdgpu_atombios_fini(adev);
3463
3464         kfree(adev->bios);
3465         adev->bios = NULL;
3466         if (amdgpu_has_atpx() &&
3467             (amdgpu_is_atpx_hybrid() ||
3468              amdgpu_has_atpx_dgpu_power_cntl()) &&
3469             !pci_is_thunderbolt_attached(adev->pdev))
3470                 vga_switcheroo_unregister_client(adev->pdev);
3471         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3472                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3473         vga_client_register(adev->pdev, NULL, NULL, NULL);
3474         if (adev->rio_mem)
3475                 pci_iounmap(adev->pdev, adev->rio_mem);
3476         adev->rio_mem = NULL;
3477         iounmap(adev->rmmio);
3478         adev->rmmio = NULL;
3479         amdgpu_device_doorbell_fini(adev);
3480
3481         if (adev->ucode_sysfs_en)
3482                 amdgpu_ucode_sysfs_fini(adev);
3483
3484         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3485         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3486                 amdgpu_pmu_fini(adev);
3487         if (adev->mman.discovery_bin)
3488                 amdgpu_discovery_fini(adev);
3489 }
3490
3491
3492 /*
3493  * Suspend & resume.
3494  */
3495 /**
3496  * amdgpu_device_suspend - initiate device suspend
3497  *
3498  * @dev: drm dev pointer
3499  * @fbcon : notify the fbdev of suspend
3500  *
3501  * Puts the hw in the suspend state (all asics).
3502  * Returns 0 for success or an error on failure.
3503  * Called at driver suspend.
3504  */
3505 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3506 {
3507         struct amdgpu_device *adev;
3508         struct drm_crtc *crtc;
3509         struct drm_connector *connector;
3510         struct drm_connector_list_iter iter;
3511         int r;
3512
3513         adev = drm_to_adev(dev);
3514
3515         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3516                 return 0;
3517
3518         adev->in_suspend = true;
3519         drm_kms_helper_poll_disable(dev);
3520
3521         if (fbcon)
3522                 amdgpu_fbdev_set_suspend(adev, 1);
3523
3524         cancel_delayed_work_sync(&adev->delayed_init_work);
3525
3526         if (!amdgpu_device_has_dc_support(adev)) {
3527                 /* turn off display hw */
3528                 drm_modeset_lock_all(dev);
3529                 drm_connector_list_iter_begin(dev, &iter);
3530                 drm_for_each_connector_iter(connector, &iter)
3531                         drm_helper_connector_dpms(connector,
3532                                                   DRM_MODE_DPMS_OFF);
3533                 drm_connector_list_iter_end(&iter);
3534                 drm_modeset_unlock_all(dev);
3535                         /* unpin the front buffers and cursors */
3536                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3537                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3538                         struct drm_framebuffer *fb = crtc->primary->fb;
3539                         struct amdgpu_bo *robj;
3540
3541                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3542                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3543                                 r = amdgpu_bo_reserve(aobj, true);
3544                                 if (r == 0) {
3545                                         amdgpu_bo_unpin(aobj);
3546                                         amdgpu_bo_unreserve(aobj);
3547                                 }
3548                         }
3549
3550                         if (fb == NULL || fb->obj[0] == NULL) {
3551                                 continue;
3552                         }
3553                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3554                         /* don't unpin kernel fb objects */
3555                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3556                                 r = amdgpu_bo_reserve(robj, true);
3557                                 if (r == 0) {
3558                                         amdgpu_bo_unpin(robj);
3559                                         amdgpu_bo_unreserve(robj);
3560                                 }
3561                         }
3562                 }
3563         }
3564
3565         amdgpu_ras_suspend(adev);
3566
3567         r = amdgpu_device_ip_suspend_phase1(adev);
3568
3569         amdgpu_amdkfd_suspend(adev, !fbcon);
3570
3571         /* evict vram memory */
3572         amdgpu_bo_evict_vram(adev);
3573
3574         amdgpu_fence_driver_suspend(adev);
3575
3576         r = amdgpu_device_ip_suspend_phase2(adev);
3577
3578         /* evict remaining vram memory
3579          * This second call to evict vram is to evict the gart page table
3580          * using the CPU.
3581          */
3582         amdgpu_bo_evict_vram(adev);
3583
3584         return 0;
3585 }
3586
3587 /**
3588  * amdgpu_device_resume - initiate device resume
3589  *
3590  * @dev: drm dev pointer
3591  * @fbcon : notify the fbdev of resume
3592  *
3593  * Bring the hw back to operating state (all asics).
3594  * Returns 0 for success or an error on failure.
3595  * Called at driver resume.
3596  */
3597 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3598 {
3599         struct drm_connector *connector;
3600         struct drm_connector_list_iter iter;
3601         struct amdgpu_device *adev = drm_to_adev(dev);
3602         struct drm_crtc *crtc;
3603         int r = 0;
3604
3605         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3606                 return 0;
3607
3608         /* post card */
3609         if (amdgpu_device_need_post(adev)) {
3610                 r = amdgpu_device_asic_init(adev);
3611                 if (r)
3612                         dev_err(adev->dev, "amdgpu asic init failed\n");
3613         }
3614
3615         r = amdgpu_device_ip_resume(adev);
3616         if (r) {
3617                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3618                 return r;
3619         }
3620         amdgpu_fence_driver_resume(adev);
3621
3622
3623         r = amdgpu_device_ip_late_init(adev);
3624         if (r)
3625                 return r;
3626
3627         queue_delayed_work(system_wq, &adev->delayed_init_work,
3628                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3629
3630         if (!amdgpu_device_has_dc_support(adev)) {
3631                 /* pin cursors */
3632                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3633                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3634
3635                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3636                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3637                                 r = amdgpu_bo_reserve(aobj, true);
3638                                 if (r == 0) {
3639                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3640                                         if (r != 0)
3641                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3642                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3643                                         amdgpu_bo_unreserve(aobj);
3644                                 }
3645                         }
3646                 }
3647         }
3648         r = amdgpu_amdkfd_resume(adev, !fbcon);
3649         if (r)
3650                 return r;
3651
3652         /* Make sure IB tests flushed */
3653         flush_delayed_work(&adev->delayed_init_work);
3654
3655         /* blat the mode back in */
3656         if (fbcon) {
3657                 if (!amdgpu_device_has_dc_support(adev)) {
3658                         /* pre DCE11 */
3659                         drm_helper_resume_force_mode(dev);
3660
3661                         /* turn on display hw */
3662                         drm_modeset_lock_all(dev);
3663
3664                         drm_connector_list_iter_begin(dev, &iter);
3665                         drm_for_each_connector_iter(connector, &iter)
3666                                 drm_helper_connector_dpms(connector,
3667                                                           DRM_MODE_DPMS_ON);
3668                         drm_connector_list_iter_end(&iter);
3669
3670                         drm_modeset_unlock_all(dev);
3671                 }
3672                 amdgpu_fbdev_set_suspend(adev, 0);
3673         }
3674
3675         drm_kms_helper_poll_enable(dev);
3676
3677         amdgpu_ras_resume(adev);
3678
3679         /*
3680          * Most of the connector probing functions try to acquire runtime pm
3681          * refs to ensure that the GPU is powered on when connector polling is
3682          * performed. Since we're calling this from a runtime PM callback,
3683          * trying to acquire rpm refs will cause us to deadlock.
3684          *
3685          * Since we're guaranteed to be holding the rpm lock, it's safe to
3686          * temporarily disable the rpm helpers so this doesn't deadlock us.
3687          */
3688 #ifdef CONFIG_PM
3689         dev->dev->power.disable_depth++;
3690 #endif
3691         if (!amdgpu_device_has_dc_support(adev))
3692                 drm_helper_hpd_irq_event(dev);
3693         else
3694                 drm_kms_helper_hotplug_event(dev);
3695 #ifdef CONFIG_PM
3696         dev->dev->power.disable_depth--;
3697 #endif
3698         adev->in_suspend = false;
3699
3700         return 0;
3701 }
3702
3703 /**
3704  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3705  *
3706  * @adev: amdgpu_device pointer
3707  *
3708  * The list of all the hardware IPs that make up the asic is walked and
3709  * the check_soft_reset callbacks are run.  check_soft_reset determines
3710  * if the asic is still hung or not.
3711  * Returns true if any of the IPs are still in a hung state, false if not.
3712  */
3713 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3714 {
3715         int i;
3716         bool asic_hang = false;
3717
3718         if (amdgpu_sriov_vf(adev))
3719                 return true;
3720
3721         if (amdgpu_asic_need_full_reset(adev))
3722                 return true;
3723
3724         for (i = 0; i < adev->num_ip_blocks; i++) {
3725                 if (!adev->ip_blocks[i].status.valid)
3726                         continue;
3727                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3728                         adev->ip_blocks[i].status.hang =
3729                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3730                 if (adev->ip_blocks[i].status.hang) {
3731                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3732                         asic_hang = true;
3733                 }
3734         }
3735         return asic_hang;
3736 }
3737
3738 /**
3739  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3740  *
3741  * @adev: amdgpu_device pointer
3742  *
3743  * The list of all the hardware IPs that make up the asic is walked and the
3744  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3745  * handles any IP specific hardware or software state changes that are
3746  * necessary for a soft reset to succeed.
3747  * Returns 0 on success, negative error code on failure.
3748  */
3749 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3750 {
3751         int i, r = 0;
3752
3753         for (i = 0; i < adev->num_ip_blocks; i++) {
3754                 if (!adev->ip_blocks[i].status.valid)
3755                         continue;
3756                 if (adev->ip_blocks[i].status.hang &&
3757                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3758                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3759                         if (r)
3760                                 return r;
3761                 }
3762         }
3763
3764         return 0;
3765 }
3766
3767 /**
3768  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3769  *
3770  * @adev: amdgpu_device pointer
3771  *
3772  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3773  * reset is necessary to recover.
3774  * Returns true if a full asic reset is required, false if not.
3775  */
3776 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3777 {
3778         int i;
3779
3780         if (amdgpu_asic_need_full_reset(adev))
3781                 return true;
3782
3783         for (i = 0; i < adev->num_ip_blocks; i++) {
3784                 if (!adev->ip_blocks[i].status.valid)
3785                         continue;
3786                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3787                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3788                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3789                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3790                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3791                         if (adev->ip_blocks[i].status.hang) {
3792                                 dev_info(adev->dev, "Some block need full reset!\n");
3793                                 return true;
3794                         }
3795                 }
3796         }
3797         return false;
3798 }
3799
3800 /**
3801  * amdgpu_device_ip_soft_reset - do a soft reset
3802  *
3803  * @adev: amdgpu_device pointer
3804  *
3805  * The list of all the hardware IPs that make up the asic is walked and the
3806  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3807  * IP specific hardware or software state changes that are necessary to soft
3808  * reset the IP.
3809  * Returns 0 on success, negative error code on failure.
3810  */
3811 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3812 {
3813         int i, r = 0;
3814
3815         for (i = 0; i < adev->num_ip_blocks; i++) {
3816                 if (!adev->ip_blocks[i].status.valid)
3817                         continue;
3818                 if (adev->ip_blocks[i].status.hang &&
3819                     adev->ip_blocks[i].version->funcs->soft_reset) {
3820                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3821                         if (r)
3822                                 return r;
3823                 }
3824         }
3825
3826         return 0;
3827 }
3828
3829 /**
3830  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3831  *
3832  * @adev: amdgpu_device pointer
3833  *
3834  * The list of all the hardware IPs that make up the asic is walked and the
3835  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3836  * handles any IP specific hardware or software state changes that are
3837  * necessary after the IP has been soft reset.
3838  * Returns 0 on success, negative error code on failure.
3839  */
3840 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3841 {
3842         int i, r = 0;
3843
3844         for (i = 0; i < adev->num_ip_blocks; i++) {
3845                 if (!adev->ip_blocks[i].status.valid)
3846                         continue;
3847                 if (adev->ip_blocks[i].status.hang &&
3848                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3849                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3850                 if (r)
3851                         return r;
3852         }
3853
3854         return 0;
3855 }
3856
3857 /**
3858  * amdgpu_device_recover_vram - Recover some VRAM contents
3859  *
3860  * @adev: amdgpu_device pointer
3861  *
3862  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3863  * restore things like GPUVM page tables after a GPU reset where
3864  * the contents of VRAM might be lost.
3865  *
3866  * Returns:
3867  * 0 on success, negative error code on failure.
3868  */
3869 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3870 {
3871         struct dma_fence *fence = NULL, *next = NULL;
3872         struct amdgpu_bo *shadow;
3873         long r = 1, tmo;
3874
3875         if (amdgpu_sriov_runtime(adev))
3876                 tmo = msecs_to_jiffies(8000);
3877         else
3878                 tmo = msecs_to_jiffies(100);
3879
3880         dev_info(adev->dev, "recover vram bo from shadow start\n");
3881         mutex_lock(&adev->shadow_list_lock);
3882         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3883
3884                 /* No need to recover an evicted BO */
3885                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3886                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3887                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3888                         continue;
3889
3890                 r = amdgpu_bo_restore_shadow(shadow, &next);
3891                 if (r)
3892                         break;
3893
3894                 if (fence) {
3895                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3896                         dma_fence_put(fence);
3897                         fence = next;
3898                         if (tmo == 0) {
3899                                 r = -ETIMEDOUT;
3900                                 break;
3901                         } else if (tmo < 0) {
3902                                 r = tmo;
3903                                 break;
3904                         }
3905                 } else {
3906                         fence = next;
3907                 }
3908         }
3909         mutex_unlock(&adev->shadow_list_lock);
3910
3911         if (fence)
3912                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3913         dma_fence_put(fence);
3914
3915         if (r < 0 || tmo <= 0) {
3916                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3917                 return -EIO;
3918         }
3919
3920         dev_info(adev->dev, "recover vram bo from shadow done\n");
3921         return 0;
3922 }
3923
3924
3925 /**
3926  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3927  *
3928  * @adev: amdgpu device pointer
3929  * @from_hypervisor: request from hypervisor
3930  *
3931  * do VF FLR and reinitialize Asic
3932  * return 0 means succeeded otherwise failed
3933  */
3934 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3935                                      bool from_hypervisor)
3936 {
3937         int r;
3938
3939         if (from_hypervisor)
3940                 r = amdgpu_virt_request_full_gpu(adev, true);
3941         else
3942                 r = amdgpu_virt_reset_gpu(adev);
3943         if (r)
3944                 return r;
3945
3946         amdgpu_amdkfd_pre_reset(adev);
3947
3948         /* Resume IP prior to SMC */
3949         r = amdgpu_device_ip_reinit_early_sriov(adev);
3950         if (r)
3951                 goto error;
3952
3953         amdgpu_virt_init_data_exchange(adev);
3954         /* we need recover gart prior to run SMC/CP/SDMA resume */
3955         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
3956
3957         r = amdgpu_device_fw_loading(adev);
3958         if (r)
3959                 return r;
3960
3961         /* now we are okay to resume SMC/CP/SDMA */
3962         r = amdgpu_device_ip_reinit_late_sriov(adev);
3963         if (r)
3964                 goto error;
3965
3966         amdgpu_irq_gpu_reset_resume_helper(adev);
3967         r = amdgpu_ib_ring_tests(adev);
3968         amdgpu_amdkfd_post_reset(adev);
3969
3970 error:
3971         amdgpu_virt_release_full_gpu(adev, true);
3972         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3973                 amdgpu_inc_vram_lost(adev);
3974                 r = amdgpu_device_recover_vram(adev);
3975         }
3976
3977         return r;
3978 }
3979
3980 /**
3981  * amdgpu_device_has_job_running - check if there is any job in mirror list
3982  *
3983  * @adev: amdgpu device pointer
3984  *
3985  * check if there is any job in mirror list
3986  */
3987 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3988 {
3989         int i;
3990         struct drm_sched_job *job;
3991
3992         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3993                 struct amdgpu_ring *ring = adev->rings[i];
3994
3995                 if (!ring || !ring->sched.thread)
3996                         continue;
3997
3998                 spin_lock(&ring->sched.job_list_lock);
3999                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4000                                 struct drm_sched_job, node);
4001                 spin_unlock(&ring->sched.job_list_lock);
4002                 if (job)
4003                         return true;
4004         }
4005         return false;
4006 }
4007
4008 /**
4009  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4010  *
4011  * @adev: amdgpu device pointer
4012  *
4013  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4014  * a hung GPU.
4015  */
4016 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4017 {
4018         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4019                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4020                 return false;
4021         }
4022
4023         if (amdgpu_gpu_recovery == 0)
4024                 goto disabled;
4025
4026         if (amdgpu_sriov_vf(adev))
4027                 return true;
4028
4029         if (amdgpu_gpu_recovery == -1) {
4030                 switch (adev->asic_type) {
4031                 case CHIP_BONAIRE:
4032                 case CHIP_HAWAII:
4033                 case CHIP_TOPAZ:
4034                 case CHIP_TONGA:
4035                 case CHIP_FIJI:
4036                 case CHIP_POLARIS10:
4037                 case CHIP_POLARIS11:
4038                 case CHIP_POLARIS12:
4039                 case CHIP_VEGAM:
4040                 case CHIP_VEGA20:
4041                 case CHIP_VEGA10:
4042                 case CHIP_VEGA12:
4043                 case CHIP_RAVEN:
4044                 case CHIP_ARCTURUS:
4045                 case CHIP_RENOIR:
4046                 case CHIP_NAVI10:
4047                 case CHIP_NAVI14:
4048                 case CHIP_NAVI12:
4049                 case CHIP_SIENNA_CICHLID:
4050                         break;
4051                 default:
4052                         goto disabled;
4053                 }
4054         }
4055
4056         return true;
4057
4058 disabled:
4059                 dev_info(adev->dev, "GPU recovery disabled.\n");
4060                 return false;
4061 }
4062
4063
4064 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4065                                         struct amdgpu_job *job,
4066                                         bool *need_full_reset_arg)
4067 {
4068         int i, r = 0;
4069         bool need_full_reset  = *need_full_reset_arg;
4070
4071         amdgpu_debugfs_wait_dump(adev);
4072
4073         /* block all schedulers and reset given job's ring */
4074         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4075                 struct amdgpu_ring *ring = adev->rings[i];
4076
4077                 if (!ring || !ring->sched.thread)
4078                         continue;
4079
4080                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4081                 amdgpu_fence_driver_force_completion(ring);
4082         }
4083
4084         if(job)
4085                 drm_sched_increase_karma(&job->base);
4086
4087         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4088         if (!amdgpu_sriov_vf(adev)) {
4089
4090                 if (!need_full_reset)
4091                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4092
4093                 if (!need_full_reset) {
4094                         amdgpu_device_ip_pre_soft_reset(adev);
4095                         r = amdgpu_device_ip_soft_reset(adev);
4096                         amdgpu_device_ip_post_soft_reset(adev);
4097                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4098                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4099                                 need_full_reset = true;
4100                         }
4101                 }
4102
4103                 if (need_full_reset)
4104                         r = amdgpu_device_ip_suspend(adev);
4105
4106                 *need_full_reset_arg = need_full_reset;
4107         }
4108
4109         return r;
4110 }
4111
4112 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4113                                struct list_head *device_list_handle,
4114                                bool *need_full_reset_arg)
4115 {
4116         struct amdgpu_device *tmp_adev = NULL;
4117         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4118         int r = 0;
4119
4120         /*
4121          * ASIC reset has to be done on all HGMI hive nodes ASAP
4122          * to allow proper links negotiation in FW (within 1 sec)
4123          */
4124         if (need_full_reset) {
4125                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4126                         /* For XGMI run all resets in parallel to speed up the process */
4127                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4128                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4129                                         r = -EALREADY;
4130                         } else
4131                                 r = amdgpu_asic_reset(tmp_adev);
4132
4133                         if (r) {
4134                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4135                                          r, adev_to_drm(tmp_adev)->unique);
4136                                 break;
4137                         }
4138                 }
4139
4140                 /* For XGMI wait for all resets to complete before proceed */
4141                 if (!r) {
4142                         list_for_each_entry(tmp_adev, device_list_handle,
4143                                             gmc.xgmi.head) {
4144                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4145                                         flush_work(&tmp_adev->xgmi_reset_work);
4146                                         r = tmp_adev->asic_reset_res;
4147                                         if (r)
4148                                                 break;
4149                                 }
4150                         }
4151                 }
4152         }
4153
4154         if (!r && amdgpu_ras_intr_triggered()) {
4155                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4156                         if (tmp_adev->mmhub.funcs &&
4157                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4158                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4159                 }
4160
4161                 amdgpu_ras_intr_cleared();
4162         }
4163
4164         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4165                 if (need_full_reset) {
4166                         /* post card */
4167                         if (amdgpu_device_asic_init(tmp_adev))
4168                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4169
4170                         if (!r) {
4171                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4172                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4173                                 if (r)
4174                                         goto out;
4175
4176                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4177                                 if (vram_lost) {
4178                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4179                                         amdgpu_inc_vram_lost(tmp_adev);
4180                                 }
4181
4182                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4183                                 if (r)
4184                                         goto out;
4185
4186                                 r = amdgpu_device_fw_loading(tmp_adev);
4187                                 if (r)
4188                                         return r;
4189
4190                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4191                                 if (r)
4192                                         goto out;
4193
4194                                 if (vram_lost)
4195                                         amdgpu_device_fill_reset_magic(tmp_adev);
4196
4197                                 /*
4198                                  * Add this ASIC as tracked as reset was already
4199                                  * complete successfully.
4200                                  */
4201                                 amdgpu_register_gpu_instance(tmp_adev);
4202
4203                                 r = amdgpu_device_ip_late_init(tmp_adev);
4204                                 if (r)
4205                                         goto out;
4206
4207                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4208
4209                                 /*
4210                                  * The GPU enters bad state once faulty pages
4211                                  * by ECC has reached the threshold, and ras
4212                                  * recovery is scheduled next. So add one check
4213                                  * here to break recovery if it indeed exceeds
4214                                  * bad page threshold, and remind user to
4215                                  * retire this GPU or setting one bigger
4216                                  * bad_page_threshold value to fix this once
4217                                  * probing driver again.
4218                                  */
4219                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4220                                         /* must succeed. */
4221                                         amdgpu_ras_resume(tmp_adev);
4222                                 } else {
4223                                         r = -EINVAL;
4224                                         goto out;
4225                                 }
4226
4227                                 /* Update PSP FW topology after reset */
4228                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4229                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4230                         }
4231                 }
4232
4233 out:
4234                 if (!r) {
4235                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4236                         r = amdgpu_ib_ring_tests(tmp_adev);
4237                         if (r) {
4238                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4239                                 r = amdgpu_device_ip_suspend(tmp_adev);
4240                                 need_full_reset = true;
4241                                 r = -EAGAIN;
4242                                 goto end;
4243                         }
4244                 }
4245
4246                 if (!r)
4247                         r = amdgpu_device_recover_vram(tmp_adev);
4248                 else
4249                         tmp_adev->asic_reset_res = r;
4250         }
4251
4252 end:
4253         *need_full_reset_arg = need_full_reset;
4254         return r;
4255 }
4256
4257 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4258                                 struct amdgpu_hive_info *hive)
4259 {
4260         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4261                 return false;
4262
4263         if (hive) {
4264                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4265         } else {
4266                 down_write(&adev->reset_sem);
4267         }
4268
4269         atomic_inc(&adev->gpu_reset_counter);
4270         switch (amdgpu_asic_reset_method(adev)) {
4271         case AMD_RESET_METHOD_MODE1:
4272                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4273                 break;
4274         case AMD_RESET_METHOD_MODE2:
4275                 adev->mp1_state = PP_MP1_STATE_RESET;
4276                 break;
4277         default:
4278                 adev->mp1_state = PP_MP1_STATE_NONE;
4279                 break;
4280         }
4281
4282         return true;
4283 }
4284
4285 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4286 {
4287         amdgpu_vf_error_trans_all(adev);
4288         adev->mp1_state = PP_MP1_STATE_NONE;
4289         atomic_set(&adev->in_gpu_reset, 0);
4290         up_write(&adev->reset_sem);
4291 }
4292
4293 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4294 {
4295         struct pci_dev *p = NULL;
4296
4297         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4298                         adev->pdev->bus->number, 1);
4299         if (p) {
4300                 pm_runtime_enable(&(p->dev));
4301                 pm_runtime_resume(&(p->dev));
4302         }
4303 }
4304
4305 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4306 {
4307         enum amd_reset_method reset_method;
4308         struct pci_dev *p = NULL;
4309         u64 expires;
4310
4311         /*
4312          * For now, only BACO and mode1 reset are confirmed
4313          * to suffer the audio issue without proper suspended.
4314          */
4315         reset_method = amdgpu_asic_reset_method(adev);
4316         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4317              (reset_method != AMD_RESET_METHOD_MODE1))
4318                 return -EINVAL;
4319
4320         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4321                         adev->pdev->bus->number, 1);
4322         if (!p)
4323                 return -ENODEV;
4324
4325         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4326         if (!expires)
4327                 /*
4328                  * If we cannot get the audio device autosuspend delay,
4329                  * a fixed 4S interval will be used. Considering 3S is
4330                  * the audio controller default autosuspend delay setting.
4331                  * 4S used here is guaranteed to cover that.
4332                  */
4333                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4334
4335         while (!pm_runtime_status_suspended(&(p->dev))) {
4336                 if (!pm_runtime_suspend(&(p->dev)))
4337                         break;
4338
4339                 if (expires < ktime_get_mono_fast_ns()) {
4340                         dev_warn(adev->dev, "failed to suspend display audio\n");
4341                         /* TODO: abort the succeeding gpu reset? */
4342                         return -ETIMEDOUT;
4343                 }
4344         }
4345
4346         pm_runtime_disable(&(p->dev));
4347
4348         return 0;
4349 }
4350
4351 /**
4352  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4353  *
4354  * @adev: amdgpu device pointer
4355  * @job: which job trigger hang
4356  *
4357  * Attempt to reset the GPU if it has hung (all asics).
4358  * Attempt to do soft-reset or full-reset and reinitialize Asic
4359  * Returns 0 for success or an error on failure.
4360  */
4361
4362 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4363                               struct amdgpu_job *job)
4364 {
4365         struct list_head device_list, *device_list_handle =  NULL;
4366         bool need_full_reset = false;
4367         bool job_signaled = false;
4368         struct amdgpu_hive_info *hive = NULL;
4369         struct amdgpu_device *tmp_adev = NULL;
4370         int i, r = 0;
4371         bool need_emergency_restart = false;
4372         bool audio_suspended = false;
4373
4374         /**
4375          * Special case: RAS triggered and full reset isn't supported
4376          */
4377         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4378
4379         /*
4380          * Flush RAM to disk so that after reboot
4381          * the user can read log and see why the system rebooted.
4382          */
4383         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4384                 DRM_WARN("Emergency reboot.");
4385
4386                 ksys_sync_helper();
4387                 emergency_restart();
4388         }
4389
4390         dev_info(adev->dev, "GPU %s begin!\n",
4391                 need_emergency_restart ? "jobs stop":"reset");
4392
4393         /*
4394          * Here we trylock to avoid chain of resets executing from
4395          * either trigger by jobs on different adevs in XGMI hive or jobs on
4396          * different schedulers for same device while this TO handler is running.
4397          * We always reset all schedulers for device and all devices for XGMI
4398          * hive so that should take care of them too.
4399          */
4400         hive = amdgpu_get_xgmi_hive(adev);
4401         if (hive) {
4402                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4403                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4404                                 job ? job->base.id : -1, hive->hive_id);
4405                         amdgpu_put_xgmi_hive(hive);
4406                         return 0;
4407                 }
4408                 mutex_lock(&hive->hive_lock);
4409         }
4410
4411         /*
4412          * Build list of devices to reset.
4413          * In case we are in XGMI hive mode, resort the device list
4414          * to put adev in the 1st position.
4415          */
4416         INIT_LIST_HEAD(&device_list);
4417         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4418                 if (!hive)
4419                         return -ENODEV;
4420                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4421                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4422                 device_list_handle = &hive->device_list;
4423         } else {
4424                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4425                 device_list_handle = &device_list;
4426         }
4427
4428         /* block all schedulers and reset given job's ring */
4429         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4430                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4431                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4432                                   job ? job->base.id : -1);
4433                         r = 0;
4434                         goto skip_recovery;
4435                 }
4436
4437                 /*
4438                  * Try to put the audio codec into suspend state
4439                  * before gpu reset started.
4440                  *
4441                  * Due to the power domain of the graphics device
4442                  * is shared with AZ power domain. Without this,
4443                  * we may change the audio hardware from behind
4444                  * the audio driver's back. That will trigger
4445                  * some audio codec errors.
4446                  */
4447                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4448                         audio_suspended = true;
4449
4450                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4451
4452                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4453
4454                 if (!amdgpu_sriov_vf(tmp_adev))
4455                         amdgpu_amdkfd_pre_reset(tmp_adev);
4456
4457                 /*
4458                  * Mark these ASICs to be reseted as untracked first
4459                  * And add them back after reset completed
4460                  */
4461                 amdgpu_unregister_gpu_instance(tmp_adev);
4462
4463                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4464
4465                 /* disable ras on ALL IPs */
4466                 if (!need_emergency_restart &&
4467                       amdgpu_device_ip_need_full_reset(tmp_adev))
4468                         amdgpu_ras_suspend(tmp_adev);
4469
4470                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4471                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4472
4473                         if (!ring || !ring->sched.thread)
4474                                 continue;
4475
4476                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4477
4478                         if (need_emergency_restart)
4479                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4480                 }
4481         }
4482
4483         if (need_emergency_restart)
4484                 goto skip_sched_resume;
4485
4486         /*
4487          * Must check guilty signal here since after this point all old
4488          * HW fences are force signaled.
4489          *
4490          * job->base holds a reference to parent fence
4491          */
4492         if (job && job->base.s_fence->parent &&
4493             dma_fence_is_signaled(job->base.s_fence->parent)) {
4494                 job_signaled = true;
4495                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4496                 goto skip_hw_reset;
4497         }
4498
4499 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4500         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4501                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4502                                                  NULL,
4503                                                  &need_full_reset);
4504                 /*TODO Should we stop ?*/
4505                 if (r) {
4506                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4507                                   r, adev_to_drm(tmp_adev)->unique);
4508                         tmp_adev->asic_reset_res = r;
4509                 }
4510         }
4511
4512         /* Actual ASIC resets if needed.*/
4513         /* TODO Implement XGMI hive reset logic for SRIOV */
4514         if (amdgpu_sriov_vf(adev)) {
4515                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4516                 if (r)
4517                         adev->asic_reset_res = r;
4518         } else {
4519                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4520                 if (r && r == -EAGAIN)
4521                         goto retry;
4522         }
4523
4524 skip_hw_reset:
4525
4526         /* Post ASIC reset for all devs .*/
4527         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4528
4529                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4530                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4531
4532                         if (!ring || !ring->sched.thread)
4533                                 continue;
4534
4535                         /* No point to resubmit jobs if we didn't HW reset*/
4536                         if (!tmp_adev->asic_reset_res && !job_signaled)
4537                                 drm_sched_resubmit_jobs(&ring->sched);
4538
4539                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4540                 }
4541
4542                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4543                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4544                 }
4545
4546                 tmp_adev->asic_reset_res = 0;
4547
4548                 if (r) {
4549                         /* bad news, how to tell it to userspace ? */
4550                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4551                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4552                 } else {
4553                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4554                 }
4555         }
4556
4557 skip_sched_resume:
4558         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4559                 /*unlock kfd: SRIOV would do it separately */
4560                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4561                         amdgpu_amdkfd_post_reset(tmp_adev);
4562                 if (audio_suspended)
4563                         amdgpu_device_resume_display_audio(tmp_adev);
4564                 amdgpu_device_unlock_adev(tmp_adev);
4565         }
4566
4567 skip_recovery:
4568         if (hive) {
4569                 atomic_set(&hive->in_reset, 0);
4570                 mutex_unlock(&hive->hive_lock);
4571                 amdgpu_put_xgmi_hive(hive);
4572         }
4573
4574         if (r)
4575                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4576         return r;
4577 }
4578
4579 /**
4580  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4581  *
4582  * @adev: amdgpu_device pointer
4583  *
4584  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4585  * and lanes) of the slot the device is in. Handles APUs and
4586  * virtualized environments where PCIE config space may not be available.
4587  */
4588 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4589 {
4590         struct pci_dev *pdev;
4591         enum pci_bus_speed speed_cap, platform_speed_cap;
4592         enum pcie_link_width platform_link_width;
4593
4594         if (amdgpu_pcie_gen_cap)
4595                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4596
4597         if (amdgpu_pcie_lane_cap)
4598                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4599
4600         /* covers APUs as well */
4601         if (pci_is_root_bus(adev->pdev->bus)) {
4602                 if (adev->pm.pcie_gen_mask == 0)
4603                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4604                 if (adev->pm.pcie_mlw_mask == 0)
4605                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4606                 return;
4607         }
4608
4609         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4610                 return;
4611
4612         pcie_bandwidth_available(adev->pdev, NULL,
4613                                  &platform_speed_cap, &platform_link_width);
4614
4615         if (adev->pm.pcie_gen_mask == 0) {
4616                 /* asic caps */
4617                 pdev = adev->pdev;
4618                 speed_cap = pcie_get_speed_cap(pdev);
4619                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4620                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4621                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4622                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4623                 } else {
4624                         if (speed_cap == PCIE_SPEED_16_0GT)
4625                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4626                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4627                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4628                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4629                         else if (speed_cap == PCIE_SPEED_8_0GT)
4630                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4631                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4632                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4633                         else if (speed_cap == PCIE_SPEED_5_0GT)
4634                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4635                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4636                         else
4637                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4638                 }
4639                 /* platform caps */
4640                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4641                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4642                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4643                 } else {
4644                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4645                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4646                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4647                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4648                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4649                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4650                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4651                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4652                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4653                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4654                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4655                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4656                         else
4657                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4658
4659                 }
4660         }
4661         if (adev->pm.pcie_mlw_mask == 0) {
4662                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4663                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4664                 } else {
4665                         switch (platform_link_width) {
4666                         case PCIE_LNK_X32:
4667                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4668                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4669                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4670                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4671                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4672                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4673                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4674                                 break;
4675                         case PCIE_LNK_X16:
4676                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4677                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4678                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4679                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4680                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4681                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4682                                 break;
4683                         case PCIE_LNK_X12:
4684                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4685                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4686                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4687                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4688                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4689                                 break;
4690                         case PCIE_LNK_X8:
4691                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4692                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4693                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4694                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4695                                 break;
4696                         case PCIE_LNK_X4:
4697                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4698                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4699                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4700                                 break;
4701                         case PCIE_LNK_X2:
4702                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4703                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4704                                 break;
4705                         case PCIE_LNK_X1:
4706                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4707                                 break;
4708                         default:
4709                                 break;
4710                         }
4711                 }
4712         }
4713 }
4714
4715 int amdgpu_device_baco_enter(struct drm_device *dev)
4716 {
4717         struct amdgpu_device *adev = drm_to_adev(dev);
4718         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4719
4720         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4721                 return -ENOTSUPP;
4722
4723         if (ras && ras->supported)
4724                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4725
4726         return amdgpu_dpm_baco_enter(adev);
4727 }
4728
4729 int amdgpu_device_baco_exit(struct drm_device *dev)
4730 {
4731         struct amdgpu_device *adev = drm_to_adev(dev);
4732         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4733         int ret = 0;
4734
4735         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4736                 return -ENOTSUPP;
4737
4738         ret = amdgpu_dpm_baco_exit(adev);
4739         if (ret)
4740                 return ret;
4741
4742         if (ras && ras->supported)
4743                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4744
4745         return 0;
4746 }
4747
4748 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4749 {
4750         int i;
4751
4752         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4753                 struct amdgpu_ring *ring = adev->rings[i];
4754
4755                 if (!ring || !ring->sched.thread)
4756                         continue;
4757
4758                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4759         }
4760 }
4761
4762 /**
4763  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4764  * @pdev: PCI device struct
4765  * @state: PCI channel state
4766  *
4767  * Description: Called when a PCI error is detected.
4768  *
4769  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4770  */
4771 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4772 {
4773         struct drm_device *dev = pci_get_drvdata(pdev);
4774         struct amdgpu_device *adev = drm_to_adev(dev);
4775         int i;
4776
4777         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4778
4779         switch (state) {
4780         case pci_channel_io_normal:
4781                 return PCI_ERS_RESULT_CAN_RECOVER;
4782         /* Fatal error, prepare for slot reset */
4783         case pci_channel_io_frozen:
4784                 /*
4785                  * Cancel and wait for all TDRs in progress if failing to
4786                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4787                  *
4788                  * Locking adev->reset_sem will prevent any external access
4789                  * to GPU during PCI error recovery
4790                  */
4791                 while (!amdgpu_device_lock_adev(adev, NULL))
4792                         amdgpu_cancel_all_tdr(adev);
4793
4794                 /*
4795                  * Block any work scheduling as we do for regular GPU reset
4796                  * for the duration of the recovery
4797                  */
4798                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4799                         struct amdgpu_ring *ring = adev->rings[i];
4800
4801                         if (!ring || !ring->sched.thread)
4802                                 continue;
4803
4804                         drm_sched_stop(&ring->sched, NULL);
4805                 }
4806                 return PCI_ERS_RESULT_NEED_RESET;
4807         case pci_channel_io_perm_failure:
4808                 /* Permanent error, prepare for device removal */
4809                 return PCI_ERS_RESULT_DISCONNECT;
4810         }
4811
4812         return PCI_ERS_RESULT_NEED_RESET;
4813 }
4814
4815 /**
4816  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4817  * @pdev: pointer to PCI device
4818  */
4819 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4820 {
4821
4822         DRM_INFO("PCI error: mmio enabled callback!!\n");
4823
4824         /* TODO - dump whatever for debugging purposes */
4825
4826         /* This called only if amdgpu_pci_error_detected returns
4827          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4828          * works, no need to reset slot.
4829          */
4830
4831         return PCI_ERS_RESULT_RECOVERED;
4832 }
4833
4834 /**
4835  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4836  * @pdev: PCI device struct
4837  *
4838  * Description: This routine is called by the pci error recovery
4839  * code after the PCI slot has been reset, just before we
4840  * should resume normal operations.
4841  */
4842 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4843 {
4844         struct drm_device *dev = pci_get_drvdata(pdev);
4845         struct amdgpu_device *adev = drm_to_adev(dev);
4846         int r, i;
4847         bool vram_lost;
4848         u32 memsize;
4849
4850         DRM_INFO("PCI error: slot reset callback!!\n");
4851
4852         /* wait for asic to come out of reset */
4853         msleep(500);
4854
4855         pci_restore_state(pdev);
4856
4857         /* confirm  ASIC came out of reset */
4858         for (i = 0; i < adev->usec_timeout; i++) {
4859                 memsize = amdgpu_asic_get_config_memsize(adev);
4860
4861                 if (memsize != 0xffffffff)
4862                         break;
4863                 udelay(1);
4864         }
4865         if (memsize == 0xffffffff) {
4866                 r = -ETIME;
4867                 goto out;
4868         }
4869
4870         /* TODO Call amdgpu_pre_asic_reset instead */
4871         adev->in_pci_err_recovery = true;
4872         r = amdgpu_device_ip_suspend(adev);
4873         adev->in_pci_err_recovery = false;
4874         if (r)
4875                 goto out;
4876
4877
4878         /* post card */
4879         r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
4880         if (r)
4881                 goto out;
4882
4883         r = amdgpu_device_ip_resume_phase1(adev);
4884         if (r)
4885                 goto out;
4886
4887         vram_lost = amdgpu_device_check_vram_lost(adev);
4888         if (vram_lost) {
4889                 DRM_INFO("VRAM is lost due to GPU reset!\n");
4890                 amdgpu_inc_vram_lost(adev);
4891         }
4892
4893         r = amdgpu_gtt_mgr_recover(
4894                 &adev->mman.bdev.man[TTM_PL_TT]);
4895         if (r)
4896                 goto out;
4897
4898         r = amdgpu_device_fw_loading(adev);
4899         if (r)
4900                 return r;
4901
4902         r = amdgpu_device_ip_resume_phase2(adev);
4903         if (r)
4904                 goto out;
4905
4906         if (vram_lost)
4907                 amdgpu_device_fill_reset_magic(adev);
4908
4909         /*
4910          * Add this ASIC as tracked as reset was already
4911          * complete successfully.
4912          */
4913         amdgpu_register_gpu_instance(adev);
4914
4915         r = amdgpu_device_ip_late_init(adev);
4916         if (r)
4917                 goto out;
4918
4919         amdgpu_fbdev_set_suspend(adev, 0);
4920
4921         /* must succeed. */
4922         amdgpu_ras_resume(adev);
4923
4924
4925         amdgpu_irq_gpu_reset_resume_helper(adev);
4926         r = amdgpu_ib_ring_tests(adev);
4927         if (r)
4928                 goto out;
4929
4930         r = amdgpu_device_recover_vram(adev);
4931
4932 out:
4933
4934         if (!r) {
4935                 DRM_INFO("PCIe error recovery succeeded\n");
4936         } else {
4937                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
4938                 amdgpu_device_unlock_adev(adev);
4939         }
4940
4941         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
4942 }
4943
4944 /**
4945  * amdgpu_pci_resume() - resume normal ops after PCI reset
4946  * @pdev: pointer to PCI device
4947  *
4948  * Called when the error recovery driver tells us that its
4949  * OK to resume normal operation. Use completion to allow
4950  * halted scsi ops to resume.
4951  */
4952 void amdgpu_pci_resume(struct pci_dev *pdev)
4953 {
4954         struct drm_device *dev = pci_get_drvdata(pdev);
4955         struct amdgpu_device *adev = drm_to_adev(dev);
4956         int i;
4957
4958
4959         DRM_INFO("PCI error: resume callback!!\n");
4960
4961         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4962                 struct amdgpu_ring *ring = adev->rings[i];
4963
4964                 if (!ring || !ring->sched.thread)
4965                         continue;
4966
4967
4968                 drm_sched_resubmit_jobs(&ring->sched);
4969                 drm_sched_start(&ring->sched, true);
4970         }
4971
4972         amdgpu_device_unlock_adev(adev);
4973 }