drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "LAST",
 120 };
 121
 122 /**
 123  * DOC: pcie_replay_count
 124  *
 125  * The amdgpu driver provides a sysfs API for reporting the total number
 126  * of PCIe replays (NAKs)
 127  * The file pcie_replay_count is used for this and returns the total
 128  * number of replays as a sum of the NAKs generated and NAKs received
 129  */
 130
 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 132                 struct device_attribute *attr, char *buf)
 133 {
 134         struct drm_device *ddev = dev_get_drvdata(dev);
 135         struct amdgpu_device *adev = drm_to_adev(ddev);
 136         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 137
 138         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 139 }
 140
 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 142                 amdgpu_device_get_pcie_replay_count, NULL);
 143
 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 145
 146 /**
 147  * DOC: product_name
 148  *
 149  * The amdgpu driver provides a sysfs API for reporting the product name
 150  * for the device
 151  * The file serial_number is used for this and returns the product name
 152  * as returned from the FRU.
 153  * NOTE: This is only available for certain server cards
 154  */
 155
 156 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 157                 struct device_attribute *attr, char *buf)
 158 {
 159         struct drm_device *ddev = dev_get_drvdata(dev);
 160         struct amdgpu_device *adev = drm_to_adev(ddev);
 161
 162         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 163 }
 164
 165 static DEVICE_ATTR(product_name, S_IRUGO,
 166                 amdgpu_device_get_product_name, NULL);
 167
 168 /**
 169  * DOC: product_number
 170  *
 171  * The amdgpu driver provides a sysfs API for reporting the part number
 172  * for the device
 173  * The file serial_number is used for this and returns the part number
 174  * as returned from the FRU.
 175  * NOTE: This is only available for certain server cards
 176  */
 177
 178 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 179                 struct device_attribute *attr, char *buf)
 180 {
 181         struct drm_device *ddev = dev_get_drvdata(dev);
 182         struct amdgpu_device *adev = drm_to_adev(ddev);
 183
 184         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 185 }
 186
 187 static DEVICE_ATTR(product_number, S_IRUGO,
 188                 amdgpu_device_get_product_number, NULL);
 189
 190 /**
 191  * DOC: serial_number
 192  *
 193  * The amdgpu driver provides a sysfs API for reporting the serial number
 194  * for the device
 195  * The file serial_number is used for this and returns the serial number
 196  * as returned from the FRU.
 197  * NOTE: This is only available for certain server cards
 198  */
 199
 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 201                 struct device_attribute *attr, char *buf)
 202 {
 203         struct drm_device *ddev = dev_get_drvdata(dev);
 204         struct amdgpu_device *adev = drm_to_adev(ddev);
 205
 206         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 207 }
 208
 209 static DEVICE_ATTR(serial_number, S_IRUGO,
 210                 amdgpu_device_get_serial_number, NULL);
 211
 212 /**
 213  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 214  *
 215  * @dev: drm_device pointer
 216  *
 217  * Returns true if the device is a dGPU with HG/PX power control,
 218  * otherwise return false.
 219  */
 220 bool amdgpu_device_supports_boco(struct drm_device *dev)
 221 {
 222         struct amdgpu_device *adev = drm_to_adev(dev);
 223
 224         if (adev->flags & AMD_IS_PX)
 225                 return true;
 226         return false;
 227 }
 228
 229 /**
 230  * amdgpu_device_supports_baco - Does the device support BACO
 231  *
 232  * @dev: drm_device pointer
 233  *
 234  * Returns true if the device supporte BACO,
 235  * otherwise return false.
 236  */
 237 bool amdgpu_device_supports_baco(struct drm_device *dev)
 238 {
 239         struct amdgpu_device *adev = drm_to_adev(dev);
 240
 241         return amdgpu_asic_supports_baco(adev);
 242 }
 243
 244 /**
 245  * VRAM access helper functions.
 246  *
 247  * amdgpu_device_vram_access - read/write a buffer in vram
 248  *
 249  * @adev: amdgpu_device pointer
 250  * @pos: offset of the buffer in vram
 251  * @buf: virtual address of the buffer in system memory
 252  * @size: read/write size, sizeof(@buf) must > @size
 253  * @write: true - write to vram, otherwise - read from vram
 254  */
 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 256                                uint32_t *buf, size_t size, bool write)
 257 {
 258         unsigned long flags;
 259         uint32_t hi = ~0;
 260         uint64_t last;
 261
 262
 263 #ifdef CONFIG_64BIT
 264         last = min(pos + size, adev->gmc.visible_vram_size);
 265         if (last > pos) {
 266                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 267                 size_t count = last - pos;
 268
 269                 if (write) {
 270                         memcpy_toio(addr, buf, count);
 271                         mb();
 272                         amdgpu_asic_flush_hdp(adev, NULL);
 273                 } else {
 274                         amdgpu_asic_invalidate_hdp(adev, NULL);
 275                         mb();
 276                         memcpy_fromio(buf, addr, count);
 277                 }
 278
 279                 if (count == size)
 280                         return;
 281
 282                 pos += count;
 283                 buf += count / 4;
 284                 size -= count;
 285         }
 286 #endif
 287
 288         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 289         for (last = pos + size; pos < last; pos += 4) {
 290                 uint32_t tmp = pos >> 31;
 291
 292                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 293                 if (tmp != hi) {
 294                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 295                         hi = tmp;
 296                 }
 297                 if (write)
 298                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 299                 else
 300                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 301         }
 302         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 303 }
 304
 305 /*
 306  * MMIO register access helper functions.
 307  */
 308 /**
 309  * amdgpu_mm_rreg - read a memory mapped IO register
 310  *
 311  * @adev: amdgpu_device pointer
 312  * @reg: dword aligned register offset
 313  * @acc_flags: access flags which require special behavior
 314  *
 315  * Returns the 32 bit value from the offset specified.
 316  */
 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 318                         uint32_t acc_flags)
 319 {
 320         uint32_t ret;
 321
 322         if (adev->in_pci_err_recovery)
 323                 return 0;
 324
 325         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
 326             down_read_trylock(&adev->reset_sem)) {
 327                 ret = amdgpu_kiq_rreg(adev, reg);
 328                 up_read(&adev->reset_sem);
 329                 return ret;
 330         }
 331
 332         if ((reg * 4) < adev->rmmio_size)
 333                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 334         else {
 335                 unsigned long flags;
 336
 337                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 338                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 339                 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 340                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 341         }
 342
 343         trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
 344         return ret;
 345 }
 346
 347 /*
 348  * MMIO register read with bytes helper functions
 349  * @offset:bytes offset from MMIO start
 350  *
 351 */
 352
 353 /**
 354  * amdgpu_mm_rreg8 - read a memory mapped IO register
 355  *
 356  * @adev: amdgpu_device pointer
 357  * @offset: byte aligned register offset
 358  *
 359  * Returns the 8 bit value from the offset specified.
 360  */
 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 362 {
 363         if (adev->in_pci_err_recovery)
 364                 return 0;
 365
 366         if (offset < adev->rmmio_size)
 367                 return (readb(adev->rmmio + offset));
 368         BUG();
 369 }
 370
 371 /*
 372  * MMIO register write with bytes helper functions
 373  * @offset:bytes offset from MMIO start
 374  * @value: the value want to be written to the register
 375  *
 376 */
 377 /**
 378  * amdgpu_mm_wreg8 - read a memory mapped IO register
 379  *
 380  * @adev: amdgpu_device pointer
 381  * @offset: byte aligned register offset
 382  * @value: 8 bit value to write
 383  *
 384  * Writes the value specified to the offset specified.
 385  */
 386 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 387 {
 388         if (adev->in_pci_err_recovery)
 389                 return;
 390
 391         if (offset < adev->rmmio_size)
 392                 writeb(value, adev->rmmio + offset);
 393         else
 394                 BUG();
 395 }
 396
 397 static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
 398                                        uint32_t reg, uint32_t v,
 399                                        uint32_t acc_flags)
 400 {
 401         if (adev->in_pci_err_recovery)
 402                 return;
 403
 404         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 405
 406         if ((reg * 4) < adev->rmmio_size)
 407                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 408         else {
 409                 unsigned long flags;
 410
 411                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 412                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 413                 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 414                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 415         }
 416 }
 417
 418 /**
 419  * amdgpu_mm_wreg - write to a memory mapped IO register
 420  *
 421  * @adev: amdgpu_device pointer
 422  * @reg: dword aligned register offset
 423  * @v: 32 bit value to write to the register
 424  * @acc_flags: access flags which require special behavior
 425  *
 426  * Writes the value specified to the offset specified.
 427  */
 428 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 429                     uint32_t acc_flags)
 430 {
 431         if (adev->in_pci_err_recovery)
 432                 return;
 433
 434         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
 435             down_read_trylock(&adev->reset_sem)) {
 436                 amdgpu_kiq_wreg(adev, reg, v);
 437                 up_read(&adev->reset_sem);
 438                 return;
 439         }
 440
 441         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 442 }
 443
 444 /*
 445  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 446  *
 447  * this function is invoked only the debugfs register access
 448  * */
 449 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 450                     uint32_t acc_flags)
 451 {
 452         if (adev->in_pci_err_recovery)
 453                 return;
 454
 455         if (amdgpu_sriov_fullaccess(adev) &&
 456                 adev->gfx.rlc.funcs &&
 457                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 458
 459                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 460                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 461         }
 462
 463         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 464 }
 465
 466 /**
 467  * amdgpu_io_rreg - read an IO register
 468  *
 469  * @adev: amdgpu_device pointer
 470  * @reg: dword aligned register offset
 471  *
 472  * Returns the 32 bit value from the offset specified.
 473  */
 474 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 475 {
 476         if (adev->in_pci_err_recovery)
 477                 return 0;
 478
 479         if ((reg * 4) < adev->rio_mem_size)
 480                 return ioread32(adev->rio_mem + (reg * 4));
 481         else {
 482                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 483                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 484         }
 485 }
 486
 487 /**
 488  * amdgpu_io_wreg - write to an IO register
 489  *
 490  * @adev: amdgpu_device pointer
 491  * @reg: dword aligned register offset
 492  * @v: 32 bit value to write to the register
 493  *
 494  * Writes the value specified to the offset specified.
 495  */
 496 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 497 {
 498         if (adev->in_pci_err_recovery)
 499                 return;
 500
 501         if ((reg * 4) < adev->rio_mem_size)
 502                 iowrite32(v, adev->rio_mem + (reg * 4));
 503         else {
 504                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 505                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 506         }
 507 }
 508
 509 /**
 510  * amdgpu_mm_rdoorbell - read a doorbell dword
 511  *
 512  * @adev: amdgpu_device pointer
 513  * @index: doorbell index
 514  *
 515  * Returns the value in the doorbell aperture at the
 516  * requested doorbell index (CIK).
 517  */
 518 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 519 {
 520         if (adev->in_pci_err_recovery)
 521                 return 0;
 522
 523         if (index < adev->doorbell.num_doorbells) {
 524                 return readl(adev->doorbell.ptr + index);
 525         } else {
 526                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 527                 return 0;
 528         }
 529 }
 530
 531 /**
 532  * amdgpu_mm_wdoorbell - write a doorbell dword
 533  *
 534  * @adev: amdgpu_device pointer
 535  * @index: doorbell index
 536  * @v: value to write
 537  *
 538  * Writes @v to the doorbell aperture at the
 539  * requested doorbell index (CIK).
 540  */
 541 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 542 {
 543         if (adev->in_pci_err_recovery)
 544                 return;
 545
 546         if (index < adev->doorbell.num_doorbells) {
 547                 writel(v, adev->doorbell.ptr + index);
 548         } else {
 549                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 550         }
 551 }
 552
 553 /**
 554  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 555  *
 556  * @adev: amdgpu_device pointer
 557  * @index: doorbell index
 558  *
 559  * Returns the value in the doorbell aperture at the
 560  * requested doorbell index (VEGA10+).
 561  */
 562 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 563 {
 564         if (adev->in_pci_err_recovery)
 565                 return 0;
 566
 567         if (index < adev->doorbell.num_doorbells) {
 568                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 569         } else {
 570                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 571                 return 0;
 572         }
 573 }
 574
 575 /**
 576  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 577  *
 578  * @adev: amdgpu_device pointer
 579  * @index: doorbell index
 580  * @v: value to write
 581  *
 582  * Writes @v to the doorbell aperture at the
 583  * requested doorbell index (VEGA10+).
 584  */
 585 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 586 {
 587         if (adev->in_pci_err_recovery)
 588                 return;
 589
 590         if (index < adev->doorbell.num_doorbells) {
 591                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 592         } else {
 593                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 594         }
 595 }
 596
 597 /**
 598  * amdgpu_invalid_rreg - dummy reg read function
 599  *
 600  * @adev: amdgpu device pointer
 601  * @reg: offset of register
 602  *
 603  * Dummy register read function.  Used for register blocks
 604  * that certain asics don't have (all asics).
 605  * Returns the value in the register.
 606  */
 607 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 608 {
 609         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 610         BUG();
 611         return 0;
 612 }
 613
 614 /**
 615  * amdgpu_invalid_wreg - dummy reg write function
 616  *
 617  * @adev: amdgpu device pointer
 618  * @reg: offset of register
 619  * @v: value to write to the register
 620  *
 621  * Dummy register read function.  Used for register blocks
 622  * that certain asics don't have (all asics).
 623  */
 624 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 625 {
 626         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 627                   reg, v);
 628         BUG();
 629 }
 630
 631 /**
 632  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 633  *
 634  * @adev: amdgpu device pointer
 635  * @reg: offset of register
 636  *
 637  * Dummy register read function.  Used for register blocks
 638  * that certain asics don't have (all asics).
 639  * Returns the value in the register.
 640  */
 641 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 642 {
 643         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 644         BUG();
 645         return 0;
 646 }
 647
 648 /**
 649  * amdgpu_invalid_wreg64 - dummy reg write function
 650  *
 651  * @adev: amdgpu device pointer
 652  * @reg: offset of register
 653  * @v: value to write to the register
 654  *
 655  * Dummy register read function.  Used for register blocks
 656  * that certain asics don't have (all asics).
 657  */
 658 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 659 {
 660         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 661                   reg, v);
 662         BUG();
 663 }
 664
 665 /**
 666  * amdgpu_block_invalid_rreg - dummy reg read function
 667  *
 668  * @adev: amdgpu device pointer
 669  * @block: offset of instance
 670  * @reg: offset of register
 671  *
 672  * Dummy register read function.  Used for register blocks
 673  * that certain asics don't have (all asics).
 674  * Returns the value in the register.
 675  */
 676 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 677                                           uint32_t block, uint32_t reg)
 678 {
 679         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 680                   reg, block);
 681         BUG();
 682         return 0;
 683 }
 684
 685 /**
 686  * amdgpu_block_invalid_wreg - dummy reg write function
 687  *
 688  * @adev: amdgpu device pointer
 689  * @block: offset of instance
 690  * @reg: offset of register
 691  * @v: value to write to the register
 692  *
 693  * Dummy register read function.  Used for register blocks
 694  * that certain asics don't have (all asics).
 695  */
 696 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 697                                       uint32_t block,
 698                                       uint32_t reg, uint32_t v)
 699 {
 700         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 701                   reg, block, v);
 702         BUG();
 703 }
 704
 705 /**
 706  * amdgpu_device_asic_init - Wrapper for atom asic_init
 707  *
 708  * @dev: drm_device pointer
 709  *
 710  * Does any asic specific work and then calls atom asic init.
 711  */
 712 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 713 {
 714         amdgpu_asic_pre_asic_init(adev);
 715
 716         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 717 }
 718
 719 /**
 720  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 721  *
 722  * @adev: amdgpu device pointer
 723  *
 724  * Allocates a scratch page of VRAM for use by various things in the
 725  * driver.
 726  */
 727 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 728 {
 729         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 730                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 731                                        &adev->vram_scratch.robj,
 732                                        &adev->vram_scratch.gpu_addr,
 733                                        (void **)&adev->vram_scratch.ptr);
 734 }
 735
 736 /**
 737  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 738  *
 739  * @adev: amdgpu device pointer
 740  *
 741  * Frees the VRAM scratch page.
 742  */
 743 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 744 {
 745         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 746 }
 747
 748 /**
 749  * amdgpu_device_program_register_sequence - program an array of registers.
 750  *
 751  * @adev: amdgpu_device pointer
 752  * @registers: pointer to the register array
 753  * @array_size: size of the register array
 754  *
 755  * Programs an array or registers with and and or masks.
 756  * This is a helper for setting golden registers.
 757  */
 758 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 759                                              const u32 *registers,
 760                                              const u32 array_size)
 761 {
 762         u32 tmp, reg, and_mask, or_mask;
 763         int i;
 764
 765         if (array_size % 3)
 766                 return;
 767
 768         for (i = 0; i < array_size; i +=3) {
 769                 reg = registers[i + 0];
 770                 and_mask = registers[i + 1];
 771                 or_mask = registers[i + 2];
 772
 773                 if (and_mask == 0xffffffff) {
 774                         tmp = or_mask;
 775                 } else {
 776                         tmp = RREG32(reg);
 777                         tmp &= ~and_mask;
 778                         if (adev->family >= AMDGPU_FAMILY_AI)
 779                                 tmp |= (or_mask & and_mask);
 780                         else
 781                                 tmp |= or_mask;
 782                 }
 783                 WREG32(reg, tmp);
 784         }
 785 }
 786
 787 /**
 788  * amdgpu_device_pci_config_reset - reset the GPU
 789  *
 790  * @adev: amdgpu_device pointer
 791  *
 792  * Resets the GPU using the pci config reset sequence.
 793  * Only applicable to asics prior to vega10.
 794  */
 795 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 796 {
 797         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 798 }
 799
 800 /*
 801  * GPU doorbell aperture helpers function.
 802  */
 803 /**
 804  * amdgpu_device_doorbell_init - Init doorbell driver information.
 805  *
 806  * @adev: amdgpu_device pointer
 807  *
 808  * Init doorbell driver information (CIK)
 809  * Returns 0 on success, error on failure.
 810  */
 811 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 812 {
 813
 814         /* No doorbell on SI hardware generation */
 815         if (adev->asic_type < CHIP_BONAIRE) {
 816                 adev->doorbell.base = 0;
 817                 adev->doorbell.size = 0;
 818                 adev->doorbell.num_doorbells = 0;
 819                 adev->doorbell.ptr = NULL;
 820                 return 0;
 821         }
 822
 823         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 824                 return -EINVAL;
 825
 826         amdgpu_asic_init_doorbell_index(adev);
 827
 828         /* doorbell bar mapping */
 829         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 830         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 831
 832         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 833                                              adev->doorbell_index.max_assignment+1);
 834         if (adev->doorbell.num_doorbells == 0)
 835                 return -EINVAL;
 836
 837         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 838          * paging queue doorbell use the second page. The
 839          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 840          * doorbells are in the first page. So with paging queue enabled,
 841          * the max num_doorbells should + 1 page (0x400 in dword)
 842          */
 843         if (adev->asic_type >= CHIP_VEGA10)
 844                 adev->doorbell.num_doorbells += 0x400;
 845
 846         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 847                                      adev->doorbell.num_doorbells *
 848                                      sizeof(u32));
 849         if (adev->doorbell.ptr == NULL)
 850                 return -ENOMEM;
 851
 852         return 0;
 853 }
 854
 855 /**
 856  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 857  *
 858  * @adev: amdgpu_device pointer
 859  *
 860  * Tear down doorbell driver information (CIK)
 861  */
 862 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 863 {
 864         iounmap(adev->doorbell.ptr);
 865         adev->doorbell.ptr = NULL;
 866 }
 867
 868
 869
 870 /*
 871  * amdgpu_device_wb_*()
 872  * Writeback is the method by which the GPU updates special pages in memory
 873  * with the status of certain GPU events (fences, ring pointers,etc.).
 874  */
 875
 876 /**
 877  * amdgpu_device_wb_fini - Disable Writeback and free memory
 878  *
 879  * @adev: amdgpu_device pointer
 880  *
 881  * Disables Writeback and frees the Writeback memory (all asics).
 882  * Used at driver shutdown.
 883  */
 884 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 885 {
 886         if (adev->wb.wb_obj) {
 887                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 888                                       &adev->wb.gpu_addr,
 889                                       (void **)&adev->wb.wb);
 890                 adev->wb.wb_obj = NULL;
 891         }
 892 }
 893
 894 /**
 895  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 896  *
 897  * @adev: amdgpu_device pointer
 898  *
 899  * Initializes writeback and allocates writeback memory (all asics).
 900  * Used at driver startup.
 901  * Returns 0 on success or an -error on failure.
 902  */
 903 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 904 {
 905         int r;
 906
 907         if (adev->wb.wb_obj == NULL) {
 908                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 909                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 910                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 911                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 912                                             (void **)&adev->wb.wb);
 913                 if (r) {
 914                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 915                         return r;
 916                 }
 917
 918                 adev->wb.num_wb = AMDGPU_MAX_WB;
 919                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 920
 921                 /* clear wb memory */
 922                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 923         }
 924
 925         return 0;
 926 }
 927
 928 /**
 929  * amdgpu_device_wb_get - Allocate a wb entry
 930  *
 931  * @adev: amdgpu_device pointer
 932  * @wb: wb index
 933  *
 934  * Allocate a wb slot for use by the driver (all asics).
 935  * Returns 0 on success or -EINVAL on failure.
 936  */
 937 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 938 {
 939         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 940
 941         if (offset < adev->wb.num_wb) {
 942                 __set_bit(offset, adev->wb.used);
 943                 *wb = offset << 3; /* convert to dw offset */
 944                 return 0;
 945         } else {
 946                 return -EINVAL;
 947         }
 948 }
 949
 950 /**
 951  * amdgpu_device_wb_free - Free a wb entry
 952  *
 953  * @adev: amdgpu_device pointer
 954  * @wb: wb index
 955  *
 956  * Free a wb slot allocated for use by the driver (all asics)
 957  */
 958 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 959 {
 960         wb >>= 3;
 961         if (wb < adev->wb.num_wb)
 962                 __clear_bit(wb, adev->wb.used);
 963 }
 964
 965 /**
 966  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 967  *
 968  * @adev: amdgpu_device pointer
 969  *
 970  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 971  * to fail, but if any of the BARs is not accessible after the size we abort
 972  * driver loading by returning -ENODEV.
 973  */
 974 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 975 {
 976         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 977         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 978         struct pci_bus *root;
 979         struct resource *res;
 980         unsigned i;
 981         u16 cmd;
 982         int r;
 983
 984         /* Bypass for VF */
 985         if (amdgpu_sriov_vf(adev))
 986                 return 0;
 987
 988         /* skip if the bios has already enabled large BAR */
 989         if (adev->gmc.real_vram_size &&
 990             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
 991                 return 0;
 992
 993         /* Check if the root BUS has 64bit memory resources */
 994         root = adev->pdev->bus;
 995         while (root->parent)
 996                 root = root->parent;
 997
 998         pci_bus_for_each_resource(root, res, i) {
 999                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1000                     res->start > 0x100000000ull)
1001                         break;
1002         }
1003
1004         /* Trying to resize is pointless without a root hub window above 4GB */
1005         if (!res)
1006                 return 0;
1007
1008         /* Disable memory decoding while we change the BAR addresses and size */
1009         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1010         pci_write_config_word(adev->pdev, PCI_COMMAND,
1011                               cmd & ~PCI_COMMAND_MEMORY);
1012
1013         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1014         amdgpu_device_doorbell_fini(adev);
1015         if (adev->asic_type >= CHIP_BONAIRE)
1016                 pci_release_resource(adev->pdev, 2);
1017
1018         pci_release_resource(adev->pdev, 0);
1019
1020         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1021         if (r == -ENOSPC)
1022                 DRM_INFO("Not enough PCI address space for a large BAR.");
1023         else if (r && r != -ENOTSUPP)
1024                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1025
1026         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1027
1028         /* When the doorbell or fb BAR isn't available we have no chance of
1029          * using the device.
1030          */
1031         r = amdgpu_device_doorbell_init(adev);
1032         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1033                 return -ENODEV;
1034
1035         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1036
1037         return 0;
1038 }
1039
1040 /*
1041  * GPU helpers function.
1042  */
1043 /**
1044  * amdgpu_device_need_post - check if the hw need post or not
1045  *
1046  * @adev: amdgpu_device pointer
1047  *
1048  * Check if the asic has been initialized (all asics) at driver startup
1049  * or post is needed if  hw reset is performed.
1050  * Returns true if need or false if not.
1051  */
1052 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1053 {
1054         uint32_t reg;
1055
1056         if (amdgpu_sriov_vf(adev))
1057                 return false;
1058
1059         if (amdgpu_passthrough(adev)) {
1060                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1061                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1062                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1063                  * vpost executed for smc version below 22.15
1064                  */
1065                 if (adev->asic_type == CHIP_FIJI) {
1066                         int err;
1067                         uint32_t fw_ver;
1068                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1069                         /* force vPost if error occured */
1070                         if (err)
1071                                 return true;
1072
1073                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1074                         if (fw_ver < 0x00160e00)
1075                                 return true;
1076                 }
1077         }
1078
1079         if (adev->has_hw_reset) {
1080                 adev->has_hw_reset = false;
1081                 return true;
1082         }
1083
1084         /* bios scratch used on CIK+ */
1085         if (adev->asic_type >= CHIP_BONAIRE)
1086                 return amdgpu_atombios_scratch_need_asic_init(adev);
1087
1088         /* check MEM_SIZE for older asics */
1089         reg = amdgpu_asic_get_config_memsize(adev);
1090
1091         if ((reg != 0) && (reg != 0xffffffff))
1092                 return false;
1093
1094         return true;
1095 }
1096
1097 /* if we get transitioned to only one device, take VGA back */
1098 /**
1099  * amdgpu_device_vga_set_decode - enable/disable vga decode
1100  *
1101  * @cookie: amdgpu_device pointer
1102  * @state: enable/disable vga decode
1103  *
1104  * Enable/disable vga decode (all asics).
1105  * Returns VGA resource flags.
1106  */
1107 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1108 {
1109         struct amdgpu_device *adev = cookie;
1110         amdgpu_asic_set_vga_state(adev, state);
1111         if (state)
1112                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1113                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1114         else
1115                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1116 }
1117
1118 /**
1119  * amdgpu_device_check_block_size - validate the vm block size
1120  *
1121  * @adev: amdgpu_device pointer
1122  *
1123  * Validates the vm block size specified via module parameter.
1124  * The vm block size defines number of bits in page table versus page directory,
1125  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1126  * page table and the remaining bits are in the page directory.
1127  */
1128 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1129 {
1130         /* defines number of bits in page table versus page directory,
1131          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1132          * page table and the remaining bits are in the page directory */
1133         if (amdgpu_vm_block_size == -1)
1134                 return;
1135
1136         if (amdgpu_vm_block_size < 9) {
1137                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1138                          amdgpu_vm_block_size);
1139                 amdgpu_vm_block_size = -1;
1140         }
1141 }
1142
1143 /**
1144  * amdgpu_device_check_vm_size - validate the vm size
1145  *
1146  * @adev: amdgpu_device pointer
1147  *
1148  * Validates the vm size in GB specified via module parameter.
1149  * The VM size is the size of the GPU virtual memory space in GB.
1150  */
1151 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1152 {
1153         /* no need to check the default value */
1154         if (amdgpu_vm_size == -1)
1155                 return;
1156
1157         if (amdgpu_vm_size < 1) {
1158                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1159                          amdgpu_vm_size);
1160                 amdgpu_vm_size = -1;
1161         }
1162 }
1163
1164 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1165 {
1166         struct sysinfo si;
1167         bool is_os_64 = (sizeof(void *) == 8);
1168         uint64_t total_memory;
1169         uint64_t dram_size_seven_GB = 0x1B8000000;
1170         uint64_t dram_size_three_GB = 0xB8000000;
1171
1172         if (amdgpu_smu_memory_pool_size == 0)
1173                 return;
1174
1175         if (!is_os_64) {
1176                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1177                 goto def_value;
1178         }
1179         si_meminfo(&si);
1180         total_memory = (uint64_t)si.totalram * si.mem_unit;
1181
1182         if ((amdgpu_smu_memory_pool_size == 1) ||
1183                 (amdgpu_smu_memory_pool_size == 2)) {
1184                 if (total_memory < dram_size_three_GB)
1185                         goto def_value1;
1186         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1187                 (amdgpu_smu_memory_pool_size == 8)) {
1188                 if (total_memory < dram_size_seven_GB)
1189                         goto def_value1;
1190         } else {
1191                 DRM_WARN("Smu memory pool size not supported\n");
1192                 goto def_value;
1193         }
1194         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1195
1196         return;
1197
1198 def_value1:
1199         DRM_WARN("No enough system memory\n");
1200 def_value:
1201         adev->pm.smu_prv_buffer_size = 0;
1202 }
1203
1204 /**
1205  * amdgpu_device_check_arguments - validate module params
1206  *
1207  * @adev: amdgpu_device pointer
1208  *
1209  * Validates certain module parameters and updates
1210  * the associated values used by the driver (all asics).
1211  */
1212 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1213 {
1214         if (amdgpu_sched_jobs < 4) {
1215                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1216                          amdgpu_sched_jobs);
1217                 amdgpu_sched_jobs = 4;
1218         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1219                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1220                          amdgpu_sched_jobs);
1221                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1222         }
1223
1224         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1225                 /* gart size must be greater or equal to 32M */
1226                 dev_warn(adev->dev, "gart size (%d) too small\n",
1227                          amdgpu_gart_size);
1228                 amdgpu_gart_size = -1;
1229         }
1230
1231         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1232                 /* gtt size must be greater or equal to 32M */
1233                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1234                                  amdgpu_gtt_size);
1235                 amdgpu_gtt_size = -1;
1236         }
1237
1238         /* valid range is between 4 and 9 inclusive */
1239         if (amdgpu_vm_fragment_size != -1 &&
1240             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1241                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1242                 amdgpu_vm_fragment_size = -1;
1243         }
1244
1245         if (amdgpu_sched_hw_submission < 2) {
1246                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1247                          amdgpu_sched_hw_submission);
1248                 amdgpu_sched_hw_submission = 2;
1249         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1250                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1251                          amdgpu_sched_hw_submission);
1252                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1253         }
1254
1255         amdgpu_device_check_smu_prv_buffer_size(adev);
1256
1257         amdgpu_device_check_vm_size(adev);
1258
1259         amdgpu_device_check_block_size(adev);
1260
1261         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1262
1263         amdgpu_gmc_tmz_set(adev);
1264
1265         if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1266                 amdgpu_num_kcq = 8;
1267                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1268         }
1269
1270         return 0;
1271 }
1272
1273 /**
1274  * amdgpu_switcheroo_set_state - set switcheroo state
1275  *
1276  * @pdev: pci dev pointer
1277  * @state: vga_switcheroo state
1278  *
1279  * Callback for the switcheroo driver.  Suspends or resumes the
1280  * the asics before or after it is powered up using ACPI methods.
1281  */
1282 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1283                                         enum vga_switcheroo_state state)
1284 {
1285         struct drm_device *dev = pci_get_drvdata(pdev);
1286         int r;
1287
1288         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1289                 return;
1290
1291         if (state == VGA_SWITCHEROO_ON) {
1292                 pr_info("switched on\n");
1293                 /* don't suspend or resume card normally */
1294                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1295
1296                 pci_set_power_state(dev->pdev, PCI_D0);
1297                 amdgpu_device_load_pci_state(dev->pdev);
1298                 r = pci_enable_device(dev->pdev);
1299                 if (r)
1300                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1301                 amdgpu_device_resume(dev, true);
1302
1303                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1304                 drm_kms_helper_poll_enable(dev);
1305         } else {
1306                 pr_info("switched off\n");
1307                 drm_kms_helper_poll_disable(dev);
1308                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1309                 amdgpu_device_suspend(dev, true);
1310                 amdgpu_device_cache_pci_state(dev->pdev);
1311                 /* Shut down the device */
1312                 pci_disable_device(dev->pdev);
1313                 pci_set_power_state(dev->pdev, PCI_D3cold);
1314                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1315         }
1316 }
1317
1318 /**
1319  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1320  *
1321  * @pdev: pci dev pointer
1322  *
1323  * Callback for the switcheroo driver.  Check of the switcheroo
1324  * state can be changed.
1325  * Returns true if the state can be changed, false if not.
1326  */
1327 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1328 {
1329         struct drm_device *dev = pci_get_drvdata(pdev);
1330
1331         /*
1332         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1333         * locking inversion with the driver load path. And the access here is
1334         * completely racy anyway. So don't bother with locking for now.
1335         */
1336         return atomic_read(&dev->open_count) == 0;
1337 }
1338
1339 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1340         .set_gpu_state = amdgpu_switcheroo_set_state,
1341         .reprobe = NULL,
1342         .can_switch = amdgpu_switcheroo_can_switch,
1343 };
1344
1345 /**
1346  * amdgpu_device_ip_set_clockgating_state - set the CG state
1347  *
1348  * @dev: amdgpu_device pointer
1349  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1350  * @state: clockgating state (gate or ungate)
1351  *
1352  * Sets the requested clockgating state for all instances of
1353  * the hardware IP specified.
1354  * Returns the error code from the last instance.
1355  */
1356 int amdgpu_device_ip_set_clockgating_state(void *dev,
1357                                            enum amd_ip_block_type block_type,
1358                                            enum amd_clockgating_state state)
1359 {
1360         struct amdgpu_device *adev = dev;
1361         int i, r = 0;
1362
1363         for (i = 0; i < adev->num_ip_blocks; i++) {
1364                 if (!adev->ip_blocks[i].status.valid)
1365                         continue;
1366                 if (adev->ip_blocks[i].version->type != block_type)
1367                         continue;
1368                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1369                         continue;
1370                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1371                         (void *)adev, state);
1372                 if (r)
1373                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1374                                   adev->ip_blocks[i].version->funcs->name, r);
1375         }
1376         return r;
1377 }
1378
1379 /**
1380  * amdgpu_device_ip_set_powergating_state - set the PG state
1381  *
1382  * @dev: amdgpu_device pointer
1383  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1384  * @state: powergating state (gate or ungate)
1385  *
1386  * Sets the requested powergating state for all instances of
1387  * the hardware IP specified.
1388  * Returns the error code from the last instance.
1389  */
1390 int amdgpu_device_ip_set_powergating_state(void *dev,
1391                                            enum amd_ip_block_type block_type,
1392                                            enum amd_powergating_state state)
1393 {
1394         struct amdgpu_device *adev = dev;
1395         int i, r = 0;
1396
1397         for (i = 0; i < adev->num_ip_blocks; i++) {
1398                 if (!adev->ip_blocks[i].status.valid)
1399                         continue;
1400                 if (adev->ip_blocks[i].version->type != block_type)
1401                         continue;
1402                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1403                         continue;
1404                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1405                         (void *)adev, state);
1406                 if (r)
1407                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1408                                   adev->ip_blocks[i].version->funcs->name, r);
1409         }
1410         return r;
1411 }
1412
1413 /**
1414  * amdgpu_device_ip_get_clockgating_state - get the CG state
1415  *
1416  * @adev: amdgpu_device pointer
1417  * @flags: clockgating feature flags
1418  *
1419  * Walks the list of IPs on the device and updates the clockgating
1420  * flags for each IP.
1421  * Updates @flags with the feature flags for each hardware IP where
1422  * clockgating is enabled.
1423  */
1424 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1425                                             u32 *flags)
1426 {
1427         int i;
1428
1429         for (i = 0; i < adev->num_ip_blocks; i++) {
1430                 if (!adev->ip_blocks[i].status.valid)
1431                         continue;
1432                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1433                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1434         }
1435 }
1436
1437 /**
1438  * amdgpu_device_ip_wait_for_idle - wait for idle
1439  *
1440  * @adev: amdgpu_device pointer
1441  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1442  *
1443  * Waits for the request hardware IP to be idle.
1444  * Returns 0 for success or a negative error code on failure.
1445  */
1446 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1447                                    enum amd_ip_block_type block_type)
1448 {
1449         int i, r;
1450
1451         for (i = 0; i < adev->num_ip_blocks; i++) {
1452                 if (!adev->ip_blocks[i].status.valid)
1453                         continue;
1454                 if (adev->ip_blocks[i].version->type == block_type) {
1455                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1456                         if (r)
1457                                 return r;
1458                         break;
1459                 }
1460         }
1461         return 0;
1462
1463 }
1464
1465 /**
1466  * amdgpu_device_ip_is_idle - is the hardware IP idle
1467  *
1468  * @adev: amdgpu_device pointer
1469  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1470  *
1471  * Check if the hardware IP is idle or not.
1472  * Returns true if it the IP is idle, false if not.
1473  */
1474 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1475                               enum amd_ip_block_type block_type)
1476 {
1477         int i;
1478
1479         for (i = 0; i < adev->num_ip_blocks; i++) {
1480                 if (!adev->ip_blocks[i].status.valid)
1481                         continue;
1482                 if (adev->ip_blocks[i].version->type == block_type)
1483                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1484         }
1485         return true;
1486
1487 }
1488
1489 /**
1490  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1491  *
1492  * @adev: amdgpu_device pointer
1493  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1494  *
1495  * Returns a pointer to the hardware IP block structure
1496  * if it exists for the asic, otherwise NULL.
1497  */
1498 struct amdgpu_ip_block *
1499 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1500                               enum amd_ip_block_type type)
1501 {
1502         int i;
1503
1504         for (i = 0; i < adev->num_ip_blocks; i++)
1505                 if (adev->ip_blocks[i].version->type == type)
1506                         return &adev->ip_blocks[i];
1507
1508         return NULL;
1509 }
1510
1511 /**
1512  * amdgpu_device_ip_block_version_cmp
1513  *
1514  * @adev: amdgpu_device pointer
1515  * @type: enum amd_ip_block_type
1516  * @major: major version
1517  * @minor: minor version
1518  *
1519  * return 0 if equal or greater
1520  * return 1 if smaller or the ip_block doesn't exist
1521  */
1522 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1523                                        enum amd_ip_block_type type,
1524                                        u32 major, u32 minor)
1525 {
1526         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1527
1528         if (ip_block && ((ip_block->version->major > major) ||
1529                         ((ip_block->version->major == major) &&
1530                         (ip_block->version->minor >= minor))))
1531                 return 0;
1532
1533         return 1;
1534 }
1535
1536 /**
1537  * amdgpu_device_ip_block_add
1538  *
1539  * @adev: amdgpu_device pointer
1540  * @ip_block_version: pointer to the IP to add
1541  *
1542  * Adds the IP block driver information to the collection of IPs
1543  * on the asic.
1544  */
1545 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1546                                const struct amdgpu_ip_block_version *ip_block_version)
1547 {
1548         if (!ip_block_version)
1549                 return -EINVAL;
1550
1551         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1552                   ip_block_version->funcs->name);
1553
1554         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1555
1556         return 0;
1557 }
1558
1559 /**
1560  * amdgpu_device_enable_virtual_display - enable virtual display feature
1561  *
1562  * @adev: amdgpu_device pointer
1563  *
1564  * Enabled the virtual display feature if the user has enabled it via
1565  * the module parameter virtual_display.  This feature provides a virtual
1566  * display hardware on headless boards or in virtualized environments.
1567  * This function parses and validates the configuration string specified by
1568  * the user and configues the virtual display configuration (number of
1569  * virtual connectors, crtcs, etc.) specified.
1570  */
1571 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1572 {
1573         adev->enable_virtual_display = false;
1574
1575         if (amdgpu_virtual_display) {
1576                 struct drm_device *ddev = adev_to_drm(adev);
1577                 const char *pci_address_name = pci_name(ddev->pdev);
1578                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1579
1580                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1581                 pciaddstr_tmp = pciaddstr;
1582                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1583                         pciaddname = strsep(&pciaddname_tmp, ",");
1584                         if (!strcmp("all", pciaddname)
1585                             || !strcmp(pci_address_name, pciaddname)) {
1586                                 long num_crtc;
1587                                 int res = -1;
1588
1589                                 adev->enable_virtual_display = true;
1590
1591                                 if (pciaddname_tmp)
1592                                         res = kstrtol(pciaddname_tmp, 10,
1593                                                       &num_crtc);
1594
1595                                 if (!res) {
1596                                         if (num_crtc < 1)
1597                                                 num_crtc = 1;
1598                                         if (num_crtc > 6)
1599                                                 num_crtc = 6;
1600                                         adev->mode_info.num_crtc = num_crtc;
1601                                 } else {
1602                                         adev->mode_info.num_crtc = 1;
1603                                 }
1604                                 break;
1605                         }
1606                 }
1607
1608                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1609                          amdgpu_virtual_display, pci_address_name,
1610                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1611
1612                 kfree(pciaddstr);
1613         }
1614 }
1615
1616 /**
1617  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1618  *
1619  * @adev: amdgpu_device pointer
1620  *
1621  * Parses the asic configuration parameters specified in the gpu info
1622  * firmware and makes them availale to the driver for use in configuring
1623  * the asic.
1624  * Returns 0 on success, -EINVAL on failure.
1625  */
1626 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1627 {
1628         const char *chip_name;
1629         char fw_name[40];
1630         int err;
1631         const struct gpu_info_firmware_header_v1_0 *hdr;
1632
1633         adev->firmware.gpu_info_fw = NULL;
1634
1635         if (adev->mman.discovery_bin) {
1636                 amdgpu_discovery_get_gfx_info(adev);
1637
1638                 /*
1639                  * FIXME: The bounding box is still needed by Navi12, so
1640                  * temporarily read it from gpu_info firmware. Should be droped
1641                  * when DAL no longer needs it.
1642                  */
1643                 if (adev->asic_type != CHIP_NAVI12)
1644                         return 0;
1645         }
1646
1647         switch (adev->asic_type) {
1648 #ifdef CONFIG_DRM_AMDGPU_SI
1649         case CHIP_VERDE:
1650         case CHIP_TAHITI:
1651         case CHIP_PITCAIRN:
1652         case CHIP_OLAND:
1653         case CHIP_HAINAN:
1654 #endif
1655 #ifdef CONFIG_DRM_AMDGPU_CIK
1656         case CHIP_BONAIRE:
1657         case CHIP_HAWAII:
1658         case CHIP_KAVERI:
1659         case CHIP_KABINI:
1660         case CHIP_MULLINS:
1661 #endif
1662         case CHIP_TOPAZ:
1663         case CHIP_TONGA:
1664         case CHIP_FIJI:
1665         case CHIP_POLARIS10:
1666         case CHIP_POLARIS11:
1667         case CHIP_POLARIS12:
1668         case CHIP_VEGAM:
1669         case CHIP_CARRIZO:
1670         case CHIP_STONEY:
1671         case CHIP_VEGA20:
1672         default:
1673                 return 0;
1674         case CHIP_VEGA10:
1675                 chip_name = "vega10";
1676                 break;
1677         case CHIP_VEGA12:
1678                 chip_name = "vega12";
1679                 break;
1680         case CHIP_RAVEN:
1681                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1682                         chip_name = "raven2";
1683                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1684                         chip_name = "picasso";
1685                 else
1686                         chip_name = "raven";
1687                 break;
1688         case CHIP_ARCTURUS:
1689                 chip_name = "arcturus";
1690                 break;
1691         case CHIP_RENOIR:
1692                 chip_name = "renoir";
1693                 break;
1694         case CHIP_NAVI10:
1695                 chip_name = "navi10";
1696                 break;
1697         case CHIP_NAVI14:
1698                 chip_name = "navi14";
1699                 break;
1700         case CHIP_NAVI12:
1701                 chip_name = "navi12";
1702                 break;
1703         case CHIP_SIENNA_CICHLID:
1704                 chip_name = "sienna_cichlid";
1705                 break;
1706         case CHIP_NAVY_FLOUNDER:
1707                 chip_name = "navy_flounder";
1708                 break;
1709         }
1710
1711         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1712         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1713         if (err) {
1714                 dev_err(adev->dev,
1715                         "Failed to load gpu_info firmware \"%s\"\n",
1716                         fw_name);
1717                 goto out;
1718         }
1719         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1720         if (err) {
1721                 dev_err(adev->dev,
1722                         "Failed to validate gpu_info firmware \"%s\"\n",
1723                         fw_name);
1724                 goto out;
1725         }
1726
1727         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1728         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1729
1730         switch (hdr->version_major) {
1731         case 1:
1732         {
1733                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1734                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1735                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1736
1737                 /*
1738                  * Should be droped when DAL no longer needs it.
1739                  */
1740                 if (adev->asic_type == CHIP_NAVI12)
1741                         goto parse_soc_bounding_box;
1742
1743                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1744                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1745                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1746                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1747                 adev->gfx.config.max_texture_channel_caches =
1748                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1749                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1750                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1751                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1752                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1753                 adev->gfx.config.double_offchip_lds_buf =
1754                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1755                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1756                 adev->gfx.cu_info.max_waves_per_simd =
1757                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1758                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1759                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1760                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1761                 if (hdr->version_minor >= 1) {
1762                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1763                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1764                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1765                         adev->gfx.config.num_sc_per_sh =
1766                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1767                         adev->gfx.config.num_packer_per_sc =
1768                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1769                 }
1770
1771 parse_soc_bounding_box:
1772                 /*
1773                  * soc bounding box info is not integrated in disocovery table,
1774                  * we always need to parse it from gpu info firmware if needed.
1775                  */
1776                 if (hdr->version_minor == 2) {
1777                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1778                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1779                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1780                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1781                 }
1782                 break;
1783         }
1784         default:
1785                 dev_err(adev->dev,
1786                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1787                 err = -EINVAL;
1788                 goto out;
1789         }
1790 out:
1791         return err;
1792 }
1793
1794 /**
1795  * amdgpu_device_ip_early_init - run early init for hardware IPs
1796  *
1797  * @adev: amdgpu_device pointer
1798  *
1799  * Early initialization pass for hardware IPs.  The hardware IPs that make
1800  * up each asic are discovered each IP's early_init callback is run.  This
1801  * is the first stage in initializing the asic.
1802  * Returns 0 on success, negative error code on failure.
1803  */
1804 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1805 {
1806         int i, r;
1807
1808         amdgpu_device_enable_virtual_display(adev);
1809
1810         if (amdgpu_sriov_vf(adev)) {
1811                 r = amdgpu_virt_request_full_gpu(adev, true);
1812                 if (r)
1813                         return r;
1814         }
1815
1816         switch (adev->asic_type) {
1817 #ifdef CONFIG_DRM_AMDGPU_SI
1818         case CHIP_VERDE:
1819         case CHIP_TAHITI:
1820         case CHIP_PITCAIRN:
1821         case CHIP_OLAND:
1822         case CHIP_HAINAN:
1823                 adev->family = AMDGPU_FAMILY_SI;
1824                 r = si_set_ip_blocks(adev);
1825                 if (r)
1826                         return r;
1827                 break;
1828 #endif
1829 #ifdef CONFIG_DRM_AMDGPU_CIK
1830         case CHIP_BONAIRE:
1831         case CHIP_HAWAII:
1832         case CHIP_KAVERI:
1833         case CHIP_KABINI:
1834         case CHIP_MULLINS:
1835                 if (adev->flags & AMD_IS_APU)
1836                         adev->family = AMDGPU_FAMILY_KV;
1837                 else
1838                         adev->family = AMDGPU_FAMILY_CI;
1839
1840                 r = cik_set_ip_blocks(adev);
1841                 if (r)
1842                         return r;
1843                 break;
1844 #endif
1845         case CHIP_TOPAZ:
1846         case CHIP_TONGA:
1847         case CHIP_FIJI:
1848         case CHIP_POLARIS10:
1849         case CHIP_POLARIS11:
1850         case CHIP_POLARIS12:
1851         case CHIP_VEGAM:
1852         case CHIP_CARRIZO:
1853         case CHIP_STONEY:
1854                 if (adev->flags & AMD_IS_APU)
1855                         adev->family = AMDGPU_FAMILY_CZ;
1856                 else
1857                         adev->family = AMDGPU_FAMILY_VI;
1858
1859                 r = vi_set_ip_blocks(adev);
1860                 if (r)
1861                         return r;
1862                 break;
1863         case CHIP_VEGA10:
1864         case CHIP_VEGA12:
1865         case CHIP_VEGA20:
1866         case CHIP_RAVEN:
1867         case CHIP_ARCTURUS:
1868         case CHIP_RENOIR:
1869                 if (adev->flags & AMD_IS_APU)
1870                         adev->family = AMDGPU_FAMILY_RV;
1871                 else
1872                         adev->family = AMDGPU_FAMILY_AI;
1873
1874                 r = soc15_set_ip_blocks(adev);
1875                 if (r)
1876                         return r;
1877                 break;
1878         case  CHIP_NAVI10:
1879         case  CHIP_NAVI14:
1880         case  CHIP_NAVI12:
1881         case  CHIP_SIENNA_CICHLID:
1882         case  CHIP_NAVY_FLOUNDER:
1883                 adev->family = AMDGPU_FAMILY_NV;
1884
1885                 r = nv_set_ip_blocks(adev);
1886                 if (r)
1887                         return r;
1888                 break;
1889         default:
1890                 /* FIXME: not supported yet */
1891                 return -EINVAL;
1892         }
1893
1894         amdgpu_amdkfd_device_probe(adev);
1895
1896         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1897         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1898                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1899
1900         for (i = 0; i < adev->num_ip_blocks; i++) {
1901                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1902                         DRM_ERROR("disabled ip block: %d <%s>\n",
1903                                   i, adev->ip_blocks[i].version->funcs->name);
1904                         adev->ip_blocks[i].status.valid = false;
1905                 } else {
1906                         if (adev->ip_blocks[i].version->funcs->early_init) {
1907                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1908                                 if (r == -ENOENT) {
1909                                         adev->ip_blocks[i].status.valid = false;
1910                                 } else if (r) {
1911                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1912                                                   adev->ip_blocks[i].version->funcs->name, r);
1913                                         return r;
1914                                 } else {
1915                                         adev->ip_blocks[i].status.valid = true;
1916                                 }
1917                         } else {
1918                                 adev->ip_blocks[i].status.valid = true;
1919                         }
1920                 }
1921                 /* get the vbios after the asic_funcs are set up */
1922                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1923                         r = amdgpu_device_parse_gpu_info_fw(adev);
1924                         if (r)
1925                                 return r;
1926
1927                         /* Read BIOS */
1928                         if (!amdgpu_get_bios(adev))
1929                                 return -EINVAL;
1930
1931                         r = amdgpu_atombios_init(adev);
1932                         if (r) {
1933                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1934                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1935                                 return r;
1936                         }
1937                 }
1938         }
1939
1940         adev->cg_flags &= amdgpu_cg_mask;
1941         adev->pg_flags &= amdgpu_pg_mask;
1942
1943         return 0;
1944 }
1945
1946 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1947 {
1948         int i, r;
1949
1950         for (i = 0; i < adev->num_ip_blocks; i++) {
1951                 if (!adev->ip_blocks[i].status.sw)
1952                         continue;
1953                 if (adev->ip_blocks[i].status.hw)
1954                         continue;
1955                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1956                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1957                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1958                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1959                         if (r) {
1960                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1961                                           adev->ip_blocks[i].version->funcs->name, r);
1962                                 return r;
1963                         }
1964                         adev->ip_blocks[i].status.hw = true;
1965                 }
1966         }
1967
1968         return 0;
1969 }
1970
1971 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1972 {
1973         int i, r;
1974
1975         for (i = 0; i < adev->num_ip_blocks; i++) {
1976                 if (!adev->ip_blocks[i].status.sw)
1977                         continue;
1978                 if (adev->ip_blocks[i].status.hw)
1979                         continue;
1980                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1981                 if (r) {
1982                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1983                                   adev->ip_blocks[i].version->funcs->name, r);
1984                         return r;
1985                 }
1986                 adev->ip_blocks[i].status.hw = true;
1987         }
1988
1989         return 0;
1990 }
1991
1992 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1993 {
1994         int r = 0;
1995         int i;
1996         uint32_t smu_version;
1997
1998         if (adev->asic_type >= CHIP_VEGA10) {
1999                 for (i = 0; i < adev->num_ip_blocks; i++) {
2000                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2001                                 continue;
2002
2003                         /* no need to do the fw loading again if already done*/
2004                         if (adev->ip_blocks[i].status.hw == true)
2005                                 break;
2006
2007                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2008                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2009                                 if (r) {
2010                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2011                                                           adev->ip_blocks[i].version->funcs->name, r);
2012                                         return r;
2013                                 }
2014                         } else {
2015                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2016                                 if (r) {
2017                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2018                                                           adev->ip_blocks[i].version->funcs->name, r);
2019                                         return r;
2020                                 }
2021                         }
2022
2023                         adev->ip_blocks[i].status.hw = true;
2024                         break;
2025                 }
2026         }
2027
2028         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2029                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2030
2031         return r;
2032 }
2033
2034 /**
2035  * amdgpu_device_ip_init - run init for hardware IPs
2036  *
2037  * @adev: amdgpu_device pointer
2038  *
2039  * Main initialization pass for hardware IPs.  The list of all the hardware
2040  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2041  * are run.  sw_init initializes the software state associated with each IP
2042  * and hw_init initializes the hardware associated with each IP.
2043  * Returns 0 on success, negative error code on failure.
2044  */
2045 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2046 {
2047         int i, r;
2048
2049         r = amdgpu_ras_init(adev);
2050         if (r)
2051                 return r;
2052
2053         for (i = 0; i < adev->num_ip_blocks; i++) {
2054                 if (!adev->ip_blocks[i].status.valid)
2055                         continue;
2056                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2057                 if (r) {
2058                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2059                                   adev->ip_blocks[i].version->funcs->name, r);
2060                         goto init_failed;
2061                 }
2062                 adev->ip_blocks[i].status.sw = true;
2063
2064                 /* need to do gmc hw init early so we can allocate gpu mem */
2065                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2066                         r = amdgpu_device_vram_scratch_init(adev);
2067                         if (r) {
2068                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2069                                 goto init_failed;
2070                         }
2071                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2072                         if (r) {
2073                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2074                                 goto init_failed;
2075                         }
2076                         r = amdgpu_device_wb_init(adev);
2077                         if (r) {
2078                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2079                                 goto init_failed;
2080                         }
2081                         adev->ip_blocks[i].status.hw = true;
2082
2083                         /* right after GMC hw init, we create CSA */
2084                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2085                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2086                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2087                                                                 AMDGPU_CSA_SIZE);
2088                                 if (r) {
2089                                         DRM_ERROR("allocate CSA failed %d\n", r);
2090                                         goto init_failed;
2091                                 }
2092                         }
2093                 }
2094         }
2095
2096         if (amdgpu_sriov_vf(adev))
2097                 amdgpu_virt_init_data_exchange(adev);
2098
2099         r = amdgpu_ib_pool_init(adev);
2100         if (r) {
2101                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2102                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2103                 goto init_failed;
2104         }
2105
2106         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2107         if (r)
2108                 goto init_failed;
2109
2110         r = amdgpu_device_ip_hw_init_phase1(adev);
2111         if (r)
2112                 goto init_failed;
2113
2114         r = amdgpu_device_fw_loading(adev);
2115         if (r)
2116                 goto init_failed;
2117
2118         r = amdgpu_device_ip_hw_init_phase2(adev);
2119         if (r)
2120                 goto init_failed;
2121
2122         /*
2123          * retired pages will be loaded from eeprom and reserved here,
2124          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2125          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2126          * for I2C communication which only true at this point.
2127          *
2128          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2129          * failure from bad gpu situation and stop amdgpu init process
2130          * accordingly. For other failed cases, it will still release all
2131          * the resource and print error message, rather than returning one
2132          * negative value to upper level.
2133          *
2134          * Note: theoretically, this should be called before all vram allocations
2135          * to protect retired page from abusing
2136          */
2137         r = amdgpu_ras_recovery_init(adev);
2138         if (r)
2139                 goto init_failed;
2140
2141         if (adev->gmc.xgmi.num_physical_nodes > 1)
2142                 amdgpu_xgmi_add_device(adev);
2143         amdgpu_amdkfd_device_init(adev);
2144
2145         amdgpu_fru_get_product_info(adev);
2146
2147 init_failed:
2148         if (amdgpu_sriov_vf(adev))
2149                 amdgpu_virt_release_full_gpu(adev, true);
2150
2151         return r;
2152 }
2153
2154 /**
2155  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2156  *
2157  * @adev: amdgpu_device pointer
2158  *
2159  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2160  * this function before a GPU reset.  If the value is retained after a
2161  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2162  */
2163 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2164 {
2165         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2166 }
2167
2168 /**
2169  * amdgpu_device_check_vram_lost - check if vram is valid
2170  *
2171  * @adev: amdgpu_device pointer
2172  *
2173  * Checks the reset magic value written to the gart pointer in VRAM.
2174  * The driver calls this after a GPU reset to see if the contents of
2175  * VRAM is lost or now.
2176  * returns true if vram is lost, false if not.
2177  */
2178 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2179 {
2180         if (memcmp(adev->gart.ptr, adev->reset_magic,
2181                         AMDGPU_RESET_MAGIC_NUM))
2182                 return true;
2183
2184         if (!amdgpu_in_reset(adev))
2185                 return false;
2186
2187         /*
2188          * For all ASICs with baco/mode1 reset, the VRAM is
2189          * always assumed to be lost.
2190          */
2191         switch (amdgpu_asic_reset_method(adev)) {
2192         case AMD_RESET_METHOD_BACO:
2193         case AMD_RESET_METHOD_MODE1:
2194                 return true;
2195         default:
2196                 return false;
2197         }
2198 }
2199
2200 /**
2201  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2202  *
2203  * @adev: amdgpu_device pointer
2204  * @state: clockgating state (gate or ungate)
2205  *
2206  * The list of all the hardware IPs that make up the asic is walked and the
2207  * set_clockgating_state callbacks are run.
2208  * Late initialization pass enabling clockgating for hardware IPs.
2209  * Fini or suspend, pass disabling clockgating for hardware IPs.
2210  * Returns 0 on success, negative error code on failure.
2211  */
2212
2213 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2214                                                 enum amd_clockgating_state state)
2215 {
2216         int i, j, r;
2217
2218         if (amdgpu_emu_mode == 1)
2219                 return 0;
2220
2221         for (j = 0; j < adev->num_ip_blocks; j++) {
2222                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2223                 if (!adev->ip_blocks[i].status.late_initialized)
2224                         continue;
2225                 /* skip CG for VCE/UVD, it's handled specially */
2226                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2227                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2228                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2229                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2230                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2231                         /* enable clockgating to save power */
2232                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2233                                                                                      state);
2234                         if (r) {
2235                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2236                                           adev->ip_blocks[i].version->funcs->name, r);
2237                                 return r;
2238                         }
2239                 }
2240         }
2241
2242         return 0;
2243 }
2244
2245 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2246 {
2247         int i, j, r;
2248
2249         if (amdgpu_emu_mode == 1)
2250                 return 0;
2251
2252         for (j = 0; j < adev->num_ip_blocks; j++) {
2253                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2254                 if (!adev->ip_blocks[i].status.late_initialized)
2255                         continue;
2256                 /* skip CG for VCE/UVD, it's handled specially */
2257                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2258                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2259                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2260                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2261                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2262                         /* enable powergating to save power */
2263                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2264                                                                                         state);
2265                         if (r) {
2266                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2267                                           adev->ip_blocks[i].version->funcs->name, r);
2268                                 return r;
2269                         }
2270                 }
2271         }
2272         return 0;
2273 }
2274
2275 static int amdgpu_device_enable_mgpu_fan_boost(void)
2276 {
2277         struct amdgpu_gpu_instance *gpu_ins;
2278         struct amdgpu_device *adev;
2279         int i, ret = 0;
2280
2281         mutex_lock(&mgpu_info.mutex);
2282
2283         /*
2284          * MGPU fan boost feature should be enabled
2285          * only when there are two or more dGPUs in
2286          * the system
2287          */
2288         if (mgpu_info.num_dgpu < 2)
2289                 goto out;
2290
2291         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2292                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2293                 adev = gpu_ins->adev;
2294                 if (!(adev->flags & AMD_IS_APU) &&
2295                     !gpu_ins->mgpu_fan_enabled) {
2296                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2297                         if (ret)
2298                                 break;
2299
2300                         gpu_ins->mgpu_fan_enabled = 1;
2301                 }
2302         }
2303
2304 out:
2305         mutex_unlock(&mgpu_info.mutex);
2306
2307         return ret;
2308 }
2309
2310 /**
2311  * amdgpu_device_ip_late_init - run late init for hardware IPs
2312  *
2313  * @adev: amdgpu_device pointer
2314  *
2315  * Late initialization pass for hardware IPs.  The list of all the hardware
2316  * IPs that make up the asic is walked and the late_init callbacks are run.
2317  * late_init covers any special initialization that an IP requires
2318  * after all of the have been initialized or something that needs to happen
2319  * late in the init process.
2320  * Returns 0 on success, negative error code on failure.
2321  */
2322 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2323 {
2324         struct amdgpu_gpu_instance *gpu_instance;
2325         int i = 0, r;
2326
2327         for (i = 0; i < adev->num_ip_blocks; i++) {
2328                 if (!adev->ip_blocks[i].status.hw)
2329                         continue;
2330                 if (adev->ip_blocks[i].version->funcs->late_init) {
2331                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2332                         if (r) {
2333                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2334                                           adev->ip_blocks[i].version->funcs->name, r);
2335                                 return r;
2336                         }
2337                 }
2338                 adev->ip_blocks[i].status.late_initialized = true;
2339         }
2340
2341         amdgpu_ras_set_error_query_ready(adev, true);
2342
2343         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2344         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2345
2346         amdgpu_device_fill_reset_magic(adev);
2347
2348         r = amdgpu_device_enable_mgpu_fan_boost();
2349         if (r)
2350                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2351
2352
2353         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2354                 mutex_lock(&mgpu_info.mutex);
2355
2356                 /*
2357                  * Reset device p-state to low as this was booted with high.
2358                  *
2359                  * This should be performed only after all devices from the same
2360                  * hive get initialized.
2361                  *
2362                  * However, it's unknown how many device in the hive in advance.
2363                  * As this is counted one by one during devices initializations.
2364                  *
2365                  * So, we wait for all XGMI interlinked devices initialized.
2366                  * This may bring some delays as those devices may come from
2367                  * different hives. But that should be OK.
2368                  */
2369                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2370                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2371                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2372                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2373                                         continue;
2374
2375                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2376                                                 AMDGPU_XGMI_PSTATE_MIN);
2377                                 if (r) {
2378                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2379                                         break;
2380                                 }
2381                         }
2382                 }
2383
2384                 mutex_unlock(&mgpu_info.mutex);
2385         }
2386
2387         return 0;
2388 }
2389
2390 /**
2391  * amdgpu_device_ip_fini - run fini for hardware IPs
2392  *
2393  * @adev: amdgpu_device pointer
2394  *
2395  * Main teardown pass for hardware IPs.  The list of all the hardware
2396  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2397  * are run.  hw_fini tears down the hardware associated with each IP
2398  * and sw_fini tears down any software state associated with each IP.
2399  * Returns 0 on success, negative error code on failure.
2400  */
2401 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2402 {
2403         int i, r;
2404
2405         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2406                 amdgpu_virt_release_ras_err_handler_data(adev);
2407
2408         amdgpu_ras_pre_fini(adev);
2409
2410         if (adev->gmc.xgmi.num_physical_nodes > 1)
2411                 amdgpu_xgmi_remove_device(adev);
2412
2413         amdgpu_amdkfd_device_fini(adev);
2414
2415         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2416         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2417
2418         /* need to disable SMC first */
2419         for (i = 0; i < adev->num_ip_blocks; i++) {
2420                 if (!adev->ip_blocks[i].status.hw)
2421                         continue;
2422                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2423                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2424                         /* XXX handle errors */
2425                         if (r) {
2426                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2427                                           adev->ip_blocks[i].version->funcs->name, r);
2428                         }
2429                         adev->ip_blocks[i].status.hw = false;
2430                         break;
2431                 }
2432         }
2433
2434         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2435                 if (!adev->ip_blocks[i].status.hw)
2436                         continue;
2437
2438                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2439                 /* XXX handle errors */
2440                 if (r) {
2441                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2442                                   adev->ip_blocks[i].version->funcs->name, r);
2443                 }
2444
2445                 adev->ip_blocks[i].status.hw = false;
2446         }
2447
2448
2449         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2450                 if (!adev->ip_blocks[i].status.sw)
2451                         continue;
2452
2453                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2454                         amdgpu_ucode_free_bo(adev);
2455                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2456                         amdgpu_device_wb_fini(adev);
2457                         amdgpu_device_vram_scratch_fini(adev);
2458                         amdgpu_ib_pool_fini(adev);
2459                 }
2460
2461                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2462                 /* XXX handle errors */
2463                 if (r) {
2464                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2465                                   adev->ip_blocks[i].version->funcs->name, r);
2466                 }
2467                 adev->ip_blocks[i].status.sw = false;
2468                 adev->ip_blocks[i].status.valid = false;
2469         }
2470
2471         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2472                 if (!adev->ip_blocks[i].status.late_initialized)
2473                         continue;
2474                 if (adev->ip_blocks[i].version->funcs->late_fini)
2475                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2476                 adev->ip_blocks[i].status.late_initialized = false;
2477         }
2478
2479         amdgpu_ras_fini(adev);
2480
2481         if (amdgpu_sriov_vf(adev))
2482                 if (amdgpu_virt_release_full_gpu(adev, false))
2483                         DRM_ERROR("failed to release exclusive mode on fini\n");
2484
2485         return 0;
2486 }
2487
2488 /**
2489  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2490  *
2491  * @work: work_struct.
2492  */
2493 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2494 {
2495         struct amdgpu_device *adev =
2496                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2497         int r;
2498
2499         r = amdgpu_ib_ring_tests(adev);
2500         if (r)
2501                 DRM_ERROR("ib ring test failed (%d).\n", r);
2502 }
2503
2504 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2505 {
2506         struct amdgpu_device *adev =
2507                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2508
2509         mutex_lock(&adev->gfx.gfx_off_mutex);
2510         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2511                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2512                         adev->gfx.gfx_off_state = true;
2513         }
2514         mutex_unlock(&adev->gfx.gfx_off_mutex);
2515 }
2516
2517 /**
2518  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2519  *
2520  * @adev: amdgpu_device pointer
2521  *
2522  * Main suspend function for hardware IPs.  The list of all the hardware
2523  * IPs that make up the asic is walked, clockgating is disabled and the
2524  * suspend callbacks are run.  suspend puts the hardware and software state
2525  * in each IP into a state suitable for suspend.
2526  * Returns 0 on success, negative error code on failure.
2527  */
2528 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2529 {
2530         int i, r;
2531
2532         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2533         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2534
2535         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2536                 if (!adev->ip_blocks[i].status.valid)
2537                         continue;
2538
2539                 /* displays are handled separately */
2540                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2541                         continue;
2542
2543                 /* XXX handle errors */
2544                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2545                 /* XXX handle errors */
2546                 if (r) {
2547                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2548                                   adev->ip_blocks[i].version->funcs->name, r);
2549                         return r;
2550                 }
2551
2552                 adev->ip_blocks[i].status.hw = false;
2553         }
2554
2555         return 0;
2556 }
2557
2558 /**
2559  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2560  *
2561  * @adev: amdgpu_device pointer
2562  *
2563  * Main suspend function for hardware IPs.  The list of all the hardware
2564  * IPs that make up the asic is walked, clockgating is disabled and the
2565  * suspend callbacks are run.  suspend puts the hardware and software state
2566  * in each IP into a state suitable for suspend.
2567  * Returns 0 on success, negative error code on failure.
2568  */
2569 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2570 {
2571         int i, r;
2572
2573         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2574                 if (!adev->ip_blocks[i].status.valid)
2575                         continue;
2576                 /* displays are handled in phase1 */
2577                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2578                         continue;
2579                 /* PSP lost connection when err_event_athub occurs */
2580                 if (amdgpu_ras_intr_triggered() &&
2581                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2582                         adev->ip_blocks[i].status.hw = false;
2583                         continue;
2584                 }
2585                 /* XXX handle errors */
2586                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2587                 /* XXX handle errors */
2588                 if (r) {
2589                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2590                                   adev->ip_blocks[i].version->funcs->name, r);
2591                 }
2592                 adev->ip_blocks[i].status.hw = false;
2593                 /* handle putting the SMC in the appropriate state */
2594                 if(!amdgpu_sriov_vf(adev)){
2595                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2596                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2597                                 if (r) {
2598                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2599                                                         adev->mp1_state, r);
2600                                         return r;
2601                                 }
2602                         }
2603                 }
2604                 adev->ip_blocks[i].status.hw = false;
2605         }
2606
2607         return 0;
2608 }
2609
2610 /**
2611  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2612  *
2613  * @adev: amdgpu_device pointer
2614  *
2615  * Main suspend function for hardware IPs.  The list of all the hardware
2616  * IPs that make up the asic is walked, clockgating is disabled and the
2617  * suspend callbacks are run.  suspend puts the hardware and software state
2618  * in each IP into a state suitable for suspend.
2619  * Returns 0 on success, negative error code on failure.
2620  */
2621 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2622 {
2623         int r;
2624
2625         if (amdgpu_sriov_vf(adev))
2626                 amdgpu_virt_request_full_gpu(adev, false);
2627
2628         r = amdgpu_device_ip_suspend_phase1(adev);
2629         if (r)
2630                 return r;
2631         r = amdgpu_device_ip_suspend_phase2(adev);
2632
2633         if (amdgpu_sriov_vf(adev))
2634                 amdgpu_virt_release_full_gpu(adev, false);
2635
2636         return r;
2637 }
2638
2639 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2640 {
2641         int i, r;
2642
2643         static enum amd_ip_block_type ip_order[] = {
2644                 AMD_IP_BLOCK_TYPE_GMC,
2645                 AMD_IP_BLOCK_TYPE_COMMON,
2646                 AMD_IP_BLOCK_TYPE_PSP,
2647                 AMD_IP_BLOCK_TYPE_IH,
2648         };
2649
2650         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2651                 int j;
2652                 struct amdgpu_ip_block *block;
2653
2654                 block = &adev->ip_blocks[i];
2655                 block->status.hw = false;
2656
2657                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2658
2659                         if (block->version->type != ip_order[j] ||
2660                                 !block->status.valid)
2661                                 continue;
2662
2663                         r = block->version->funcs->hw_init(adev);
2664                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2665                         if (r)
2666                                 return r;
2667                         block->status.hw = true;
2668                 }
2669         }
2670
2671         return 0;
2672 }
2673
2674 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2675 {
2676         int i, r;
2677
2678         static enum amd_ip_block_type ip_order[] = {
2679                 AMD_IP_BLOCK_TYPE_SMC,
2680                 AMD_IP_BLOCK_TYPE_DCE,
2681                 AMD_IP_BLOCK_TYPE_GFX,
2682                 AMD_IP_BLOCK_TYPE_SDMA,
2683                 AMD_IP_BLOCK_TYPE_UVD,
2684                 AMD_IP_BLOCK_TYPE_VCE,
2685                 AMD_IP_BLOCK_TYPE_VCN
2686         };
2687
2688         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2689                 int j;
2690                 struct amdgpu_ip_block *block;
2691
2692                 for (j = 0; j < adev->num_ip_blocks; j++) {
2693                         block = &adev->ip_blocks[j];
2694
2695                         if (block->version->type != ip_order[i] ||
2696                                 !block->status.valid ||
2697                                 block->status.hw)
2698                                 continue;
2699
2700                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2701                                 r = block->version->funcs->resume(adev);
2702                         else
2703                                 r = block->version->funcs->hw_init(adev);
2704
2705                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2706                         if (r)
2707                                 return r;
2708                         block->status.hw = true;
2709                 }
2710         }
2711
2712         return 0;
2713 }
2714
2715 /**
2716  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2717  *
2718  * @adev: amdgpu_device pointer
2719  *
2720  * First resume function for hardware IPs.  The list of all the hardware
2721  * IPs that make up the asic is walked and the resume callbacks are run for
2722  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2723  * after a suspend and updates the software state as necessary.  This
2724  * function is also used for restoring the GPU after a GPU reset.
2725  * Returns 0 on success, negative error code on failure.
2726  */
2727 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2728 {
2729         int i, r;
2730
2731         for (i = 0; i < adev->num_ip_blocks; i++) {
2732                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2733                         continue;
2734                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2735                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2736                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2737
2738                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2739                         if (r) {
2740                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2741                                           adev->ip_blocks[i].version->funcs->name, r);
2742                                 return r;
2743                         }
2744                         adev->ip_blocks[i].status.hw = true;
2745                 }
2746         }
2747
2748         return 0;
2749 }
2750
2751 /**
2752  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2753  *
2754  * @adev: amdgpu_device pointer
2755  *
2756  * First resume function for hardware IPs.  The list of all the hardware
2757  * IPs that make up the asic is walked and the resume callbacks are run for
2758  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2759  * functional state after a suspend and updates the software state as
2760  * necessary.  This function is also used for restoring the GPU after a GPU
2761  * reset.
2762  * Returns 0 on success, negative error code on failure.
2763  */
2764 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2765 {
2766         int i, r;
2767
2768         for (i = 0; i < adev->num_ip_blocks; i++) {
2769                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2770                         continue;
2771                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2772                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2773                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2774                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2775                         continue;
2776                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2777                 if (r) {
2778                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2779                                   adev->ip_blocks[i].version->funcs->name, r);
2780                         return r;
2781                 }
2782                 adev->ip_blocks[i].status.hw = true;
2783         }
2784
2785         return 0;
2786 }
2787
2788 /**
2789  * amdgpu_device_ip_resume - run resume for hardware IPs
2790  *
2791  * @adev: amdgpu_device pointer
2792  *
2793  * Main resume function for hardware IPs.  The hardware IPs
2794  * are split into two resume functions because they are
2795  * are also used in in recovering from a GPU reset and some additional
2796  * steps need to be take between them.  In this case (S3/S4) they are
2797  * run sequentially.
2798  * Returns 0 on success, negative error code on failure.
2799  */
2800 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2801 {
2802         int r;
2803
2804         r = amdgpu_device_ip_resume_phase1(adev);
2805         if (r)
2806                 return r;
2807
2808         r = amdgpu_device_fw_loading(adev);
2809         if (r)
2810                 return r;
2811
2812         r = amdgpu_device_ip_resume_phase2(adev);
2813
2814         return r;
2815 }
2816
2817 /**
2818  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2819  *
2820  * @adev: amdgpu_device pointer
2821  *
2822  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2823  */
2824 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2825 {
2826         if (amdgpu_sriov_vf(adev)) {
2827                 if (adev->is_atom_fw) {
2828                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2829                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2830                 } else {
2831                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2832                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2833                 }
2834
2835                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2836                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2837         }
2838 }
2839
2840 /**
2841  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2842  *
2843  * @asic_type: AMD asic type
2844  *
2845  * Check if there is DC (new modesetting infrastructre) support for an asic.
2846  * returns true if DC has support, false if not.
2847  */
2848 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2849 {
2850         switch (asic_type) {
2851 #if defined(CONFIG_DRM_AMD_DC)
2852 #if defined(CONFIG_DRM_AMD_DC_SI)
2853         case CHIP_TAHITI:
2854         case CHIP_PITCAIRN:
2855         case CHIP_VERDE:
2856         case CHIP_OLAND:
2857 #endif
2858         case CHIP_BONAIRE:
2859         case CHIP_KAVERI:
2860         case CHIP_KABINI:
2861         case CHIP_MULLINS:
2862                 /*
2863                  * We have systems in the wild with these ASICs that require
2864                  * LVDS and VGA support which is not supported with DC.
2865                  *
2866                  * Fallback to the non-DC driver here by default so as not to
2867                  * cause regressions.
2868                  */
2869                 return amdgpu_dc > 0;
2870         case CHIP_HAWAII:
2871         case CHIP_CARRIZO:
2872         case CHIP_STONEY:
2873         case CHIP_POLARIS10:
2874         case CHIP_POLARIS11:
2875         case CHIP_POLARIS12:
2876         case CHIP_VEGAM:
2877         case CHIP_TONGA:
2878         case CHIP_FIJI:
2879         case CHIP_VEGA10:
2880         case CHIP_VEGA12:
2881         case CHIP_VEGA20:
2882 #if defined(CONFIG_DRM_AMD_DC_DCN)
2883         case CHIP_RAVEN:
2884         case CHIP_NAVI10:
2885         case CHIP_NAVI14:
2886         case CHIP_NAVI12:
2887         case CHIP_RENOIR:
2888 #endif
2889 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2890         case CHIP_SIENNA_CICHLID:
2891         case CHIP_NAVY_FLOUNDER:
2892 #endif
2893                 return amdgpu_dc != 0;
2894 #endif
2895         default:
2896                 if (amdgpu_dc > 0)
2897                         DRM_INFO("Display Core has been requested via kernel parameter "
2898                                          "but isn't supported by ASIC, ignoring\n");
2899                 return false;
2900         }
2901 }
2902
2903 /**
2904  * amdgpu_device_has_dc_support - check if dc is supported
2905  *
2906  * @adev: amdgpu_device_pointer
2907  *
2908  * Returns true for supported, false for not supported
2909  */
2910 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2911 {
2912         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2913                 return false;
2914
2915         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2916 }
2917
2918
2919 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2920 {
2921         struct amdgpu_device *adev =
2922                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2923         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2924
2925         /* It's a bug to not have a hive within this function */
2926         if (WARN_ON(!hive))
2927                 return;
2928
2929         /*
2930          * Use task barrier to synchronize all xgmi reset works across the
2931          * hive. task_barrier_enter and task_barrier_exit will block
2932          * until all the threads running the xgmi reset works reach
2933          * those points. task_barrier_full will do both blocks.
2934          */
2935         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2936
2937                 task_barrier_enter(&hive->tb);
2938                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
2939
2940                 if (adev->asic_reset_res)
2941                         goto fail;
2942
2943                 task_barrier_exit(&hive->tb);
2944                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
2945
2946                 if (adev->asic_reset_res)
2947                         goto fail;
2948
2949                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2950                         adev->mmhub.funcs->reset_ras_error_count(adev);
2951         } else {
2952
2953                 task_barrier_full(&hive->tb);
2954                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2955         }
2956
2957 fail:
2958         if (adev->asic_reset_res)
2959                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2960                          adev->asic_reset_res, adev_to_drm(adev)->unique);
2961         amdgpu_put_xgmi_hive(hive);
2962 }
2963
2964 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2965 {
2966         char *input = amdgpu_lockup_timeout;
2967         char *timeout_setting = NULL;
2968         int index = 0;
2969         long timeout;
2970         int ret = 0;
2971
2972         /*
2973          * By default timeout for non compute jobs is 10000.
2974          * And there is no timeout enforced on compute jobs.
2975          * In SR-IOV or passthrough mode, timeout for compute
2976          * jobs are 60000 by default.
2977          */
2978         adev->gfx_timeout = msecs_to_jiffies(10000);
2979         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2980         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2981                 adev->compute_timeout =  msecs_to_jiffies(60000);
2982         else
2983                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2984
2985         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2986                 while ((timeout_setting = strsep(&input, ",")) &&
2987                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2988                         ret = kstrtol(timeout_setting, 0, &timeout);
2989                         if (ret)
2990                                 return ret;
2991
2992                         if (timeout == 0) {
2993                                 index++;
2994                                 continue;
2995                         } else if (timeout < 0) {
2996                                 timeout = MAX_SCHEDULE_TIMEOUT;
2997                         } else {
2998                                 timeout = msecs_to_jiffies(timeout);
2999                         }
3000
3001                         switch (index++) {
3002                         case 0:
3003                                 adev->gfx_timeout = timeout;
3004                                 break;
3005                         case 1:
3006                                 adev->compute_timeout = timeout;
3007                                 break;
3008                         case 2:
3009                                 adev->sdma_timeout = timeout;
3010                                 break;
3011                         case 3:
3012                                 adev->video_timeout = timeout;
3013                                 break;
3014                         default:
3015                                 break;
3016                         }
3017                 }
3018                 /*
3019                  * There is only one value specified and
3020                  * it should apply to all non-compute jobs.
3021                  */
3022                 if (index == 1) {
3023                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3024                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3025                                 adev->compute_timeout = adev->gfx_timeout;
3026                 }
3027         }
3028
3029         return ret;
3030 }
3031
3032 static const struct attribute *amdgpu_dev_attributes[] = {
3033         &dev_attr_product_name.attr,
3034         &dev_attr_product_number.attr,
3035         &dev_attr_serial_number.attr,
3036         &dev_attr_pcie_replay_count.attr,
3037         NULL
3038 };
3039
3040
3041 /**
3042  * amdgpu_device_init - initialize the driver
3043  *
3044  * @adev: amdgpu_device pointer
3045  * @flags: driver flags
3046  *
3047  * Initializes the driver info and hw (all asics).
3048  * Returns 0 for success or an error on failure.
3049  * Called at driver startup.
3050  */
3051 int amdgpu_device_init(struct amdgpu_device *adev,
3052                        uint32_t flags)
3053 {
3054         struct drm_device *ddev = adev_to_drm(adev);
3055         struct pci_dev *pdev = adev->pdev;
3056         int r, i;
3057         bool boco = false;
3058         u32 max_MBps;
3059
3060         adev->shutdown = false;
3061         adev->flags = flags;
3062
3063         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3064                 adev->asic_type = amdgpu_force_asic_type;
3065         else
3066                 adev->asic_type = flags & AMD_ASIC_MASK;
3067
3068         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3069         if (amdgpu_emu_mode == 1)
3070                 adev->usec_timeout *= 10;
3071         adev->gmc.gart_size = 512 * 1024 * 1024;
3072         adev->accel_working = false;
3073         adev->num_rings = 0;
3074         adev->mman.buffer_funcs = NULL;
3075         adev->mman.buffer_funcs_ring = NULL;
3076         adev->vm_manager.vm_pte_funcs = NULL;
3077         adev->vm_manager.vm_pte_num_scheds = 0;
3078         adev->gmc.gmc_funcs = NULL;
3079         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3080         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3081
3082         adev->smc_rreg = &amdgpu_invalid_rreg;
3083         adev->smc_wreg = &amdgpu_invalid_wreg;
3084         adev->pcie_rreg = &amdgpu_invalid_rreg;
3085         adev->pcie_wreg = &amdgpu_invalid_wreg;
3086         adev->pciep_rreg = &amdgpu_invalid_rreg;
3087         adev->pciep_wreg = &amdgpu_invalid_wreg;
3088         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3089         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3090         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3091         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3092         adev->didt_rreg = &amdgpu_invalid_rreg;
3093         adev->didt_wreg = &amdgpu_invalid_wreg;
3094         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3095         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3096         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3097         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3098
3099         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3100                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3101                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3102
3103         /* mutex initialization are all done here so we
3104          * can recall function without having locking issues */
3105         atomic_set(&adev->irq.ih.lock, 0);
3106         mutex_init(&adev->firmware.mutex);
3107         mutex_init(&adev->pm.mutex);
3108         mutex_init(&adev->gfx.gpu_clock_mutex);
3109         mutex_init(&adev->srbm_mutex);
3110         mutex_init(&adev->gfx.pipe_reserve_mutex);
3111         mutex_init(&adev->gfx.gfx_off_mutex);
3112         mutex_init(&adev->grbm_idx_mutex);
3113         mutex_init(&adev->mn_lock);
3114         mutex_init(&adev->virt.vf_errors.lock);
3115         hash_init(adev->mn_hash);
3116         atomic_set(&adev->in_gpu_reset, 0);
3117         init_rwsem(&adev->reset_sem);
3118         mutex_init(&adev->psp.mutex);
3119         mutex_init(&adev->notifier_lock);
3120
3121         r = amdgpu_device_check_arguments(adev);
3122         if (r)
3123                 return r;
3124
3125         spin_lock_init(&adev->mmio_idx_lock);
3126         spin_lock_init(&adev->smc_idx_lock);
3127         spin_lock_init(&adev->pcie_idx_lock);
3128         spin_lock_init(&adev->uvd_ctx_idx_lock);
3129         spin_lock_init(&adev->didt_idx_lock);
3130         spin_lock_init(&adev->gc_cac_idx_lock);
3131         spin_lock_init(&adev->se_cac_idx_lock);
3132         spin_lock_init(&adev->audio_endpt_idx_lock);
3133         spin_lock_init(&adev->mm_stats.lock);
3134
3135         INIT_LIST_HEAD(&adev->shadow_list);
3136         mutex_init(&adev->shadow_list_lock);
3137
3138         INIT_DELAYED_WORK(&adev->delayed_init_work,
3139                           amdgpu_device_delayed_init_work_handler);
3140         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3141                           amdgpu_device_delay_enable_gfx_off);
3142
3143         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3144
3145         adev->gfx.gfx_off_req_count = 1;
3146         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3147
3148         atomic_set(&adev->throttling_logging_enabled, 1);
3149         /*
3150          * If throttling continues, logging will be performed every minute
3151          * to avoid log flooding. "-1" is subtracted since the thermal
3152          * throttling interrupt comes every second. Thus, the total logging
3153          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3154          * for throttling interrupt) = 60 seconds.
3155          */
3156         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3157         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3158
3159         /* Registers mapping */
3160         /* TODO: block userspace mapping of io register */
3161         if (adev->asic_type >= CHIP_BONAIRE) {
3162                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3163                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3164         } else {
3165                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3166                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3167         }
3168
3169         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3170         if (adev->rmmio == NULL) {
3171                 return -ENOMEM;
3172         }
3173         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3174         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3175
3176         /* io port mapping */
3177         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3178                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3179                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3180                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3181                         break;
3182                 }
3183         }
3184         if (adev->rio_mem == NULL)
3185                 DRM_INFO("PCI I/O BAR is not found.\n");
3186
3187         /* enable PCIE atomic ops */
3188         r = pci_enable_atomic_ops_to_root(adev->pdev,
3189                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3190                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3191         if (r) {
3192                 adev->have_atomics_support = false;
3193                 DRM_INFO("PCIE atomic ops is not supported\n");
3194         } else {
3195                 adev->have_atomics_support = true;
3196         }
3197
3198         amdgpu_device_get_pcie_info(adev);
3199
3200         if (amdgpu_mcbp)
3201                 DRM_INFO("MCBP is enabled\n");
3202
3203         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3204                 adev->enable_mes = true;
3205
3206         /* detect hw virtualization here */
3207         amdgpu_detect_virtualization(adev);
3208
3209         r = amdgpu_device_get_job_timeout_settings(adev);
3210         if (r) {
3211                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3212                 goto failed_unmap;
3213         }
3214
3215         /* early init functions */
3216         r = amdgpu_device_ip_early_init(adev);
3217         if (r)
3218                 goto failed_unmap;
3219
3220         /* doorbell bar mapping and doorbell index init*/
3221         amdgpu_device_doorbell_init(adev);
3222
3223         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3224         /* this will fail for cards that aren't VGA class devices, just
3225          * ignore it */
3226         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3227
3228         if (amdgpu_device_supports_boco(ddev))
3229                 boco = true;
3230         if (amdgpu_has_atpx() &&
3231             (amdgpu_is_atpx_hybrid() ||
3232              amdgpu_has_atpx_dgpu_power_cntl()) &&
3233             !pci_is_thunderbolt_attached(adev->pdev))
3234                 vga_switcheroo_register_client(adev->pdev,
3235                                                &amdgpu_switcheroo_ops, boco);
3236         if (boco)
3237                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3238
3239         if (amdgpu_emu_mode == 1) {
3240                 /* post the asic on emulation mode */
3241                 emu_soc_asic_init(adev);
3242                 goto fence_driver_init;
3243         }
3244
3245         /* detect if we are with an SRIOV vbios */
3246         amdgpu_device_detect_sriov_bios(adev);
3247
3248         /* check if we need to reset the asic
3249          *  E.g., driver was not cleanly unloaded previously, etc.
3250          */
3251         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3252                 r = amdgpu_asic_reset(adev);
3253                 if (r) {
3254                         dev_err(adev->dev, "asic reset on init failed\n");
3255                         goto failed;
3256                 }
3257         }
3258
3259         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3260
3261         /* Post card if necessary */
3262         if (amdgpu_device_need_post(adev)) {
3263                 if (!adev->bios) {
3264                         dev_err(adev->dev, "no vBIOS found\n");
3265                         r = -EINVAL;
3266                         goto failed;
3267                 }
3268                 DRM_INFO("GPU posting now...\n");
3269                 r = amdgpu_device_asic_init(adev);
3270                 if (r) {
3271                         dev_err(adev->dev, "gpu post error!\n");
3272                         goto failed;
3273                 }
3274         }
3275
3276         if (adev->is_atom_fw) {
3277                 /* Initialize clocks */
3278                 r = amdgpu_atomfirmware_get_clock_info(adev);
3279                 if (r) {
3280                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3281                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3282                         goto failed;
3283                 }
3284         } else {
3285                 /* Initialize clocks */
3286                 r = amdgpu_atombios_get_clock_info(adev);
3287                 if (r) {
3288                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3289                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3290                         goto failed;
3291                 }
3292                 /* init i2c buses */
3293                 if (!amdgpu_device_has_dc_support(adev))
3294                         amdgpu_atombios_i2c_init(adev);
3295         }
3296
3297 fence_driver_init:
3298         /* Fence driver */
3299         r = amdgpu_fence_driver_init(adev);
3300         if (r) {
3301                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3302                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3303                 goto failed;
3304         }
3305
3306         /* init the mode config */
3307         drm_mode_config_init(adev_to_drm(adev));
3308
3309         r = amdgpu_device_ip_init(adev);
3310         if (r) {
3311                 /* failed in exclusive mode due to timeout */
3312                 if (amdgpu_sriov_vf(adev) &&
3313                     !amdgpu_sriov_runtime(adev) &&
3314                     amdgpu_virt_mmio_blocked(adev) &&
3315                     !amdgpu_virt_wait_reset(adev)) {
3316                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3317                         /* Don't send request since VF is inactive. */
3318                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3319                         adev->virt.ops = NULL;
3320                         r = -EAGAIN;
3321                         goto failed;
3322                 }
3323                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3324                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3325                 goto failed;
3326         }
3327
3328         dev_info(adev->dev,
3329                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3330                         adev->gfx.config.max_shader_engines,
3331                         adev->gfx.config.max_sh_per_se,
3332                         adev->gfx.config.max_cu_per_sh,
3333                         adev->gfx.cu_info.number);
3334
3335         adev->accel_working = true;
3336
3337         amdgpu_vm_check_compute_bug(adev);
3338
3339         /* Initialize the buffer migration limit. */
3340         if (amdgpu_moverate >= 0)
3341                 max_MBps = amdgpu_moverate;
3342         else
3343                 max_MBps = 8; /* Allow 8 MB/s. */
3344         /* Get a log2 for easy divisions. */
3345         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3346
3347         amdgpu_fbdev_init(adev);
3348
3349         r = amdgpu_pm_sysfs_init(adev);
3350         if (r) {
3351                 adev->pm_sysfs_en = false;
3352                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3353         } else
3354                 adev->pm_sysfs_en = true;
3355
3356         r = amdgpu_ucode_sysfs_init(adev);
3357         if (r) {
3358                 adev->ucode_sysfs_en = false;
3359                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3360         } else
3361                 adev->ucode_sysfs_en = true;
3362
3363         if ((amdgpu_testing & 1)) {
3364                 if (adev->accel_working)
3365                         amdgpu_test_moves(adev);
3366                 else
3367                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3368         }
3369         if (amdgpu_benchmarking) {
3370                 if (adev->accel_working)
3371                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3372                 else
3373                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3374         }
3375
3376         /*
3377          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3378          * Otherwise the mgpu fan boost feature will be skipped due to the
3379          * gpu instance is counted less.
3380          */
3381         amdgpu_register_gpu_instance(adev);
3382
3383         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3384          * explicit gating rather than handling it automatically.
3385          */
3386         r = amdgpu_device_ip_late_init(adev);
3387         if (r) {
3388                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3389                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3390                 goto failed;
3391         }
3392
3393         /* must succeed. */
3394         amdgpu_ras_resume(adev);
3395
3396         queue_delayed_work(system_wq, &adev->delayed_init_work,
3397                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3398
3399         if (amdgpu_sriov_vf(adev))
3400                 flush_delayed_work(&adev->delayed_init_work);
3401
3402         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3403         if (r)
3404                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3405
3406         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3407                 r = amdgpu_pmu_init(adev);
3408         if (r)
3409                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3410
3411         /* Have stored pci confspace at hand for restore in sudden PCI error */
3412         if (amdgpu_device_cache_pci_state(adev->pdev))
3413                 pci_restore_state(pdev);
3414
3415         return 0;
3416
3417 failed:
3418         amdgpu_vf_error_trans_all(adev);
3419         if (boco)
3420                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3421
3422 failed_unmap:
3423         iounmap(adev->rmmio);
3424         adev->rmmio = NULL;
3425
3426         return r;
3427 }
3428
3429 /**
3430  * amdgpu_device_fini - tear down the driver
3431  *
3432  * @adev: amdgpu_device pointer
3433  *
3434  * Tear down the driver info (all asics).
3435  * Called at driver shutdown.
3436  */
3437 void amdgpu_device_fini(struct amdgpu_device *adev)
3438 {
3439         dev_info(adev->dev, "amdgpu: finishing device.\n");
3440         flush_delayed_work(&adev->delayed_init_work);
3441         adev->shutdown = true;
3442
3443         kfree(adev->pci_state);
3444
3445         /* make sure IB test finished before entering exclusive mode
3446          * to avoid preemption on IB test
3447          * */
3448         if (amdgpu_sriov_vf(adev))
3449                 amdgpu_virt_request_full_gpu(adev, false);
3450
3451         /* disable all interrupts */
3452         amdgpu_irq_disable_all(adev);
3453         if (adev->mode_info.mode_config_initialized){
3454                 if (!amdgpu_device_has_dc_support(adev))
3455                         drm_helper_force_disable_all(adev_to_drm(adev));
3456                 else
3457                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3458         }
3459         amdgpu_fence_driver_fini(adev);
3460         if (adev->pm_sysfs_en)
3461                 amdgpu_pm_sysfs_fini(adev);
3462         amdgpu_fbdev_fini(adev);
3463         amdgpu_device_ip_fini(adev);
3464         release_firmware(adev->firmware.gpu_info_fw);
3465         adev->firmware.gpu_info_fw = NULL;
3466         adev->accel_working = false;
3467         /* free i2c buses */
3468         if (!amdgpu_device_has_dc_support(adev))
3469                 amdgpu_i2c_fini(adev);
3470
3471         if (amdgpu_emu_mode != 1)
3472                 amdgpu_atombios_fini(adev);
3473
3474         kfree(adev->bios);
3475         adev->bios = NULL;
3476         if (amdgpu_has_atpx() &&
3477             (amdgpu_is_atpx_hybrid() ||
3478              amdgpu_has_atpx_dgpu_power_cntl()) &&
3479             !pci_is_thunderbolt_attached(adev->pdev))
3480                 vga_switcheroo_unregister_client(adev->pdev);
3481         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3482                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3483         vga_client_register(adev->pdev, NULL, NULL, NULL);
3484         if (adev->rio_mem)
3485                 pci_iounmap(adev->pdev, adev->rio_mem);
3486         adev->rio_mem = NULL;
3487         iounmap(adev->rmmio);
3488         adev->rmmio = NULL;
3489         amdgpu_device_doorbell_fini(adev);
3490
3491         if (adev->ucode_sysfs_en)
3492                 amdgpu_ucode_sysfs_fini(adev);
3493
3494         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3495         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3496                 amdgpu_pmu_fini(adev);
3497         if (adev->mman.discovery_bin)
3498                 amdgpu_discovery_fini(adev);
3499 }
3500
3501
3502 /*
3503  * Suspend & resume.
3504  */
3505 /**
3506  * amdgpu_device_suspend - initiate device suspend
3507  *
3508  * @dev: drm dev pointer
3509  * @fbcon : notify the fbdev of suspend
3510  *
3511  * Puts the hw in the suspend state (all asics).
3512  * Returns 0 for success or an error on failure.
3513  * Called at driver suspend.
3514  */
3515 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3516 {
3517         struct amdgpu_device *adev;
3518         struct drm_crtc *crtc;
3519         struct drm_connector *connector;
3520         struct drm_connector_list_iter iter;
3521         int r;
3522
3523         adev = drm_to_adev(dev);
3524
3525         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3526                 return 0;
3527
3528         adev->in_suspend = true;
3529         drm_kms_helper_poll_disable(dev);
3530
3531         if (fbcon)
3532                 amdgpu_fbdev_set_suspend(adev, 1);
3533
3534         cancel_delayed_work_sync(&adev->delayed_init_work);
3535
3536         if (!amdgpu_device_has_dc_support(adev)) {
3537                 /* turn off display hw */
3538                 drm_modeset_lock_all(dev);
3539                 drm_connector_list_iter_begin(dev, &iter);
3540                 drm_for_each_connector_iter(connector, &iter)
3541                         drm_helper_connector_dpms(connector,
3542                                                   DRM_MODE_DPMS_OFF);
3543                 drm_connector_list_iter_end(&iter);
3544                 drm_modeset_unlock_all(dev);
3545                         /* unpin the front buffers and cursors */
3546                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3547                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3548                         struct drm_framebuffer *fb = crtc->primary->fb;
3549                         struct amdgpu_bo *robj;
3550
3551                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3552                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3553                                 r = amdgpu_bo_reserve(aobj, true);
3554                                 if (r == 0) {
3555                                         amdgpu_bo_unpin(aobj);
3556                                         amdgpu_bo_unreserve(aobj);
3557                                 }
3558                         }
3559
3560                         if (fb == NULL || fb->obj[0] == NULL) {
3561                                 continue;
3562                         }
3563                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3564                         /* don't unpin kernel fb objects */
3565                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3566                                 r = amdgpu_bo_reserve(robj, true);
3567                                 if (r == 0) {
3568                                         amdgpu_bo_unpin(robj);
3569                                         amdgpu_bo_unreserve(robj);
3570                                 }
3571                         }
3572                 }
3573         }
3574
3575         amdgpu_ras_suspend(adev);
3576
3577         r = amdgpu_device_ip_suspend_phase1(adev);
3578
3579         amdgpu_amdkfd_suspend(adev, !fbcon);
3580
3581         /* evict vram memory */
3582         amdgpu_bo_evict_vram(adev);
3583
3584         amdgpu_fence_driver_suspend(adev);
3585
3586         r = amdgpu_device_ip_suspend_phase2(adev);
3587
3588         /* evict remaining vram memory
3589          * This second call to evict vram is to evict the gart page table
3590          * using the CPU.
3591          */
3592         amdgpu_bo_evict_vram(adev);
3593
3594         return 0;
3595 }
3596
3597 /**
3598  * amdgpu_device_resume - initiate device resume
3599  *
3600  * @dev: drm dev pointer
3601  * @fbcon : notify the fbdev of resume
3602  *
3603  * Bring the hw back to operating state (all asics).
3604  * Returns 0 for success or an error on failure.
3605  * Called at driver resume.
3606  */
3607 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3608 {
3609         struct drm_connector *connector;
3610         struct drm_connector_list_iter iter;
3611         struct amdgpu_device *adev = drm_to_adev(dev);
3612         struct drm_crtc *crtc;
3613         int r = 0;
3614
3615         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3616                 return 0;
3617
3618         /* post card */
3619         if (amdgpu_device_need_post(adev)) {
3620                 r = amdgpu_device_asic_init(adev);
3621                 if (r)
3622                         dev_err(adev->dev, "amdgpu asic init failed\n");
3623         }
3624
3625         r = amdgpu_device_ip_resume(adev);
3626         if (r) {
3627                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3628                 return r;
3629         }
3630         amdgpu_fence_driver_resume(adev);
3631
3632
3633         r = amdgpu_device_ip_late_init(adev);
3634         if (r)
3635                 return r;
3636
3637         queue_delayed_work(system_wq, &adev->delayed_init_work,
3638                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3639
3640         if (!amdgpu_device_has_dc_support(adev)) {
3641                 /* pin cursors */
3642                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3643                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3644
3645                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3646                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3647                                 r = amdgpu_bo_reserve(aobj, true);
3648                                 if (r == 0) {
3649                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3650                                         if (r != 0)
3651                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3652                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3653                                         amdgpu_bo_unreserve(aobj);
3654                                 }
3655                         }
3656                 }
3657         }
3658         r = amdgpu_amdkfd_resume(adev, !fbcon);
3659         if (r)
3660                 return r;
3661
3662         /* Make sure IB tests flushed */
3663         flush_delayed_work(&adev->delayed_init_work);
3664
3665         /* blat the mode back in */
3666         if (fbcon) {
3667                 if (!amdgpu_device_has_dc_support(adev)) {
3668                         /* pre DCE11 */
3669                         drm_helper_resume_force_mode(dev);
3670
3671                         /* turn on display hw */
3672                         drm_modeset_lock_all(dev);
3673
3674                         drm_connector_list_iter_begin(dev, &iter);
3675                         drm_for_each_connector_iter(connector, &iter)
3676                                 drm_helper_connector_dpms(connector,
3677                                                           DRM_MODE_DPMS_ON);
3678                         drm_connector_list_iter_end(&iter);
3679
3680                         drm_modeset_unlock_all(dev);
3681                 }
3682                 amdgpu_fbdev_set_suspend(adev, 0);
3683         }
3684
3685         drm_kms_helper_poll_enable(dev);
3686
3687         amdgpu_ras_resume(adev);
3688
3689         /*
3690          * Most of the connector probing functions try to acquire runtime pm
3691          * refs to ensure that the GPU is powered on when connector polling is
3692          * performed. Since we're calling this from a runtime PM callback,
3693          * trying to acquire rpm refs will cause us to deadlock.
3694          *
3695          * Since we're guaranteed to be holding the rpm lock, it's safe to
3696          * temporarily disable the rpm helpers so this doesn't deadlock us.
3697          */
3698 #ifdef CONFIG_PM
3699         dev->dev->power.disable_depth++;
3700 #endif
3701         if (!amdgpu_device_has_dc_support(adev))
3702                 drm_helper_hpd_irq_event(dev);
3703         else
3704                 drm_kms_helper_hotplug_event(dev);
3705 #ifdef CONFIG_PM
3706         dev->dev->power.disable_depth--;
3707 #endif
3708         adev->in_suspend = false;
3709
3710         return 0;
3711 }
3712
3713 /**
3714  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3715  *
3716  * @adev: amdgpu_device pointer
3717  *
3718  * The list of all the hardware IPs that make up the asic is walked and
3719  * the check_soft_reset callbacks are run.  check_soft_reset determines
3720  * if the asic is still hung or not.
3721  * Returns true if any of the IPs are still in a hung state, false if not.
3722  */
3723 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3724 {
3725         int i;
3726         bool asic_hang = false;
3727
3728         if (amdgpu_sriov_vf(adev))
3729                 return true;
3730
3731         if (amdgpu_asic_need_full_reset(adev))
3732                 return true;
3733
3734         for (i = 0; i < adev->num_ip_blocks; i++) {
3735                 if (!adev->ip_blocks[i].status.valid)
3736                         continue;
3737                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3738                         adev->ip_blocks[i].status.hang =
3739                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3740                 if (adev->ip_blocks[i].status.hang) {
3741                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3742                         asic_hang = true;
3743                 }
3744         }
3745         return asic_hang;
3746 }
3747
3748 /**
3749  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3750  *
3751  * @adev: amdgpu_device pointer
3752  *
3753  * The list of all the hardware IPs that make up the asic is walked and the
3754  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3755  * handles any IP specific hardware or software state changes that are
3756  * necessary for a soft reset to succeed.
3757  * Returns 0 on success, negative error code on failure.
3758  */
3759 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3760 {
3761         int i, r = 0;
3762
3763         for (i = 0; i < adev->num_ip_blocks; i++) {
3764                 if (!adev->ip_blocks[i].status.valid)
3765                         continue;
3766                 if (adev->ip_blocks[i].status.hang &&
3767                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3768                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3769                         if (r)
3770                                 return r;
3771                 }
3772         }
3773
3774         return 0;
3775 }
3776
3777 /**
3778  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3779  *
3780  * @adev: amdgpu_device pointer
3781  *
3782  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3783  * reset is necessary to recover.
3784  * Returns true if a full asic reset is required, false if not.
3785  */
3786 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3787 {
3788         int i;
3789
3790         if (amdgpu_asic_need_full_reset(adev))
3791                 return true;
3792
3793         for (i = 0; i < adev->num_ip_blocks; i++) {
3794                 if (!adev->ip_blocks[i].status.valid)
3795                         continue;
3796                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3797                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3798                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3799                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3800                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3801                         if (adev->ip_blocks[i].status.hang) {
3802                                 dev_info(adev->dev, "Some block need full reset!\n");
3803                                 return true;
3804                         }
3805                 }
3806         }
3807         return false;
3808 }
3809
3810 /**
3811  * amdgpu_device_ip_soft_reset - do a soft reset
3812  *
3813  * @adev: amdgpu_device pointer
3814  *
3815  * The list of all the hardware IPs that make up the asic is walked and the
3816  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3817  * IP specific hardware or software state changes that are necessary to soft
3818  * reset the IP.
3819  * Returns 0 on success, negative error code on failure.
3820  */
3821 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3822 {
3823         int i, r = 0;
3824
3825         for (i = 0; i < adev->num_ip_blocks; i++) {
3826                 if (!adev->ip_blocks[i].status.valid)
3827                         continue;
3828                 if (adev->ip_blocks[i].status.hang &&
3829                     adev->ip_blocks[i].version->funcs->soft_reset) {
3830                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3831                         if (r)
3832                                 return r;
3833                 }
3834         }
3835
3836         return 0;
3837 }
3838
3839 /**
3840  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3841  *
3842  * @adev: amdgpu_device pointer
3843  *
3844  * The list of all the hardware IPs that make up the asic is walked and the
3845  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3846  * handles any IP specific hardware or software state changes that are
3847  * necessary after the IP has been soft reset.
3848  * Returns 0 on success, negative error code on failure.
3849  */
3850 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3851 {
3852         int i, r = 0;
3853
3854         for (i = 0; i < adev->num_ip_blocks; i++) {
3855                 if (!adev->ip_blocks[i].status.valid)
3856                         continue;
3857                 if (adev->ip_blocks[i].status.hang &&
3858                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3859                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3860                 if (r)
3861                         return r;
3862         }
3863
3864         return 0;
3865 }
3866
3867 /**
3868  * amdgpu_device_recover_vram - Recover some VRAM contents
3869  *
3870  * @adev: amdgpu_device pointer
3871  *
3872  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3873  * restore things like GPUVM page tables after a GPU reset where
3874  * the contents of VRAM might be lost.
3875  *
3876  * Returns:
3877  * 0 on success, negative error code on failure.
3878  */
3879 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3880 {
3881         struct dma_fence *fence = NULL, *next = NULL;
3882         struct amdgpu_bo *shadow;
3883         long r = 1, tmo;
3884
3885         if (amdgpu_sriov_runtime(adev))
3886                 tmo = msecs_to_jiffies(8000);
3887         else
3888                 tmo = msecs_to_jiffies(100);
3889
3890         dev_info(adev->dev, "recover vram bo from shadow start\n");
3891         mutex_lock(&adev->shadow_list_lock);
3892         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3893
3894                 /* No need to recover an evicted BO */
3895                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3896                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3897                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3898                         continue;
3899
3900                 r = amdgpu_bo_restore_shadow(shadow, &next);
3901                 if (r)
3902                         break;
3903
3904                 if (fence) {
3905                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3906                         dma_fence_put(fence);
3907                         fence = next;
3908                         if (tmo == 0) {
3909                                 r = -ETIMEDOUT;
3910                                 break;
3911                         } else if (tmo < 0) {
3912                                 r = tmo;
3913                                 break;
3914                         }
3915                 } else {
3916                         fence = next;
3917                 }
3918         }
3919         mutex_unlock(&adev->shadow_list_lock);
3920
3921         if (fence)
3922                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3923         dma_fence_put(fence);
3924
3925         if (r < 0 || tmo <= 0) {
3926                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3927                 return -EIO;
3928         }
3929
3930         dev_info(adev->dev, "recover vram bo from shadow done\n");
3931         return 0;
3932 }
3933
3934
3935 /**
3936  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3937  *
3938  * @adev: amdgpu device pointer
3939  * @from_hypervisor: request from hypervisor
3940  *
3941  * do VF FLR and reinitialize Asic
3942  * return 0 means succeeded otherwise failed
3943  */
3944 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3945                                      bool from_hypervisor)
3946 {
3947         int r;
3948
3949         if (from_hypervisor)
3950                 r = amdgpu_virt_request_full_gpu(adev, true);
3951         else
3952                 r = amdgpu_virt_reset_gpu(adev);
3953         if (r)
3954                 return r;
3955
3956         amdgpu_amdkfd_pre_reset(adev);
3957
3958         /* Resume IP prior to SMC */
3959         r = amdgpu_device_ip_reinit_early_sriov(adev);
3960         if (r)
3961                 goto error;
3962
3963         amdgpu_virt_init_data_exchange(adev);
3964         /* we need recover gart prior to run SMC/CP/SDMA resume */
3965         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
3966
3967         r = amdgpu_device_fw_loading(adev);
3968         if (r)
3969                 return r;
3970
3971         /* now we are okay to resume SMC/CP/SDMA */
3972         r = amdgpu_device_ip_reinit_late_sriov(adev);
3973         if (r)
3974                 goto error;
3975
3976         amdgpu_irq_gpu_reset_resume_helper(adev);
3977         r = amdgpu_ib_ring_tests(adev);
3978         amdgpu_amdkfd_post_reset(adev);
3979
3980 error:
3981         amdgpu_virt_release_full_gpu(adev, true);
3982         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3983                 amdgpu_inc_vram_lost(adev);
3984                 r = amdgpu_device_recover_vram(adev);
3985         }
3986
3987         return r;
3988 }
3989
3990 /**
3991  * amdgpu_device_has_job_running - check if there is any job in mirror list
3992  *
3993  * @adev: amdgpu device pointer
3994  *
3995  * check if there is any job in mirror list
3996  */
3997 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3998 {
3999         int i;
4000         struct drm_sched_job *job;
4001
4002         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4003                 struct amdgpu_ring *ring = adev->rings[i];
4004
4005                 if (!ring || !ring->sched.thread)
4006                         continue;
4007
4008                 spin_lock(&ring->sched.job_list_lock);
4009                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4010                                 struct drm_sched_job, node);
4011                 spin_unlock(&ring->sched.job_list_lock);
4012                 if (job)
4013                         return true;
4014         }
4015         return false;
4016 }
4017
4018 /**
4019  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4020  *
4021  * @adev: amdgpu device pointer
4022  *
4023  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4024  * a hung GPU.
4025  */
4026 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4027 {
4028         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4029                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4030                 return false;
4031         }
4032
4033         if (amdgpu_gpu_recovery == 0)
4034                 goto disabled;
4035
4036         if (amdgpu_sriov_vf(adev))
4037                 return true;
4038
4039         if (amdgpu_gpu_recovery == -1) {
4040                 switch (adev->asic_type) {
4041                 case CHIP_BONAIRE:
4042                 case CHIP_HAWAII:
4043                 case CHIP_TOPAZ:
4044                 case CHIP_TONGA:
4045                 case CHIP_FIJI:
4046                 case CHIP_POLARIS10:
4047                 case CHIP_POLARIS11:
4048                 case CHIP_POLARIS12:
4049                 case CHIP_VEGAM:
4050                 case CHIP_VEGA20:
4051                 case CHIP_VEGA10:
4052                 case CHIP_VEGA12:
4053                 case CHIP_RAVEN:
4054                 case CHIP_ARCTURUS:
4055                 case CHIP_RENOIR:
4056                 case CHIP_NAVI10:
4057                 case CHIP_NAVI14:
4058                 case CHIP_NAVI12:
4059                 case CHIP_SIENNA_CICHLID:
4060                         break;
4061                 default:
4062                         goto disabled;
4063                 }
4064         }
4065
4066         return true;
4067
4068 disabled:
4069                 dev_info(adev->dev, "GPU recovery disabled.\n");
4070                 return false;
4071 }
4072
4073
4074 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4075                                         struct amdgpu_job *job,
4076                                         bool *need_full_reset_arg)
4077 {
4078         int i, r = 0;
4079         bool need_full_reset  = *need_full_reset_arg;
4080
4081         amdgpu_debugfs_wait_dump(adev);
4082
4083         /* block all schedulers and reset given job's ring */
4084         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4085                 struct amdgpu_ring *ring = adev->rings[i];
4086
4087                 if (!ring || !ring->sched.thread)
4088                         continue;
4089
4090                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4091                 amdgpu_fence_driver_force_completion(ring);
4092         }
4093
4094         if(job)
4095                 drm_sched_increase_karma(&job->base);
4096
4097         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4098         if (!amdgpu_sriov_vf(adev)) {
4099
4100                 if (!need_full_reset)
4101                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4102
4103                 if (!need_full_reset) {
4104                         amdgpu_device_ip_pre_soft_reset(adev);
4105                         r = amdgpu_device_ip_soft_reset(adev);
4106                         amdgpu_device_ip_post_soft_reset(adev);
4107                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4108                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4109                                 need_full_reset = true;
4110                         }
4111                 }
4112
4113                 if (need_full_reset)
4114                         r = amdgpu_device_ip_suspend(adev);
4115
4116                 *need_full_reset_arg = need_full_reset;
4117         }
4118
4119         return r;
4120 }
4121
4122 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4123                                struct list_head *device_list_handle,
4124                                bool *need_full_reset_arg,
4125                                bool skip_hw_reset)
4126 {
4127         struct amdgpu_device *tmp_adev = NULL;
4128         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4129         int r = 0;
4130
4131         /*
4132          * ASIC reset has to be done on all HGMI hive nodes ASAP
4133          * to allow proper links negotiation in FW (within 1 sec)
4134          */
4135         if (!skip_hw_reset && need_full_reset) {
4136                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4137                         /* For XGMI run all resets in parallel to speed up the process */
4138                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4139                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4140                                         r = -EALREADY;
4141                         } else
4142                                 r = amdgpu_asic_reset(tmp_adev);
4143
4144                         if (r) {
4145                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4146                                          r, adev_to_drm(tmp_adev)->unique);
4147                                 break;
4148                         }
4149                 }
4150
4151                 /* For XGMI wait for all resets to complete before proceed */
4152                 if (!r) {
4153                         list_for_each_entry(tmp_adev, device_list_handle,
4154                                             gmc.xgmi.head) {
4155                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4156                                         flush_work(&tmp_adev->xgmi_reset_work);
4157                                         r = tmp_adev->asic_reset_res;
4158                                         if (r)
4159                                                 break;
4160                                 }
4161                         }
4162                 }
4163         }
4164
4165         if (!r && amdgpu_ras_intr_triggered()) {
4166                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4167                         if (tmp_adev->mmhub.funcs &&
4168                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4169                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4170                 }
4171
4172                 amdgpu_ras_intr_cleared();
4173         }
4174
4175         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4176                 if (need_full_reset) {
4177                         /* post card */
4178                         if (amdgpu_device_asic_init(tmp_adev))
4179                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4180
4181                         if (!r) {
4182                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4183                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4184                                 if (r)
4185                                         goto out;
4186
4187                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4188                                 if (vram_lost) {
4189                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4190                                         amdgpu_inc_vram_lost(tmp_adev);
4191                                 }
4192
4193                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4194                                 if (r)
4195                                         goto out;
4196
4197                                 r = amdgpu_device_fw_loading(tmp_adev);
4198                                 if (r)
4199                                         return r;
4200
4201                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4202                                 if (r)
4203                                         goto out;
4204
4205                                 if (vram_lost)
4206                                         amdgpu_device_fill_reset_magic(tmp_adev);
4207
4208                                 /*
4209                                  * Add this ASIC as tracked as reset was already
4210                                  * complete successfully.
4211                                  */
4212                                 amdgpu_register_gpu_instance(tmp_adev);
4213
4214                                 r = amdgpu_device_ip_late_init(tmp_adev);
4215                                 if (r)
4216                                         goto out;
4217
4218                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4219
4220                                 /*
4221                                  * The GPU enters bad state once faulty pages
4222                                  * by ECC has reached the threshold, and ras
4223                                  * recovery is scheduled next. So add one check
4224                                  * here to break recovery if it indeed exceeds
4225                                  * bad page threshold, and remind user to
4226                                  * retire this GPU or setting one bigger
4227                                  * bad_page_threshold value to fix this once
4228                                  * probing driver again.
4229                                  */
4230                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4231                                         /* must succeed. */
4232                                         amdgpu_ras_resume(tmp_adev);
4233                                 } else {
4234                                         r = -EINVAL;
4235                                         goto out;
4236                                 }
4237
4238                                 /* Update PSP FW topology after reset */
4239                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4240                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4241                         }
4242                 }
4243
4244 out:
4245                 if (!r) {
4246                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4247                         r = amdgpu_ib_ring_tests(tmp_adev);
4248                         if (r) {
4249                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4250                                 r = amdgpu_device_ip_suspend(tmp_adev);
4251                                 need_full_reset = true;
4252                                 r = -EAGAIN;
4253                                 goto end;
4254                         }
4255                 }
4256
4257                 if (!r)
4258                         r = amdgpu_device_recover_vram(tmp_adev);
4259                 else
4260                         tmp_adev->asic_reset_res = r;
4261         }
4262
4263 end:
4264         *need_full_reset_arg = need_full_reset;
4265         return r;
4266 }
4267
4268 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4269                                 struct amdgpu_hive_info *hive)
4270 {
4271         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4272                 return false;
4273
4274         if (hive) {
4275                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4276         } else {
4277                 down_write(&adev->reset_sem);
4278         }
4279
4280         atomic_inc(&adev->gpu_reset_counter);
4281         switch (amdgpu_asic_reset_method(adev)) {
4282         case AMD_RESET_METHOD_MODE1:
4283                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4284                 break;
4285         case AMD_RESET_METHOD_MODE2:
4286                 adev->mp1_state = PP_MP1_STATE_RESET;
4287                 break;
4288         default:
4289                 adev->mp1_state = PP_MP1_STATE_NONE;
4290                 break;
4291         }
4292
4293         return true;
4294 }
4295
4296 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4297 {
4298         amdgpu_vf_error_trans_all(adev);
4299         adev->mp1_state = PP_MP1_STATE_NONE;
4300         atomic_set(&adev->in_gpu_reset, 0);
4301         up_write(&adev->reset_sem);
4302 }
4303
4304 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4305 {
4306         struct pci_dev *p = NULL;
4307
4308         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4309                         adev->pdev->bus->number, 1);
4310         if (p) {
4311                 pm_runtime_enable(&(p->dev));
4312                 pm_runtime_resume(&(p->dev));
4313         }
4314 }
4315
4316 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4317 {
4318         enum amd_reset_method reset_method;
4319         struct pci_dev *p = NULL;
4320         u64 expires;
4321
4322         /*
4323          * For now, only BACO and mode1 reset are confirmed
4324          * to suffer the audio issue without proper suspended.
4325          */
4326         reset_method = amdgpu_asic_reset_method(adev);
4327         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4328              (reset_method != AMD_RESET_METHOD_MODE1))
4329                 return -EINVAL;
4330
4331         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4332                         adev->pdev->bus->number, 1);
4333         if (!p)
4334                 return -ENODEV;
4335
4336         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4337         if (!expires)
4338                 /*
4339                  * If we cannot get the audio device autosuspend delay,
4340                  * a fixed 4S interval will be used. Considering 3S is
4341                  * the audio controller default autosuspend delay setting.
4342                  * 4S used here is guaranteed to cover that.
4343                  */
4344                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4345
4346         while (!pm_runtime_status_suspended(&(p->dev))) {
4347                 if (!pm_runtime_suspend(&(p->dev)))
4348                         break;
4349
4350                 if (expires < ktime_get_mono_fast_ns()) {
4351                         dev_warn(adev->dev, "failed to suspend display audio\n");
4352                         /* TODO: abort the succeeding gpu reset? */
4353                         return -ETIMEDOUT;
4354                 }
4355         }
4356
4357         pm_runtime_disable(&(p->dev));
4358
4359         return 0;
4360 }
4361
4362 /**
4363  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4364  *
4365  * @adev: amdgpu device pointer
4366  * @job: which job trigger hang
4367  *
4368  * Attempt to reset the GPU if it has hung (all asics).
4369  * Attempt to do soft-reset or full-reset and reinitialize Asic
4370  * Returns 0 for success or an error on failure.
4371  */
4372
4373 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4374                               struct amdgpu_job *job)
4375 {
4376         struct list_head device_list, *device_list_handle =  NULL;
4377         bool need_full_reset = false;
4378         bool job_signaled = false;
4379         struct amdgpu_hive_info *hive = NULL;
4380         struct amdgpu_device *tmp_adev = NULL;
4381         int i, r = 0;
4382         bool need_emergency_restart = false;
4383         bool audio_suspended = false;
4384
4385         /**
4386          * Special case: RAS triggered and full reset isn't supported
4387          */
4388         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4389
4390         /*
4391          * Flush RAM to disk so that after reboot
4392          * the user can read log and see why the system rebooted.
4393          */
4394         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4395                 DRM_WARN("Emergency reboot.");
4396
4397                 ksys_sync_helper();
4398                 emergency_restart();
4399         }
4400
4401         dev_info(adev->dev, "GPU %s begin!\n",
4402                 need_emergency_restart ? "jobs stop":"reset");
4403
4404         /*
4405          * Here we trylock to avoid chain of resets executing from
4406          * either trigger by jobs on different adevs in XGMI hive or jobs on
4407          * different schedulers for same device while this TO handler is running.
4408          * We always reset all schedulers for device and all devices for XGMI
4409          * hive so that should take care of them too.
4410          */
4411         hive = amdgpu_get_xgmi_hive(adev);
4412         if (hive) {
4413                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4414                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4415                                 job ? job->base.id : -1, hive->hive_id);
4416                         amdgpu_put_xgmi_hive(hive);
4417                         return 0;
4418                 }
4419                 mutex_lock(&hive->hive_lock);
4420         }
4421
4422         /*
4423          * Build list of devices to reset.
4424          * In case we are in XGMI hive mode, resort the device list
4425          * to put adev in the 1st position.
4426          */
4427         INIT_LIST_HEAD(&device_list);
4428         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4429                 if (!hive)
4430                         return -ENODEV;
4431                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4432                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4433                 device_list_handle = &hive->device_list;
4434         } else {
4435                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4436                 device_list_handle = &device_list;
4437         }
4438
4439         /* block all schedulers and reset given job's ring */
4440         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4441                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4442                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4443                                   job ? job->base.id : -1);
4444                         r = 0;
4445                         goto skip_recovery;
4446                 }
4447
4448                 /*
4449                  * Try to put the audio codec into suspend state
4450                  * before gpu reset started.
4451                  *
4452                  * Due to the power domain of the graphics device
4453                  * is shared with AZ power domain. Without this,
4454                  * we may change the audio hardware from behind
4455                  * the audio driver's back. That will trigger
4456                  * some audio codec errors.
4457                  */
4458                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4459                         audio_suspended = true;
4460
4461                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4462
4463                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4464
4465                 if (!amdgpu_sriov_vf(tmp_adev))
4466                         amdgpu_amdkfd_pre_reset(tmp_adev);
4467
4468                 /*
4469                  * Mark these ASICs to be reseted as untracked first
4470                  * And add them back after reset completed
4471                  */
4472                 amdgpu_unregister_gpu_instance(tmp_adev);
4473
4474                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4475
4476                 /* disable ras on ALL IPs */
4477                 if (!need_emergency_restart &&
4478                       amdgpu_device_ip_need_full_reset(tmp_adev))
4479                         amdgpu_ras_suspend(tmp_adev);
4480
4481                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4482                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4483
4484                         if (!ring || !ring->sched.thread)
4485                                 continue;
4486
4487                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4488
4489                         if (need_emergency_restart)
4490                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4491                 }
4492         }
4493
4494         if (need_emergency_restart)
4495                 goto skip_sched_resume;
4496
4497         /*
4498          * Must check guilty signal here since after this point all old
4499          * HW fences are force signaled.
4500          *
4501          * job->base holds a reference to parent fence
4502          */
4503         if (job && job->base.s_fence->parent &&
4504             dma_fence_is_signaled(job->base.s_fence->parent)) {
4505                 job_signaled = true;
4506                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4507                 goto skip_hw_reset;
4508         }
4509
4510 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4511         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4512                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4513                                                  NULL,
4514                                                  &need_full_reset);
4515                 /*TODO Should we stop ?*/
4516                 if (r) {
4517                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4518                                   r, adev_to_drm(tmp_adev)->unique);
4519                         tmp_adev->asic_reset_res = r;
4520                 }
4521         }
4522
4523         /* Actual ASIC resets if needed.*/
4524         /* TODO Implement XGMI hive reset logic for SRIOV */
4525         if (amdgpu_sriov_vf(adev)) {
4526                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4527                 if (r)
4528                         adev->asic_reset_res = r;
4529         } else {
4530                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4531                 if (r && r == -EAGAIN)
4532                         goto retry;
4533         }
4534
4535 skip_hw_reset:
4536
4537         /* Post ASIC reset for all devs .*/
4538         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4539
4540                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4541                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4542
4543                         if (!ring || !ring->sched.thread)
4544                                 continue;
4545
4546                         /* No point to resubmit jobs if we didn't HW reset*/
4547                         if (!tmp_adev->asic_reset_res && !job_signaled)
4548                                 drm_sched_resubmit_jobs(&ring->sched);
4549
4550                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4551                 }
4552
4553                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4554                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4555                 }
4556
4557                 tmp_adev->asic_reset_res = 0;
4558
4559                 if (r) {
4560                         /* bad news, how to tell it to userspace ? */
4561                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4562                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4563                 } else {
4564                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4565                 }
4566         }
4567
4568 skip_sched_resume:
4569         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4570                 /*unlock kfd: SRIOV would do it separately */
4571                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4572                         amdgpu_amdkfd_post_reset(tmp_adev);
4573                 if (audio_suspended)
4574                         amdgpu_device_resume_display_audio(tmp_adev);
4575                 amdgpu_device_unlock_adev(tmp_adev);
4576         }
4577
4578 skip_recovery:
4579         if (hive) {
4580                 atomic_set(&hive->in_reset, 0);
4581                 mutex_unlock(&hive->hive_lock);
4582                 amdgpu_put_xgmi_hive(hive);
4583         }
4584
4585         if (r)
4586                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4587         return r;
4588 }
4589
4590 /**
4591  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4592  *
4593  * @adev: amdgpu_device pointer
4594  *
4595  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4596  * and lanes) of the slot the device is in. Handles APUs and
4597  * virtualized environments where PCIE config space may not be available.
4598  */
4599 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4600 {
4601         struct pci_dev *pdev;
4602         enum pci_bus_speed speed_cap, platform_speed_cap;
4603         enum pcie_link_width platform_link_width;
4604
4605         if (amdgpu_pcie_gen_cap)
4606                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4607
4608         if (amdgpu_pcie_lane_cap)
4609                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4610
4611         /* covers APUs as well */
4612         if (pci_is_root_bus(adev->pdev->bus)) {
4613                 if (adev->pm.pcie_gen_mask == 0)
4614                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4615                 if (adev->pm.pcie_mlw_mask == 0)
4616                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4617                 return;
4618         }
4619
4620         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4621                 return;
4622
4623         pcie_bandwidth_available(adev->pdev, NULL,
4624                                  &platform_speed_cap, &platform_link_width);
4625
4626         if (adev->pm.pcie_gen_mask == 0) {
4627                 /* asic caps */
4628                 pdev = adev->pdev;
4629                 speed_cap = pcie_get_speed_cap(pdev);
4630                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4631                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4632                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4633                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4634                 } else {
4635                         if (speed_cap == PCIE_SPEED_16_0GT)
4636                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4637                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4638                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4639                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4640                         else if (speed_cap == PCIE_SPEED_8_0GT)
4641                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4642                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4643                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4644                         else if (speed_cap == PCIE_SPEED_5_0GT)
4645                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4646                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4647                         else
4648                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4649                 }
4650                 /* platform caps */
4651                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4652                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4653                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4654                 } else {
4655                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4656                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4657                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4658                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4659                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4660                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4661                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4662                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4663                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4664                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4665                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4666                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4667                         else
4668                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4669
4670                 }
4671         }
4672         if (adev->pm.pcie_mlw_mask == 0) {
4673                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4674                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4675                 } else {
4676                         switch (platform_link_width) {
4677                         case PCIE_LNK_X32:
4678                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4679                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4680                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4681                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4682                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4683                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4684                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4685                                 break;
4686                         case PCIE_LNK_X16:
4687                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4688                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4689                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4690                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4691                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4692                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4693                                 break;
4694                         case PCIE_LNK_X12:
4695                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4696                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4697                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4698                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4699                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4700                                 break;
4701                         case PCIE_LNK_X8:
4702                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4703                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4704                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4705                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4706                                 break;
4707                         case PCIE_LNK_X4:
4708                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4709                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4710                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4711                                 break;
4712                         case PCIE_LNK_X2:
4713                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4714                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4715                                 break;
4716                         case PCIE_LNK_X1:
4717                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4718                                 break;
4719                         default:
4720                                 break;
4721                         }
4722                 }
4723         }
4724 }
4725
4726 int amdgpu_device_baco_enter(struct drm_device *dev)
4727 {
4728         struct amdgpu_device *adev = drm_to_adev(dev);
4729         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4730
4731         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4732                 return -ENOTSUPP;
4733
4734         if (ras && ras->supported)
4735                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4736
4737         return amdgpu_dpm_baco_enter(adev);
4738 }
4739
4740 int amdgpu_device_baco_exit(struct drm_device *dev)
4741 {
4742         struct amdgpu_device *adev = drm_to_adev(dev);
4743         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4744         int ret = 0;
4745
4746         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4747                 return -ENOTSUPP;
4748
4749         ret = amdgpu_dpm_baco_exit(adev);
4750         if (ret)
4751                 return ret;
4752
4753         if (ras && ras->supported)
4754                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4755
4756         return 0;
4757 }
4758
4759 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4760 {
4761         int i;
4762
4763         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4764                 struct amdgpu_ring *ring = adev->rings[i];
4765
4766                 if (!ring || !ring->sched.thread)
4767                         continue;
4768
4769                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4770         }
4771 }
4772
4773 /**
4774  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4775  * @pdev: PCI device struct
4776  * @state: PCI channel state
4777  *
4778  * Description: Called when a PCI error is detected.
4779  *
4780  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4781  */
4782 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4783 {
4784         struct drm_device *dev = pci_get_drvdata(pdev);
4785         struct amdgpu_device *adev = drm_to_adev(dev);
4786         int i;
4787
4788         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4789
4790         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4791                 DRM_WARN("No support for XGMI hive yet...");
4792                 return PCI_ERS_RESULT_DISCONNECT;
4793         }
4794
4795         switch (state) {
4796         case pci_channel_io_normal:
4797                 return PCI_ERS_RESULT_CAN_RECOVER;
4798         /* Fatal error, prepare for slot reset */
4799         case pci_channel_io_frozen:
4800                 /*
4801                  * Cancel and wait for all TDRs in progress if failing to
4802                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4803                  *
4804                  * Locking adev->reset_sem will prevent any external access
4805                  * to GPU during PCI error recovery
4806                  */
4807                 while (!amdgpu_device_lock_adev(adev, NULL))
4808                         amdgpu_cancel_all_tdr(adev);
4809
4810                 /*
4811                  * Block any work scheduling as we do for regular GPU reset
4812                  * for the duration of the recovery
4813                  */
4814                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4815                         struct amdgpu_ring *ring = adev->rings[i];
4816
4817                         if (!ring || !ring->sched.thread)
4818                                 continue;
4819
4820                         drm_sched_stop(&ring->sched, NULL);
4821                 }
4822                 return PCI_ERS_RESULT_NEED_RESET;
4823         case pci_channel_io_perm_failure:
4824                 /* Permanent error, prepare for device removal */
4825                 return PCI_ERS_RESULT_DISCONNECT;
4826         }
4827
4828         return PCI_ERS_RESULT_NEED_RESET;
4829 }
4830
4831 /**
4832  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4833  * @pdev: pointer to PCI device
4834  */
4835 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4836 {
4837
4838         DRM_INFO("PCI error: mmio enabled callback!!\n");
4839
4840         /* TODO - dump whatever for debugging purposes */
4841
4842         /* This called only if amdgpu_pci_error_detected returns
4843          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4844          * works, no need to reset slot.
4845          */
4846
4847         return PCI_ERS_RESULT_RECOVERED;
4848 }
4849
4850 /**
4851  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4852  * @pdev: PCI device struct
4853  *
4854  * Description: This routine is called by the pci error recovery
4855  * code after the PCI slot has been reset, just before we
4856  * should resume normal operations.
4857  */
4858 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4859 {
4860         struct drm_device *dev = pci_get_drvdata(pdev);
4861         struct amdgpu_device *adev = drm_to_adev(dev);
4862         int r, i;
4863         bool need_full_reset = true;
4864         u32 memsize;
4865         struct list_head device_list;
4866
4867         DRM_INFO("PCI error: slot reset callback!!\n");
4868
4869         INIT_LIST_HEAD(&device_list);
4870         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4871
4872         /* wait for asic to come out of reset */
4873         msleep(500);
4874
4875         /* Restore PCI confspace */
4876         amdgpu_device_load_pci_state(pdev);
4877
4878         /* confirm  ASIC came out of reset */
4879         for (i = 0; i < adev->usec_timeout; i++) {
4880                 memsize = amdgpu_asic_get_config_memsize(adev);
4881
4882                 if (memsize != 0xffffffff)
4883                         break;
4884                 udelay(1);
4885         }
4886         if (memsize == 0xffffffff) {
4887                 r = -ETIME;
4888                 goto out;
4889         }
4890
4891         adev->in_pci_err_recovery = true;
4892         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
4893         adev->in_pci_err_recovery = false;
4894         if (r)
4895                 goto out;
4896
4897         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
4898
4899 out:
4900         if (!r) {
4901                 if (amdgpu_device_cache_pci_state(adev->pdev))
4902                         pci_restore_state(adev->pdev);
4903
4904                 DRM_INFO("PCIe error recovery succeeded\n");
4905         } else {
4906                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
4907                 amdgpu_device_unlock_adev(adev);
4908         }
4909
4910         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
4911 }
4912
4913 /**
4914  * amdgpu_pci_resume() - resume normal ops after PCI reset
4915  * @pdev: pointer to PCI device
4916  *
4917  * Called when the error recovery driver tells us that its
4918  * OK to resume normal operation. Use completion to allow
4919  * halted scsi ops to resume.
4920  */
4921 void amdgpu_pci_resume(struct pci_dev *pdev)
4922 {
4923         struct drm_device *dev = pci_get_drvdata(pdev);
4924         struct amdgpu_device *adev = drm_to_adev(dev);
4925         int i;
4926
4927
4928         DRM_INFO("PCI error: resume callback!!\n");
4929
4930         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4931                 struct amdgpu_ring *ring = adev->rings[i];
4932
4933                 if (!ring || !ring->sched.thread)
4934                         continue;
4935
4936
4937                 drm_sched_resubmit_jobs(&ring->sched);
4938                 drm_sched_start(&ring->sched, true);
4939         }
4940
4941         amdgpu_device_unlock_adev(adev);
4942 }
4943
4944 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
4945 {
4946         struct drm_device *dev = pci_get_drvdata(pdev);
4947         struct amdgpu_device *adev = drm_to_adev(dev);
4948         int r;
4949
4950         r = pci_save_state(pdev);
4951         if (!r) {
4952                 kfree(adev->pci_state);
4953
4954                 adev->pci_state = pci_store_saved_state(pdev);
4955
4956                 if (!adev->pci_state) {
4957                         DRM_ERROR("Failed to store PCI saved state");
4958                         return false;
4959                 }
4960         } else {
4961                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
4962                 return false;
4963         }
4964
4965         return true;
4966 }
4967
4968 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
4969 {
4970         struct drm_device *dev = pci_get_drvdata(pdev);
4971         struct amdgpu_device *adev = drm_to_adev(dev);
4972         int r;
4973
4974         if (!adev->pci_state)
4975                 return false;
4976
4977         r = pci_load_saved_state(pdev, adev->pci_state);
4978
4979         if (!r) {
4980                 pci_restore_state(pdev);
4981         } else {
4982                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
4983                 return false;
4984         }
4985
4986         return true;
4987 }
4988
4989