drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "VANGOGH",
 120         "DIMGREY_CAVEFISH",
 121         "LAST",
 122 };
 123
 124 /**
 125  * DOC: pcie_replay_count
 126  *
 127  * The amdgpu driver provides a sysfs API for reporting the total number
 128  * of PCIe replays (NAKs)
 129  * The file pcie_replay_count is used for this and returns the total
 130  * number of replays as a sum of the NAKs generated and NAKs received
 131  */
 132
 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 134                 struct device_attribute *attr, char *buf)
 135 {
 136         struct drm_device *ddev = dev_get_drvdata(dev);
 137         struct amdgpu_device *adev = drm_to_adev(ddev);
 138         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 139
 140         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 141 }
 142
 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 144                 amdgpu_device_get_pcie_replay_count, NULL);
 145
 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 147
 148 /**
 149  * DOC: product_name
 150  *
 151  * The amdgpu driver provides a sysfs API for reporting the product name
 152  * for the device
 153  * The file serial_number is used for this and returns the product name
 154  * as returned from the FRU.
 155  * NOTE: This is only available for certain server cards
 156  */
 157
 158 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 159                 struct device_attribute *attr, char *buf)
 160 {
 161         struct drm_device *ddev = dev_get_drvdata(dev);
 162         struct amdgpu_device *adev = drm_to_adev(ddev);
 163
 164         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 165 }
 166
 167 static DEVICE_ATTR(product_name, S_IRUGO,
 168                 amdgpu_device_get_product_name, NULL);
 169
 170 /**
 171  * DOC: product_number
 172  *
 173  * The amdgpu driver provides a sysfs API for reporting the part number
 174  * for the device
 175  * The file serial_number is used for this and returns the part number
 176  * as returned from the FRU.
 177  * NOTE: This is only available for certain server cards
 178  */
 179
 180 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 181                 struct device_attribute *attr, char *buf)
 182 {
 183         struct drm_device *ddev = dev_get_drvdata(dev);
 184         struct amdgpu_device *adev = drm_to_adev(ddev);
 185
 186         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 187 }
 188
 189 static DEVICE_ATTR(product_number, S_IRUGO,
 190                 amdgpu_device_get_product_number, NULL);
 191
 192 /**
 193  * DOC: serial_number
 194  *
 195  * The amdgpu driver provides a sysfs API for reporting the serial number
 196  * for the device
 197  * The file serial_number is used for this and returns the serial number
 198  * as returned from the FRU.
 199  * NOTE: This is only available for certain server cards
 200  */
 201
 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 203                 struct device_attribute *attr, char *buf)
 204 {
 205         struct drm_device *ddev = dev_get_drvdata(dev);
 206         struct amdgpu_device *adev = drm_to_adev(ddev);
 207
 208         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 209 }
 210
 211 static DEVICE_ATTR(serial_number, S_IRUGO,
 212                 amdgpu_device_get_serial_number, NULL);
 213
 214 /**
 215  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 216  *
 217  * @dev: drm_device pointer
 218  *
 219  * Returns true if the device is a dGPU with HG/PX power control,
 220  * otherwise return false.
 221  */
 222 bool amdgpu_device_supports_boco(struct drm_device *dev)
 223 {
 224         struct amdgpu_device *adev = drm_to_adev(dev);
 225
 226         if (adev->flags & AMD_IS_PX)
 227                 return true;
 228         return false;
 229 }
 230
 231 /**
 232  * amdgpu_device_supports_baco - Does the device support BACO
 233  *
 234  * @dev: drm_device pointer
 235  *
 236  * Returns true if the device supporte BACO,
 237  * otherwise return false.
 238  */
 239 bool amdgpu_device_supports_baco(struct drm_device *dev)
 240 {
 241         struct amdgpu_device *adev = drm_to_adev(dev);
 242
 243         return amdgpu_asic_supports_baco(adev);
 244 }
 245
 246 /**
 247  * VRAM access helper functions.
 248  *
 249  * amdgpu_device_vram_access - read/write a buffer in vram
 250  *
 251  * @adev: amdgpu_device pointer
 252  * @pos: offset of the buffer in vram
 253  * @buf: virtual address of the buffer in system memory
 254  * @size: read/write size, sizeof(@buf) must > @size
 255  * @write: true - write to vram, otherwise - read from vram
 256  */
 257 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 258                                uint32_t *buf, size_t size, bool write)
 259 {
 260         unsigned long flags;
 261         uint32_t hi = ~0;
 262         uint64_t last;
 263
 264
 265 #ifdef CONFIG_64BIT
 266         last = min(pos + size, adev->gmc.visible_vram_size);
 267         if (last > pos) {
 268                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 269                 size_t count = last - pos;
 270
 271                 if (write) {
 272                         memcpy_toio(addr, buf, count);
 273                         mb();
 274                         amdgpu_asic_flush_hdp(adev, NULL);
 275                 } else {
 276                         amdgpu_asic_invalidate_hdp(adev, NULL);
 277                         mb();
 278                         memcpy_fromio(buf, addr, count);
 279                 }
 280
 281                 if (count == size)
 282                         return;
 283
 284                 pos += count;
 285                 buf += count / 4;
 286                 size -= count;
 287         }
 288 #endif
 289
 290         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 291         for (last = pos + size; pos < last; pos += 4) {
 292                 uint32_t tmp = pos >> 31;
 293
 294                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 295                 if (tmp != hi) {
 296                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 297                         hi = tmp;
 298                 }
 299                 if (write)
 300                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 301                 else
 302                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 303         }
 304         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 305 }
 306
 307 /*
 308  * register access helper functions.
 309  */
 310 /**
 311  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 312  *
 313  * @adev: amdgpu_device pointer
 314  * @reg: dword aligned register offset
 315  * @acc_flags: access flags which require special behavior
 316  *
 317  * Returns the 32 bit value from the offset specified.
 318  */
 319 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 320                             uint32_t reg, uint32_t acc_flags)
 321 {
 322         uint32_t ret;
 323
 324         if (adev->in_pci_err_recovery)
 325                 return 0;
 326
 327         if ((reg * 4) < adev->rmmio_size) {
 328                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 329                     amdgpu_sriov_runtime(adev) &&
 330                     down_read_trylock(&adev->reset_sem)) {
 331                         ret = amdgpu_kiq_rreg(adev, reg);
 332                         up_read(&adev->reset_sem);
 333                 } else {
 334                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 335                 }
 336         } else {
 337                 ret = adev->pcie_rreg(adev, reg * 4);
 338         }
 339
 340         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 341
 342         return ret;
 343 }
 344
 345 /*
 346  * MMIO register read with bytes helper functions
 347  * @offset:bytes offset from MMIO start
 348  *
 349 */
 350
 351 /**
 352  * amdgpu_mm_rreg8 - read a memory mapped IO register
 353  *
 354  * @adev: amdgpu_device pointer
 355  * @offset: byte aligned register offset
 356  *
 357  * Returns the 8 bit value from the offset specified.
 358  */
 359 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 360 {
 361         if (adev->in_pci_err_recovery)
 362                 return 0;
 363
 364         if (offset < adev->rmmio_size)
 365                 return (readb(adev->rmmio + offset));
 366         BUG();
 367 }
 368
 369 /*
 370  * MMIO register write with bytes helper functions
 371  * @offset:bytes offset from MMIO start
 372  * @value: the value want to be written to the register
 373  *
 374 */
 375 /**
 376  * amdgpu_mm_wreg8 - read a memory mapped IO register
 377  *
 378  * @adev: amdgpu_device pointer
 379  * @offset: byte aligned register offset
 380  * @value: 8 bit value to write
 381  *
 382  * Writes the value specified to the offset specified.
 383  */
 384 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 385 {
 386         if (adev->in_pci_err_recovery)
 387                 return;
 388
 389         if (offset < adev->rmmio_size)
 390                 writeb(value, adev->rmmio + offset);
 391         else
 392                 BUG();
 393 }
 394
 395 /**
 396  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 397  *
 398  * @adev: amdgpu_device pointer
 399  * @reg: dword aligned register offset
 400  * @v: 32 bit value to write to the register
 401  * @acc_flags: access flags which require special behavior
 402  *
 403  * Writes the value specified to the offset specified.
 404  */
 405 void amdgpu_device_wreg(struct amdgpu_device *adev,
 406                         uint32_t reg, uint32_t v,
 407                         uint32_t acc_flags)
 408 {
 409         if (adev->in_pci_err_recovery)
 410                 return;
 411
 412         if ((reg * 4) < adev->rmmio_size) {
 413                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 414                     amdgpu_sriov_runtime(adev) &&
 415                     down_read_trylock(&adev->reset_sem)) {
 416                         amdgpu_kiq_wreg(adev, reg, v);
 417                         up_read(&adev->reset_sem);
 418                 } else {
 419                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 420                 }
 421         } else {
 422                 adev->pcie_wreg(adev, reg * 4, v);
 423         }
 424
 425         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 426 }
 427
 428 /*
 429  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 430  *
 431  * this function is invoked only the debugfs register access
 432  * */
 433 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 434                              uint32_t reg, uint32_t v)
 435 {
 436         if (adev->in_pci_err_recovery)
 437                 return;
 438
 439         if (amdgpu_sriov_fullaccess(adev) &&
 440             adev->gfx.rlc.funcs &&
 441             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 442                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 443                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 444         } else {
 445                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 446         }
 447 }
 448
 449 /**
 450  * amdgpu_io_rreg - read an IO register
 451  *
 452  * @adev: amdgpu_device pointer
 453  * @reg: dword aligned register offset
 454  *
 455  * Returns the 32 bit value from the offset specified.
 456  */
 457 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 458 {
 459         if (adev->in_pci_err_recovery)
 460                 return 0;
 461
 462         if ((reg * 4) < adev->rio_mem_size)
 463                 return ioread32(adev->rio_mem + (reg * 4));
 464         else {
 465                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 466                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 467         }
 468 }
 469
 470 /**
 471  * amdgpu_io_wreg - write to an IO register
 472  *
 473  * @adev: amdgpu_device pointer
 474  * @reg: dword aligned register offset
 475  * @v: 32 bit value to write to the register
 476  *
 477  * Writes the value specified to the offset specified.
 478  */
 479 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 480 {
 481         if (adev->in_pci_err_recovery)
 482                 return;
 483
 484         if ((reg * 4) < adev->rio_mem_size)
 485                 iowrite32(v, adev->rio_mem + (reg * 4));
 486         else {
 487                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 488                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 489         }
 490 }
 491
 492 /**
 493  * amdgpu_mm_rdoorbell - read a doorbell dword
 494  *
 495  * @adev: amdgpu_device pointer
 496  * @index: doorbell index
 497  *
 498  * Returns the value in the doorbell aperture at the
 499  * requested doorbell index (CIK).
 500  */
 501 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 502 {
 503         if (adev->in_pci_err_recovery)
 504                 return 0;
 505
 506         if (index < adev->doorbell.num_doorbells) {
 507                 return readl(adev->doorbell.ptr + index);
 508         } else {
 509                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 510                 return 0;
 511         }
 512 }
 513
 514 /**
 515  * amdgpu_mm_wdoorbell - write a doorbell dword
 516  *
 517  * @adev: amdgpu_device pointer
 518  * @index: doorbell index
 519  * @v: value to write
 520  *
 521  * Writes @v to the doorbell aperture at the
 522  * requested doorbell index (CIK).
 523  */
 524 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 525 {
 526         if (adev->in_pci_err_recovery)
 527                 return;
 528
 529         if (index < adev->doorbell.num_doorbells) {
 530                 writel(v, adev->doorbell.ptr + index);
 531         } else {
 532                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 533         }
 534 }
 535
 536 /**
 537  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 538  *
 539  * @adev: amdgpu_device pointer
 540  * @index: doorbell index
 541  *
 542  * Returns the value in the doorbell aperture at the
 543  * requested doorbell index (VEGA10+).
 544  */
 545 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 546 {
 547         if (adev->in_pci_err_recovery)
 548                 return 0;
 549
 550         if (index < adev->doorbell.num_doorbells) {
 551                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 552         } else {
 553                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 554                 return 0;
 555         }
 556 }
 557
 558 /**
 559  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 560  *
 561  * @adev: amdgpu_device pointer
 562  * @index: doorbell index
 563  * @v: value to write
 564  *
 565  * Writes @v to the doorbell aperture at the
 566  * requested doorbell index (VEGA10+).
 567  */
 568 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 569 {
 570         if (adev->in_pci_err_recovery)
 571                 return;
 572
 573         if (index < adev->doorbell.num_doorbells) {
 574                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 575         } else {
 576                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 577         }
 578 }
 579
 580 /**
 581  * amdgpu_device_indirect_rreg - read an indirect register
 582  *
 583  * @adev: amdgpu_device pointer
 584  * @pcie_index: mmio register offset
 585  * @pcie_data: mmio register offset
 586  *
 587  * Returns the value of indirect register @reg_addr
 588  */
 589 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 590                                 u32 pcie_index, u32 pcie_data,
 591                                 u32 reg_addr)
 592 {
 593         unsigned long flags;
 594         u32 r;
 595         void __iomem *pcie_index_offset;
 596         void __iomem *pcie_data_offset;
 597
 598         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 599         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 600         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 601
 602         writel(reg_addr, pcie_index_offset);
 603         readl(pcie_index_offset);
 604         r = readl(pcie_data_offset);
 605         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 606
 607         return r;
 608 }
 609
 610 /**
 611  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 612  *
 613  * @adev: amdgpu_device pointer
 614  * @pcie_index: mmio register offset
 615  * @pcie_data: mmio register offset
 616  *
 617  * Returns the value of indirect register @reg_addr
 618  */
 619 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 620                                   u32 pcie_index, u32 pcie_data,
 621                                   u32 reg_addr)
 622 {
 623         unsigned long flags;
 624         u64 r;
 625         void __iomem *pcie_index_offset;
 626         void __iomem *pcie_data_offset;
 627
 628         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 629         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 630         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 631
 632         /* read low 32 bits */
 633         writel(reg_addr, pcie_index_offset);
 634         readl(pcie_index_offset);
 635         r = readl(pcie_data_offset);
 636         /* read high 32 bits */
 637         writel(reg_addr + 4, pcie_index_offset);
 638         readl(pcie_index_offset);
 639         r |= ((u64)readl(pcie_data_offset) << 32);
 640         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 641
 642         return r;
 643 }
 644
 645 /**
 646  * amdgpu_device_indirect_wreg - write an indirect register address
 647  *
 648  * @adev: amdgpu_device pointer
 649  * @pcie_index: mmio register offset
 650  * @pcie_data: mmio register offset
 651  * @reg_addr: indirect register offset
 652  * @reg_data: indirect register data
 653  *
 654  */
 655 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 656                                  u32 pcie_index, u32 pcie_data,
 657                                  u32 reg_addr, u32 reg_data)
 658 {
 659         unsigned long flags;
 660         void __iomem *pcie_index_offset;
 661         void __iomem *pcie_data_offset;
 662
 663         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 664         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 665         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 666
 667         writel(reg_addr, pcie_index_offset);
 668         readl(pcie_index_offset);
 669         writel(reg_data, pcie_data_offset);
 670         readl(pcie_data_offset);
 671         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 672 }
 673
 674 /**
 675  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 676  *
 677  * @adev: amdgpu_device pointer
 678  * @pcie_index: mmio register offset
 679  * @pcie_data: mmio register offset
 680  * @reg_addr: indirect register offset
 681  * @reg_data: indirect register data
 682  *
 683  */
 684 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 685                                    u32 pcie_index, u32 pcie_data,
 686                                    u32 reg_addr, u64 reg_data)
 687 {
 688         unsigned long flags;
 689         void __iomem *pcie_index_offset;
 690         void __iomem *pcie_data_offset;
 691
 692         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 693         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 694         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 695
 696         /* write low 32 bits */
 697         writel(reg_addr, pcie_index_offset);
 698         readl(pcie_index_offset);
 699         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 700         readl(pcie_data_offset);
 701         /* write high 32 bits */
 702         writel(reg_addr + 4, pcie_index_offset);
 703         readl(pcie_index_offset);
 704         writel((u32)(reg_data >> 32), pcie_data_offset);
 705         readl(pcie_data_offset);
 706         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 707 }
 708
 709 /**
 710  * amdgpu_invalid_rreg - dummy reg read function
 711  *
 712  * @adev: amdgpu device pointer
 713  * @reg: offset of register
 714  *
 715  * Dummy register read function.  Used for register blocks
 716  * that certain asics don't have (all asics).
 717  * Returns the value in the register.
 718  */
 719 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 720 {
 721         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 722         BUG();
 723         return 0;
 724 }
 725
 726 /**
 727  * amdgpu_invalid_wreg - dummy reg write function
 728  *
 729  * @adev: amdgpu device pointer
 730  * @reg: offset of register
 731  * @v: value to write to the register
 732  *
 733  * Dummy register read function.  Used for register blocks
 734  * that certain asics don't have (all asics).
 735  */
 736 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 737 {
 738         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 739                   reg, v);
 740         BUG();
 741 }
 742
 743 /**
 744  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 745  *
 746  * @adev: amdgpu device pointer
 747  * @reg: offset of register
 748  *
 749  * Dummy register read function.  Used for register blocks
 750  * that certain asics don't have (all asics).
 751  * Returns the value in the register.
 752  */
 753 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 754 {
 755         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 756         BUG();
 757         return 0;
 758 }
 759
 760 /**
 761  * amdgpu_invalid_wreg64 - dummy reg write function
 762  *
 763  * @adev: amdgpu device pointer
 764  * @reg: offset of register
 765  * @v: value to write to the register
 766  *
 767  * Dummy register read function.  Used for register blocks
 768  * that certain asics don't have (all asics).
 769  */
 770 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 771 {
 772         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 773                   reg, v);
 774         BUG();
 775 }
 776
 777 /**
 778  * amdgpu_block_invalid_rreg - dummy reg read function
 779  *
 780  * @adev: amdgpu device pointer
 781  * @block: offset of instance
 782  * @reg: offset of register
 783  *
 784  * Dummy register read function.  Used for register blocks
 785  * that certain asics don't have (all asics).
 786  * Returns the value in the register.
 787  */
 788 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 789                                           uint32_t block, uint32_t reg)
 790 {
 791         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 792                   reg, block);
 793         BUG();
 794         return 0;
 795 }
 796
 797 /**
 798  * amdgpu_block_invalid_wreg - dummy reg write function
 799  *
 800  * @adev: amdgpu device pointer
 801  * @block: offset of instance
 802  * @reg: offset of register
 803  * @v: value to write to the register
 804  *
 805  * Dummy register read function.  Used for register blocks
 806  * that certain asics don't have (all asics).
 807  */
 808 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 809                                       uint32_t block,
 810                                       uint32_t reg, uint32_t v)
 811 {
 812         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 813                   reg, block, v);
 814         BUG();
 815 }
 816
 817 /**
 818  * amdgpu_device_asic_init - Wrapper for atom asic_init
 819  *
 820  * @dev: drm_device pointer
 821  *
 822  * Does any asic specific work and then calls atom asic init.
 823  */
 824 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 825 {
 826         amdgpu_asic_pre_asic_init(adev);
 827
 828         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 829 }
 830
 831 /**
 832  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 833  *
 834  * @adev: amdgpu device pointer
 835  *
 836  * Allocates a scratch page of VRAM for use by various things in the
 837  * driver.
 838  */
 839 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 840 {
 841         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 842                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 843                                        &adev->vram_scratch.robj,
 844                                        &adev->vram_scratch.gpu_addr,
 845                                        (void **)&adev->vram_scratch.ptr);
 846 }
 847
 848 /**
 849  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 850  *
 851  * @adev: amdgpu device pointer
 852  *
 853  * Frees the VRAM scratch page.
 854  */
 855 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 856 {
 857         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 858 }
 859
 860 /**
 861  * amdgpu_device_program_register_sequence - program an array of registers.
 862  *
 863  * @adev: amdgpu_device pointer
 864  * @registers: pointer to the register array
 865  * @array_size: size of the register array
 866  *
 867  * Programs an array or registers with and and or masks.
 868  * This is a helper for setting golden registers.
 869  */
 870 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 871                                              const u32 *registers,
 872                                              const u32 array_size)
 873 {
 874         u32 tmp, reg, and_mask, or_mask;
 875         int i;
 876
 877         if (array_size % 3)
 878                 return;
 879
 880         for (i = 0; i < array_size; i +=3) {
 881                 reg = registers[i + 0];
 882                 and_mask = registers[i + 1];
 883                 or_mask = registers[i + 2];
 884
 885                 if (and_mask == 0xffffffff) {
 886                         tmp = or_mask;
 887                 } else {
 888                         tmp = RREG32(reg);
 889                         tmp &= ~and_mask;
 890                         if (adev->family >= AMDGPU_FAMILY_AI)
 891                                 tmp |= (or_mask & and_mask);
 892                         else
 893                                 tmp |= or_mask;
 894                 }
 895                 WREG32(reg, tmp);
 896         }
 897 }
 898
 899 /**
 900  * amdgpu_device_pci_config_reset - reset the GPU
 901  *
 902  * @adev: amdgpu_device pointer
 903  *
 904  * Resets the GPU using the pci config reset sequence.
 905  * Only applicable to asics prior to vega10.
 906  */
 907 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 908 {
 909         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 910 }
 911
 912 /*
 913  * GPU doorbell aperture helpers function.
 914  */
 915 /**
 916  * amdgpu_device_doorbell_init - Init doorbell driver information.
 917  *
 918  * @adev: amdgpu_device pointer
 919  *
 920  * Init doorbell driver information (CIK)
 921  * Returns 0 on success, error on failure.
 922  */
 923 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 924 {
 925
 926         /* No doorbell on SI hardware generation */
 927         if (adev->asic_type < CHIP_BONAIRE) {
 928                 adev->doorbell.base = 0;
 929                 adev->doorbell.size = 0;
 930                 adev->doorbell.num_doorbells = 0;
 931                 adev->doorbell.ptr = NULL;
 932                 return 0;
 933         }
 934
 935         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 936                 return -EINVAL;
 937
 938         amdgpu_asic_init_doorbell_index(adev);
 939
 940         /* doorbell bar mapping */
 941         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 942         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 943
 944         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 945                                              adev->doorbell_index.max_assignment+1);
 946         if (adev->doorbell.num_doorbells == 0)
 947                 return -EINVAL;
 948
 949         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 950          * paging queue doorbell use the second page. The
 951          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 952          * doorbells are in the first page. So with paging queue enabled,
 953          * the max num_doorbells should + 1 page (0x400 in dword)
 954          */
 955         if (adev->asic_type >= CHIP_VEGA10)
 956                 adev->doorbell.num_doorbells += 0x400;
 957
 958         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 959                                      adev->doorbell.num_doorbells *
 960                                      sizeof(u32));
 961         if (adev->doorbell.ptr == NULL)
 962                 return -ENOMEM;
 963
 964         return 0;
 965 }
 966
 967 /**
 968  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 969  *
 970  * @adev: amdgpu_device pointer
 971  *
 972  * Tear down doorbell driver information (CIK)
 973  */
 974 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 975 {
 976         iounmap(adev->doorbell.ptr);
 977         adev->doorbell.ptr = NULL;
 978 }
 979
 980
 981
 982 /*
 983  * amdgpu_device_wb_*()
 984  * Writeback is the method by which the GPU updates special pages in memory
 985  * with the status of certain GPU events (fences, ring pointers,etc.).
 986  */
 987
 988 /**
 989  * amdgpu_device_wb_fini - Disable Writeback and free memory
 990  *
 991  * @adev: amdgpu_device pointer
 992  *
 993  * Disables Writeback and frees the Writeback memory (all asics).
 994  * Used at driver shutdown.
 995  */
 996 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 997 {
 998         if (adev->wb.wb_obj) {
 999                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1000                                       &adev->wb.gpu_addr,
1001                                       (void **)&adev->wb.wb);
1002                 adev->wb.wb_obj = NULL;
1003         }
1004 }
1005
1006 /**
1007  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1008  *
1009  * @adev: amdgpu_device pointer
1010  *
1011  * Initializes writeback and allocates writeback memory (all asics).
1012  * Used at driver startup.
1013  * Returns 0 on success or an -error on failure.
1014  */
1015 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1016 {
1017         int r;
1018
1019         if (adev->wb.wb_obj == NULL) {
1020                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1021                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1022                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1023                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1024                                             (void **)&adev->wb.wb);
1025                 if (r) {
1026                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1027                         return r;
1028                 }
1029
1030                 adev->wb.num_wb = AMDGPU_MAX_WB;
1031                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1032
1033                 /* clear wb memory */
1034                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1035         }
1036
1037         return 0;
1038 }
1039
1040 /**
1041  * amdgpu_device_wb_get - Allocate a wb entry
1042  *
1043  * @adev: amdgpu_device pointer
1044  * @wb: wb index
1045  *
1046  * Allocate a wb slot for use by the driver (all asics).
1047  * Returns 0 on success or -EINVAL on failure.
1048  */
1049 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1050 {
1051         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1052
1053         if (offset < adev->wb.num_wb) {
1054                 __set_bit(offset, adev->wb.used);
1055                 *wb = offset << 3; /* convert to dw offset */
1056                 return 0;
1057         } else {
1058                 return -EINVAL;
1059         }
1060 }
1061
1062 /**
1063  * amdgpu_device_wb_free - Free a wb entry
1064  *
1065  * @adev: amdgpu_device pointer
1066  * @wb: wb index
1067  *
1068  * Free a wb slot allocated for use by the driver (all asics)
1069  */
1070 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1071 {
1072         wb >>= 3;
1073         if (wb < adev->wb.num_wb)
1074                 __clear_bit(wb, adev->wb.used);
1075 }
1076
1077 /**
1078  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1079  *
1080  * @adev: amdgpu_device pointer
1081  *
1082  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1083  * to fail, but if any of the BARs is not accessible after the size we abort
1084  * driver loading by returning -ENODEV.
1085  */
1086 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1087 {
1088         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1089         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1090         struct pci_bus *root;
1091         struct resource *res;
1092         unsigned i;
1093         u16 cmd;
1094         int r;
1095
1096         /* Bypass for VF */
1097         if (amdgpu_sriov_vf(adev))
1098                 return 0;
1099
1100         /* skip if the bios has already enabled large BAR */
1101         if (adev->gmc.real_vram_size &&
1102             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1103                 return 0;
1104
1105         /* Check if the root BUS has 64bit memory resources */
1106         root = adev->pdev->bus;
1107         while (root->parent)
1108                 root = root->parent;
1109
1110         pci_bus_for_each_resource(root, res, i) {
1111                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1112                     res->start > 0x100000000ull)
1113                         break;
1114         }
1115
1116         /* Trying to resize is pointless without a root hub window above 4GB */
1117         if (!res)
1118                 return 0;
1119
1120         /* Disable memory decoding while we change the BAR addresses and size */
1121         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1122         pci_write_config_word(adev->pdev, PCI_COMMAND,
1123                               cmd & ~PCI_COMMAND_MEMORY);
1124
1125         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1126         amdgpu_device_doorbell_fini(adev);
1127         if (adev->asic_type >= CHIP_BONAIRE)
1128                 pci_release_resource(adev->pdev, 2);
1129
1130         pci_release_resource(adev->pdev, 0);
1131
1132         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1133         if (r == -ENOSPC)
1134                 DRM_INFO("Not enough PCI address space for a large BAR.");
1135         else if (r && r != -ENOTSUPP)
1136                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1137
1138         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1139
1140         /* When the doorbell or fb BAR isn't available we have no chance of
1141          * using the device.
1142          */
1143         r = amdgpu_device_doorbell_init(adev);
1144         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1145                 return -ENODEV;
1146
1147         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1148
1149         return 0;
1150 }
1151
1152 /*
1153  * GPU helpers function.
1154  */
1155 /**
1156  * amdgpu_device_need_post - check if the hw need post or not
1157  *
1158  * @adev: amdgpu_device pointer
1159  *
1160  * Check if the asic has been initialized (all asics) at driver startup
1161  * or post is needed if  hw reset is performed.
1162  * Returns true if need or false if not.
1163  */
1164 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1165 {
1166         uint32_t reg;
1167
1168         if (amdgpu_sriov_vf(adev))
1169                 return false;
1170
1171         if (amdgpu_passthrough(adev)) {
1172                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1173                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1174                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1175                  * vpost executed for smc version below 22.15
1176                  */
1177                 if (adev->asic_type == CHIP_FIJI) {
1178                         int err;
1179                         uint32_t fw_ver;
1180                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1181                         /* force vPost if error occured */
1182                         if (err)
1183                                 return true;
1184
1185                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1186                         if (fw_ver < 0x00160e00)
1187                                 return true;
1188                 }
1189         }
1190
1191         if (adev->has_hw_reset) {
1192                 adev->has_hw_reset = false;
1193                 return true;
1194         }
1195
1196         /* bios scratch used on CIK+ */
1197         if (adev->asic_type >= CHIP_BONAIRE)
1198                 return amdgpu_atombios_scratch_need_asic_init(adev);
1199
1200         /* check MEM_SIZE for older asics */
1201         reg = amdgpu_asic_get_config_memsize(adev);
1202
1203         if ((reg != 0) && (reg != 0xffffffff))
1204                 return false;
1205
1206         return true;
1207 }
1208
1209 /* if we get transitioned to only one device, take VGA back */
1210 /**
1211  * amdgpu_device_vga_set_decode - enable/disable vga decode
1212  *
1213  * @cookie: amdgpu_device pointer
1214  * @state: enable/disable vga decode
1215  *
1216  * Enable/disable vga decode (all asics).
1217  * Returns VGA resource flags.
1218  */
1219 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1220 {
1221         struct amdgpu_device *adev = cookie;
1222         amdgpu_asic_set_vga_state(adev, state);
1223         if (state)
1224                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1225                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1226         else
1227                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1228 }
1229
1230 /**
1231  * amdgpu_device_check_block_size - validate the vm block size
1232  *
1233  * @adev: amdgpu_device pointer
1234  *
1235  * Validates the vm block size specified via module parameter.
1236  * The vm block size defines number of bits in page table versus page directory,
1237  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1238  * page table and the remaining bits are in the page directory.
1239  */
1240 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1241 {
1242         /* defines number of bits in page table versus page directory,
1243          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1244          * page table and the remaining bits are in the page directory */
1245         if (amdgpu_vm_block_size == -1)
1246                 return;
1247
1248         if (amdgpu_vm_block_size < 9) {
1249                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1250                          amdgpu_vm_block_size);
1251                 amdgpu_vm_block_size = -1;
1252         }
1253 }
1254
1255 /**
1256  * amdgpu_device_check_vm_size - validate the vm size
1257  *
1258  * @adev: amdgpu_device pointer
1259  *
1260  * Validates the vm size in GB specified via module parameter.
1261  * The VM size is the size of the GPU virtual memory space in GB.
1262  */
1263 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1264 {
1265         /* no need to check the default value */
1266         if (amdgpu_vm_size == -1)
1267                 return;
1268
1269         if (amdgpu_vm_size < 1) {
1270                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1271                          amdgpu_vm_size);
1272                 amdgpu_vm_size = -1;
1273         }
1274 }
1275
1276 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1277 {
1278         struct sysinfo si;
1279         bool is_os_64 = (sizeof(void *) == 8);
1280         uint64_t total_memory;
1281         uint64_t dram_size_seven_GB = 0x1B8000000;
1282         uint64_t dram_size_three_GB = 0xB8000000;
1283
1284         if (amdgpu_smu_memory_pool_size == 0)
1285                 return;
1286
1287         if (!is_os_64) {
1288                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1289                 goto def_value;
1290         }
1291         si_meminfo(&si);
1292         total_memory = (uint64_t)si.totalram * si.mem_unit;
1293
1294         if ((amdgpu_smu_memory_pool_size == 1) ||
1295                 (amdgpu_smu_memory_pool_size == 2)) {
1296                 if (total_memory < dram_size_three_GB)
1297                         goto def_value1;
1298         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1299                 (amdgpu_smu_memory_pool_size == 8)) {
1300                 if (total_memory < dram_size_seven_GB)
1301                         goto def_value1;
1302         } else {
1303                 DRM_WARN("Smu memory pool size not supported\n");
1304                 goto def_value;
1305         }
1306         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1307
1308         return;
1309
1310 def_value1:
1311         DRM_WARN("No enough system memory\n");
1312 def_value:
1313         adev->pm.smu_prv_buffer_size = 0;
1314 }
1315
1316 /**
1317  * amdgpu_device_check_arguments - validate module params
1318  *
1319  * @adev: amdgpu_device pointer
1320  *
1321  * Validates certain module parameters and updates
1322  * the associated values used by the driver (all asics).
1323  */
1324 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1325 {
1326         if (amdgpu_sched_jobs < 4) {
1327                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1328                          amdgpu_sched_jobs);
1329                 amdgpu_sched_jobs = 4;
1330         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1331                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1332                          amdgpu_sched_jobs);
1333                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1334         }
1335
1336         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1337                 /* gart size must be greater or equal to 32M */
1338                 dev_warn(adev->dev, "gart size (%d) too small\n",
1339                          amdgpu_gart_size);
1340                 amdgpu_gart_size = -1;
1341         }
1342
1343         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1344                 /* gtt size must be greater or equal to 32M */
1345                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1346                                  amdgpu_gtt_size);
1347                 amdgpu_gtt_size = -1;
1348         }
1349
1350         /* valid range is between 4 and 9 inclusive */
1351         if (amdgpu_vm_fragment_size != -1 &&
1352             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1353                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1354                 amdgpu_vm_fragment_size = -1;
1355         }
1356
1357         if (amdgpu_sched_hw_submission < 2) {
1358                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1359                          amdgpu_sched_hw_submission);
1360                 amdgpu_sched_hw_submission = 2;
1361         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1362                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1363                          amdgpu_sched_hw_submission);
1364                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1365         }
1366
1367         amdgpu_device_check_smu_prv_buffer_size(adev);
1368
1369         amdgpu_device_check_vm_size(adev);
1370
1371         amdgpu_device_check_block_size(adev);
1372
1373         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1374
1375         amdgpu_gmc_tmz_set(adev);
1376
1377         if (amdgpu_num_kcq == -1) {
1378                 amdgpu_num_kcq = 8;
1379         } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1380                 amdgpu_num_kcq = 8;
1381                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1382         }
1383
1384         amdgpu_gmc_noretry_set(adev);
1385
1386         return 0;
1387 }
1388
1389 /**
1390  * amdgpu_switcheroo_set_state - set switcheroo state
1391  *
1392  * @pdev: pci dev pointer
1393  * @state: vga_switcheroo state
1394  *
1395  * Callback for the switcheroo driver.  Suspends or resumes the
1396  * the asics before or after it is powered up using ACPI methods.
1397  */
1398 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1399                                         enum vga_switcheroo_state state)
1400 {
1401         struct drm_device *dev = pci_get_drvdata(pdev);
1402         int r;
1403
1404         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1405                 return;
1406
1407         if (state == VGA_SWITCHEROO_ON) {
1408                 pr_info("switched on\n");
1409                 /* don't suspend or resume card normally */
1410                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1411
1412                 pci_set_power_state(dev->pdev, PCI_D0);
1413                 amdgpu_device_load_pci_state(dev->pdev);
1414                 r = pci_enable_device(dev->pdev);
1415                 if (r)
1416                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1417                 amdgpu_device_resume(dev, true);
1418
1419                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1420                 drm_kms_helper_poll_enable(dev);
1421         } else {
1422                 pr_info("switched off\n");
1423                 drm_kms_helper_poll_disable(dev);
1424                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1425                 amdgpu_device_suspend(dev, true);
1426                 amdgpu_device_cache_pci_state(dev->pdev);
1427                 /* Shut down the device */
1428                 pci_disable_device(dev->pdev);
1429                 pci_set_power_state(dev->pdev, PCI_D3cold);
1430                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1431         }
1432 }
1433
1434 /**
1435  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1436  *
1437  * @pdev: pci dev pointer
1438  *
1439  * Callback for the switcheroo driver.  Check of the switcheroo
1440  * state can be changed.
1441  * Returns true if the state can be changed, false if not.
1442  */
1443 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1444 {
1445         struct drm_device *dev = pci_get_drvdata(pdev);
1446
1447         /*
1448         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1449         * locking inversion with the driver load path. And the access here is
1450         * completely racy anyway. So don't bother with locking for now.
1451         */
1452         return atomic_read(&dev->open_count) == 0;
1453 }
1454
1455 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1456         .set_gpu_state = amdgpu_switcheroo_set_state,
1457         .reprobe = NULL,
1458         .can_switch = amdgpu_switcheroo_can_switch,
1459 };
1460
1461 /**
1462  * amdgpu_device_ip_set_clockgating_state - set the CG state
1463  *
1464  * @dev: amdgpu_device pointer
1465  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1466  * @state: clockgating state (gate or ungate)
1467  *
1468  * Sets the requested clockgating state for all instances of
1469  * the hardware IP specified.
1470  * Returns the error code from the last instance.
1471  */
1472 int amdgpu_device_ip_set_clockgating_state(void *dev,
1473                                            enum amd_ip_block_type block_type,
1474                                            enum amd_clockgating_state state)
1475 {
1476         struct amdgpu_device *adev = dev;
1477         int i, r = 0;
1478
1479         for (i = 0; i < adev->num_ip_blocks; i++) {
1480                 if (!adev->ip_blocks[i].status.valid)
1481                         continue;
1482                 if (adev->ip_blocks[i].version->type != block_type)
1483                         continue;
1484                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1485                         continue;
1486                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1487                         (void *)adev, state);
1488                 if (r)
1489                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1490                                   adev->ip_blocks[i].version->funcs->name, r);
1491         }
1492         return r;
1493 }
1494
1495 /**
1496  * amdgpu_device_ip_set_powergating_state - set the PG state
1497  *
1498  * @dev: amdgpu_device pointer
1499  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1500  * @state: powergating state (gate or ungate)
1501  *
1502  * Sets the requested powergating state for all instances of
1503  * the hardware IP specified.
1504  * Returns the error code from the last instance.
1505  */
1506 int amdgpu_device_ip_set_powergating_state(void *dev,
1507                                            enum amd_ip_block_type block_type,
1508                                            enum amd_powergating_state state)
1509 {
1510         struct amdgpu_device *adev = dev;
1511         int i, r = 0;
1512
1513         for (i = 0; i < adev->num_ip_blocks; i++) {
1514                 if (!adev->ip_blocks[i].status.valid)
1515                         continue;
1516                 if (adev->ip_blocks[i].version->type != block_type)
1517                         continue;
1518                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1519                         continue;
1520                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1521                         (void *)adev, state);
1522                 if (r)
1523                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1524                                   adev->ip_blocks[i].version->funcs->name, r);
1525         }
1526         return r;
1527 }
1528
1529 /**
1530  * amdgpu_device_ip_get_clockgating_state - get the CG state
1531  *
1532  * @adev: amdgpu_device pointer
1533  * @flags: clockgating feature flags
1534  *
1535  * Walks the list of IPs on the device and updates the clockgating
1536  * flags for each IP.
1537  * Updates @flags with the feature flags for each hardware IP where
1538  * clockgating is enabled.
1539  */
1540 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1541                                             u32 *flags)
1542 {
1543         int i;
1544
1545         for (i = 0; i < adev->num_ip_blocks; i++) {
1546                 if (!adev->ip_blocks[i].status.valid)
1547                         continue;
1548                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1549                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1550         }
1551 }
1552
1553 /**
1554  * amdgpu_device_ip_wait_for_idle - wait for idle
1555  *
1556  * @adev: amdgpu_device pointer
1557  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1558  *
1559  * Waits for the request hardware IP to be idle.
1560  * Returns 0 for success or a negative error code on failure.
1561  */
1562 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1563                                    enum amd_ip_block_type block_type)
1564 {
1565         int i, r;
1566
1567         for (i = 0; i < adev->num_ip_blocks; i++) {
1568                 if (!adev->ip_blocks[i].status.valid)
1569                         continue;
1570                 if (adev->ip_blocks[i].version->type == block_type) {
1571                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1572                         if (r)
1573                                 return r;
1574                         break;
1575                 }
1576         }
1577         return 0;
1578
1579 }
1580
1581 /**
1582  * amdgpu_device_ip_is_idle - is the hardware IP idle
1583  *
1584  * @adev: amdgpu_device pointer
1585  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1586  *
1587  * Check if the hardware IP is idle or not.
1588  * Returns true if it the IP is idle, false if not.
1589  */
1590 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1591                               enum amd_ip_block_type block_type)
1592 {
1593         int i;
1594
1595         for (i = 0; i < adev->num_ip_blocks; i++) {
1596                 if (!adev->ip_blocks[i].status.valid)
1597                         continue;
1598                 if (adev->ip_blocks[i].version->type == block_type)
1599                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1600         }
1601         return true;
1602
1603 }
1604
1605 /**
1606  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1607  *
1608  * @adev: amdgpu_device pointer
1609  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1610  *
1611  * Returns a pointer to the hardware IP block structure
1612  * if it exists for the asic, otherwise NULL.
1613  */
1614 struct amdgpu_ip_block *
1615 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1616                               enum amd_ip_block_type type)
1617 {
1618         int i;
1619
1620         for (i = 0; i < adev->num_ip_blocks; i++)
1621                 if (adev->ip_blocks[i].version->type == type)
1622                         return &adev->ip_blocks[i];
1623
1624         return NULL;
1625 }
1626
1627 /**
1628  * amdgpu_device_ip_block_version_cmp
1629  *
1630  * @adev: amdgpu_device pointer
1631  * @type: enum amd_ip_block_type
1632  * @major: major version
1633  * @minor: minor version
1634  *
1635  * return 0 if equal or greater
1636  * return 1 if smaller or the ip_block doesn't exist
1637  */
1638 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1639                                        enum amd_ip_block_type type,
1640                                        u32 major, u32 minor)
1641 {
1642         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1643
1644         if (ip_block && ((ip_block->version->major > major) ||
1645                         ((ip_block->version->major == major) &&
1646                         (ip_block->version->minor >= minor))))
1647                 return 0;
1648
1649         return 1;
1650 }
1651
1652 /**
1653  * amdgpu_device_ip_block_add
1654  *
1655  * @adev: amdgpu_device pointer
1656  * @ip_block_version: pointer to the IP to add
1657  *
1658  * Adds the IP block driver information to the collection of IPs
1659  * on the asic.
1660  */
1661 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1662                                const struct amdgpu_ip_block_version *ip_block_version)
1663 {
1664         if (!ip_block_version)
1665                 return -EINVAL;
1666
1667         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1668                   ip_block_version->funcs->name);
1669
1670         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1671
1672         return 0;
1673 }
1674
1675 /**
1676  * amdgpu_device_enable_virtual_display - enable virtual display feature
1677  *
1678  * @adev: amdgpu_device pointer
1679  *
1680  * Enabled the virtual display feature if the user has enabled it via
1681  * the module parameter virtual_display.  This feature provides a virtual
1682  * display hardware on headless boards or in virtualized environments.
1683  * This function parses and validates the configuration string specified by
1684  * the user and configues the virtual display configuration (number of
1685  * virtual connectors, crtcs, etc.) specified.
1686  */
1687 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1688 {
1689         adev->enable_virtual_display = false;
1690
1691         if (amdgpu_virtual_display) {
1692                 struct drm_device *ddev = adev_to_drm(adev);
1693                 const char *pci_address_name = pci_name(ddev->pdev);
1694                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1695
1696                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1697                 pciaddstr_tmp = pciaddstr;
1698                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1699                         pciaddname = strsep(&pciaddname_tmp, ",");
1700                         if (!strcmp("all", pciaddname)
1701                             || !strcmp(pci_address_name, pciaddname)) {
1702                                 long num_crtc;
1703                                 int res = -1;
1704
1705                                 adev->enable_virtual_display = true;
1706
1707                                 if (pciaddname_tmp)
1708                                         res = kstrtol(pciaddname_tmp, 10,
1709                                                       &num_crtc);
1710
1711                                 if (!res) {
1712                                         if (num_crtc < 1)
1713                                                 num_crtc = 1;
1714                                         if (num_crtc > 6)
1715                                                 num_crtc = 6;
1716                                         adev->mode_info.num_crtc = num_crtc;
1717                                 } else {
1718                                         adev->mode_info.num_crtc = 1;
1719                                 }
1720                                 break;
1721                         }
1722                 }
1723
1724                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1725                          amdgpu_virtual_display, pci_address_name,
1726                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1727
1728                 kfree(pciaddstr);
1729         }
1730 }
1731
1732 /**
1733  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1734  *
1735  * @adev: amdgpu_device pointer
1736  *
1737  * Parses the asic configuration parameters specified in the gpu info
1738  * firmware and makes them availale to the driver for use in configuring
1739  * the asic.
1740  * Returns 0 on success, -EINVAL on failure.
1741  */
1742 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1743 {
1744         const char *chip_name;
1745         char fw_name[40];
1746         int err;
1747         const struct gpu_info_firmware_header_v1_0 *hdr;
1748
1749         adev->firmware.gpu_info_fw = NULL;
1750
1751         if (adev->mman.discovery_bin) {
1752                 amdgpu_discovery_get_gfx_info(adev);
1753
1754                 /*
1755                  * FIXME: The bounding box is still needed by Navi12, so
1756                  * temporarily read it from gpu_info firmware. Should be droped
1757                  * when DAL no longer needs it.
1758                  */
1759                 if (adev->asic_type != CHIP_NAVI12)
1760                         return 0;
1761         }
1762
1763         switch (adev->asic_type) {
1764 #ifdef CONFIG_DRM_AMDGPU_SI
1765         case CHIP_VERDE:
1766         case CHIP_TAHITI:
1767         case CHIP_PITCAIRN:
1768         case CHIP_OLAND:
1769         case CHIP_HAINAN:
1770 #endif
1771 #ifdef CONFIG_DRM_AMDGPU_CIK
1772         case CHIP_BONAIRE:
1773         case CHIP_HAWAII:
1774         case CHIP_KAVERI:
1775         case CHIP_KABINI:
1776         case CHIP_MULLINS:
1777 #endif
1778         case CHIP_TOPAZ:
1779         case CHIP_TONGA:
1780         case CHIP_FIJI:
1781         case CHIP_POLARIS10:
1782         case CHIP_POLARIS11:
1783         case CHIP_POLARIS12:
1784         case CHIP_VEGAM:
1785         case CHIP_CARRIZO:
1786         case CHIP_STONEY:
1787         case CHIP_VEGA20:
1788         case CHIP_SIENNA_CICHLID:
1789         case CHIP_NAVY_FLOUNDER:
1790         case CHIP_DIMGREY_CAVEFISH:
1791         default:
1792                 return 0;
1793         case CHIP_VEGA10:
1794                 chip_name = "vega10";
1795                 break;
1796         case CHIP_VEGA12:
1797                 chip_name = "vega12";
1798                 break;
1799         case CHIP_RAVEN:
1800                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1801                         chip_name = "raven2";
1802                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1803                         chip_name = "picasso";
1804                 else
1805                         chip_name = "raven";
1806                 break;
1807         case CHIP_ARCTURUS:
1808                 chip_name = "arcturus";
1809                 break;
1810         case CHIP_RENOIR:
1811                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1812                         chip_name = "renoir";
1813                 else
1814                         chip_name = "green_sardine";
1815                 break;
1816         case CHIP_NAVI10:
1817                 chip_name = "navi10";
1818                 break;
1819         case CHIP_NAVI14:
1820                 chip_name = "navi14";
1821                 break;
1822         case CHIP_NAVI12:
1823                 chip_name = "navi12";
1824                 break;
1825         case CHIP_VANGOGH:
1826                 chip_name = "vangogh";
1827                 break;
1828         }
1829
1830         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1831         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1832         if (err) {
1833                 dev_err(adev->dev,
1834                         "Failed to load gpu_info firmware \"%s\"\n",
1835                         fw_name);
1836                 goto out;
1837         }
1838         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1839         if (err) {
1840                 dev_err(adev->dev,
1841                         "Failed to validate gpu_info firmware \"%s\"\n",
1842                         fw_name);
1843                 goto out;
1844         }
1845
1846         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1847         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1848
1849         switch (hdr->version_major) {
1850         case 1:
1851         {
1852                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1853                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1854                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1855
1856                 /*
1857                  * Should be droped when DAL no longer needs it.
1858                  */
1859                 if (adev->asic_type == CHIP_NAVI12)
1860                         goto parse_soc_bounding_box;
1861
1862                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1863                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1864                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1865                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1866                 adev->gfx.config.max_texture_channel_caches =
1867                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1868                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1869                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1870                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1871                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1872                 adev->gfx.config.double_offchip_lds_buf =
1873                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1874                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1875                 adev->gfx.cu_info.max_waves_per_simd =
1876                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1877                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1878                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1879                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1880                 if (hdr->version_minor >= 1) {
1881                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1882                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1883                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1884                         adev->gfx.config.num_sc_per_sh =
1885                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1886                         adev->gfx.config.num_packer_per_sc =
1887                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1888                 }
1889
1890 parse_soc_bounding_box:
1891                 /*
1892                  * soc bounding box info is not integrated in disocovery table,
1893                  * we always need to parse it from gpu info firmware if needed.
1894                  */
1895                 if (hdr->version_minor == 2) {
1896                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1897                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1898                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1899                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1900                 }
1901                 break;
1902         }
1903         default:
1904                 dev_err(adev->dev,
1905                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1906                 err = -EINVAL;
1907                 goto out;
1908         }
1909 out:
1910         return err;
1911 }
1912
1913 /**
1914  * amdgpu_device_ip_early_init - run early init for hardware IPs
1915  *
1916  * @adev: amdgpu_device pointer
1917  *
1918  * Early initialization pass for hardware IPs.  The hardware IPs that make
1919  * up each asic are discovered each IP's early_init callback is run.  This
1920  * is the first stage in initializing the asic.
1921  * Returns 0 on success, negative error code on failure.
1922  */
1923 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1924 {
1925         int i, r;
1926
1927         amdgpu_device_enable_virtual_display(adev);
1928
1929         if (amdgpu_sriov_vf(adev)) {
1930                 r = amdgpu_virt_request_full_gpu(adev, true);
1931                 if (r)
1932                         return r;
1933         }
1934
1935         switch (adev->asic_type) {
1936 #ifdef CONFIG_DRM_AMDGPU_SI
1937         case CHIP_VERDE:
1938         case CHIP_TAHITI:
1939         case CHIP_PITCAIRN:
1940         case CHIP_OLAND:
1941         case CHIP_HAINAN:
1942                 adev->family = AMDGPU_FAMILY_SI;
1943                 r = si_set_ip_blocks(adev);
1944                 if (r)
1945                         return r;
1946                 break;
1947 #endif
1948 #ifdef CONFIG_DRM_AMDGPU_CIK
1949         case CHIP_BONAIRE:
1950         case CHIP_HAWAII:
1951         case CHIP_KAVERI:
1952         case CHIP_KABINI:
1953         case CHIP_MULLINS:
1954                 if (adev->flags & AMD_IS_APU)
1955                         adev->family = AMDGPU_FAMILY_KV;
1956                 else
1957                         adev->family = AMDGPU_FAMILY_CI;
1958
1959                 r = cik_set_ip_blocks(adev);
1960                 if (r)
1961                         return r;
1962                 break;
1963 #endif
1964         case CHIP_TOPAZ:
1965         case CHIP_TONGA:
1966         case CHIP_FIJI:
1967         case CHIP_POLARIS10:
1968         case CHIP_POLARIS11:
1969         case CHIP_POLARIS12:
1970         case CHIP_VEGAM:
1971         case CHIP_CARRIZO:
1972         case CHIP_STONEY:
1973                 if (adev->flags & AMD_IS_APU)
1974                         adev->family = AMDGPU_FAMILY_CZ;
1975                 else
1976                         adev->family = AMDGPU_FAMILY_VI;
1977
1978                 r = vi_set_ip_blocks(adev);
1979                 if (r)
1980                         return r;
1981                 break;
1982         case CHIP_VEGA10:
1983         case CHIP_VEGA12:
1984         case CHIP_VEGA20:
1985         case CHIP_RAVEN:
1986         case CHIP_ARCTURUS:
1987         case CHIP_RENOIR:
1988                 if (adev->flags & AMD_IS_APU)
1989                         adev->family = AMDGPU_FAMILY_RV;
1990                 else
1991                         adev->family = AMDGPU_FAMILY_AI;
1992
1993                 r = soc15_set_ip_blocks(adev);
1994                 if (r)
1995                         return r;
1996                 break;
1997         case  CHIP_NAVI10:
1998         case  CHIP_NAVI14:
1999         case  CHIP_NAVI12:
2000         case  CHIP_SIENNA_CICHLID:
2001         case  CHIP_NAVY_FLOUNDER:
2002         case  CHIP_DIMGREY_CAVEFISH:
2003         case CHIP_VANGOGH:
2004                 if (adev->asic_type == CHIP_VANGOGH)
2005                         adev->family = AMDGPU_FAMILY_VGH;
2006                 else
2007                         adev->family = AMDGPU_FAMILY_NV;
2008
2009                 r = nv_set_ip_blocks(adev);
2010                 if (r)
2011                         return r;
2012                 break;
2013         default:
2014                 /* FIXME: not supported yet */
2015                 return -EINVAL;
2016         }
2017
2018         amdgpu_amdkfd_device_probe(adev);
2019
2020         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2021         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2022                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2023
2024         for (i = 0; i < adev->num_ip_blocks; i++) {
2025                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2026                         DRM_ERROR("disabled ip block: %d <%s>\n",
2027                                   i, adev->ip_blocks[i].version->funcs->name);
2028                         adev->ip_blocks[i].status.valid = false;
2029                 } else {
2030                         if (adev->ip_blocks[i].version->funcs->early_init) {
2031                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2032                                 if (r == -ENOENT) {
2033                                         adev->ip_blocks[i].status.valid = false;
2034                                 } else if (r) {
2035                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2036                                                   adev->ip_blocks[i].version->funcs->name, r);
2037                                         return r;
2038                                 } else {
2039                                         adev->ip_blocks[i].status.valid = true;
2040                                 }
2041                         } else {
2042                                 adev->ip_blocks[i].status.valid = true;
2043                         }
2044                 }
2045                 /* get the vbios after the asic_funcs are set up */
2046                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2047                         r = amdgpu_device_parse_gpu_info_fw(adev);
2048                         if (r)
2049                                 return r;
2050
2051                         /* Read BIOS */
2052                         if (!amdgpu_get_bios(adev))
2053                                 return -EINVAL;
2054
2055                         r = amdgpu_atombios_init(adev);
2056                         if (r) {
2057                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2058                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2059                                 return r;
2060                         }
2061                 }
2062         }
2063
2064         adev->cg_flags &= amdgpu_cg_mask;
2065         adev->pg_flags &= amdgpu_pg_mask;
2066
2067         return 0;
2068 }
2069
2070 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2071 {
2072         int i, r;
2073
2074         for (i = 0; i < adev->num_ip_blocks; i++) {
2075                 if (!adev->ip_blocks[i].status.sw)
2076                         continue;
2077                 if (adev->ip_blocks[i].status.hw)
2078                         continue;
2079                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2080                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2081                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2082                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2083                         if (r) {
2084                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2085                                           adev->ip_blocks[i].version->funcs->name, r);
2086                                 return r;
2087                         }
2088                         adev->ip_blocks[i].status.hw = true;
2089                 }
2090         }
2091
2092         return 0;
2093 }
2094
2095 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2096 {
2097         int i, r;
2098
2099         for (i = 0; i < adev->num_ip_blocks; i++) {
2100                 if (!adev->ip_blocks[i].status.sw)
2101                         continue;
2102                 if (adev->ip_blocks[i].status.hw)
2103                         continue;
2104                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2105                 if (r) {
2106                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2107                                   adev->ip_blocks[i].version->funcs->name, r);
2108                         return r;
2109                 }
2110                 adev->ip_blocks[i].status.hw = true;
2111         }
2112
2113         return 0;
2114 }
2115
2116 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2117 {
2118         int r = 0;
2119         int i;
2120         uint32_t smu_version;
2121
2122         if (adev->asic_type >= CHIP_VEGA10) {
2123                 for (i = 0; i < adev->num_ip_blocks; i++) {
2124                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2125                                 continue;
2126
2127                         /* no need to do the fw loading again if already done*/
2128                         if (adev->ip_blocks[i].status.hw == true)
2129                                 break;
2130
2131                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2132                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2133                                 if (r) {
2134                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2135                                                           adev->ip_blocks[i].version->funcs->name, r);
2136                                         return r;
2137                                 }
2138                         } else {
2139                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2140                                 if (r) {
2141                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2142                                                           adev->ip_blocks[i].version->funcs->name, r);
2143                                         return r;
2144                                 }
2145                         }
2146
2147                         adev->ip_blocks[i].status.hw = true;
2148                         break;
2149                 }
2150         }
2151
2152         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2153                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2154
2155         return r;
2156 }
2157
2158 /**
2159  * amdgpu_device_ip_init - run init for hardware IPs
2160  *
2161  * @adev: amdgpu_device pointer
2162  *
2163  * Main initialization pass for hardware IPs.  The list of all the hardware
2164  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2165  * are run.  sw_init initializes the software state associated with each IP
2166  * and hw_init initializes the hardware associated with each IP.
2167  * Returns 0 on success, negative error code on failure.
2168  */
2169 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2170 {
2171         int i, r;
2172
2173         r = amdgpu_ras_init(adev);
2174         if (r)
2175                 return r;
2176
2177         for (i = 0; i < adev->num_ip_blocks; i++) {
2178                 if (!adev->ip_blocks[i].status.valid)
2179                         continue;
2180                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2181                 if (r) {
2182                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2183                                   adev->ip_blocks[i].version->funcs->name, r);
2184                         goto init_failed;
2185                 }
2186                 adev->ip_blocks[i].status.sw = true;
2187
2188                 /* need to do gmc hw init early so we can allocate gpu mem */
2189                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2190                         r = amdgpu_device_vram_scratch_init(adev);
2191                         if (r) {
2192                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2193                                 goto init_failed;
2194                         }
2195                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2196                         if (r) {
2197                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2198                                 goto init_failed;
2199                         }
2200                         r = amdgpu_device_wb_init(adev);
2201                         if (r) {
2202                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2203                                 goto init_failed;
2204                         }
2205                         adev->ip_blocks[i].status.hw = true;
2206
2207                         /* right after GMC hw init, we create CSA */
2208                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2209                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2210                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2211                                                                 AMDGPU_CSA_SIZE);
2212                                 if (r) {
2213                                         DRM_ERROR("allocate CSA failed %d\n", r);
2214                                         goto init_failed;
2215                                 }
2216                         }
2217                 }
2218         }
2219
2220         if (amdgpu_sriov_vf(adev))
2221                 amdgpu_virt_init_data_exchange(adev);
2222
2223         r = amdgpu_ib_pool_init(adev);
2224         if (r) {
2225                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2226                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2227                 goto init_failed;
2228         }
2229
2230         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2231         if (r)
2232                 goto init_failed;
2233
2234         r = amdgpu_device_ip_hw_init_phase1(adev);
2235         if (r)
2236                 goto init_failed;
2237
2238         r = amdgpu_device_fw_loading(adev);
2239         if (r)
2240                 goto init_failed;
2241
2242         r = amdgpu_device_ip_hw_init_phase2(adev);
2243         if (r)
2244                 goto init_failed;
2245
2246         /*
2247          * retired pages will be loaded from eeprom and reserved here,
2248          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2249          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2250          * for I2C communication which only true at this point.
2251          *
2252          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2253          * failure from bad gpu situation and stop amdgpu init process
2254          * accordingly. For other failed cases, it will still release all
2255          * the resource and print error message, rather than returning one
2256          * negative value to upper level.
2257          *
2258          * Note: theoretically, this should be called before all vram allocations
2259          * to protect retired page from abusing
2260          */
2261         r = amdgpu_ras_recovery_init(adev);
2262         if (r)
2263                 goto init_failed;
2264
2265         if (adev->gmc.xgmi.num_physical_nodes > 1)
2266                 amdgpu_xgmi_add_device(adev);
2267         amdgpu_amdkfd_device_init(adev);
2268
2269         amdgpu_fru_get_product_info(adev);
2270
2271 init_failed:
2272         if (amdgpu_sriov_vf(adev))
2273                 amdgpu_virt_release_full_gpu(adev, true);
2274
2275         return r;
2276 }
2277
2278 /**
2279  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2280  *
2281  * @adev: amdgpu_device pointer
2282  *
2283  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2284  * this function before a GPU reset.  If the value is retained after a
2285  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2286  */
2287 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2288 {
2289         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2290 }
2291
2292 /**
2293  * amdgpu_device_check_vram_lost - check if vram is valid
2294  *
2295  * @adev: amdgpu_device pointer
2296  *
2297  * Checks the reset magic value written to the gart pointer in VRAM.
2298  * The driver calls this after a GPU reset to see if the contents of
2299  * VRAM is lost or now.
2300  * returns true if vram is lost, false if not.
2301  */
2302 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2303 {
2304         if (memcmp(adev->gart.ptr, adev->reset_magic,
2305                         AMDGPU_RESET_MAGIC_NUM))
2306                 return true;
2307
2308         if (!amdgpu_in_reset(adev))
2309                 return false;
2310
2311         /*
2312          * For all ASICs with baco/mode1 reset, the VRAM is
2313          * always assumed to be lost.
2314          */
2315         switch (amdgpu_asic_reset_method(adev)) {
2316         case AMD_RESET_METHOD_BACO:
2317         case AMD_RESET_METHOD_MODE1:
2318                 return true;
2319         default:
2320                 return false;
2321         }
2322 }
2323
2324 /**
2325  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2326  *
2327  * @adev: amdgpu_device pointer
2328  * @state: clockgating state (gate or ungate)
2329  *
2330  * The list of all the hardware IPs that make up the asic is walked and the
2331  * set_clockgating_state callbacks are run.
2332  * Late initialization pass enabling clockgating for hardware IPs.
2333  * Fini or suspend, pass disabling clockgating for hardware IPs.
2334  * Returns 0 on success, negative error code on failure.
2335  */
2336
2337 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2338                                                 enum amd_clockgating_state state)
2339 {
2340         int i, j, r;
2341
2342         if (amdgpu_emu_mode == 1)
2343                 return 0;
2344
2345         for (j = 0; j < adev->num_ip_blocks; j++) {
2346                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2347                 if (!adev->ip_blocks[i].status.late_initialized)
2348                         continue;
2349                 /* skip CG for VCE/UVD, it's handled specially */
2350                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2351                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2352                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2353                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2354                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2355                         /* enable clockgating to save power */
2356                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2357                                                                                      state);
2358                         if (r) {
2359                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2360                                           adev->ip_blocks[i].version->funcs->name, r);
2361                                 return r;
2362                         }
2363                 }
2364         }
2365
2366         return 0;
2367 }
2368
2369 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2370 {
2371         int i, j, r;
2372
2373         if (amdgpu_emu_mode == 1)
2374                 return 0;
2375
2376         for (j = 0; j < adev->num_ip_blocks; j++) {
2377                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2378                 if (!adev->ip_blocks[i].status.late_initialized)
2379                         continue;
2380                 /* skip CG for VCE/UVD, it's handled specially */
2381                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2382                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2383                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2384                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2385                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2386                         /* enable powergating to save power */
2387                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2388                                                                                         state);
2389                         if (r) {
2390                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2391                                           adev->ip_blocks[i].version->funcs->name, r);
2392                                 return r;
2393                         }
2394                 }
2395         }
2396         return 0;
2397 }
2398
2399 static int amdgpu_device_enable_mgpu_fan_boost(void)
2400 {
2401         struct amdgpu_gpu_instance *gpu_ins;
2402         struct amdgpu_device *adev;
2403         int i, ret = 0;
2404
2405         mutex_lock(&mgpu_info.mutex);
2406
2407         /*
2408          * MGPU fan boost feature should be enabled
2409          * only when there are two or more dGPUs in
2410          * the system
2411          */
2412         if (mgpu_info.num_dgpu < 2)
2413                 goto out;
2414
2415         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2416                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2417                 adev = gpu_ins->adev;
2418                 if (!(adev->flags & AMD_IS_APU) &&
2419                     !gpu_ins->mgpu_fan_enabled) {
2420                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2421                         if (ret)
2422                                 break;
2423
2424                         gpu_ins->mgpu_fan_enabled = 1;
2425                 }
2426         }
2427
2428 out:
2429         mutex_unlock(&mgpu_info.mutex);
2430
2431         return ret;
2432 }
2433
2434 /**
2435  * amdgpu_device_ip_late_init - run late init for hardware IPs
2436  *
2437  * @adev: amdgpu_device pointer
2438  *
2439  * Late initialization pass for hardware IPs.  The list of all the hardware
2440  * IPs that make up the asic is walked and the late_init callbacks are run.
2441  * late_init covers any special initialization that an IP requires
2442  * after all of the have been initialized or something that needs to happen
2443  * late in the init process.
2444  * Returns 0 on success, negative error code on failure.
2445  */
2446 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2447 {
2448         struct amdgpu_gpu_instance *gpu_instance;
2449         int i = 0, r;
2450
2451         for (i = 0; i < adev->num_ip_blocks; i++) {
2452                 if (!adev->ip_blocks[i].status.hw)
2453                         continue;
2454                 if (adev->ip_blocks[i].version->funcs->late_init) {
2455                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2456                         if (r) {
2457                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2458                                           adev->ip_blocks[i].version->funcs->name, r);
2459                                 return r;
2460                         }
2461                 }
2462                 adev->ip_blocks[i].status.late_initialized = true;
2463         }
2464
2465         amdgpu_ras_set_error_query_ready(adev, true);
2466
2467         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2468         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2469
2470         amdgpu_device_fill_reset_magic(adev);
2471
2472         r = amdgpu_device_enable_mgpu_fan_boost();
2473         if (r)
2474                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2475
2476
2477         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2478                 mutex_lock(&mgpu_info.mutex);
2479
2480                 /*
2481                  * Reset device p-state to low as this was booted with high.
2482                  *
2483                  * This should be performed only after all devices from the same
2484                  * hive get initialized.
2485                  *
2486                  * However, it's unknown how many device in the hive in advance.
2487                  * As this is counted one by one during devices initializations.
2488                  *
2489                  * So, we wait for all XGMI interlinked devices initialized.
2490                  * This may bring some delays as those devices may come from
2491                  * different hives. But that should be OK.
2492                  */
2493                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2494                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2495                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2496                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2497                                         continue;
2498
2499                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2500                                                 AMDGPU_XGMI_PSTATE_MIN);
2501                                 if (r) {
2502                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2503                                         break;
2504                                 }
2505                         }
2506                 }
2507
2508                 mutex_unlock(&mgpu_info.mutex);
2509         }
2510
2511         return 0;
2512 }
2513
2514 /**
2515  * amdgpu_device_ip_fini - run fini for hardware IPs
2516  *
2517  * @adev: amdgpu_device pointer
2518  *
2519  * Main teardown pass for hardware IPs.  The list of all the hardware
2520  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2521  * are run.  hw_fini tears down the hardware associated with each IP
2522  * and sw_fini tears down any software state associated with each IP.
2523  * Returns 0 on success, negative error code on failure.
2524  */
2525 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2526 {
2527         int i, r;
2528
2529         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2530                 amdgpu_virt_release_ras_err_handler_data(adev);
2531
2532         amdgpu_ras_pre_fini(adev);
2533
2534         if (adev->gmc.xgmi.num_physical_nodes > 1)
2535                 amdgpu_xgmi_remove_device(adev);
2536
2537         amdgpu_amdkfd_device_fini(adev);
2538
2539         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2540         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2541
2542         /* need to disable SMC first */
2543         for (i = 0; i < adev->num_ip_blocks; i++) {
2544                 if (!adev->ip_blocks[i].status.hw)
2545                         continue;
2546                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2547                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2548                         /* XXX handle errors */
2549                         if (r) {
2550                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2551                                           adev->ip_blocks[i].version->funcs->name, r);
2552                         }
2553                         adev->ip_blocks[i].status.hw = false;
2554                         break;
2555                 }
2556         }
2557
2558         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2559                 if (!adev->ip_blocks[i].status.hw)
2560                         continue;
2561
2562                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2563                 /* XXX handle errors */
2564                 if (r) {
2565                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2566                                   adev->ip_blocks[i].version->funcs->name, r);
2567                 }
2568
2569                 adev->ip_blocks[i].status.hw = false;
2570         }
2571
2572
2573         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2574                 if (!adev->ip_blocks[i].status.sw)
2575                         continue;
2576
2577                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2578                         amdgpu_ucode_free_bo(adev);
2579                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2580                         amdgpu_device_wb_fini(adev);
2581                         amdgpu_device_vram_scratch_fini(adev);
2582                         amdgpu_ib_pool_fini(adev);
2583                 }
2584
2585                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2586                 /* XXX handle errors */
2587                 if (r) {
2588                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2589                                   adev->ip_blocks[i].version->funcs->name, r);
2590                 }
2591                 adev->ip_blocks[i].status.sw = false;
2592                 adev->ip_blocks[i].status.valid = false;
2593         }
2594
2595         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2596                 if (!adev->ip_blocks[i].status.late_initialized)
2597                         continue;
2598                 if (adev->ip_blocks[i].version->funcs->late_fini)
2599                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2600                 adev->ip_blocks[i].status.late_initialized = false;
2601         }
2602
2603         amdgpu_ras_fini(adev);
2604
2605         if (amdgpu_sriov_vf(adev))
2606                 if (amdgpu_virt_release_full_gpu(adev, false))
2607                         DRM_ERROR("failed to release exclusive mode on fini\n");
2608
2609         return 0;
2610 }
2611
2612 /**
2613  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2614  *
2615  * @work: work_struct.
2616  */
2617 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2618 {
2619         struct amdgpu_device *adev =
2620                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2621         int r;
2622
2623         r = amdgpu_ib_ring_tests(adev);
2624         if (r)
2625                 DRM_ERROR("ib ring test failed (%d).\n", r);
2626 }
2627
2628 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2629 {
2630         struct amdgpu_device *adev =
2631                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2632
2633         mutex_lock(&adev->gfx.gfx_off_mutex);
2634         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2635                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2636                         adev->gfx.gfx_off_state = true;
2637         }
2638         mutex_unlock(&adev->gfx.gfx_off_mutex);
2639 }
2640
2641 /**
2642  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2643  *
2644  * @adev: amdgpu_device pointer
2645  *
2646  * Main suspend function for hardware IPs.  The list of all the hardware
2647  * IPs that make up the asic is walked, clockgating is disabled and the
2648  * suspend callbacks are run.  suspend puts the hardware and software state
2649  * in each IP into a state suitable for suspend.
2650  * Returns 0 on success, negative error code on failure.
2651  */
2652 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2653 {
2654         int i, r;
2655
2656         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2657         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2658
2659         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2660                 if (!adev->ip_blocks[i].status.valid)
2661                         continue;
2662
2663                 /* displays are handled separately */
2664                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2665                         continue;
2666
2667                 /* XXX handle errors */
2668                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2669                 /* XXX handle errors */
2670                 if (r) {
2671                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2672                                   adev->ip_blocks[i].version->funcs->name, r);
2673                         return r;
2674                 }
2675
2676                 adev->ip_blocks[i].status.hw = false;
2677         }
2678
2679         return 0;
2680 }
2681
2682 /**
2683  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2684  *
2685  * @adev: amdgpu_device pointer
2686  *
2687  * Main suspend function for hardware IPs.  The list of all the hardware
2688  * IPs that make up the asic is walked, clockgating is disabled and the
2689  * suspend callbacks are run.  suspend puts the hardware and software state
2690  * in each IP into a state suitable for suspend.
2691  * Returns 0 on success, negative error code on failure.
2692  */
2693 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2694 {
2695         int i, r;
2696
2697         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2698                 if (!adev->ip_blocks[i].status.valid)
2699                         continue;
2700                 /* displays are handled in phase1 */
2701                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2702                         continue;
2703                 /* PSP lost connection when err_event_athub occurs */
2704                 if (amdgpu_ras_intr_triggered() &&
2705                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2706                         adev->ip_blocks[i].status.hw = false;
2707                         continue;
2708                 }
2709                 /* XXX handle errors */
2710                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2711                 /* XXX handle errors */
2712                 if (r) {
2713                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2714                                   adev->ip_blocks[i].version->funcs->name, r);
2715                 }
2716                 adev->ip_blocks[i].status.hw = false;
2717                 /* handle putting the SMC in the appropriate state */
2718                 if(!amdgpu_sriov_vf(adev)){
2719                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2720                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2721                                 if (r) {
2722                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2723                                                         adev->mp1_state, r);
2724                                         return r;
2725                                 }
2726                         }
2727                 }
2728                 adev->ip_blocks[i].status.hw = false;
2729         }
2730
2731         return 0;
2732 }
2733
2734 /**
2735  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2736  *
2737  * @adev: amdgpu_device pointer
2738  *
2739  * Main suspend function for hardware IPs.  The list of all the hardware
2740  * IPs that make up the asic is walked, clockgating is disabled and the
2741  * suspend callbacks are run.  suspend puts the hardware and software state
2742  * in each IP into a state suitable for suspend.
2743  * Returns 0 on success, negative error code on failure.
2744  */
2745 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2746 {
2747         int r;
2748
2749         if (amdgpu_sriov_vf(adev))
2750                 amdgpu_virt_request_full_gpu(adev, false);
2751
2752         r = amdgpu_device_ip_suspend_phase1(adev);
2753         if (r)
2754                 return r;
2755         r = amdgpu_device_ip_suspend_phase2(adev);
2756
2757         if (amdgpu_sriov_vf(adev))
2758                 amdgpu_virt_release_full_gpu(adev, false);
2759
2760         return r;
2761 }
2762
2763 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2764 {
2765         int i, r;
2766
2767         static enum amd_ip_block_type ip_order[] = {
2768                 AMD_IP_BLOCK_TYPE_GMC,
2769                 AMD_IP_BLOCK_TYPE_COMMON,
2770                 AMD_IP_BLOCK_TYPE_PSP,
2771                 AMD_IP_BLOCK_TYPE_IH,
2772         };
2773
2774         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2775                 int j;
2776                 struct amdgpu_ip_block *block;
2777
2778                 block = &adev->ip_blocks[i];
2779                 block->status.hw = false;
2780
2781                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2782
2783                         if (block->version->type != ip_order[j] ||
2784                                 !block->status.valid)
2785                                 continue;
2786
2787                         r = block->version->funcs->hw_init(adev);
2788                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2789                         if (r)
2790                                 return r;
2791                         block->status.hw = true;
2792                 }
2793         }
2794
2795         return 0;
2796 }
2797
2798 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2799 {
2800         int i, r;
2801
2802         static enum amd_ip_block_type ip_order[] = {
2803                 AMD_IP_BLOCK_TYPE_SMC,
2804                 AMD_IP_BLOCK_TYPE_DCE,
2805                 AMD_IP_BLOCK_TYPE_GFX,
2806                 AMD_IP_BLOCK_TYPE_SDMA,
2807                 AMD_IP_BLOCK_TYPE_UVD,
2808                 AMD_IP_BLOCK_TYPE_VCE,
2809                 AMD_IP_BLOCK_TYPE_VCN
2810         };
2811
2812         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2813                 int j;
2814                 struct amdgpu_ip_block *block;
2815
2816                 for (j = 0; j < adev->num_ip_blocks; j++) {
2817                         block = &adev->ip_blocks[j];
2818
2819                         if (block->version->type != ip_order[i] ||
2820                                 !block->status.valid ||
2821                                 block->status.hw)
2822                                 continue;
2823
2824                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2825                                 r = block->version->funcs->resume(adev);
2826                         else
2827                                 r = block->version->funcs->hw_init(adev);
2828
2829                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2830                         if (r)
2831                                 return r;
2832                         block->status.hw = true;
2833                 }
2834         }
2835
2836         return 0;
2837 }
2838
2839 /**
2840  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2841  *
2842  * @adev: amdgpu_device pointer
2843  *
2844  * First resume function for hardware IPs.  The list of all the hardware
2845  * IPs that make up the asic is walked and the resume callbacks are run for
2846  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2847  * after a suspend and updates the software state as necessary.  This
2848  * function is also used for restoring the GPU after a GPU reset.
2849  * Returns 0 on success, negative error code on failure.
2850  */
2851 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2852 {
2853         int i, r;
2854
2855         for (i = 0; i < adev->num_ip_blocks; i++) {
2856                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2857                         continue;
2858                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2859                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2860                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2861
2862                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2863                         if (r) {
2864                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2865                                           adev->ip_blocks[i].version->funcs->name, r);
2866                                 return r;
2867                         }
2868                         adev->ip_blocks[i].status.hw = true;
2869                 }
2870         }
2871
2872         return 0;
2873 }
2874
2875 /**
2876  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2877  *
2878  * @adev: amdgpu_device pointer
2879  *
2880  * First resume function for hardware IPs.  The list of all the hardware
2881  * IPs that make up the asic is walked and the resume callbacks are run for
2882  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2883  * functional state after a suspend and updates the software state as
2884  * necessary.  This function is also used for restoring the GPU after a GPU
2885  * reset.
2886  * Returns 0 on success, negative error code on failure.
2887  */
2888 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2889 {
2890         int i, r;
2891
2892         for (i = 0; i < adev->num_ip_blocks; i++) {
2893                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2894                         continue;
2895                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2896                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2897                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2898                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2899                         continue;
2900                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2901                 if (r) {
2902                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2903                                   adev->ip_blocks[i].version->funcs->name, r);
2904                         return r;
2905                 }
2906                 adev->ip_blocks[i].status.hw = true;
2907         }
2908
2909         return 0;
2910 }
2911
2912 /**
2913  * amdgpu_device_ip_resume - run resume for hardware IPs
2914  *
2915  * @adev: amdgpu_device pointer
2916  *
2917  * Main resume function for hardware IPs.  The hardware IPs
2918  * are split into two resume functions because they are
2919  * are also used in in recovering from a GPU reset and some additional
2920  * steps need to be take between them.  In this case (S3/S4) they are
2921  * run sequentially.
2922  * Returns 0 on success, negative error code on failure.
2923  */
2924 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2925 {
2926         int r;
2927
2928         r = amdgpu_device_ip_resume_phase1(adev);
2929         if (r)
2930                 return r;
2931
2932         r = amdgpu_device_fw_loading(adev);
2933         if (r)
2934                 return r;
2935
2936         r = amdgpu_device_ip_resume_phase2(adev);
2937
2938         return r;
2939 }
2940
2941 /**
2942  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2943  *
2944  * @adev: amdgpu_device pointer
2945  *
2946  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2947  */
2948 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2949 {
2950         if (amdgpu_sriov_vf(adev)) {
2951                 if (adev->is_atom_fw) {
2952                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2953                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2954                 } else {
2955                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2956                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2957                 }
2958
2959                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2960                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2961         }
2962 }
2963
2964 /**
2965  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2966  *
2967  * @asic_type: AMD asic type
2968  *
2969  * Check if there is DC (new modesetting infrastructre) support for an asic.
2970  * returns true if DC has support, false if not.
2971  */
2972 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2973 {
2974         switch (asic_type) {
2975 #if defined(CONFIG_DRM_AMD_DC)
2976 #if defined(CONFIG_DRM_AMD_DC_SI)
2977         case CHIP_TAHITI:
2978         case CHIP_PITCAIRN:
2979         case CHIP_VERDE:
2980         case CHIP_OLAND:
2981 #endif
2982         case CHIP_BONAIRE:
2983         case CHIP_KAVERI:
2984         case CHIP_KABINI:
2985         case CHIP_MULLINS:
2986                 /*
2987                  * We have systems in the wild with these ASICs that require
2988                  * LVDS and VGA support which is not supported with DC.
2989                  *
2990                  * Fallback to the non-DC driver here by default so as not to
2991                  * cause regressions.
2992                  */
2993                 return amdgpu_dc > 0;
2994         case CHIP_HAWAII:
2995         case CHIP_CARRIZO:
2996         case CHIP_STONEY:
2997         case CHIP_POLARIS10:
2998         case CHIP_POLARIS11:
2999         case CHIP_POLARIS12:
3000         case CHIP_VEGAM:
3001         case CHIP_TONGA:
3002         case CHIP_FIJI:
3003         case CHIP_VEGA10:
3004         case CHIP_VEGA12:
3005         case CHIP_VEGA20:
3006 #if defined(CONFIG_DRM_AMD_DC_DCN)
3007         case CHIP_RAVEN:
3008         case CHIP_NAVI10:
3009         case CHIP_NAVI14:
3010         case CHIP_NAVI12:
3011         case CHIP_RENOIR:
3012 #endif
3013 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3014         case CHIP_SIENNA_CICHLID:
3015         case CHIP_NAVY_FLOUNDER:
3016         case CHIP_DIMGREY_CAVEFISH:
3017 #endif
3018                 return amdgpu_dc != 0;
3019 #endif
3020         default:
3021                 if (amdgpu_dc > 0)
3022                         DRM_INFO("Display Core has been requested via kernel parameter "
3023                                          "but isn't supported by ASIC, ignoring\n");
3024                 return false;
3025         }
3026 }
3027
3028 /**
3029  * amdgpu_device_has_dc_support - check if dc is supported
3030  *
3031  * @adev: amdgpu_device_pointer
3032  *
3033  * Returns true for supported, false for not supported
3034  */
3035 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3036 {
3037         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3038                 return false;
3039
3040         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3041 }
3042
3043
3044 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3045 {
3046         struct amdgpu_device *adev =
3047                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3048         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3049
3050         /* It's a bug to not have a hive within this function */
3051         if (WARN_ON(!hive))
3052                 return;
3053
3054         /*
3055          * Use task barrier to synchronize all xgmi reset works across the
3056          * hive. task_barrier_enter and task_barrier_exit will block
3057          * until all the threads running the xgmi reset works reach
3058          * those points. task_barrier_full will do both blocks.
3059          */
3060         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3061
3062                 task_barrier_enter(&hive->tb);
3063                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3064
3065                 if (adev->asic_reset_res)
3066                         goto fail;
3067
3068                 task_barrier_exit(&hive->tb);
3069                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3070
3071                 if (adev->asic_reset_res)
3072                         goto fail;
3073
3074                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3075                         adev->mmhub.funcs->reset_ras_error_count(adev);
3076         } else {
3077
3078                 task_barrier_full(&hive->tb);
3079                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3080         }
3081
3082 fail:
3083         if (adev->asic_reset_res)
3084                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3085                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3086         amdgpu_put_xgmi_hive(hive);
3087 }
3088
3089 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3090 {
3091         char *input = amdgpu_lockup_timeout;
3092         char *timeout_setting = NULL;
3093         int index = 0;
3094         long timeout;
3095         int ret = 0;
3096
3097         /*
3098          * By default timeout for non compute jobs is 10000.
3099          * And there is no timeout enforced on compute jobs.
3100          * In SR-IOV or passthrough mode, timeout for compute
3101          * jobs are 60000 by default.
3102          */
3103         adev->gfx_timeout = msecs_to_jiffies(10000);
3104         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3105         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3106                 adev->compute_timeout =  msecs_to_jiffies(60000);
3107         else
3108                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3109
3110         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3111                 while ((timeout_setting = strsep(&input, ",")) &&
3112                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3113                         ret = kstrtol(timeout_setting, 0, &timeout);
3114                         if (ret)
3115                                 return ret;
3116
3117                         if (timeout == 0) {
3118                                 index++;
3119                                 continue;
3120                         } else if (timeout < 0) {
3121                                 timeout = MAX_SCHEDULE_TIMEOUT;
3122                         } else {
3123                                 timeout = msecs_to_jiffies(timeout);
3124                         }
3125
3126                         switch (index++) {
3127                         case 0:
3128                                 adev->gfx_timeout = timeout;
3129                                 break;
3130                         case 1:
3131                                 adev->compute_timeout = timeout;
3132                                 break;
3133                         case 2:
3134                                 adev->sdma_timeout = timeout;
3135                                 break;
3136                         case 3:
3137                                 adev->video_timeout = timeout;
3138                                 break;
3139                         default:
3140                                 break;
3141                         }
3142                 }
3143                 /*
3144                  * There is only one value specified and
3145                  * it should apply to all non-compute jobs.
3146                  */
3147                 if (index == 1) {
3148                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3149                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3150                                 adev->compute_timeout = adev->gfx_timeout;
3151                 }
3152         }
3153
3154         return ret;
3155 }
3156
3157 static const struct attribute *amdgpu_dev_attributes[] = {
3158         &dev_attr_product_name.attr,
3159         &dev_attr_product_number.attr,
3160         &dev_attr_serial_number.attr,
3161         &dev_attr_pcie_replay_count.attr,
3162         NULL
3163 };
3164
3165
3166 /**
3167  * amdgpu_device_init - initialize the driver
3168  *
3169  * @adev: amdgpu_device pointer
3170  * @flags: driver flags
3171  *
3172  * Initializes the driver info and hw (all asics).
3173  * Returns 0 for success or an error on failure.
3174  * Called at driver startup.
3175  */
3176 int amdgpu_device_init(struct amdgpu_device *adev,
3177                        uint32_t flags)
3178 {
3179         struct drm_device *ddev = adev_to_drm(adev);
3180         struct pci_dev *pdev = adev->pdev;
3181         int r, i;
3182         bool boco = false;
3183         u32 max_MBps;
3184
3185         adev->shutdown = false;
3186         adev->flags = flags;
3187
3188         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3189                 adev->asic_type = amdgpu_force_asic_type;
3190         else
3191                 adev->asic_type = flags & AMD_ASIC_MASK;
3192
3193         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3194         if (amdgpu_emu_mode == 1)
3195                 adev->usec_timeout *= 10;
3196         adev->gmc.gart_size = 512 * 1024 * 1024;
3197         adev->accel_working = false;
3198         adev->num_rings = 0;
3199         adev->mman.buffer_funcs = NULL;
3200         adev->mman.buffer_funcs_ring = NULL;
3201         adev->vm_manager.vm_pte_funcs = NULL;
3202         adev->vm_manager.vm_pte_num_scheds = 0;
3203         adev->gmc.gmc_funcs = NULL;
3204         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3205         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3206
3207         adev->smc_rreg = &amdgpu_invalid_rreg;
3208         adev->smc_wreg = &amdgpu_invalid_wreg;
3209         adev->pcie_rreg = &amdgpu_invalid_rreg;
3210         adev->pcie_wreg = &amdgpu_invalid_wreg;
3211         adev->pciep_rreg = &amdgpu_invalid_rreg;
3212         adev->pciep_wreg = &amdgpu_invalid_wreg;
3213         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3214         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3215         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3216         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3217         adev->didt_rreg = &amdgpu_invalid_rreg;
3218         adev->didt_wreg = &amdgpu_invalid_wreg;
3219         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3220         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3221         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3222         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3223
3224         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3225                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3226                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3227
3228         /* mutex initialization are all done here so we
3229          * can recall function without having locking issues */
3230         atomic_set(&adev->irq.ih.lock, 0);
3231         mutex_init(&adev->firmware.mutex);
3232         mutex_init(&adev->pm.mutex);
3233         mutex_init(&adev->gfx.gpu_clock_mutex);
3234         mutex_init(&adev->srbm_mutex);
3235         mutex_init(&adev->gfx.pipe_reserve_mutex);
3236         mutex_init(&adev->gfx.gfx_off_mutex);
3237         mutex_init(&adev->grbm_idx_mutex);
3238         mutex_init(&adev->mn_lock);
3239         mutex_init(&adev->virt.vf_errors.lock);
3240         hash_init(adev->mn_hash);
3241         atomic_set(&adev->in_gpu_reset, 0);
3242         init_rwsem(&adev->reset_sem);
3243         mutex_init(&adev->psp.mutex);
3244         mutex_init(&adev->notifier_lock);
3245
3246         r = amdgpu_device_check_arguments(adev);
3247         if (r)
3248                 return r;
3249
3250         spin_lock_init(&adev->mmio_idx_lock);
3251         spin_lock_init(&adev->smc_idx_lock);
3252         spin_lock_init(&adev->pcie_idx_lock);
3253         spin_lock_init(&adev->uvd_ctx_idx_lock);
3254         spin_lock_init(&adev->didt_idx_lock);
3255         spin_lock_init(&adev->gc_cac_idx_lock);
3256         spin_lock_init(&adev->se_cac_idx_lock);
3257         spin_lock_init(&adev->audio_endpt_idx_lock);
3258         spin_lock_init(&adev->mm_stats.lock);
3259
3260         INIT_LIST_HEAD(&adev->shadow_list);
3261         mutex_init(&adev->shadow_list_lock);
3262
3263         INIT_DELAYED_WORK(&adev->delayed_init_work,
3264                           amdgpu_device_delayed_init_work_handler);
3265         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3266                           amdgpu_device_delay_enable_gfx_off);
3267
3268         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3269
3270         adev->gfx.gfx_off_req_count = 1;
3271         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3272
3273         atomic_set(&adev->throttling_logging_enabled, 1);
3274         /*
3275          * If throttling continues, logging will be performed every minute
3276          * to avoid log flooding. "-1" is subtracted since the thermal
3277          * throttling interrupt comes every second. Thus, the total logging
3278          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3279          * for throttling interrupt) = 60 seconds.
3280          */
3281         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3282         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3283
3284         /* Registers mapping */
3285         /* TODO: block userspace mapping of io register */
3286         if (adev->asic_type >= CHIP_BONAIRE) {
3287                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3288                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3289         } else {
3290                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3291                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3292         }
3293
3294         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3295         if (adev->rmmio == NULL) {
3296                 return -ENOMEM;
3297         }
3298         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3299         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3300
3301         /* io port mapping */
3302         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3303                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3304                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3305                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3306                         break;
3307                 }
3308         }
3309         if (adev->rio_mem == NULL)
3310                 DRM_INFO("PCI I/O BAR is not found.\n");
3311
3312         /* enable PCIE atomic ops */
3313         r = pci_enable_atomic_ops_to_root(adev->pdev,
3314                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3315                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3316         if (r) {
3317                 adev->have_atomics_support = false;
3318                 DRM_INFO("PCIE atomic ops is not supported\n");
3319         } else {
3320                 adev->have_atomics_support = true;
3321         }
3322
3323         amdgpu_device_get_pcie_info(adev);
3324
3325         if (amdgpu_mcbp)
3326                 DRM_INFO("MCBP is enabled\n");
3327
3328         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3329                 adev->enable_mes = true;
3330
3331         /* detect hw virtualization here */
3332         amdgpu_detect_virtualization(adev);
3333
3334         r = amdgpu_device_get_job_timeout_settings(adev);
3335         if (r) {
3336                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3337                 goto failed_unmap;
3338         }
3339
3340         /* early init functions */
3341         r = amdgpu_device_ip_early_init(adev);
3342         if (r)
3343                 goto failed_unmap;
3344
3345         /* doorbell bar mapping and doorbell index init*/
3346         amdgpu_device_doorbell_init(adev);
3347
3348         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3349         /* this will fail for cards that aren't VGA class devices, just
3350          * ignore it */
3351         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3352
3353         if (amdgpu_device_supports_boco(ddev))
3354                 boco = true;
3355         if (amdgpu_has_atpx() &&
3356             (amdgpu_is_atpx_hybrid() ||
3357              amdgpu_has_atpx_dgpu_power_cntl()) &&
3358             !pci_is_thunderbolt_attached(adev->pdev))
3359                 vga_switcheroo_register_client(adev->pdev,
3360                                                &amdgpu_switcheroo_ops, boco);
3361         if (boco)
3362                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3363
3364         if (amdgpu_emu_mode == 1) {
3365                 /* post the asic on emulation mode */
3366                 emu_soc_asic_init(adev);
3367                 goto fence_driver_init;
3368         }
3369
3370         /* detect if we are with an SRIOV vbios */
3371         amdgpu_device_detect_sriov_bios(adev);
3372
3373         /* check if we need to reset the asic
3374          *  E.g., driver was not cleanly unloaded previously, etc.
3375          */
3376         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3377                 r = amdgpu_asic_reset(adev);
3378                 if (r) {
3379                         dev_err(adev->dev, "asic reset on init failed\n");
3380                         goto failed;
3381                 }
3382         }
3383
3384         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3385
3386         /* Post card if necessary */
3387         if (amdgpu_device_need_post(adev)) {
3388                 if (!adev->bios) {
3389                         dev_err(adev->dev, "no vBIOS found\n");
3390                         r = -EINVAL;
3391                         goto failed;
3392                 }
3393                 DRM_INFO("GPU posting now...\n");
3394                 r = amdgpu_device_asic_init(adev);
3395                 if (r) {
3396                         dev_err(adev->dev, "gpu post error!\n");
3397                         goto failed;
3398                 }
3399         }
3400
3401         if (adev->is_atom_fw) {
3402                 /* Initialize clocks */
3403                 r = amdgpu_atomfirmware_get_clock_info(adev);
3404                 if (r) {
3405                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3406                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3407                         goto failed;
3408                 }
3409         } else {
3410                 /* Initialize clocks */
3411                 r = amdgpu_atombios_get_clock_info(adev);
3412                 if (r) {
3413                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3414                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3415                         goto failed;
3416                 }
3417                 /* init i2c buses */
3418                 if (!amdgpu_device_has_dc_support(adev))
3419                         amdgpu_atombios_i2c_init(adev);
3420         }
3421
3422 fence_driver_init:
3423         /* Fence driver */
3424         r = amdgpu_fence_driver_init(adev);
3425         if (r) {
3426                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3427                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3428                 goto failed;
3429         }
3430
3431         /* init the mode config */
3432         drm_mode_config_init(adev_to_drm(adev));
3433
3434         r = amdgpu_device_ip_init(adev);
3435         if (r) {
3436                 /* failed in exclusive mode due to timeout */
3437                 if (amdgpu_sriov_vf(adev) &&
3438                     !amdgpu_sriov_runtime(adev) &&
3439                     amdgpu_virt_mmio_blocked(adev) &&
3440                     !amdgpu_virt_wait_reset(adev)) {
3441                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3442                         /* Don't send request since VF is inactive. */
3443                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3444                         adev->virt.ops = NULL;
3445                         r = -EAGAIN;
3446                         goto failed;
3447                 }
3448                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3449                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3450                 goto failed;
3451         }
3452
3453         dev_info(adev->dev,
3454                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3455                         adev->gfx.config.max_shader_engines,
3456                         adev->gfx.config.max_sh_per_se,
3457                         adev->gfx.config.max_cu_per_sh,
3458                         adev->gfx.cu_info.number);
3459
3460         adev->accel_working = true;
3461
3462         amdgpu_vm_check_compute_bug(adev);
3463
3464         /* Initialize the buffer migration limit. */
3465         if (amdgpu_moverate >= 0)
3466                 max_MBps = amdgpu_moverate;
3467         else
3468                 max_MBps = 8; /* Allow 8 MB/s. */
3469         /* Get a log2 for easy divisions. */
3470         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3471
3472         amdgpu_fbdev_init(adev);
3473
3474         r = amdgpu_pm_sysfs_init(adev);
3475         if (r) {
3476                 adev->pm_sysfs_en = false;
3477                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3478         } else
3479                 adev->pm_sysfs_en = true;
3480
3481         r = amdgpu_ucode_sysfs_init(adev);
3482         if (r) {
3483                 adev->ucode_sysfs_en = false;
3484                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3485         } else
3486                 adev->ucode_sysfs_en = true;
3487
3488         if ((amdgpu_testing & 1)) {
3489                 if (adev->accel_working)
3490                         amdgpu_test_moves(adev);
3491                 else
3492                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3493         }
3494         if (amdgpu_benchmarking) {
3495                 if (adev->accel_working)
3496                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3497                 else
3498                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3499         }
3500
3501         /*
3502          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3503          * Otherwise the mgpu fan boost feature will be skipped due to the
3504          * gpu instance is counted less.
3505          */
3506         amdgpu_register_gpu_instance(adev);
3507
3508         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3509          * explicit gating rather than handling it automatically.
3510          */
3511         r = amdgpu_device_ip_late_init(adev);
3512         if (r) {
3513                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3514                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3515                 goto failed;
3516         }
3517
3518         /* must succeed. */
3519         amdgpu_ras_resume(adev);
3520
3521         queue_delayed_work(system_wq, &adev->delayed_init_work,
3522                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3523
3524         if (amdgpu_sriov_vf(adev))
3525                 flush_delayed_work(&adev->delayed_init_work);
3526
3527         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3528         if (r)
3529                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3530
3531         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3532                 r = amdgpu_pmu_init(adev);
3533         if (r)
3534                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3535
3536         /* Have stored pci confspace at hand for restore in sudden PCI error */
3537         if (amdgpu_device_cache_pci_state(adev->pdev))
3538                 pci_restore_state(pdev);
3539
3540         return 0;
3541
3542 failed:
3543         amdgpu_vf_error_trans_all(adev);
3544         if (boco)
3545                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3546
3547 failed_unmap:
3548         iounmap(adev->rmmio);
3549         adev->rmmio = NULL;
3550
3551         return r;
3552 }
3553
3554 /**
3555  * amdgpu_device_fini - tear down the driver
3556  *
3557  * @adev: amdgpu_device pointer
3558  *
3559  * Tear down the driver info (all asics).
3560  * Called at driver shutdown.
3561  */
3562 void amdgpu_device_fini(struct amdgpu_device *adev)
3563 {
3564         dev_info(adev->dev, "amdgpu: finishing device.\n");
3565         flush_delayed_work(&adev->delayed_init_work);
3566         adev->shutdown = true;
3567
3568         kfree(adev->pci_state);
3569
3570         /* make sure IB test finished before entering exclusive mode
3571          * to avoid preemption on IB test
3572          * */
3573         if (amdgpu_sriov_vf(adev)) {
3574                 amdgpu_virt_request_full_gpu(adev, false);
3575                 amdgpu_virt_fini_data_exchange(adev);
3576         }
3577
3578         /* disable all interrupts */
3579         amdgpu_irq_disable_all(adev);
3580         if (adev->mode_info.mode_config_initialized){
3581                 if (!amdgpu_device_has_dc_support(adev))
3582                         drm_helper_force_disable_all(adev_to_drm(adev));
3583                 else
3584                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3585         }
3586         amdgpu_fence_driver_fini(adev);
3587         if (adev->pm_sysfs_en)
3588                 amdgpu_pm_sysfs_fini(adev);
3589         amdgpu_fbdev_fini(adev);
3590         amdgpu_device_ip_fini(adev);
3591         release_firmware(adev->firmware.gpu_info_fw);
3592         adev->firmware.gpu_info_fw = NULL;
3593         adev->accel_working = false;
3594         /* free i2c buses */
3595         if (!amdgpu_device_has_dc_support(adev))
3596                 amdgpu_i2c_fini(adev);
3597
3598         if (amdgpu_emu_mode != 1)
3599                 amdgpu_atombios_fini(adev);
3600
3601         kfree(adev->bios);
3602         adev->bios = NULL;
3603         if (amdgpu_has_atpx() &&
3604             (amdgpu_is_atpx_hybrid() ||
3605              amdgpu_has_atpx_dgpu_power_cntl()) &&
3606             !pci_is_thunderbolt_attached(adev->pdev))
3607                 vga_switcheroo_unregister_client(adev->pdev);
3608         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3609                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3610         vga_client_register(adev->pdev, NULL, NULL, NULL);
3611         if (adev->rio_mem)
3612                 pci_iounmap(adev->pdev, adev->rio_mem);
3613         adev->rio_mem = NULL;
3614         iounmap(adev->rmmio);
3615         adev->rmmio = NULL;
3616         amdgpu_device_doorbell_fini(adev);
3617
3618         if (adev->ucode_sysfs_en)
3619                 amdgpu_ucode_sysfs_fini(adev);
3620
3621         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3622         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3623                 amdgpu_pmu_fini(adev);
3624         if (adev->mman.discovery_bin)
3625                 amdgpu_discovery_fini(adev);
3626 }
3627
3628
3629 /*
3630  * Suspend & resume.
3631  */
3632 /**
3633  * amdgpu_device_suspend - initiate device suspend
3634  *
3635  * @dev: drm dev pointer
3636  * @fbcon : notify the fbdev of suspend
3637  *
3638  * Puts the hw in the suspend state (all asics).
3639  * Returns 0 for success or an error on failure.
3640  * Called at driver suspend.
3641  */
3642 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3643 {
3644         struct amdgpu_device *adev;
3645         struct drm_crtc *crtc;
3646         struct drm_connector *connector;
3647         struct drm_connector_list_iter iter;
3648         int r;
3649
3650         adev = drm_to_adev(dev);
3651
3652         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3653                 return 0;
3654
3655         adev->in_suspend = true;
3656         drm_kms_helper_poll_disable(dev);
3657
3658         if (fbcon)
3659                 amdgpu_fbdev_set_suspend(adev, 1);
3660
3661         cancel_delayed_work_sync(&adev->delayed_init_work);
3662
3663         if (!amdgpu_device_has_dc_support(adev)) {
3664                 /* turn off display hw */
3665                 drm_modeset_lock_all(dev);
3666                 drm_connector_list_iter_begin(dev, &iter);
3667                 drm_for_each_connector_iter(connector, &iter)
3668                         drm_helper_connector_dpms(connector,
3669                                                   DRM_MODE_DPMS_OFF);
3670                 drm_connector_list_iter_end(&iter);
3671                 drm_modeset_unlock_all(dev);
3672                         /* unpin the front buffers and cursors */
3673                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3674                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3675                         struct drm_framebuffer *fb = crtc->primary->fb;
3676                         struct amdgpu_bo *robj;
3677
3678                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3679                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3680                                 r = amdgpu_bo_reserve(aobj, true);
3681                                 if (r == 0) {
3682                                         amdgpu_bo_unpin(aobj);
3683                                         amdgpu_bo_unreserve(aobj);
3684                                 }
3685                         }
3686
3687                         if (fb == NULL || fb->obj[0] == NULL) {
3688                                 continue;
3689                         }
3690                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3691                         /* don't unpin kernel fb objects */
3692                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3693                                 r = amdgpu_bo_reserve(robj, true);
3694                                 if (r == 0) {
3695                                         amdgpu_bo_unpin(robj);
3696                                         amdgpu_bo_unreserve(robj);
3697                                 }
3698                         }
3699                 }
3700         }
3701
3702         amdgpu_ras_suspend(adev);
3703
3704         r = amdgpu_device_ip_suspend_phase1(adev);
3705
3706         amdgpu_amdkfd_suspend(adev, !fbcon);
3707
3708         /* evict vram memory */
3709         amdgpu_bo_evict_vram(adev);
3710
3711         amdgpu_fence_driver_suspend(adev);
3712
3713         r = amdgpu_device_ip_suspend_phase2(adev);
3714
3715         /* evict remaining vram memory
3716          * This second call to evict vram is to evict the gart page table
3717          * using the CPU.
3718          */
3719         amdgpu_bo_evict_vram(adev);
3720
3721         return 0;
3722 }
3723
3724 /**
3725  * amdgpu_device_resume - initiate device resume
3726  *
3727  * @dev: drm dev pointer
3728  * @fbcon : notify the fbdev of resume
3729  *
3730  * Bring the hw back to operating state (all asics).
3731  * Returns 0 for success or an error on failure.
3732  * Called at driver resume.
3733  */
3734 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3735 {
3736         struct drm_connector *connector;
3737         struct drm_connector_list_iter iter;
3738         struct amdgpu_device *adev = drm_to_adev(dev);
3739         struct drm_crtc *crtc;
3740         int r = 0;
3741
3742         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3743                 return 0;
3744
3745         /* post card */
3746         if (amdgpu_device_need_post(adev)) {
3747                 r = amdgpu_device_asic_init(adev);
3748                 if (r)
3749                         dev_err(adev->dev, "amdgpu asic init failed\n");
3750         }
3751
3752         r = amdgpu_device_ip_resume(adev);
3753         if (r) {
3754                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3755                 return r;
3756         }
3757         amdgpu_fence_driver_resume(adev);
3758
3759
3760         r = amdgpu_device_ip_late_init(adev);
3761         if (r)
3762                 return r;
3763
3764         queue_delayed_work(system_wq, &adev->delayed_init_work,
3765                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3766
3767         if (!amdgpu_device_has_dc_support(adev)) {
3768                 /* pin cursors */
3769                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3770                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3771
3772                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3773                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3774                                 r = amdgpu_bo_reserve(aobj, true);
3775                                 if (r == 0) {
3776                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3777                                         if (r != 0)
3778                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3779                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3780                                         amdgpu_bo_unreserve(aobj);
3781                                 }
3782                         }
3783                 }
3784         }
3785         r = amdgpu_amdkfd_resume(adev, !fbcon);
3786         if (r)
3787                 return r;
3788
3789         /* Make sure IB tests flushed */
3790         flush_delayed_work(&adev->delayed_init_work);
3791
3792         /* blat the mode back in */
3793         if (fbcon) {
3794                 if (!amdgpu_device_has_dc_support(adev)) {
3795                         /* pre DCE11 */
3796                         drm_helper_resume_force_mode(dev);
3797
3798                         /* turn on display hw */
3799                         drm_modeset_lock_all(dev);
3800
3801                         drm_connector_list_iter_begin(dev, &iter);
3802                         drm_for_each_connector_iter(connector, &iter)
3803                                 drm_helper_connector_dpms(connector,
3804                                                           DRM_MODE_DPMS_ON);
3805                         drm_connector_list_iter_end(&iter);
3806
3807                         drm_modeset_unlock_all(dev);
3808                 }
3809                 amdgpu_fbdev_set_suspend(adev, 0);
3810         }
3811
3812         drm_kms_helper_poll_enable(dev);
3813
3814         amdgpu_ras_resume(adev);
3815
3816         /*
3817          * Most of the connector probing functions try to acquire runtime pm
3818          * refs to ensure that the GPU is powered on when connector polling is
3819          * performed. Since we're calling this from a runtime PM callback,
3820          * trying to acquire rpm refs will cause us to deadlock.
3821          *
3822          * Since we're guaranteed to be holding the rpm lock, it's safe to
3823          * temporarily disable the rpm helpers so this doesn't deadlock us.
3824          */
3825 #ifdef CONFIG_PM
3826         dev->dev->power.disable_depth++;
3827 #endif
3828         if (!amdgpu_device_has_dc_support(adev))
3829                 drm_helper_hpd_irq_event(dev);
3830         else
3831                 drm_kms_helper_hotplug_event(dev);
3832 #ifdef CONFIG_PM
3833         dev->dev->power.disable_depth--;
3834 #endif
3835         adev->in_suspend = false;
3836
3837         return 0;
3838 }
3839
3840 /**
3841  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3842  *
3843  * @adev: amdgpu_device pointer
3844  *
3845  * The list of all the hardware IPs that make up the asic is walked and
3846  * the check_soft_reset callbacks are run.  check_soft_reset determines
3847  * if the asic is still hung or not.
3848  * Returns true if any of the IPs are still in a hung state, false if not.
3849  */
3850 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3851 {
3852         int i;
3853         bool asic_hang = false;
3854
3855         if (amdgpu_sriov_vf(adev))
3856                 return true;
3857
3858         if (amdgpu_asic_need_full_reset(adev))
3859                 return true;
3860
3861         for (i = 0; i < adev->num_ip_blocks; i++) {
3862                 if (!adev->ip_blocks[i].status.valid)
3863                         continue;
3864                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3865                         adev->ip_blocks[i].status.hang =
3866                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3867                 if (adev->ip_blocks[i].status.hang) {
3868                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3869                         asic_hang = true;
3870                 }
3871         }
3872         return asic_hang;
3873 }
3874
3875 /**
3876  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3877  *
3878  * @adev: amdgpu_device pointer
3879  *
3880  * The list of all the hardware IPs that make up the asic is walked and the
3881  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3882  * handles any IP specific hardware or software state changes that are
3883  * necessary for a soft reset to succeed.
3884  * Returns 0 on success, negative error code on failure.
3885  */
3886 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3887 {
3888         int i, r = 0;
3889
3890         for (i = 0; i < adev->num_ip_blocks; i++) {
3891                 if (!adev->ip_blocks[i].status.valid)
3892                         continue;
3893                 if (adev->ip_blocks[i].status.hang &&
3894                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3895                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3896                         if (r)
3897                                 return r;
3898                 }
3899         }
3900
3901         return 0;
3902 }
3903
3904 /**
3905  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3906  *
3907  * @adev: amdgpu_device pointer
3908  *
3909  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3910  * reset is necessary to recover.
3911  * Returns true if a full asic reset is required, false if not.
3912  */
3913 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3914 {
3915         int i;
3916
3917         if (amdgpu_asic_need_full_reset(adev))
3918                 return true;
3919
3920         for (i = 0; i < adev->num_ip_blocks; i++) {
3921                 if (!adev->ip_blocks[i].status.valid)
3922                         continue;
3923                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3924                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3925                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3926                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3927                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3928                         if (adev->ip_blocks[i].status.hang) {
3929                                 dev_info(adev->dev, "Some block need full reset!\n");
3930                                 return true;
3931                         }
3932                 }
3933         }
3934         return false;
3935 }
3936
3937 /**
3938  * amdgpu_device_ip_soft_reset - do a soft reset
3939  *
3940  * @adev: amdgpu_device pointer
3941  *
3942  * The list of all the hardware IPs that make up the asic is walked and the
3943  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3944  * IP specific hardware or software state changes that are necessary to soft
3945  * reset the IP.
3946  * Returns 0 on success, negative error code on failure.
3947  */
3948 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3949 {
3950         int i, r = 0;
3951
3952         for (i = 0; i < adev->num_ip_blocks; i++) {
3953                 if (!adev->ip_blocks[i].status.valid)
3954                         continue;
3955                 if (adev->ip_blocks[i].status.hang &&
3956                     adev->ip_blocks[i].version->funcs->soft_reset) {
3957                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3958                         if (r)
3959                                 return r;
3960                 }
3961         }
3962
3963         return 0;
3964 }
3965
3966 /**
3967  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3968  *
3969  * @adev: amdgpu_device pointer
3970  *
3971  * The list of all the hardware IPs that make up the asic is walked and the
3972  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3973  * handles any IP specific hardware or software state changes that are
3974  * necessary after the IP has been soft reset.
3975  * Returns 0 on success, negative error code on failure.
3976  */
3977 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3978 {
3979         int i, r = 0;
3980
3981         for (i = 0; i < adev->num_ip_blocks; i++) {
3982                 if (!adev->ip_blocks[i].status.valid)
3983                         continue;
3984                 if (adev->ip_blocks[i].status.hang &&
3985                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3986                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3987                 if (r)
3988                         return r;
3989         }
3990
3991         return 0;
3992 }
3993
3994 /**
3995  * amdgpu_device_recover_vram - Recover some VRAM contents
3996  *
3997  * @adev: amdgpu_device pointer
3998  *
3999  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4000  * restore things like GPUVM page tables after a GPU reset where
4001  * the contents of VRAM might be lost.
4002  *
4003  * Returns:
4004  * 0 on success, negative error code on failure.
4005  */
4006 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4007 {
4008         struct dma_fence *fence = NULL, *next = NULL;
4009         struct amdgpu_bo *shadow;
4010         long r = 1, tmo;
4011
4012         if (amdgpu_sriov_runtime(adev))
4013                 tmo = msecs_to_jiffies(8000);
4014         else
4015                 tmo = msecs_to_jiffies(100);
4016
4017         dev_info(adev->dev, "recover vram bo from shadow start\n");
4018         mutex_lock(&adev->shadow_list_lock);
4019         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4020
4021                 /* No need to recover an evicted BO */
4022                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4023                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4024                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4025                         continue;
4026
4027                 r = amdgpu_bo_restore_shadow(shadow, &next);
4028                 if (r)
4029                         break;
4030
4031                 if (fence) {
4032                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4033                         dma_fence_put(fence);
4034                         fence = next;
4035                         if (tmo == 0) {
4036                                 r = -ETIMEDOUT;
4037                                 break;
4038                         } else if (tmo < 0) {
4039                                 r = tmo;
4040                                 break;
4041                         }
4042                 } else {
4043                         fence = next;
4044                 }
4045         }
4046         mutex_unlock(&adev->shadow_list_lock);
4047
4048         if (fence)
4049                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4050         dma_fence_put(fence);
4051
4052         if (r < 0 || tmo <= 0) {
4053                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4054                 return -EIO;
4055         }
4056
4057         dev_info(adev->dev, "recover vram bo from shadow done\n");
4058         return 0;
4059 }
4060
4061
4062 /**
4063  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4064  *
4065  * @adev: amdgpu device pointer
4066  * @from_hypervisor: request from hypervisor
4067  *
4068  * do VF FLR and reinitialize Asic
4069  * return 0 means succeeded otherwise failed
4070  */
4071 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4072                                      bool from_hypervisor)
4073 {
4074         int r;
4075
4076         if (from_hypervisor)
4077                 r = amdgpu_virt_request_full_gpu(adev, true);
4078         else
4079                 r = amdgpu_virt_reset_gpu(adev);
4080         if (r)
4081                 return r;
4082
4083         amdgpu_amdkfd_pre_reset(adev);
4084
4085         /* Resume IP prior to SMC */
4086         r = amdgpu_device_ip_reinit_early_sriov(adev);
4087         if (r)
4088                 goto error;
4089
4090         amdgpu_virt_init_data_exchange(adev);
4091         /* we need recover gart prior to run SMC/CP/SDMA resume */
4092         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4093
4094         r = amdgpu_device_fw_loading(adev);
4095         if (r)
4096                 return r;
4097
4098         /* now we are okay to resume SMC/CP/SDMA */
4099         r = amdgpu_device_ip_reinit_late_sriov(adev);
4100         if (r)
4101                 goto error;
4102
4103         amdgpu_irq_gpu_reset_resume_helper(adev);
4104         r = amdgpu_ib_ring_tests(adev);
4105         amdgpu_amdkfd_post_reset(adev);
4106
4107 error:
4108         amdgpu_virt_release_full_gpu(adev, true);
4109         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4110                 amdgpu_inc_vram_lost(adev);
4111                 r = amdgpu_device_recover_vram(adev);
4112         }
4113
4114         return r;
4115 }
4116
4117 /**
4118  * amdgpu_device_has_job_running - check if there is any job in mirror list
4119  *
4120  * @adev: amdgpu device pointer
4121  *
4122  * check if there is any job in mirror list
4123  */
4124 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4125 {
4126         int i;
4127         struct drm_sched_job *job;
4128
4129         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4130                 struct amdgpu_ring *ring = adev->rings[i];
4131
4132                 if (!ring || !ring->sched.thread)
4133                         continue;
4134
4135                 spin_lock(&ring->sched.job_list_lock);
4136                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4137                                 struct drm_sched_job, node);
4138                 spin_unlock(&ring->sched.job_list_lock);
4139                 if (job)
4140                         return true;
4141         }
4142         return false;
4143 }
4144
4145 /**
4146  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4147  *
4148  * @adev: amdgpu device pointer
4149  *
4150  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4151  * a hung GPU.
4152  */
4153 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4154 {
4155         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4156                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4157                 return false;
4158         }
4159
4160         if (amdgpu_gpu_recovery == 0)
4161                 goto disabled;
4162
4163         if (amdgpu_sriov_vf(adev))
4164                 return true;
4165
4166         if (amdgpu_gpu_recovery == -1) {
4167                 switch (adev->asic_type) {
4168                 case CHIP_BONAIRE:
4169                 case CHIP_HAWAII:
4170                 case CHIP_TOPAZ:
4171                 case CHIP_TONGA:
4172                 case CHIP_FIJI:
4173                 case CHIP_POLARIS10:
4174                 case CHIP_POLARIS11:
4175                 case CHIP_POLARIS12:
4176                 case CHIP_VEGAM:
4177                 case CHIP_VEGA20:
4178                 case CHIP_VEGA10:
4179                 case CHIP_VEGA12:
4180                 case CHIP_RAVEN:
4181                 case CHIP_ARCTURUS:
4182                 case CHIP_RENOIR:
4183                 case CHIP_NAVI10:
4184                 case CHIP_NAVI14:
4185                 case CHIP_NAVI12:
4186                 case CHIP_SIENNA_CICHLID:
4187                         break;
4188                 default:
4189                         goto disabled;
4190                 }
4191         }
4192
4193         return true;
4194
4195 disabled:
4196                 dev_info(adev->dev, "GPU recovery disabled.\n");
4197                 return false;
4198 }
4199
4200
4201 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4202                                         struct amdgpu_job *job,
4203                                         bool *need_full_reset_arg)
4204 {
4205         int i, r = 0;
4206         bool need_full_reset  = *need_full_reset_arg;
4207
4208         amdgpu_debugfs_wait_dump(adev);
4209
4210         if (amdgpu_sriov_vf(adev)) {
4211                 /* stop the data exchange thread */
4212                 amdgpu_virt_fini_data_exchange(adev);
4213         }
4214
4215         /* block all schedulers and reset given job's ring */
4216         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4217                 struct amdgpu_ring *ring = adev->rings[i];
4218
4219                 if (!ring || !ring->sched.thread)
4220                         continue;
4221
4222                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4223                 amdgpu_fence_driver_force_completion(ring);
4224         }
4225
4226         if(job)
4227                 drm_sched_increase_karma(&job->base);
4228
4229         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4230         if (!amdgpu_sriov_vf(adev)) {
4231
4232                 if (!need_full_reset)
4233                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4234
4235                 if (!need_full_reset) {
4236                         amdgpu_device_ip_pre_soft_reset(adev);
4237                         r = amdgpu_device_ip_soft_reset(adev);
4238                         amdgpu_device_ip_post_soft_reset(adev);
4239                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4240                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4241                                 need_full_reset = true;
4242                         }
4243                 }
4244
4245                 if (need_full_reset)
4246                         r = amdgpu_device_ip_suspend(adev);
4247
4248                 *need_full_reset_arg = need_full_reset;
4249         }
4250
4251         return r;
4252 }
4253
4254 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4255                                struct list_head *device_list_handle,
4256                                bool *need_full_reset_arg,
4257                                bool skip_hw_reset)
4258 {
4259         struct amdgpu_device *tmp_adev = NULL;
4260         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4261         int r = 0;
4262
4263         /*
4264          * ASIC reset has to be done on all HGMI hive nodes ASAP
4265          * to allow proper links negotiation in FW (within 1 sec)
4266          */
4267         if (!skip_hw_reset && need_full_reset) {
4268                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4269                         /* For XGMI run all resets in parallel to speed up the process */
4270                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4271                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4272                                         r = -EALREADY;
4273                         } else
4274                                 r = amdgpu_asic_reset(tmp_adev);
4275
4276                         if (r) {
4277                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4278                                          r, adev_to_drm(tmp_adev)->unique);
4279                                 break;
4280                         }
4281                 }
4282
4283                 /* For XGMI wait for all resets to complete before proceed */
4284                 if (!r) {
4285                         list_for_each_entry(tmp_adev, device_list_handle,
4286                                             gmc.xgmi.head) {
4287                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4288                                         flush_work(&tmp_adev->xgmi_reset_work);
4289                                         r = tmp_adev->asic_reset_res;
4290                                         if (r)
4291                                                 break;
4292                                 }
4293                         }
4294                 }
4295         }
4296
4297         if (!r && amdgpu_ras_intr_triggered()) {
4298                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4299                         if (tmp_adev->mmhub.funcs &&
4300                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4301                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4302                 }
4303
4304                 amdgpu_ras_intr_cleared();
4305         }
4306
4307         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4308                 if (need_full_reset) {
4309                         /* post card */
4310                         if (amdgpu_device_asic_init(tmp_adev))
4311                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4312
4313                         if (!r) {
4314                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4315                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4316                                 if (r)
4317                                         goto out;
4318
4319                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4320                                 if (vram_lost) {
4321                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4322                                         amdgpu_inc_vram_lost(tmp_adev);
4323                                 }
4324
4325                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4326                                 if (r)
4327                                         goto out;
4328
4329                                 r = amdgpu_device_fw_loading(tmp_adev);
4330                                 if (r)
4331                                         return r;
4332
4333                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4334                                 if (r)
4335                                         goto out;
4336
4337                                 if (vram_lost)
4338                                         amdgpu_device_fill_reset_magic(tmp_adev);
4339
4340                                 /*
4341                                  * Add this ASIC as tracked as reset was already
4342                                  * complete successfully.
4343                                  */
4344                                 amdgpu_register_gpu_instance(tmp_adev);
4345
4346                                 r = amdgpu_device_ip_late_init(tmp_adev);
4347                                 if (r)
4348                                         goto out;
4349
4350                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4351
4352                                 /*
4353                                  * The GPU enters bad state once faulty pages
4354                                  * by ECC has reached the threshold, and ras
4355                                  * recovery is scheduled next. So add one check
4356                                  * here to break recovery if it indeed exceeds
4357                                  * bad page threshold, and remind user to
4358                                  * retire this GPU or setting one bigger
4359                                  * bad_page_threshold value to fix this once
4360                                  * probing driver again.
4361                                  */
4362                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4363                                         /* must succeed. */
4364                                         amdgpu_ras_resume(tmp_adev);
4365                                 } else {
4366                                         r = -EINVAL;
4367                                         goto out;
4368                                 }
4369
4370                                 /* Update PSP FW topology after reset */
4371                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4372                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4373                         }
4374                 }
4375
4376 out:
4377                 if (!r) {
4378                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4379                         r = amdgpu_ib_ring_tests(tmp_adev);
4380                         if (r) {
4381                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4382                                 r = amdgpu_device_ip_suspend(tmp_adev);
4383                                 need_full_reset = true;
4384                                 r = -EAGAIN;
4385                                 goto end;
4386                         }
4387                 }
4388
4389                 if (!r)
4390                         r = amdgpu_device_recover_vram(tmp_adev);
4391                 else
4392                         tmp_adev->asic_reset_res = r;
4393         }
4394
4395 end:
4396         *need_full_reset_arg = need_full_reset;
4397         return r;
4398 }
4399
4400 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4401                                 struct amdgpu_hive_info *hive)
4402 {
4403         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4404                 return false;
4405
4406         if (hive) {
4407                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4408         } else {
4409                 down_write(&adev->reset_sem);
4410         }
4411
4412         atomic_inc(&adev->gpu_reset_counter);
4413         switch (amdgpu_asic_reset_method(adev)) {
4414         case AMD_RESET_METHOD_MODE1:
4415                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4416                 break;
4417         case AMD_RESET_METHOD_MODE2:
4418                 adev->mp1_state = PP_MP1_STATE_RESET;
4419                 break;
4420         default:
4421                 adev->mp1_state = PP_MP1_STATE_NONE;
4422                 break;
4423         }
4424
4425         return true;
4426 }
4427
4428 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4429 {
4430         amdgpu_vf_error_trans_all(adev);
4431         adev->mp1_state = PP_MP1_STATE_NONE;
4432         atomic_set(&adev->in_gpu_reset, 0);
4433         up_write(&adev->reset_sem);
4434 }
4435
4436 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4437 {
4438         struct pci_dev *p = NULL;
4439
4440         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4441                         adev->pdev->bus->number, 1);
4442         if (p) {
4443                 pm_runtime_enable(&(p->dev));
4444                 pm_runtime_resume(&(p->dev));
4445         }
4446 }
4447
4448 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4449 {
4450         enum amd_reset_method reset_method;
4451         struct pci_dev *p = NULL;
4452         u64 expires;
4453
4454         /*
4455          * For now, only BACO and mode1 reset are confirmed
4456          * to suffer the audio issue without proper suspended.
4457          */
4458         reset_method = amdgpu_asic_reset_method(adev);
4459         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4460              (reset_method != AMD_RESET_METHOD_MODE1))
4461                 return -EINVAL;
4462
4463         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4464                         adev->pdev->bus->number, 1);
4465         if (!p)
4466                 return -ENODEV;
4467
4468         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4469         if (!expires)
4470                 /*
4471                  * If we cannot get the audio device autosuspend delay,
4472                  * a fixed 4S interval will be used. Considering 3S is
4473                  * the audio controller default autosuspend delay setting.
4474                  * 4S used here is guaranteed to cover that.
4475                  */
4476                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4477
4478         while (!pm_runtime_status_suspended(&(p->dev))) {
4479                 if (!pm_runtime_suspend(&(p->dev)))
4480                         break;
4481
4482                 if (expires < ktime_get_mono_fast_ns()) {
4483                         dev_warn(adev->dev, "failed to suspend display audio\n");
4484                         /* TODO: abort the succeeding gpu reset? */
4485                         return -ETIMEDOUT;
4486                 }
4487         }
4488
4489         pm_runtime_disable(&(p->dev));
4490
4491         return 0;
4492 }
4493
4494 /**
4495  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4496  *
4497  * @adev: amdgpu device pointer
4498  * @job: which job trigger hang
4499  *
4500  * Attempt to reset the GPU if it has hung (all asics).
4501  * Attempt to do soft-reset or full-reset and reinitialize Asic
4502  * Returns 0 for success or an error on failure.
4503  */
4504
4505 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4506                               struct amdgpu_job *job)
4507 {
4508         struct list_head device_list, *device_list_handle =  NULL;
4509         bool need_full_reset = false;
4510         bool job_signaled = false;
4511         struct amdgpu_hive_info *hive = NULL;
4512         struct amdgpu_device *tmp_adev = NULL;
4513         int i, r = 0;
4514         bool need_emergency_restart = false;
4515         bool audio_suspended = false;
4516
4517         /**
4518          * Special case: RAS triggered and full reset isn't supported
4519          */
4520         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4521
4522         /*
4523          * Flush RAM to disk so that after reboot
4524          * the user can read log and see why the system rebooted.
4525          */
4526         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4527                 DRM_WARN("Emergency reboot.");
4528
4529                 ksys_sync_helper();
4530                 emergency_restart();
4531         }
4532
4533         dev_info(adev->dev, "GPU %s begin!\n",
4534                 need_emergency_restart ? "jobs stop":"reset");
4535
4536         /*
4537          * Here we trylock to avoid chain of resets executing from
4538          * either trigger by jobs on different adevs in XGMI hive or jobs on
4539          * different schedulers for same device while this TO handler is running.
4540          * We always reset all schedulers for device and all devices for XGMI
4541          * hive so that should take care of them too.
4542          */
4543         hive = amdgpu_get_xgmi_hive(adev);
4544         if (hive) {
4545                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4546                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4547                                 job ? job->base.id : -1, hive->hive_id);
4548                         amdgpu_put_xgmi_hive(hive);
4549                         return 0;
4550                 }
4551                 mutex_lock(&hive->hive_lock);
4552         }
4553
4554         /*
4555          * Build list of devices to reset.
4556          * In case we are in XGMI hive mode, resort the device list
4557          * to put adev in the 1st position.
4558          */
4559         INIT_LIST_HEAD(&device_list);
4560         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4561                 if (!hive)
4562                         return -ENODEV;
4563                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4564                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4565                 device_list_handle = &hive->device_list;
4566         } else {
4567                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4568                 device_list_handle = &device_list;
4569         }
4570
4571         /* block all schedulers and reset given job's ring */
4572         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4573                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4574                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4575                                   job ? job->base.id : -1);
4576                         r = 0;
4577                         goto skip_recovery;
4578                 }
4579
4580                 /*
4581                  * Try to put the audio codec into suspend state
4582                  * before gpu reset started.
4583                  *
4584                  * Due to the power domain of the graphics device
4585                  * is shared with AZ power domain. Without this,
4586                  * we may change the audio hardware from behind
4587                  * the audio driver's back. That will trigger
4588                  * some audio codec errors.
4589                  */
4590                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4591                         audio_suspended = true;
4592
4593                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4594
4595                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4596
4597                 if (!amdgpu_sriov_vf(tmp_adev))
4598                         amdgpu_amdkfd_pre_reset(tmp_adev);
4599
4600                 /*
4601                  * Mark these ASICs to be reseted as untracked first
4602                  * And add them back after reset completed
4603                  */
4604                 amdgpu_unregister_gpu_instance(tmp_adev);
4605
4606                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4607
4608                 /* disable ras on ALL IPs */
4609                 if (!need_emergency_restart &&
4610                       amdgpu_device_ip_need_full_reset(tmp_adev))
4611                         amdgpu_ras_suspend(tmp_adev);
4612
4613                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4614                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4615
4616                         if (!ring || !ring->sched.thread)
4617                                 continue;
4618
4619                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4620
4621                         if (need_emergency_restart)
4622                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4623                 }
4624         }
4625
4626         if (need_emergency_restart)
4627                 goto skip_sched_resume;
4628
4629         /*
4630          * Must check guilty signal here since after this point all old
4631          * HW fences are force signaled.
4632          *
4633          * job->base holds a reference to parent fence
4634          */
4635         if (job && job->base.s_fence->parent &&
4636             dma_fence_is_signaled(job->base.s_fence->parent)) {
4637                 job_signaled = true;
4638                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4639                 goto skip_hw_reset;
4640         }
4641
4642 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4643         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4644                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4645                                                  (tmp_adev == adev) ? job : NULL,
4646                                                  &need_full_reset);
4647                 /*TODO Should we stop ?*/
4648                 if (r) {
4649                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4650                                   r, adev_to_drm(tmp_adev)->unique);
4651                         tmp_adev->asic_reset_res = r;
4652                 }
4653         }
4654
4655         /* Actual ASIC resets if needed.*/
4656         /* TODO Implement XGMI hive reset logic for SRIOV */
4657         if (amdgpu_sriov_vf(adev)) {
4658                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4659                 if (r)
4660                         adev->asic_reset_res = r;
4661         } else {
4662                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4663                 if (r && r == -EAGAIN)
4664                         goto retry;
4665         }
4666
4667 skip_hw_reset:
4668
4669         /* Post ASIC reset for all devs .*/
4670         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4671
4672                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4673                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4674
4675                         if (!ring || !ring->sched.thread)
4676                                 continue;
4677
4678                         /* No point to resubmit jobs if we didn't HW reset*/
4679                         if (!tmp_adev->asic_reset_res && !job_signaled)
4680                                 drm_sched_resubmit_jobs(&ring->sched);
4681
4682                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4683                 }
4684
4685                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4686                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4687                 }
4688
4689                 tmp_adev->asic_reset_res = 0;
4690
4691                 if (r) {
4692                         /* bad news, how to tell it to userspace ? */
4693                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4694                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4695                 } else {
4696                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4697                 }
4698         }
4699
4700 skip_sched_resume:
4701         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4702                 /*unlock kfd: SRIOV would do it separately */
4703                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4704                         amdgpu_amdkfd_post_reset(tmp_adev);
4705                 if (audio_suspended)
4706                         amdgpu_device_resume_display_audio(tmp_adev);
4707                 amdgpu_device_unlock_adev(tmp_adev);
4708         }
4709
4710 skip_recovery:
4711         if (hive) {
4712                 atomic_set(&hive->in_reset, 0);
4713                 mutex_unlock(&hive->hive_lock);
4714                 amdgpu_put_xgmi_hive(hive);
4715         }
4716
4717         if (r)
4718                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4719         return r;
4720 }
4721
4722 /**
4723  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4724  *
4725  * @adev: amdgpu_device pointer
4726  *
4727  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4728  * and lanes) of the slot the device is in. Handles APUs and
4729  * virtualized environments where PCIE config space may not be available.
4730  */
4731 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4732 {
4733         struct pci_dev *pdev;
4734         enum pci_bus_speed speed_cap, platform_speed_cap;
4735         enum pcie_link_width platform_link_width;
4736
4737         if (amdgpu_pcie_gen_cap)
4738                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4739
4740         if (amdgpu_pcie_lane_cap)
4741                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4742
4743         /* covers APUs as well */
4744         if (pci_is_root_bus(adev->pdev->bus)) {
4745                 if (adev->pm.pcie_gen_mask == 0)
4746                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4747                 if (adev->pm.pcie_mlw_mask == 0)
4748                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4749                 return;
4750         }
4751
4752         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4753                 return;
4754
4755         pcie_bandwidth_available(adev->pdev, NULL,
4756                                  &platform_speed_cap, &platform_link_width);
4757
4758         if (adev->pm.pcie_gen_mask == 0) {
4759                 /* asic caps */
4760                 pdev = adev->pdev;
4761                 speed_cap = pcie_get_speed_cap(pdev);
4762                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4763                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4764                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4765                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4766                 } else {
4767                         if (speed_cap == PCIE_SPEED_16_0GT)
4768                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4769                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4770                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4771                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4772                         else if (speed_cap == PCIE_SPEED_8_0GT)
4773                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4774                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4775                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4776                         else if (speed_cap == PCIE_SPEED_5_0GT)
4777                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4779                         else
4780                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4781                 }
4782                 /* platform caps */
4783                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4784                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4785                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4786                 } else {
4787                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4788                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4789                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4790                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4791                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4792                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4793                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4794                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4795                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4796                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4797                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4798                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4799                         else
4800                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4801
4802                 }
4803         }
4804         if (adev->pm.pcie_mlw_mask == 0) {
4805                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4806                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4807                 } else {
4808                         switch (platform_link_width) {
4809                         case PCIE_LNK_X32:
4810                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4811                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4812                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4813                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4814                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4815                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4816                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4817                                 break;
4818                         case PCIE_LNK_X16:
4819                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4820                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4821                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4822                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4823                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4824                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4825                                 break;
4826                         case PCIE_LNK_X12:
4827                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4828                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4829                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4830                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4831                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4832                                 break;
4833                         case PCIE_LNK_X8:
4834                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4835                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4836                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4837                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4838                                 break;
4839                         case PCIE_LNK_X4:
4840                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4841                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4842                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4843                                 break;
4844                         case PCIE_LNK_X2:
4845                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4846                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4847                                 break;
4848                         case PCIE_LNK_X1:
4849                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4850                                 break;
4851                         default:
4852                                 break;
4853                         }
4854                 }
4855         }
4856 }
4857
4858 int amdgpu_device_baco_enter(struct drm_device *dev)
4859 {
4860         struct amdgpu_device *adev = drm_to_adev(dev);
4861         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4862
4863         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4864                 return -ENOTSUPP;
4865
4866         if (ras && ras->supported)
4867                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4868
4869         return amdgpu_dpm_baco_enter(adev);
4870 }
4871
4872 int amdgpu_device_baco_exit(struct drm_device *dev)
4873 {
4874         struct amdgpu_device *adev = drm_to_adev(dev);
4875         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4876         int ret = 0;
4877
4878         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4879                 return -ENOTSUPP;
4880
4881         ret = amdgpu_dpm_baco_exit(adev);
4882         if (ret)
4883                 return ret;
4884
4885         if (ras && ras->supported)
4886                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4887
4888         return 0;
4889 }
4890
4891 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4892 {
4893         int i;
4894
4895         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4896                 struct amdgpu_ring *ring = adev->rings[i];
4897
4898                 if (!ring || !ring->sched.thread)
4899                         continue;
4900
4901                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4902         }
4903 }
4904
4905 /**
4906  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4907  * @pdev: PCI device struct
4908  * @state: PCI channel state
4909  *
4910  * Description: Called when a PCI error is detected.
4911  *
4912  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4913  */
4914 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4915 {
4916         struct drm_device *dev = pci_get_drvdata(pdev);
4917         struct amdgpu_device *adev = drm_to_adev(dev);
4918         int i;
4919
4920         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4921
4922         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4923                 DRM_WARN("No support for XGMI hive yet...");
4924                 return PCI_ERS_RESULT_DISCONNECT;
4925         }
4926
4927         switch (state) {
4928         case pci_channel_io_normal:
4929                 return PCI_ERS_RESULT_CAN_RECOVER;
4930         /* Fatal error, prepare for slot reset */
4931         case pci_channel_io_frozen:
4932                 /*
4933                  * Cancel and wait for all TDRs in progress if failing to
4934                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4935                  *
4936                  * Locking adev->reset_sem will prevent any external access
4937                  * to GPU during PCI error recovery
4938                  */
4939                 while (!amdgpu_device_lock_adev(adev, NULL))
4940                         amdgpu_cancel_all_tdr(adev);
4941
4942                 /*
4943                  * Block any work scheduling as we do for regular GPU reset
4944                  * for the duration of the recovery
4945                  */
4946                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4947                         struct amdgpu_ring *ring = adev->rings[i];
4948
4949                         if (!ring || !ring->sched.thread)
4950                                 continue;
4951
4952                         drm_sched_stop(&ring->sched, NULL);
4953                 }
4954                 return PCI_ERS_RESULT_NEED_RESET;
4955         case pci_channel_io_perm_failure:
4956                 /* Permanent error, prepare for device removal */
4957                 return PCI_ERS_RESULT_DISCONNECT;
4958         }
4959
4960         return PCI_ERS_RESULT_NEED_RESET;
4961 }
4962
4963 /**
4964  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4965  * @pdev: pointer to PCI device
4966  */
4967 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4968 {
4969
4970         DRM_INFO("PCI error: mmio enabled callback!!\n");
4971
4972         /* TODO - dump whatever for debugging purposes */
4973
4974         /* This called only if amdgpu_pci_error_detected returns
4975          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4976          * works, no need to reset slot.
4977          */
4978
4979         return PCI_ERS_RESULT_RECOVERED;
4980 }
4981
4982 /**
4983  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4984  * @pdev: PCI device struct
4985  *
4986  * Description: This routine is called by the pci error recovery
4987  * code after the PCI slot has been reset, just before we
4988  * should resume normal operations.
4989  */
4990 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4991 {
4992         struct drm_device *dev = pci_get_drvdata(pdev);
4993         struct amdgpu_device *adev = drm_to_adev(dev);
4994         int r, i;
4995         bool need_full_reset = true;
4996         u32 memsize;
4997         struct list_head device_list;
4998
4999         DRM_INFO("PCI error: slot reset callback!!\n");
5000
5001         INIT_LIST_HEAD(&device_list);
5002         list_add_tail(&adev->gmc.xgmi.head, &device_list);
5003
5004         /* wait for asic to come out of reset */
5005         msleep(500);
5006
5007         /* Restore PCI confspace */
5008         amdgpu_device_load_pci_state(pdev);
5009
5010         /* confirm  ASIC came out of reset */
5011         for (i = 0; i < adev->usec_timeout; i++) {
5012                 memsize = amdgpu_asic_get_config_memsize(adev);
5013
5014                 if (memsize != 0xffffffff)
5015                         break;
5016                 udelay(1);
5017         }
5018         if (memsize == 0xffffffff) {
5019                 r = -ETIME;
5020                 goto out;
5021         }
5022
5023         adev->in_pci_err_recovery = true;
5024         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5025         adev->in_pci_err_recovery = false;
5026         if (r)
5027                 goto out;
5028
5029         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5030
5031 out:
5032         if (!r) {
5033                 if (amdgpu_device_cache_pci_state(adev->pdev))
5034                         pci_restore_state(adev->pdev);
5035
5036                 DRM_INFO("PCIe error recovery succeeded\n");
5037         } else {
5038                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5039                 amdgpu_device_unlock_adev(adev);
5040         }
5041
5042         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5043 }
5044
5045 /**
5046  * amdgpu_pci_resume() - resume normal ops after PCI reset
5047  * @pdev: pointer to PCI device
5048  *
5049  * Called when the error recovery driver tells us that its
5050  * OK to resume normal operation. Use completion to allow
5051  * halted scsi ops to resume.
5052  */
5053 void amdgpu_pci_resume(struct pci_dev *pdev)
5054 {
5055         struct drm_device *dev = pci_get_drvdata(pdev);
5056         struct amdgpu_device *adev = drm_to_adev(dev);
5057         int i;
5058
5059
5060         DRM_INFO("PCI error: resume callback!!\n");
5061
5062         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5063                 struct amdgpu_ring *ring = adev->rings[i];
5064
5065                 if (!ring || !ring->sched.thread)
5066                         continue;
5067
5068
5069                 drm_sched_resubmit_jobs(&ring->sched);
5070                 drm_sched_start(&ring->sched, true);
5071         }
5072
5073         amdgpu_device_unlock_adev(adev);
5074 }
5075
5076 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5077 {
5078         struct drm_device *dev = pci_get_drvdata(pdev);
5079         struct amdgpu_device *adev = drm_to_adev(dev);
5080         int r;
5081
5082         r = pci_save_state(pdev);
5083         if (!r) {
5084                 kfree(adev->pci_state);
5085
5086                 adev->pci_state = pci_store_saved_state(pdev);
5087
5088                 if (!adev->pci_state) {
5089                         DRM_ERROR("Failed to store PCI saved state");
5090                         return false;
5091                 }
5092         } else {
5093                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5094                 return false;
5095         }
5096
5097         return true;
5098 }
5099
5100 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5101 {
5102         struct drm_device *dev = pci_get_drvdata(pdev);
5103         struct amdgpu_device *adev = drm_to_adev(dev);
5104         int r;
5105
5106         if (!adev->pci_state)
5107                 return false;
5108
5109         r = pci_load_saved_state(pdev, adev->pci_state);
5110
5111         if (!r) {
5112                 pci_restore_state(pdev);
5113         } else {
5114                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5115                 return false;
5116         }
5117
5118         return true;
5119 }
5120
5121