drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  84
  85 #define AMDGPU_RESUME_MS                2000
  86
  87 const char *amdgpu_asic_name[] = {
  88         "TAHITI",
  89         "PITCAIRN",
  90         "VERDE",
  91         "OLAND",
  92         "HAINAN",
  93         "BONAIRE",
  94         "KAVERI",
  95         "KABINI",
  96         "HAWAII",
  97         "MULLINS",
  98         "TOPAZ",
  99         "TONGA",
 100         "FIJI",
 101         "CARRIZO",
 102         "STONEY",
 103         "POLARIS10",
 104         "POLARIS11",
 105         "POLARIS12",
 106         "VEGAM",
 107         "VEGA10",
 108         "VEGA12",
 109         "VEGA20",
 110         "RAVEN",
 111         "ARCTURUS",
 112         "RENOIR",
 113         "ALDEBARAN",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "VANGOGH",
 120         "DIMGREY_CAVEFISH",
 121         "LAST",
 122 };
 123
 124 /**
 125  * DOC: pcie_replay_count
 126  *
 127  * The amdgpu driver provides a sysfs API for reporting the total number
 128  * of PCIe replays (NAKs)
 129  * The file pcie_replay_count is used for this and returns the total
 130  * number of replays as a sum of the NAKs generated and NAKs received
 131  */
 132
 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 134                 struct device_attribute *attr, char *buf)
 135 {
 136         struct drm_device *ddev = dev_get_drvdata(dev);
 137         struct amdgpu_device *adev = drm_to_adev(ddev);
 138         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 139
 140         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 141 }
 142
 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 144                 amdgpu_device_get_pcie_replay_count, NULL);
 145
 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 147
 148 /**
 149  * DOC: product_name
 150  *
 151  * The amdgpu driver provides a sysfs API for reporting the product name
 152  * for the device
 153  * The file serial_number is used for this and returns the product name
 154  * as returned from the FRU.
 155  * NOTE: This is only available for certain server cards
 156  */
 157
 158 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 159                 struct device_attribute *attr, char *buf)
 160 {
 161         struct drm_device *ddev = dev_get_drvdata(dev);
 162         struct amdgpu_device *adev = drm_to_adev(ddev);
 163
 164         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 165 }
 166
 167 static DEVICE_ATTR(product_name, S_IRUGO,
 168                 amdgpu_device_get_product_name, NULL);
 169
 170 /**
 171  * DOC: product_number
 172  *
 173  * The amdgpu driver provides a sysfs API for reporting the part number
 174  * for the device
 175  * The file serial_number is used for this and returns the part number
 176  * as returned from the FRU.
 177  * NOTE: This is only available for certain server cards
 178  */
 179
 180 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 181                 struct device_attribute *attr, char *buf)
 182 {
 183         struct drm_device *ddev = dev_get_drvdata(dev);
 184         struct amdgpu_device *adev = drm_to_adev(ddev);
 185
 186         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 187 }
 188
 189 static DEVICE_ATTR(product_number, S_IRUGO,
 190                 amdgpu_device_get_product_number, NULL);
 191
 192 /**
 193  * DOC: serial_number
 194  *
 195  * The amdgpu driver provides a sysfs API for reporting the serial number
 196  * for the device
 197  * The file serial_number is used for this and returns the serial number
 198  * as returned from the FRU.
 199  * NOTE: This is only available for certain server cards
 200  */
 201
 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 203                 struct device_attribute *attr, char *buf)
 204 {
 205         struct drm_device *ddev = dev_get_drvdata(dev);
 206         struct amdgpu_device *adev = drm_to_adev(ddev);
 207
 208         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 209 }
 210
 211 static DEVICE_ATTR(serial_number, S_IRUGO,
 212                 amdgpu_device_get_serial_number, NULL);
 213
 214 /**
 215  * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
 216  *
 217  * @dev: drm_device pointer
 218  *
 219  * Returns true if the device is a dGPU with HG/PX power control,
 220  * otherwise return false.
 221  */
 222 bool amdgpu_device_supports_atpx(struct drm_device *dev)
 223 {
 224         struct amdgpu_device *adev = drm_to_adev(dev);
 225
 226         if (adev->flags & AMD_IS_PX)
 227                 return true;
 228         return false;
 229 }
 230
 231 /**
 232  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 233  *
 234  * @dev: drm_device pointer
 235  *
 236  * Returns true if the device is a dGPU with HG/PX power control,
 237  * otherwise return false.
 238  */
 239 bool amdgpu_device_supports_boco(struct drm_device *dev)
 240 {
 241         struct amdgpu_device *adev = drm_to_adev(dev);
 242
 243         if (adev->has_pr3)
 244                 return true;
 245         return false;
 246 }
 247
 248 /**
 249  * amdgpu_device_supports_baco - Does the device support BACO
 250  *
 251  * @dev: drm_device pointer
 252  *
 253  * Returns true if the device supporte BACO,
 254  * otherwise return false.
 255  */
 256 bool amdgpu_device_supports_baco(struct drm_device *dev)
 257 {
 258         struct amdgpu_device *adev = drm_to_adev(dev);
 259
 260         return amdgpu_asic_supports_baco(adev);
 261 }
 262
 263 /*
 264  * VRAM access helper functions
 265  */
 266
 267 /**
 268  * amdgpu_device_vram_access - read/write a buffer in vram
 269  *
 270  * @adev: amdgpu_device pointer
 271  * @pos: offset of the buffer in vram
 272  * @buf: virtual address of the buffer in system memory
 273  * @size: read/write size, sizeof(@buf) must > @size
 274  * @write: true - write to vram, otherwise - read from vram
 275  */
 276 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 277                                uint32_t *buf, size_t size, bool write)
 278 {
 279         unsigned long flags;
 280         uint32_t hi = ~0;
 281         uint64_t last;
 282
 283
 284 #ifdef CONFIG_64BIT
 285         last = min(pos + size, adev->gmc.visible_vram_size);
 286         if (last > pos) {
 287                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 288                 size_t count = last - pos;
 289
 290                 if (write) {
 291                         memcpy_toio(addr, buf, count);
 292                         mb();
 293                         amdgpu_asic_flush_hdp(adev, NULL);
 294                 } else {
 295                         amdgpu_asic_invalidate_hdp(adev, NULL);
 296                         mb();
 297                         memcpy_fromio(buf, addr, count);
 298                 }
 299
 300                 if (count == size)
 301                         return;
 302
 303                 pos += count;
 304                 buf += count / 4;
 305                 size -= count;
 306         }
 307 #endif
 308
 309         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 310         for (last = pos + size; pos < last; pos += 4) {
 311                 uint32_t tmp = pos >> 31;
 312
 313                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 314                 if (tmp != hi) {
 315                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 316                         hi = tmp;
 317                 }
 318                 if (write)
 319                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 320                 else
 321                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 322         }
 323         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 324 }
 325
 326 /*
 327  * register access helper functions.
 328  */
 329 /**
 330  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 331  *
 332  * @adev: amdgpu_device pointer
 333  * @reg: dword aligned register offset
 334  * @acc_flags: access flags which require special behavior
 335  *
 336  * Returns the 32 bit value from the offset specified.
 337  */
 338 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 339                             uint32_t reg, uint32_t acc_flags)
 340 {
 341         uint32_t ret;
 342
 343         if (adev->in_pci_err_recovery)
 344                 return 0;
 345
 346         if ((reg * 4) < adev->rmmio_size) {
 347                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 348                     amdgpu_sriov_runtime(adev) &&
 349                     down_read_trylock(&adev->reset_sem)) {
 350                         ret = amdgpu_kiq_rreg(adev, reg);
 351                         up_read(&adev->reset_sem);
 352                 } else {
 353                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 354                 }
 355         } else {
 356                 ret = adev->pcie_rreg(adev, reg * 4);
 357         }
 358
 359         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 360
 361         return ret;
 362 }
 363
 364 /*
 365  * MMIO register read with bytes helper functions
 366  * @offset:bytes offset from MMIO start
 367  *
 368 */
 369
 370 /**
 371  * amdgpu_mm_rreg8 - read a memory mapped IO register
 372  *
 373  * @adev: amdgpu_device pointer
 374  * @offset: byte aligned register offset
 375  *
 376  * Returns the 8 bit value from the offset specified.
 377  */
 378 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 379 {
 380         if (adev->in_pci_err_recovery)
 381                 return 0;
 382
 383         if (offset < adev->rmmio_size)
 384                 return (readb(adev->rmmio + offset));
 385         BUG();
 386 }
 387
 388 /*
 389  * MMIO register write with bytes helper functions
 390  * @offset:bytes offset from MMIO start
 391  * @value: the value want to be written to the register
 392  *
 393 */
 394 /**
 395  * amdgpu_mm_wreg8 - read a memory mapped IO register
 396  *
 397  * @adev: amdgpu_device pointer
 398  * @offset: byte aligned register offset
 399  * @value: 8 bit value to write
 400  *
 401  * Writes the value specified to the offset specified.
 402  */
 403 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 404 {
 405         if (adev->in_pci_err_recovery)
 406                 return;
 407
 408         if (offset < adev->rmmio_size)
 409                 writeb(value, adev->rmmio + offset);
 410         else
 411                 BUG();
 412 }
 413
 414 /**
 415  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 416  *
 417  * @adev: amdgpu_device pointer
 418  * @reg: dword aligned register offset
 419  * @v: 32 bit value to write to the register
 420  * @acc_flags: access flags which require special behavior
 421  *
 422  * Writes the value specified to the offset specified.
 423  */
 424 void amdgpu_device_wreg(struct amdgpu_device *adev,
 425                         uint32_t reg, uint32_t v,
 426                         uint32_t acc_flags)
 427 {
 428         if (adev->in_pci_err_recovery)
 429                 return;
 430
 431         if ((reg * 4) < adev->rmmio_size) {
 432                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 433                     amdgpu_sriov_runtime(adev) &&
 434                     down_read_trylock(&adev->reset_sem)) {
 435                         amdgpu_kiq_wreg(adev, reg, v);
 436                         up_read(&adev->reset_sem);
 437                 } else {
 438                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 439                 }
 440         } else {
 441                 adev->pcie_wreg(adev, reg * 4, v);
 442         }
 443
 444         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 445 }
 446
 447 /*
 448  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 449  *
 450  * this function is invoked only the debugfs register access
 451  * */
 452 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 453                              uint32_t reg, uint32_t v)
 454 {
 455         if (adev->in_pci_err_recovery)
 456                 return;
 457
 458         if (amdgpu_sriov_fullaccess(adev) &&
 459             adev->gfx.rlc.funcs &&
 460             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 461                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 462                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 463         } else {
 464                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 465         }
 466 }
 467
 468 /**
 469  * amdgpu_io_rreg - read an IO register
 470  *
 471  * @adev: amdgpu_device pointer
 472  * @reg: dword aligned register offset
 473  *
 474  * Returns the 32 bit value from the offset specified.
 475  */
 476 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 477 {
 478         if (adev->in_pci_err_recovery)
 479                 return 0;
 480
 481         if ((reg * 4) < adev->rio_mem_size)
 482                 return ioread32(adev->rio_mem + (reg * 4));
 483         else {
 484                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 485                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 486         }
 487 }
 488
 489 /**
 490  * amdgpu_io_wreg - write to an IO register
 491  *
 492  * @adev: amdgpu_device pointer
 493  * @reg: dword aligned register offset
 494  * @v: 32 bit value to write to the register
 495  *
 496  * Writes the value specified to the offset specified.
 497  */
 498 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 499 {
 500         if (adev->in_pci_err_recovery)
 501                 return;
 502
 503         if ((reg * 4) < adev->rio_mem_size)
 504                 iowrite32(v, adev->rio_mem + (reg * 4));
 505         else {
 506                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 507                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 508         }
 509 }
 510
 511 /**
 512  * amdgpu_mm_rdoorbell - read a doorbell dword
 513  *
 514  * @adev: amdgpu_device pointer
 515  * @index: doorbell index
 516  *
 517  * Returns the value in the doorbell aperture at the
 518  * requested doorbell index (CIK).
 519  */
 520 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 521 {
 522         if (adev->in_pci_err_recovery)
 523                 return 0;
 524
 525         if (index < adev->doorbell.num_doorbells) {
 526                 return readl(adev->doorbell.ptr + index);
 527         } else {
 528                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 529                 return 0;
 530         }
 531 }
 532
 533 /**
 534  * amdgpu_mm_wdoorbell - write a doorbell dword
 535  *
 536  * @adev: amdgpu_device pointer
 537  * @index: doorbell index
 538  * @v: value to write
 539  *
 540  * Writes @v to the doorbell aperture at the
 541  * requested doorbell index (CIK).
 542  */
 543 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 544 {
 545         if (adev->in_pci_err_recovery)
 546                 return;
 547
 548         if (index < adev->doorbell.num_doorbells) {
 549                 writel(v, adev->doorbell.ptr + index);
 550         } else {
 551                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 552         }
 553 }
 554
 555 /**
 556  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 557  *
 558  * @adev: amdgpu_device pointer
 559  * @index: doorbell index
 560  *
 561  * Returns the value in the doorbell aperture at the
 562  * requested doorbell index (VEGA10+).
 563  */
 564 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 565 {
 566         if (adev->in_pci_err_recovery)
 567                 return 0;
 568
 569         if (index < adev->doorbell.num_doorbells) {
 570                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 571         } else {
 572                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 573                 return 0;
 574         }
 575 }
 576
 577 /**
 578  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 579  *
 580  * @adev: amdgpu_device pointer
 581  * @index: doorbell index
 582  * @v: value to write
 583  *
 584  * Writes @v to the doorbell aperture at the
 585  * requested doorbell index (VEGA10+).
 586  */
 587 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 588 {
 589         if (adev->in_pci_err_recovery)
 590                 return;
 591
 592         if (index < adev->doorbell.num_doorbells) {
 593                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 594         } else {
 595                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 596         }
 597 }
 598
 599 /**
 600  * amdgpu_device_indirect_rreg - read an indirect register
 601  *
 602  * @adev: amdgpu_device pointer
 603  * @pcie_index: mmio register offset
 604  * @pcie_data: mmio register offset
 605  * @reg_addr: indirect register address to read from
 606  *
 607  * Returns the value of indirect register @reg_addr
 608  */
 609 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 610                                 u32 pcie_index, u32 pcie_data,
 611                                 u32 reg_addr)
 612 {
 613         unsigned long flags;
 614         u32 r;
 615         void __iomem *pcie_index_offset;
 616         void __iomem *pcie_data_offset;
 617
 618         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 619         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 620         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 621
 622         writel(reg_addr, pcie_index_offset);
 623         readl(pcie_index_offset);
 624         r = readl(pcie_data_offset);
 625         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 626
 627         return r;
 628 }
 629
 630 /**
 631  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 632  *
 633  * @adev: amdgpu_device pointer
 634  * @pcie_index: mmio register offset
 635  * @pcie_data: mmio register offset
 636  * @reg_addr: indirect register address to read from
 637  *
 638  * Returns the value of indirect register @reg_addr
 639  */
 640 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 641                                   u32 pcie_index, u32 pcie_data,
 642                                   u32 reg_addr)
 643 {
 644         unsigned long flags;
 645         u64 r;
 646         void __iomem *pcie_index_offset;
 647         void __iomem *pcie_data_offset;
 648
 649         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 650         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 651         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 652
 653         /* read low 32 bits */
 654         writel(reg_addr, pcie_index_offset);
 655         readl(pcie_index_offset);
 656         r = readl(pcie_data_offset);
 657         /* read high 32 bits */
 658         writel(reg_addr + 4, pcie_index_offset);
 659         readl(pcie_index_offset);
 660         r |= ((u64)readl(pcie_data_offset) << 32);
 661         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 662
 663         return r;
 664 }
 665
 666 /**
 667  * amdgpu_device_indirect_wreg - write an indirect register address
 668  *
 669  * @adev: amdgpu_device pointer
 670  * @pcie_index: mmio register offset
 671  * @pcie_data: mmio register offset
 672  * @reg_addr: indirect register offset
 673  * @reg_data: indirect register data
 674  *
 675  */
 676 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 677                                  u32 pcie_index, u32 pcie_data,
 678                                  u32 reg_addr, u32 reg_data)
 679 {
 680         unsigned long flags;
 681         void __iomem *pcie_index_offset;
 682         void __iomem *pcie_data_offset;
 683
 684         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 685         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 686         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 687
 688         writel(reg_addr, pcie_index_offset);
 689         readl(pcie_index_offset);
 690         writel(reg_data, pcie_data_offset);
 691         readl(pcie_data_offset);
 692         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 693 }
 694
 695 /**
 696  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 697  *
 698  * @adev: amdgpu_device pointer
 699  * @pcie_index: mmio register offset
 700  * @pcie_data: mmio register offset
 701  * @reg_addr: indirect register offset
 702  * @reg_data: indirect register data
 703  *
 704  */
 705 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 706                                    u32 pcie_index, u32 pcie_data,
 707                                    u32 reg_addr, u64 reg_data)
 708 {
 709         unsigned long flags;
 710         void __iomem *pcie_index_offset;
 711         void __iomem *pcie_data_offset;
 712
 713         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 714         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 715         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 716
 717         /* write low 32 bits */
 718         writel(reg_addr, pcie_index_offset);
 719         readl(pcie_index_offset);
 720         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 721         readl(pcie_data_offset);
 722         /* write high 32 bits */
 723         writel(reg_addr + 4, pcie_index_offset);
 724         readl(pcie_index_offset);
 725         writel((u32)(reg_data >> 32), pcie_data_offset);
 726         readl(pcie_data_offset);
 727         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 728 }
 729
 730 /**
 731  * amdgpu_invalid_rreg - dummy reg read function
 732  *
 733  * @adev: amdgpu_device pointer
 734  * @reg: offset of register
 735  *
 736  * Dummy register read function.  Used for register blocks
 737  * that certain asics don't have (all asics).
 738  * Returns the value in the register.
 739  */
 740 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 741 {
 742         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 743         BUG();
 744         return 0;
 745 }
 746
 747 /**
 748  * amdgpu_invalid_wreg - dummy reg write function
 749  *
 750  * @adev: amdgpu_device pointer
 751  * @reg: offset of register
 752  * @v: value to write to the register
 753  *
 754  * Dummy register read function.  Used for register blocks
 755  * that certain asics don't have (all asics).
 756  */
 757 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 758 {
 759         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 760                   reg, v);
 761         BUG();
 762 }
 763
 764 /**
 765  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 766  *
 767  * @adev: amdgpu_device pointer
 768  * @reg: offset of register
 769  *
 770  * Dummy register read function.  Used for register blocks
 771  * that certain asics don't have (all asics).
 772  * Returns the value in the register.
 773  */
 774 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 775 {
 776         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 777         BUG();
 778         return 0;
 779 }
 780
 781 /**
 782  * amdgpu_invalid_wreg64 - dummy reg write function
 783  *
 784  * @adev: amdgpu_device pointer
 785  * @reg: offset of register
 786  * @v: value to write to the register
 787  *
 788  * Dummy register read function.  Used for register blocks
 789  * that certain asics don't have (all asics).
 790  */
 791 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 792 {
 793         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 794                   reg, v);
 795         BUG();
 796 }
 797
 798 /**
 799  * amdgpu_block_invalid_rreg - dummy reg read function
 800  *
 801  * @adev: amdgpu_device pointer
 802  * @block: offset of instance
 803  * @reg: offset of register
 804  *
 805  * Dummy register read function.  Used for register blocks
 806  * that certain asics don't have (all asics).
 807  * Returns the value in the register.
 808  */
 809 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 810                                           uint32_t block, uint32_t reg)
 811 {
 812         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 813                   reg, block);
 814         BUG();
 815         return 0;
 816 }
 817
 818 /**
 819  * amdgpu_block_invalid_wreg - dummy reg write function
 820  *
 821  * @adev: amdgpu_device pointer
 822  * @block: offset of instance
 823  * @reg: offset of register
 824  * @v: value to write to the register
 825  *
 826  * Dummy register read function.  Used for register blocks
 827  * that certain asics don't have (all asics).
 828  */
 829 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 830                                       uint32_t block,
 831                                       uint32_t reg, uint32_t v)
 832 {
 833         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 834                   reg, block, v);
 835         BUG();
 836 }
 837
 838 /**
 839  * amdgpu_device_asic_init - Wrapper for atom asic_init
 840  *
 841  * @adev: amdgpu_device pointer
 842  *
 843  * Does any asic specific work and then calls atom asic init.
 844  */
 845 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 846 {
 847         amdgpu_asic_pre_asic_init(adev);
 848
 849         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 850 }
 851
 852 /**
 853  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 854  *
 855  * @adev: amdgpu_device pointer
 856  *
 857  * Allocates a scratch page of VRAM for use by various things in the
 858  * driver.
 859  */
 860 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 861 {
 862         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 863                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 864                                        &adev->vram_scratch.robj,
 865                                        &adev->vram_scratch.gpu_addr,
 866                                        (void **)&adev->vram_scratch.ptr);
 867 }
 868
 869 /**
 870  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 871  *
 872  * @adev: amdgpu_device pointer
 873  *
 874  * Frees the VRAM scratch page.
 875  */
 876 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 877 {
 878         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 879 }
 880
 881 /**
 882  * amdgpu_device_program_register_sequence - program an array of registers.
 883  *
 884  * @adev: amdgpu_device pointer
 885  * @registers: pointer to the register array
 886  * @array_size: size of the register array
 887  *
 888  * Programs an array or registers with and and or masks.
 889  * This is a helper for setting golden registers.
 890  */
 891 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 892                                              const u32 *registers,
 893                                              const u32 array_size)
 894 {
 895         u32 tmp, reg, and_mask, or_mask;
 896         int i;
 897
 898         if (array_size % 3)
 899                 return;
 900
 901         for (i = 0; i < array_size; i +=3) {
 902                 reg = registers[i + 0];
 903                 and_mask = registers[i + 1];
 904                 or_mask = registers[i + 2];
 905
 906                 if (and_mask == 0xffffffff) {
 907                         tmp = or_mask;
 908                 } else {
 909                         tmp = RREG32(reg);
 910                         tmp &= ~and_mask;
 911                         if (adev->family >= AMDGPU_FAMILY_AI)
 912                                 tmp |= (or_mask & and_mask);
 913                         else
 914                                 tmp |= or_mask;
 915                 }
 916                 WREG32(reg, tmp);
 917         }
 918 }
 919
 920 /**
 921  * amdgpu_device_pci_config_reset - reset the GPU
 922  *
 923  * @adev: amdgpu_device pointer
 924  *
 925  * Resets the GPU using the pci config reset sequence.
 926  * Only applicable to asics prior to vega10.
 927  */
 928 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 929 {
 930         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 931 }
 932
 933 /**
 934  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
 935  *
 936  * @adev: amdgpu_device pointer
 937  *
 938  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
 939  */
 940 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
 941 {
 942         return pci_reset_function(adev->pdev);
 943 }
 944
 945 /*
 946  * GPU doorbell aperture helpers function.
 947  */
 948 /**
 949  * amdgpu_device_doorbell_init - Init doorbell driver information.
 950  *
 951  * @adev: amdgpu_device pointer
 952  *
 953  * Init doorbell driver information (CIK)
 954  * Returns 0 on success, error on failure.
 955  */
 956 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 957 {
 958
 959         /* No doorbell on SI hardware generation */
 960         if (adev->asic_type < CHIP_BONAIRE) {
 961                 adev->doorbell.base = 0;
 962                 adev->doorbell.size = 0;
 963                 adev->doorbell.num_doorbells = 0;
 964                 adev->doorbell.ptr = NULL;
 965                 return 0;
 966         }
 967
 968         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 969                 return -EINVAL;
 970
 971         amdgpu_asic_init_doorbell_index(adev);
 972
 973         /* doorbell bar mapping */
 974         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 975         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 976
 977         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 978                                              adev->doorbell_index.max_assignment+1);
 979         if (adev->doorbell.num_doorbells == 0)
 980                 return -EINVAL;
 981
 982         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 983          * paging queue doorbell use the second page. The
 984          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 985          * doorbells are in the first page. So with paging queue enabled,
 986          * the max num_doorbells should + 1 page (0x400 in dword)
 987          */
 988         if (adev->asic_type >= CHIP_VEGA10)
 989                 adev->doorbell.num_doorbells += 0x400;
 990
 991         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 992                                      adev->doorbell.num_doorbells *
 993                                      sizeof(u32));
 994         if (adev->doorbell.ptr == NULL)
 995                 return -ENOMEM;
 996
 997         return 0;
 998 }
 999
1000 /**
1001  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1002  *
1003  * @adev: amdgpu_device pointer
1004  *
1005  * Tear down doorbell driver information (CIK)
1006  */
1007 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1008 {
1009         iounmap(adev->doorbell.ptr);
1010         adev->doorbell.ptr = NULL;
1011 }
1012
1013
1014
1015 /*
1016  * amdgpu_device_wb_*()
1017  * Writeback is the method by which the GPU updates special pages in memory
1018  * with the status of certain GPU events (fences, ring pointers,etc.).
1019  */
1020
1021 /**
1022  * amdgpu_device_wb_fini - Disable Writeback and free memory
1023  *
1024  * @adev: amdgpu_device pointer
1025  *
1026  * Disables Writeback and frees the Writeback memory (all asics).
1027  * Used at driver shutdown.
1028  */
1029 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1030 {
1031         if (adev->wb.wb_obj) {
1032                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1033                                       &adev->wb.gpu_addr,
1034                                       (void **)&adev->wb.wb);
1035                 adev->wb.wb_obj = NULL;
1036         }
1037 }
1038
1039 /**
1040  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1041  *
1042  * @adev: amdgpu_device pointer
1043  *
1044  * Initializes writeback and allocates writeback memory (all asics).
1045  * Used at driver startup.
1046  * Returns 0 on success or an -error on failure.
1047  */
1048 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1049 {
1050         int r;
1051
1052         if (adev->wb.wb_obj == NULL) {
1053                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1054                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1055                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1056                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1057                                             (void **)&adev->wb.wb);
1058                 if (r) {
1059                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1060                         return r;
1061                 }
1062
1063                 adev->wb.num_wb = AMDGPU_MAX_WB;
1064                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1065
1066                 /* clear wb memory */
1067                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1068         }
1069
1070         return 0;
1071 }
1072
1073 /**
1074  * amdgpu_device_wb_get - Allocate a wb entry
1075  *
1076  * @adev: amdgpu_device pointer
1077  * @wb: wb index
1078  *
1079  * Allocate a wb slot for use by the driver (all asics).
1080  * Returns 0 on success or -EINVAL on failure.
1081  */
1082 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1083 {
1084         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1085
1086         if (offset < adev->wb.num_wb) {
1087                 __set_bit(offset, adev->wb.used);
1088                 *wb = offset << 3; /* convert to dw offset */
1089                 return 0;
1090         } else {
1091                 return -EINVAL;
1092         }
1093 }
1094
1095 /**
1096  * amdgpu_device_wb_free - Free a wb entry
1097  *
1098  * @adev: amdgpu_device pointer
1099  * @wb: wb index
1100  *
1101  * Free a wb slot allocated for use by the driver (all asics)
1102  */
1103 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1104 {
1105         wb >>= 3;
1106         if (wb < adev->wb.num_wb)
1107                 __clear_bit(wb, adev->wb.used);
1108 }
1109
1110 /**
1111  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1112  *
1113  * @adev: amdgpu_device pointer
1114  *
1115  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1116  * to fail, but if any of the BARs is not accessible after the size we abort
1117  * driver loading by returning -ENODEV.
1118  */
1119 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1120 {
1121         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1122         struct pci_bus *root;
1123         struct resource *res;
1124         unsigned i;
1125         u16 cmd;
1126         int r;
1127
1128         /* Bypass for VF */
1129         if (amdgpu_sriov_vf(adev))
1130                 return 0;
1131
1132         /* skip if the bios has already enabled large BAR */
1133         if (adev->gmc.real_vram_size &&
1134             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1135                 return 0;
1136
1137         /* Check if the root BUS has 64bit memory resources */
1138         root = adev->pdev->bus;
1139         while (root->parent)
1140                 root = root->parent;
1141
1142         pci_bus_for_each_resource(root, res, i) {
1143                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1144                     res->start > 0x100000000ull)
1145                         break;
1146         }
1147
1148         /* Trying to resize is pointless without a root hub window above 4GB */
1149         if (!res)
1150                 return 0;
1151
1152         /* Limit the BAR size to what is available */
1153         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1154                         rbar_size);
1155
1156         /* Disable memory decoding while we change the BAR addresses and size */
1157         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1158         pci_write_config_word(adev->pdev, PCI_COMMAND,
1159                               cmd & ~PCI_COMMAND_MEMORY);
1160
1161         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1162         amdgpu_device_doorbell_fini(adev);
1163         if (adev->asic_type >= CHIP_BONAIRE)
1164                 pci_release_resource(adev->pdev, 2);
1165
1166         pci_release_resource(adev->pdev, 0);
1167
1168         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1169         if (r == -ENOSPC)
1170                 DRM_INFO("Not enough PCI address space for a large BAR.");
1171         else if (r && r != -ENOTSUPP)
1172                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1173
1174         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1175
1176         /* When the doorbell or fb BAR isn't available we have no chance of
1177          * using the device.
1178          */
1179         r = amdgpu_device_doorbell_init(adev);
1180         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1181                 return -ENODEV;
1182
1183         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1184
1185         return 0;
1186 }
1187
1188 /*
1189  * GPU helpers function.
1190  */
1191 /**
1192  * amdgpu_device_need_post - check if the hw need post or not
1193  *
1194  * @adev: amdgpu_device pointer
1195  *
1196  * Check if the asic has been initialized (all asics) at driver startup
1197  * or post is needed if  hw reset is performed.
1198  * Returns true if need or false if not.
1199  */
1200 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1201 {
1202         uint32_t reg;
1203
1204         if (amdgpu_sriov_vf(adev))
1205                 return false;
1206
1207         if (amdgpu_passthrough(adev)) {
1208                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1209                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1210                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1211                  * vpost executed for smc version below 22.15
1212                  */
1213                 if (adev->asic_type == CHIP_FIJI) {
1214                         int err;
1215                         uint32_t fw_ver;
1216                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1217                         /* force vPost if error occured */
1218                         if (err)
1219                                 return true;
1220
1221                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1222                         if (fw_ver < 0x00160e00)
1223                                 return true;
1224                 }
1225         }
1226
1227         if (adev->has_hw_reset) {
1228                 adev->has_hw_reset = false;
1229                 return true;
1230         }
1231
1232         /* bios scratch used on CIK+ */
1233         if (adev->asic_type >= CHIP_BONAIRE)
1234                 return amdgpu_atombios_scratch_need_asic_init(adev);
1235
1236         /* check MEM_SIZE for older asics */
1237         reg = amdgpu_asic_get_config_memsize(adev);
1238
1239         if ((reg != 0) && (reg != 0xffffffff))
1240                 return false;
1241
1242         return true;
1243 }
1244
1245 /* if we get transitioned to only one device, take VGA back */
1246 /**
1247  * amdgpu_device_vga_set_decode - enable/disable vga decode
1248  *
1249  * @cookie: amdgpu_device pointer
1250  * @state: enable/disable vga decode
1251  *
1252  * Enable/disable vga decode (all asics).
1253  * Returns VGA resource flags.
1254  */
1255 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1256 {
1257         struct amdgpu_device *adev = cookie;
1258         amdgpu_asic_set_vga_state(adev, state);
1259         if (state)
1260                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1261                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1262         else
1263                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1264 }
1265
1266 /**
1267  * amdgpu_device_check_block_size - validate the vm block size
1268  *
1269  * @adev: amdgpu_device pointer
1270  *
1271  * Validates the vm block size specified via module parameter.
1272  * The vm block size defines number of bits in page table versus page directory,
1273  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1274  * page table and the remaining bits are in the page directory.
1275  */
1276 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1277 {
1278         /* defines number of bits in page table versus page directory,
1279          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1280          * page table and the remaining bits are in the page directory */
1281         if (amdgpu_vm_block_size == -1)
1282                 return;
1283
1284         if (amdgpu_vm_block_size < 9) {
1285                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1286                          amdgpu_vm_block_size);
1287                 amdgpu_vm_block_size = -1;
1288         }
1289 }
1290
1291 /**
1292  * amdgpu_device_check_vm_size - validate the vm size
1293  *
1294  * @adev: amdgpu_device pointer
1295  *
1296  * Validates the vm size in GB specified via module parameter.
1297  * The VM size is the size of the GPU virtual memory space in GB.
1298  */
1299 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1300 {
1301         /* no need to check the default value */
1302         if (amdgpu_vm_size == -1)
1303                 return;
1304
1305         if (amdgpu_vm_size < 1) {
1306                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1307                          amdgpu_vm_size);
1308                 amdgpu_vm_size = -1;
1309         }
1310 }
1311
1312 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1313 {
1314         struct sysinfo si;
1315         bool is_os_64 = (sizeof(void *) == 8);
1316         uint64_t total_memory;
1317         uint64_t dram_size_seven_GB = 0x1B8000000;
1318         uint64_t dram_size_three_GB = 0xB8000000;
1319
1320         if (amdgpu_smu_memory_pool_size == 0)
1321                 return;
1322
1323         if (!is_os_64) {
1324                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1325                 goto def_value;
1326         }
1327         si_meminfo(&si);
1328         total_memory = (uint64_t)si.totalram * si.mem_unit;
1329
1330         if ((amdgpu_smu_memory_pool_size == 1) ||
1331                 (amdgpu_smu_memory_pool_size == 2)) {
1332                 if (total_memory < dram_size_three_GB)
1333                         goto def_value1;
1334         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1335                 (amdgpu_smu_memory_pool_size == 8)) {
1336                 if (total_memory < dram_size_seven_GB)
1337                         goto def_value1;
1338         } else {
1339                 DRM_WARN("Smu memory pool size not supported\n");
1340                 goto def_value;
1341         }
1342         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1343
1344         return;
1345
1346 def_value1:
1347         DRM_WARN("No enough system memory\n");
1348 def_value:
1349         adev->pm.smu_prv_buffer_size = 0;
1350 }
1351
1352 /**
1353  * amdgpu_device_check_arguments - validate module params
1354  *
1355  * @adev: amdgpu_device pointer
1356  *
1357  * Validates certain module parameters and updates
1358  * the associated values used by the driver (all asics).
1359  */
1360 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1361 {
1362         if (amdgpu_sched_jobs < 4) {
1363                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1364                          amdgpu_sched_jobs);
1365                 amdgpu_sched_jobs = 4;
1366         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1367                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1368                          amdgpu_sched_jobs);
1369                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1370         }
1371
1372         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1373                 /* gart size must be greater or equal to 32M */
1374                 dev_warn(adev->dev, "gart size (%d) too small\n",
1375                          amdgpu_gart_size);
1376                 amdgpu_gart_size = -1;
1377         }
1378
1379         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1380                 /* gtt size must be greater or equal to 32M */
1381                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1382                                  amdgpu_gtt_size);
1383                 amdgpu_gtt_size = -1;
1384         }
1385
1386         /* valid range is between 4 and 9 inclusive */
1387         if (amdgpu_vm_fragment_size != -1 &&
1388             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1389                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1390                 amdgpu_vm_fragment_size = -1;
1391         }
1392
1393         if (amdgpu_sched_hw_submission < 2) {
1394                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1395                          amdgpu_sched_hw_submission);
1396                 amdgpu_sched_hw_submission = 2;
1397         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1398                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1399                          amdgpu_sched_hw_submission);
1400                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1401         }
1402
1403         amdgpu_device_check_smu_prv_buffer_size(adev);
1404
1405         amdgpu_device_check_vm_size(adev);
1406
1407         amdgpu_device_check_block_size(adev);
1408
1409         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1410
1411         amdgpu_gmc_tmz_set(adev);
1412
1413         amdgpu_gmc_noretry_set(adev);
1414
1415         return 0;
1416 }
1417
1418 /**
1419  * amdgpu_switcheroo_set_state - set switcheroo state
1420  *
1421  * @pdev: pci dev pointer
1422  * @state: vga_switcheroo state
1423  *
1424  * Callback for the switcheroo driver.  Suspends or resumes the
1425  * the asics before or after it is powered up using ACPI methods.
1426  */
1427 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1428                                         enum vga_switcheroo_state state)
1429 {
1430         struct drm_device *dev = pci_get_drvdata(pdev);
1431         int r;
1432
1433         if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF)
1434                 return;
1435
1436         if (state == VGA_SWITCHEROO_ON) {
1437                 pr_info("switched on\n");
1438                 /* don't suspend or resume card normally */
1439                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1440
1441                 pci_set_power_state(pdev, PCI_D0);
1442                 amdgpu_device_load_pci_state(pdev);
1443                 r = pci_enable_device(pdev);
1444                 if (r)
1445                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1446                 amdgpu_device_resume(dev, true);
1447
1448                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1449         } else {
1450                 pr_info("switched off\n");
1451                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1452                 amdgpu_device_suspend(dev, true);
1453                 amdgpu_device_cache_pci_state(pdev);
1454                 /* Shut down the device */
1455                 pci_disable_device(pdev);
1456                 pci_set_power_state(pdev, PCI_D3cold);
1457                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1458         }
1459 }
1460
1461 /**
1462  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1463  *
1464  * @pdev: pci dev pointer
1465  *
1466  * Callback for the switcheroo driver.  Check of the switcheroo
1467  * state can be changed.
1468  * Returns true if the state can be changed, false if not.
1469  */
1470 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1471 {
1472         struct drm_device *dev = pci_get_drvdata(pdev);
1473
1474         /*
1475         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1476         * locking inversion with the driver load path. And the access here is
1477         * completely racy anyway. So don't bother with locking for now.
1478         */
1479         return atomic_read(&dev->open_count) == 0;
1480 }
1481
1482 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1483         .set_gpu_state = amdgpu_switcheroo_set_state,
1484         .reprobe = NULL,
1485         .can_switch = amdgpu_switcheroo_can_switch,
1486 };
1487
1488 /**
1489  * amdgpu_device_ip_set_clockgating_state - set the CG state
1490  *
1491  * @dev: amdgpu_device pointer
1492  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1493  * @state: clockgating state (gate or ungate)
1494  *
1495  * Sets the requested clockgating state for all instances of
1496  * the hardware IP specified.
1497  * Returns the error code from the last instance.
1498  */
1499 int amdgpu_device_ip_set_clockgating_state(void *dev,
1500                                            enum amd_ip_block_type block_type,
1501                                            enum amd_clockgating_state state)
1502 {
1503         struct amdgpu_device *adev = dev;
1504         int i, r = 0;
1505
1506         for (i = 0; i < adev->num_ip_blocks; i++) {
1507                 if (!adev->ip_blocks[i].status.valid)
1508                         continue;
1509                 if (adev->ip_blocks[i].version->type != block_type)
1510                         continue;
1511                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1512                         continue;
1513                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1514                         (void *)adev, state);
1515                 if (r)
1516                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1517                                   adev->ip_blocks[i].version->funcs->name, r);
1518         }
1519         return r;
1520 }
1521
1522 /**
1523  * amdgpu_device_ip_set_powergating_state - set the PG state
1524  *
1525  * @dev: amdgpu_device pointer
1526  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1527  * @state: powergating state (gate or ungate)
1528  *
1529  * Sets the requested powergating state for all instances of
1530  * the hardware IP specified.
1531  * Returns the error code from the last instance.
1532  */
1533 int amdgpu_device_ip_set_powergating_state(void *dev,
1534                                            enum amd_ip_block_type block_type,
1535                                            enum amd_powergating_state state)
1536 {
1537         struct amdgpu_device *adev = dev;
1538         int i, r = 0;
1539
1540         for (i = 0; i < adev->num_ip_blocks; i++) {
1541                 if (!adev->ip_blocks[i].status.valid)
1542                         continue;
1543                 if (adev->ip_blocks[i].version->type != block_type)
1544                         continue;
1545                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1546                         continue;
1547                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1548                         (void *)adev, state);
1549                 if (r)
1550                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1551                                   adev->ip_blocks[i].version->funcs->name, r);
1552         }
1553         return r;
1554 }
1555
1556 /**
1557  * amdgpu_device_ip_get_clockgating_state - get the CG state
1558  *
1559  * @adev: amdgpu_device pointer
1560  * @flags: clockgating feature flags
1561  *
1562  * Walks the list of IPs on the device and updates the clockgating
1563  * flags for each IP.
1564  * Updates @flags with the feature flags for each hardware IP where
1565  * clockgating is enabled.
1566  */
1567 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1568                                             u32 *flags)
1569 {
1570         int i;
1571
1572         for (i = 0; i < adev->num_ip_blocks; i++) {
1573                 if (!adev->ip_blocks[i].status.valid)
1574                         continue;
1575                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1576                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1577         }
1578 }
1579
1580 /**
1581  * amdgpu_device_ip_wait_for_idle - wait for idle
1582  *
1583  * @adev: amdgpu_device pointer
1584  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1585  *
1586  * Waits for the request hardware IP to be idle.
1587  * Returns 0 for success or a negative error code on failure.
1588  */
1589 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1590                                    enum amd_ip_block_type block_type)
1591 {
1592         int i, r;
1593
1594         for (i = 0; i < adev->num_ip_blocks; i++) {
1595                 if (!adev->ip_blocks[i].status.valid)
1596                         continue;
1597                 if (adev->ip_blocks[i].version->type == block_type) {
1598                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1599                         if (r)
1600                                 return r;
1601                         break;
1602                 }
1603         }
1604         return 0;
1605
1606 }
1607
1608 /**
1609  * amdgpu_device_ip_is_idle - is the hardware IP idle
1610  *
1611  * @adev: amdgpu_device pointer
1612  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1613  *
1614  * Check if the hardware IP is idle or not.
1615  * Returns true if it the IP is idle, false if not.
1616  */
1617 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1618                               enum amd_ip_block_type block_type)
1619 {
1620         int i;
1621
1622         for (i = 0; i < adev->num_ip_blocks; i++) {
1623                 if (!adev->ip_blocks[i].status.valid)
1624                         continue;
1625                 if (adev->ip_blocks[i].version->type == block_type)
1626                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1627         }
1628         return true;
1629
1630 }
1631
1632 /**
1633  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1634  *
1635  * @adev: amdgpu_device pointer
1636  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1637  *
1638  * Returns a pointer to the hardware IP block structure
1639  * if it exists for the asic, otherwise NULL.
1640  */
1641 struct amdgpu_ip_block *
1642 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1643                               enum amd_ip_block_type type)
1644 {
1645         int i;
1646
1647         for (i = 0; i < adev->num_ip_blocks; i++)
1648                 if (adev->ip_blocks[i].version->type == type)
1649                         return &adev->ip_blocks[i];
1650
1651         return NULL;
1652 }
1653
1654 /**
1655  * amdgpu_device_ip_block_version_cmp
1656  *
1657  * @adev: amdgpu_device pointer
1658  * @type: enum amd_ip_block_type
1659  * @major: major version
1660  * @minor: minor version
1661  *
1662  * return 0 if equal or greater
1663  * return 1 if smaller or the ip_block doesn't exist
1664  */
1665 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1666                                        enum amd_ip_block_type type,
1667                                        u32 major, u32 minor)
1668 {
1669         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1670
1671         if (ip_block && ((ip_block->version->major > major) ||
1672                         ((ip_block->version->major == major) &&
1673                         (ip_block->version->minor >= minor))))
1674                 return 0;
1675
1676         return 1;
1677 }
1678
1679 /**
1680  * amdgpu_device_ip_block_add
1681  *
1682  * @adev: amdgpu_device pointer
1683  * @ip_block_version: pointer to the IP to add
1684  *
1685  * Adds the IP block driver information to the collection of IPs
1686  * on the asic.
1687  */
1688 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1689                                const struct amdgpu_ip_block_version *ip_block_version)
1690 {
1691         if (!ip_block_version)
1692                 return -EINVAL;
1693
1694         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1695                   ip_block_version->funcs->name);
1696
1697         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1698
1699         return 0;
1700 }
1701
1702 /**
1703  * amdgpu_device_enable_virtual_display - enable virtual display feature
1704  *
1705  * @adev: amdgpu_device pointer
1706  *
1707  * Enabled the virtual display feature if the user has enabled it via
1708  * the module parameter virtual_display.  This feature provides a virtual
1709  * display hardware on headless boards or in virtualized environments.
1710  * This function parses and validates the configuration string specified by
1711  * the user and configues the virtual display configuration (number of
1712  * virtual connectors, crtcs, etc.) specified.
1713  */
1714 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1715 {
1716         adev->enable_virtual_display = false;
1717
1718         if (amdgpu_virtual_display) {
1719                 const char *pci_address_name = pci_name(adev->pdev);
1720                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1721
1722                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1723                 pciaddstr_tmp = pciaddstr;
1724                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1725                         pciaddname = strsep(&pciaddname_tmp, ",");
1726                         if (!strcmp("all", pciaddname)
1727                             || !strcmp(pci_address_name, pciaddname)) {
1728                                 long num_crtc;
1729                                 int res = -1;
1730
1731                                 adev->enable_virtual_display = true;
1732
1733                                 if (pciaddname_tmp)
1734                                         res = kstrtol(pciaddname_tmp, 10,
1735                                                       &num_crtc);
1736
1737                                 if (!res) {
1738                                         if (num_crtc < 1)
1739                                                 num_crtc = 1;
1740                                         if (num_crtc > 6)
1741                                                 num_crtc = 6;
1742                                         adev->mode_info.num_crtc = num_crtc;
1743                                 } else {
1744                                         adev->mode_info.num_crtc = 1;
1745                                 }
1746                                 break;
1747                         }
1748                 }
1749
1750                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1751                          amdgpu_virtual_display, pci_address_name,
1752                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1753
1754                 kfree(pciaddstr);
1755         }
1756 }
1757
1758 /**
1759  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1760  *
1761  * @adev: amdgpu_device pointer
1762  *
1763  * Parses the asic configuration parameters specified in the gpu info
1764  * firmware and makes them availale to the driver for use in configuring
1765  * the asic.
1766  * Returns 0 on success, -EINVAL on failure.
1767  */
1768 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1769 {
1770         const char *chip_name;
1771         char fw_name[40];
1772         int err;
1773         const struct gpu_info_firmware_header_v1_0 *hdr;
1774
1775         adev->firmware.gpu_info_fw = NULL;
1776
1777         if (adev->mman.discovery_bin) {
1778                 amdgpu_discovery_get_gfx_info(adev);
1779
1780                 /*
1781                  * FIXME: The bounding box is still needed by Navi12, so
1782                  * temporarily read it from gpu_info firmware. Should be droped
1783                  * when DAL no longer needs it.
1784                  */
1785                 if (adev->asic_type != CHIP_NAVI12)
1786                         return 0;
1787         }
1788
1789         switch (adev->asic_type) {
1790 #ifdef CONFIG_DRM_AMDGPU_SI
1791         case CHIP_VERDE:
1792         case CHIP_TAHITI:
1793         case CHIP_PITCAIRN:
1794         case CHIP_OLAND:
1795         case CHIP_HAINAN:
1796 #endif
1797 #ifdef CONFIG_DRM_AMDGPU_CIK
1798         case CHIP_BONAIRE:
1799         case CHIP_HAWAII:
1800         case CHIP_KAVERI:
1801         case CHIP_KABINI:
1802         case CHIP_MULLINS:
1803 #endif
1804         case CHIP_TOPAZ:
1805         case CHIP_TONGA:
1806         case CHIP_FIJI:
1807         case CHIP_POLARIS10:
1808         case CHIP_POLARIS11:
1809         case CHIP_POLARIS12:
1810         case CHIP_VEGAM:
1811         case CHIP_CARRIZO:
1812         case CHIP_STONEY:
1813         case CHIP_VEGA20:
1814         case CHIP_ALDEBARAN:
1815         case CHIP_SIENNA_CICHLID:
1816         case CHIP_NAVY_FLOUNDER:
1817         case CHIP_DIMGREY_CAVEFISH:
1818         default:
1819                 return 0;
1820         case CHIP_VEGA10:
1821                 chip_name = "vega10";
1822                 break;
1823         case CHIP_VEGA12:
1824                 chip_name = "vega12";
1825                 break;
1826         case CHIP_RAVEN:
1827                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1828                         chip_name = "raven2";
1829                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1830                         chip_name = "picasso";
1831                 else
1832                         chip_name = "raven";
1833                 break;
1834         case CHIP_ARCTURUS:
1835                 chip_name = "arcturus";
1836                 break;
1837         case CHIP_RENOIR:
1838                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1839                         chip_name = "renoir";
1840                 else
1841                         chip_name = "green_sardine";
1842                 break;
1843         case CHIP_NAVI10:
1844                 chip_name = "navi10";
1845                 break;
1846         case CHIP_NAVI14:
1847                 chip_name = "navi14";
1848                 break;
1849         case CHIP_NAVI12:
1850                 chip_name = "navi12";
1851                 break;
1852         case CHIP_VANGOGH:
1853                 chip_name = "vangogh";
1854                 break;
1855         }
1856
1857         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1858         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1859         if (err) {
1860                 dev_err(adev->dev,
1861                         "Failed to load gpu_info firmware \"%s\"\n",
1862                         fw_name);
1863                 goto out;
1864         }
1865         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1866         if (err) {
1867                 dev_err(adev->dev,
1868                         "Failed to validate gpu_info firmware \"%s\"\n",
1869                         fw_name);
1870                 goto out;
1871         }
1872
1873         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1874         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1875
1876         switch (hdr->version_major) {
1877         case 1:
1878         {
1879                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1880                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1881                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1882
1883                 /*
1884                  * Should be droped when DAL no longer needs it.
1885                  */
1886                 if (adev->asic_type == CHIP_NAVI12)
1887                         goto parse_soc_bounding_box;
1888
1889                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1890                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1891                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1892                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1893                 adev->gfx.config.max_texture_channel_caches =
1894                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1895                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1896                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1897                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1898                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1899                 adev->gfx.config.double_offchip_lds_buf =
1900                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1901                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1902                 adev->gfx.cu_info.max_waves_per_simd =
1903                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1904                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1905                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1906                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1907                 if (hdr->version_minor >= 1) {
1908                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1909                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1910                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1911                         adev->gfx.config.num_sc_per_sh =
1912                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1913                         adev->gfx.config.num_packer_per_sc =
1914                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1915                 }
1916
1917 parse_soc_bounding_box:
1918                 /*
1919                  * soc bounding box info is not integrated in disocovery table,
1920                  * we always need to parse it from gpu info firmware if needed.
1921                  */
1922                 if (hdr->version_minor == 2) {
1923                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1924                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1925                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1926                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1927                 }
1928                 break;
1929         }
1930         default:
1931                 dev_err(adev->dev,
1932                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1933                 err = -EINVAL;
1934                 goto out;
1935         }
1936 out:
1937         return err;
1938 }
1939
1940 /**
1941  * amdgpu_device_ip_early_init - run early init for hardware IPs
1942  *
1943  * @adev: amdgpu_device pointer
1944  *
1945  * Early initialization pass for hardware IPs.  The hardware IPs that make
1946  * up each asic are discovered each IP's early_init callback is run.  This
1947  * is the first stage in initializing the asic.
1948  * Returns 0 on success, negative error code on failure.
1949  */
1950 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1951 {
1952         int i, r;
1953
1954         amdgpu_device_enable_virtual_display(adev);
1955
1956         if (amdgpu_sriov_vf(adev)) {
1957                 r = amdgpu_virt_request_full_gpu(adev, true);
1958                 if (r)
1959                         return r;
1960         }
1961
1962         switch (adev->asic_type) {
1963 #ifdef CONFIG_DRM_AMDGPU_SI
1964         case CHIP_VERDE:
1965         case CHIP_TAHITI:
1966         case CHIP_PITCAIRN:
1967         case CHIP_OLAND:
1968         case CHIP_HAINAN:
1969                 adev->family = AMDGPU_FAMILY_SI;
1970                 r = si_set_ip_blocks(adev);
1971                 if (r)
1972                         return r;
1973                 break;
1974 #endif
1975 #ifdef CONFIG_DRM_AMDGPU_CIK
1976         case CHIP_BONAIRE:
1977         case CHIP_HAWAII:
1978         case CHIP_KAVERI:
1979         case CHIP_KABINI:
1980         case CHIP_MULLINS:
1981                 if (adev->flags & AMD_IS_APU)
1982                         adev->family = AMDGPU_FAMILY_KV;
1983                 else
1984                         adev->family = AMDGPU_FAMILY_CI;
1985
1986                 r = cik_set_ip_blocks(adev);
1987                 if (r)
1988                         return r;
1989                 break;
1990 #endif
1991         case CHIP_TOPAZ:
1992         case CHIP_TONGA:
1993         case CHIP_FIJI:
1994         case CHIP_POLARIS10:
1995         case CHIP_POLARIS11:
1996         case CHIP_POLARIS12:
1997         case CHIP_VEGAM:
1998         case CHIP_CARRIZO:
1999         case CHIP_STONEY:
2000                 if (adev->flags & AMD_IS_APU)
2001                         adev->family = AMDGPU_FAMILY_CZ;
2002                 else
2003                         adev->family = AMDGPU_FAMILY_VI;
2004
2005                 r = vi_set_ip_blocks(adev);
2006                 if (r)
2007                         return r;
2008                 break;
2009         case CHIP_VEGA10:
2010         case CHIP_VEGA12:
2011         case CHIP_VEGA20:
2012         case CHIP_RAVEN:
2013         case CHIP_ARCTURUS:
2014         case CHIP_RENOIR:
2015         case CHIP_ALDEBARAN:
2016                 if (adev->flags & AMD_IS_APU)
2017                         adev->family = AMDGPU_FAMILY_RV;
2018                 else
2019                         adev->family = AMDGPU_FAMILY_AI;
2020
2021                 r = soc15_set_ip_blocks(adev);
2022                 if (r)
2023                         return r;
2024                 break;
2025         case  CHIP_NAVI10:
2026         case  CHIP_NAVI14:
2027         case  CHIP_NAVI12:
2028         case  CHIP_SIENNA_CICHLID:
2029         case  CHIP_NAVY_FLOUNDER:
2030         case  CHIP_DIMGREY_CAVEFISH:
2031         case CHIP_VANGOGH:
2032                 if (adev->asic_type == CHIP_VANGOGH)
2033                         adev->family = AMDGPU_FAMILY_VGH;
2034                 else
2035                         adev->family = AMDGPU_FAMILY_NV;
2036
2037                 r = nv_set_ip_blocks(adev);
2038                 if (r)
2039                         return r;
2040                 break;
2041         default:
2042                 /* FIXME: not supported yet */
2043                 return -EINVAL;
2044         }
2045
2046         amdgpu_amdkfd_device_probe(adev);
2047
2048         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2049         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2050                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2051         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2052                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2053
2054         for (i = 0; i < adev->num_ip_blocks; i++) {
2055                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2056                         DRM_ERROR("disabled ip block: %d <%s>\n",
2057                                   i, adev->ip_blocks[i].version->funcs->name);
2058                         adev->ip_blocks[i].status.valid = false;
2059                 } else {
2060                         if (adev->ip_blocks[i].version->funcs->early_init) {
2061                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2062                                 if (r == -ENOENT) {
2063                                         adev->ip_blocks[i].status.valid = false;
2064                                 } else if (r) {
2065                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2066                                                   adev->ip_blocks[i].version->funcs->name, r);
2067                                         return r;
2068                                 } else {
2069                                         adev->ip_blocks[i].status.valid = true;
2070                                 }
2071                         } else {
2072                                 adev->ip_blocks[i].status.valid = true;
2073                         }
2074                 }
2075                 /* get the vbios after the asic_funcs are set up */
2076                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2077                         r = amdgpu_device_parse_gpu_info_fw(adev);
2078                         if (r)
2079                                 return r;
2080
2081                         /* Read BIOS */
2082                         if (!amdgpu_get_bios(adev))
2083                                 return -EINVAL;
2084
2085                         r = amdgpu_atombios_init(adev);
2086                         if (r) {
2087                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2088                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2089                                 return r;
2090                         }
2091                 }
2092         }
2093
2094         adev->cg_flags &= amdgpu_cg_mask;
2095         adev->pg_flags &= amdgpu_pg_mask;
2096
2097         return 0;
2098 }
2099
2100 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2101 {
2102         int i, r;
2103
2104         for (i = 0; i < adev->num_ip_blocks; i++) {
2105                 if (!adev->ip_blocks[i].status.sw)
2106                         continue;
2107                 if (adev->ip_blocks[i].status.hw)
2108                         continue;
2109                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2110                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2111                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2112                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2113                         if (r) {
2114                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2115                                           adev->ip_blocks[i].version->funcs->name, r);
2116                                 return r;
2117                         }
2118                         adev->ip_blocks[i].status.hw = true;
2119                 }
2120         }
2121
2122         return 0;
2123 }
2124
2125 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2126 {
2127         int i, r;
2128
2129         for (i = 0; i < adev->num_ip_blocks; i++) {
2130                 if (!adev->ip_blocks[i].status.sw)
2131                         continue;
2132                 if (adev->ip_blocks[i].status.hw)
2133                         continue;
2134                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2135                 if (r) {
2136                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2137                                   adev->ip_blocks[i].version->funcs->name, r);
2138                         return r;
2139                 }
2140                 adev->ip_blocks[i].status.hw = true;
2141         }
2142
2143         return 0;
2144 }
2145
2146 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2147 {
2148         int r = 0;
2149         int i;
2150         uint32_t smu_version;
2151
2152         if (adev->asic_type >= CHIP_VEGA10) {
2153                 for (i = 0; i < adev->num_ip_blocks; i++) {
2154                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2155                                 continue;
2156
2157                         /* no need to do the fw loading again if already done*/
2158                         if (adev->ip_blocks[i].status.hw == true)
2159                                 break;
2160
2161                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2162                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2163                                 if (r) {
2164                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2165                                                           adev->ip_blocks[i].version->funcs->name, r);
2166                                         return r;
2167                                 }
2168                         } else {
2169                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2170                                 if (r) {
2171                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2172                                                           adev->ip_blocks[i].version->funcs->name, r);
2173                                         return r;
2174                                 }
2175                         }
2176
2177                         adev->ip_blocks[i].status.hw = true;
2178                         break;
2179                 }
2180         }
2181
2182         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2183                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2184
2185         return r;
2186 }
2187
2188 /**
2189  * amdgpu_device_ip_init - run init for hardware IPs
2190  *
2191  * @adev: amdgpu_device pointer
2192  *
2193  * Main initialization pass for hardware IPs.  The list of all the hardware
2194  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2195  * are run.  sw_init initializes the software state associated with each IP
2196  * and hw_init initializes the hardware associated with each IP.
2197  * Returns 0 on success, negative error code on failure.
2198  */
2199 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2200 {
2201         int i, r;
2202
2203         r = amdgpu_ras_init(adev);
2204         if (r)
2205                 return r;
2206
2207         for (i = 0; i < adev->num_ip_blocks; i++) {
2208                 if (!adev->ip_blocks[i].status.valid)
2209                         continue;
2210                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2211                 if (r) {
2212                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2213                                   adev->ip_blocks[i].version->funcs->name, r);
2214                         goto init_failed;
2215                 }
2216                 adev->ip_blocks[i].status.sw = true;
2217
2218                 /* need to do gmc hw init early so we can allocate gpu mem */
2219                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2220                         r = amdgpu_device_vram_scratch_init(adev);
2221                         if (r) {
2222                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2223                                 goto init_failed;
2224                         }
2225                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2226                         if (r) {
2227                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2228                                 goto init_failed;
2229                         }
2230                         r = amdgpu_device_wb_init(adev);
2231                         if (r) {
2232                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2233                                 goto init_failed;
2234                         }
2235                         adev->ip_blocks[i].status.hw = true;
2236
2237                         /* right after GMC hw init, we create CSA */
2238                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2239                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2240                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2241                                                                 AMDGPU_CSA_SIZE);
2242                                 if (r) {
2243                                         DRM_ERROR("allocate CSA failed %d\n", r);
2244                                         goto init_failed;
2245                                 }
2246                         }
2247                 }
2248         }
2249
2250         if (amdgpu_sriov_vf(adev))
2251                 amdgpu_virt_init_data_exchange(adev);
2252
2253         r = amdgpu_ib_pool_init(adev);
2254         if (r) {
2255                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2256                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2257                 goto init_failed;
2258         }
2259
2260         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2261         if (r)
2262                 goto init_failed;
2263
2264         r = amdgpu_device_ip_hw_init_phase1(adev);
2265         if (r)
2266                 goto init_failed;
2267
2268         r = amdgpu_device_fw_loading(adev);
2269         if (r)
2270                 goto init_failed;
2271
2272         r = amdgpu_device_ip_hw_init_phase2(adev);
2273         if (r)
2274                 goto init_failed;
2275
2276         /*
2277          * retired pages will be loaded from eeprom and reserved here,
2278          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2279          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2280          * for I2C communication which only true at this point.
2281          *
2282          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2283          * failure from bad gpu situation and stop amdgpu init process
2284          * accordingly. For other failed cases, it will still release all
2285          * the resource and print error message, rather than returning one
2286          * negative value to upper level.
2287          *
2288          * Note: theoretically, this should be called before all vram allocations
2289          * to protect retired page from abusing
2290          */
2291         r = amdgpu_ras_recovery_init(adev);
2292         if (r)
2293                 goto init_failed;
2294
2295         if (adev->gmc.xgmi.num_physical_nodes > 1)
2296                 amdgpu_xgmi_add_device(adev);
2297         amdgpu_amdkfd_device_init(adev);
2298
2299         amdgpu_fru_get_product_info(adev);
2300
2301 init_failed:
2302         if (amdgpu_sriov_vf(adev))
2303                 amdgpu_virt_release_full_gpu(adev, true);
2304
2305         return r;
2306 }
2307
2308 /**
2309  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2310  *
2311  * @adev: amdgpu_device pointer
2312  *
2313  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2314  * this function before a GPU reset.  If the value is retained after a
2315  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2316  */
2317 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2318 {
2319         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2320 }
2321
2322 /**
2323  * amdgpu_device_check_vram_lost - check if vram is valid
2324  *
2325  * @adev: amdgpu_device pointer
2326  *
2327  * Checks the reset magic value written to the gart pointer in VRAM.
2328  * The driver calls this after a GPU reset to see if the contents of
2329  * VRAM is lost or now.
2330  * returns true if vram is lost, false if not.
2331  */
2332 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2333 {
2334         if (memcmp(adev->gart.ptr, adev->reset_magic,
2335                         AMDGPU_RESET_MAGIC_NUM))
2336                 return true;
2337
2338         if (!amdgpu_in_reset(adev))
2339                 return false;
2340
2341         /*
2342          * For all ASICs with baco/mode1 reset, the VRAM is
2343          * always assumed to be lost.
2344          */
2345         switch (amdgpu_asic_reset_method(adev)) {
2346         case AMD_RESET_METHOD_BACO:
2347         case AMD_RESET_METHOD_MODE1:
2348                 return true;
2349         default:
2350                 return false;
2351         }
2352 }
2353
2354 /**
2355  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2356  *
2357  * @adev: amdgpu_device pointer
2358  * @state: clockgating state (gate or ungate)
2359  *
2360  * The list of all the hardware IPs that make up the asic is walked and the
2361  * set_clockgating_state callbacks are run.
2362  * Late initialization pass enabling clockgating for hardware IPs.
2363  * Fini or suspend, pass disabling clockgating for hardware IPs.
2364  * Returns 0 on success, negative error code on failure.
2365  */
2366
2367 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2368                                                 enum amd_clockgating_state state)
2369 {
2370         int i, j, r;
2371
2372         if (amdgpu_emu_mode == 1)
2373                 return 0;
2374
2375         for (j = 0; j < adev->num_ip_blocks; j++) {
2376                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2377                 if (!adev->ip_blocks[i].status.late_initialized)
2378                         continue;
2379                 /* skip CG for VCE/UVD, it's handled specially */
2380                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2381                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2382                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2383                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2384                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2385                         /* enable clockgating to save power */
2386                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2387                                                                                      state);
2388                         if (r) {
2389                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2390                                           adev->ip_blocks[i].version->funcs->name, r);
2391                                 return r;
2392                         }
2393                 }
2394         }
2395
2396         return 0;
2397 }
2398
2399 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2400 {
2401         int i, j, r;
2402
2403         if (amdgpu_emu_mode == 1)
2404                 return 0;
2405
2406         for (j = 0; j < adev->num_ip_blocks; j++) {
2407                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2408                 if (!adev->ip_blocks[i].status.late_initialized)
2409                         continue;
2410                 /* skip CG for VCE/UVD, it's handled specially */
2411                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2412                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2413                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2414                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2415                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2416                         /* enable powergating to save power */
2417                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2418                                                                                         state);
2419                         if (r) {
2420                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2421                                           adev->ip_blocks[i].version->funcs->name, r);
2422                                 return r;
2423                         }
2424                 }
2425         }
2426         return 0;
2427 }
2428
2429 static int amdgpu_device_enable_mgpu_fan_boost(void)
2430 {
2431         struct amdgpu_gpu_instance *gpu_ins;
2432         struct amdgpu_device *adev;
2433         int i, ret = 0;
2434
2435         mutex_lock(&mgpu_info.mutex);
2436
2437         /*
2438          * MGPU fan boost feature should be enabled
2439          * only when there are two or more dGPUs in
2440          * the system
2441          */
2442         if (mgpu_info.num_dgpu < 2)
2443                 goto out;
2444
2445         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2446                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2447                 adev = gpu_ins->adev;
2448                 if (!(adev->flags & AMD_IS_APU) &&
2449                     !gpu_ins->mgpu_fan_enabled) {
2450                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2451                         if (ret)
2452                                 break;
2453
2454                         gpu_ins->mgpu_fan_enabled = 1;
2455                 }
2456         }
2457
2458 out:
2459         mutex_unlock(&mgpu_info.mutex);
2460
2461         return ret;
2462 }
2463
2464 /**
2465  * amdgpu_device_ip_late_init - run late init for hardware IPs
2466  *
2467  * @adev: amdgpu_device pointer
2468  *
2469  * Late initialization pass for hardware IPs.  The list of all the hardware
2470  * IPs that make up the asic is walked and the late_init callbacks are run.
2471  * late_init covers any special initialization that an IP requires
2472  * after all of the have been initialized or something that needs to happen
2473  * late in the init process.
2474  * Returns 0 on success, negative error code on failure.
2475  */
2476 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2477 {
2478         struct amdgpu_gpu_instance *gpu_instance;
2479         int i = 0, r;
2480
2481         for (i = 0; i < adev->num_ip_blocks; i++) {
2482                 if (!adev->ip_blocks[i].status.hw)
2483                         continue;
2484                 if (adev->ip_blocks[i].version->funcs->late_init) {
2485                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2486                         if (r) {
2487                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2488                                           adev->ip_blocks[i].version->funcs->name, r);
2489                                 return r;
2490                         }
2491                 }
2492                 adev->ip_blocks[i].status.late_initialized = true;
2493         }
2494
2495         amdgpu_ras_set_error_query_ready(adev, true);
2496
2497         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2498         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2499
2500         amdgpu_device_fill_reset_magic(adev);
2501
2502         r = amdgpu_device_enable_mgpu_fan_boost();
2503         if (r)
2504                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2505
2506
2507         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2508                 mutex_lock(&mgpu_info.mutex);
2509
2510                 /*
2511                  * Reset device p-state to low as this was booted with high.
2512                  *
2513                  * This should be performed only after all devices from the same
2514                  * hive get initialized.
2515                  *
2516                  * However, it's unknown how many device in the hive in advance.
2517                  * As this is counted one by one during devices initializations.
2518                  *
2519                  * So, we wait for all XGMI interlinked devices initialized.
2520                  * This may bring some delays as those devices may come from
2521                  * different hives. But that should be OK.
2522                  */
2523                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2524                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2525                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2526                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2527                                         continue;
2528
2529                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2530                                                 AMDGPU_XGMI_PSTATE_MIN);
2531                                 if (r) {
2532                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2533                                         break;
2534                                 }
2535                         }
2536                 }
2537
2538                 mutex_unlock(&mgpu_info.mutex);
2539         }
2540
2541         return 0;
2542 }
2543
2544 /**
2545  * amdgpu_device_ip_fini - run fini for hardware IPs
2546  *
2547  * @adev: amdgpu_device pointer
2548  *
2549  * Main teardown pass for hardware IPs.  The list of all the hardware
2550  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2551  * are run.  hw_fini tears down the hardware associated with each IP
2552  * and sw_fini tears down any software state associated with each IP.
2553  * Returns 0 on success, negative error code on failure.
2554  */
2555 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2556 {
2557         int i, r;
2558
2559         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2560                 amdgpu_virt_release_ras_err_handler_data(adev);
2561
2562         amdgpu_ras_pre_fini(adev);
2563
2564         if (adev->gmc.xgmi.num_physical_nodes > 1)
2565                 amdgpu_xgmi_remove_device(adev);
2566
2567         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2568         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2569
2570         amdgpu_amdkfd_device_fini(adev);
2571
2572         /* need to disable SMC first */
2573         for (i = 0; i < adev->num_ip_blocks; i++) {
2574                 if (!adev->ip_blocks[i].status.hw)
2575                         continue;
2576                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2577                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2578                         /* XXX handle errors */
2579                         if (r) {
2580                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2581                                           adev->ip_blocks[i].version->funcs->name, r);
2582                         }
2583                         adev->ip_blocks[i].status.hw = false;
2584                         break;
2585                 }
2586         }
2587
2588         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2589                 if (!adev->ip_blocks[i].status.hw)
2590                         continue;
2591
2592                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2593                 /* XXX handle errors */
2594                 if (r) {
2595                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2596                                   adev->ip_blocks[i].version->funcs->name, r);
2597                 }
2598
2599                 adev->ip_blocks[i].status.hw = false;
2600         }
2601
2602
2603         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2604                 if (!adev->ip_blocks[i].status.sw)
2605                         continue;
2606
2607                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2608                         amdgpu_ucode_free_bo(adev);
2609                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2610                         amdgpu_device_wb_fini(adev);
2611                         amdgpu_device_vram_scratch_fini(adev);
2612                         amdgpu_ib_pool_fini(adev);
2613                 }
2614
2615                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2616                 /* XXX handle errors */
2617                 if (r) {
2618                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2619                                   adev->ip_blocks[i].version->funcs->name, r);
2620                 }
2621                 adev->ip_blocks[i].status.sw = false;
2622                 adev->ip_blocks[i].status.valid = false;
2623         }
2624
2625         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2626                 if (!adev->ip_blocks[i].status.late_initialized)
2627                         continue;
2628                 if (adev->ip_blocks[i].version->funcs->late_fini)
2629                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2630                 adev->ip_blocks[i].status.late_initialized = false;
2631         }
2632
2633         amdgpu_ras_fini(adev);
2634
2635         if (amdgpu_sriov_vf(adev))
2636                 if (amdgpu_virt_release_full_gpu(adev, false))
2637                         DRM_ERROR("failed to release exclusive mode on fini\n");
2638
2639         return 0;
2640 }
2641
2642 /**
2643  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2644  *
2645  * @work: work_struct.
2646  */
2647 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2648 {
2649         struct amdgpu_device *adev =
2650                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2651         int r;
2652
2653         r = amdgpu_ib_ring_tests(adev);
2654         if (r)
2655                 DRM_ERROR("ib ring test failed (%d).\n", r);
2656 }
2657
2658 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2659 {
2660         struct amdgpu_device *adev =
2661                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2662
2663         mutex_lock(&adev->gfx.gfx_off_mutex);
2664         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2665                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2666                         adev->gfx.gfx_off_state = true;
2667         }
2668         mutex_unlock(&adev->gfx.gfx_off_mutex);
2669 }
2670
2671 /**
2672  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2673  *
2674  * @adev: amdgpu_device pointer
2675  *
2676  * Main suspend function for hardware IPs.  The list of all the hardware
2677  * IPs that make up the asic is walked, clockgating is disabled and the
2678  * suspend callbacks are run.  suspend puts the hardware and software state
2679  * in each IP into a state suitable for suspend.
2680  * Returns 0 on success, negative error code on failure.
2681  */
2682 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2683 {
2684         int i, r;
2685
2686         if (adev->in_poweroff_reboot_com ||
2687             !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) {
2688                 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2689                 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2690         }
2691
2692         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2693                 if (!adev->ip_blocks[i].status.valid)
2694                         continue;
2695
2696                 /* displays are handled separately */
2697                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2698                         continue;
2699
2700                 /* XXX handle errors */
2701                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2702                 /* XXX handle errors */
2703                 if (r) {
2704                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2705                                   adev->ip_blocks[i].version->funcs->name, r);
2706                         return r;
2707                 }
2708
2709                 adev->ip_blocks[i].status.hw = false;
2710         }
2711
2712         return 0;
2713 }
2714
2715 /**
2716  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2717  *
2718  * @adev: amdgpu_device pointer
2719  *
2720  * Main suspend function for hardware IPs.  The list of all the hardware
2721  * IPs that make up the asic is walked, clockgating is disabled and the
2722  * suspend callbacks are run.  suspend puts the hardware and software state
2723  * in each IP into a state suitable for suspend.
2724  * Returns 0 on success, negative error code on failure.
2725  */
2726 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2727 {
2728         int i, r;
2729
2730         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2731                 if (!adev->ip_blocks[i].status.valid)
2732                         continue;
2733                 /* displays are handled in phase1 */
2734                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2735                         continue;
2736                 /* PSP lost connection when err_event_athub occurs */
2737                 if (amdgpu_ras_intr_triggered() &&
2738                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2739                         adev->ip_blocks[i].status.hw = false;
2740                         continue;
2741                 }
2742                 /* XXX handle errors */
2743                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2744                 /* XXX handle errors */
2745                 if (r) {
2746                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2747                                   adev->ip_blocks[i].version->funcs->name, r);
2748                 }
2749                 adev->ip_blocks[i].status.hw = false;
2750                 /* handle putting the SMC in the appropriate state */
2751                 if(!amdgpu_sriov_vf(adev)){
2752                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2753                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2754                                 if (r) {
2755                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2756                                                         adev->mp1_state, r);
2757                                         return r;
2758                                 }
2759                         }
2760                 }
2761                 adev->ip_blocks[i].status.hw = false;
2762         }
2763
2764         return 0;
2765 }
2766
2767 /**
2768  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2769  *
2770  * @adev: amdgpu_device pointer
2771  *
2772  * Main suspend function for hardware IPs.  The list of all the hardware
2773  * IPs that make up the asic is walked, clockgating is disabled and the
2774  * suspend callbacks are run.  suspend puts the hardware and software state
2775  * in each IP into a state suitable for suspend.
2776  * Returns 0 on success, negative error code on failure.
2777  */
2778 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2779 {
2780         int r;
2781
2782         if (amdgpu_sriov_vf(adev)) {
2783                 amdgpu_virt_fini_data_exchange(adev);
2784                 amdgpu_virt_request_full_gpu(adev, false);
2785         }
2786
2787         r = amdgpu_device_ip_suspend_phase1(adev);
2788         if (r)
2789                 return r;
2790         r = amdgpu_device_ip_suspend_phase2(adev);
2791
2792         if (amdgpu_sriov_vf(adev))
2793                 amdgpu_virt_release_full_gpu(adev, false);
2794
2795         return r;
2796 }
2797
2798 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2799 {
2800         int i, r;
2801
2802         static enum amd_ip_block_type ip_order[] = {
2803                 AMD_IP_BLOCK_TYPE_GMC,
2804                 AMD_IP_BLOCK_TYPE_COMMON,
2805                 AMD_IP_BLOCK_TYPE_PSP,
2806                 AMD_IP_BLOCK_TYPE_IH,
2807         };
2808
2809         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2810                 int j;
2811                 struct amdgpu_ip_block *block;
2812
2813                 block = &adev->ip_blocks[i];
2814                 block->status.hw = false;
2815
2816                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2817
2818                         if (block->version->type != ip_order[j] ||
2819                                 !block->status.valid)
2820                                 continue;
2821
2822                         r = block->version->funcs->hw_init(adev);
2823                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2824                         if (r)
2825                                 return r;
2826                         block->status.hw = true;
2827                 }
2828         }
2829
2830         return 0;
2831 }
2832
2833 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2834 {
2835         int i, r;
2836
2837         static enum amd_ip_block_type ip_order[] = {
2838                 AMD_IP_BLOCK_TYPE_SMC,
2839                 AMD_IP_BLOCK_TYPE_DCE,
2840                 AMD_IP_BLOCK_TYPE_GFX,
2841                 AMD_IP_BLOCK_TYPE_SDMA,
2842                 AMD_IP_BLOCK_TYPE_UVD,
2843                 AMD_IP_BLOCK_TYPE_VCE,
2844                 AMD_IP_BLOCK_TYPE_VCN
2845         };
2846
2847         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2848                 int j;
2849                 struct amdgpu_ip_block *block;
2850
2851                 for (j = 0; j < adev->num_ip_blocks; j++) {
2852                         block = &adev->ip_blocks[j];
2853
2854                         if (block->version->type != ip_order[i] ||
2855                                 !block->status.valid ||
2856                                 block->status.hw)
2857                                 continue;
2858
2859                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2860                                 r = block->version->funcs->resume(adev);
2861                         else
2862                                 r = block->version->funcs->hw_init(adev);
2863
2864                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2865                         if (r)
2866                                 return r;
2867                         block->status.hw = true;
2868                 }
2869         }
2870
2871         return 0;
2872 }
2873
2874 /**
2875  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2876  *
2877  * @adev: amdgpu_device pointer
2878  *
2879  * First resume function for hardware IPs.  The list of all the hardware
2880  * IPs that make up the asic is walked and the resume callbacks are run for
2881  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2882  * after a suspend and updates the software state as necessary.  This
2883  * function is also used for restoring the GPU after a GPU reset.
2884  * Returns 0 on success, negative error code on failure.
2885  */
2886 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2887 {
2888         int i, r;
2889
2890         for (i = 0; i < adev->num_ip_blocks; i++) {
2891                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2892                         continue;
2893                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2894                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2895                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2896
2897                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2898                         if (r) {
2899                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2900                                           adev->ip_blocks[i].version->funcs->name, r);
2901                                 return r;
2902                         }
2903                         adev->ip_blocks[i].status.hw = true;
2904                 }
2905         }
2906
2907         return 0;
2908 }
2909
2910 /**
2911  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2912  *
2913  * @adev: amdgpu_device pointer
2914  *
2915  * First resume function for hardware IPs.  The list of all the hardware
2916  * IPs that make up the asic is walked and the resume callbacks are run for
2917  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2918  * functional state after a suspend and updates the software state as
2919  * necessary.  This function is also used for restoring the GPU after a GPU
2920  * reset.
2921  * Returns 0 on success, negative error code on failure.
2922  */
2923 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2924 {
2925         int i, r;
2926
2927         for (i = 0; i < adev->num_ip_blocks; i++) {
2928                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2929                         continue;
2930                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2931                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2932                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2933                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2934                         continue;
2935                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2936                 if (r) {
2937                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2938                                   adev->ip_blocks[i].version->funcs->name, r);
2939                         return r;
2940                 }
2941                 adev->ip_blocks[i].status.hw = true;
2942         }
2943
2944         return 0;
2945 }
2946
2947 /**
2948  * amdgpu_device_ip_resume - run resume for hardware IPs
2949  *
2950  * @adev: amdgpu_device pointer
2951  *
2952  * Main resume function for hardware IPs.  The hardware IPs
2953  * are split into two resume functions because they are
2954  * are also used in in recovering from a GPU reset and some additional
2955  * steps need to be take between them.  In this case (S3/S4) they are
2956  * run sequentially.
2957  * Returns 0 on success, negative error code on failure.
2958  */
2959 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2960 {
2961         int r;
2962
2963         r = amdgpu_device_ip_resume_phase1(adev);
2964         if (r)
2965                 return r;
2966
2967         r = amdgpu_device_fw_loading(adev);
2968         if (r)
2969                 return r;
2970
2971         r = amdgpu_device_ip_resume_phase2(adev);
2972
2973         return r;
2974 }
2975
2976 /**
2977  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2978  *
2979  * @adev: amdgpu_device pointer
2980  *
2981  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2982  */
2983 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2984 {
2985         if (amdgpu_sriov_vf(adev)) {
2986                 if (adev->is_atom_fw) {
2987                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2988                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2989                 } else {
2990                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2991                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2992                 }
2993
2994                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2995                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2996         }
2997 }
2998
2999 /**
3000  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3001  *
3002  * @asic_type: AMD asic type
3003  *
3004  * Check if there is DC (new modesetting infrastructre) support for an asic.
3005  * returns true if DC has support, false if not.
3006  */
3007 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3008 {
3009         switch (asic_type) {
3010 #if defined(CONFIG_DRM_AMD_DC)
3011 #if defined(CONFIG_DRM_AMD_DC_SI)
3012         case CHIP_TAHITI:
3013         case CHIP_PITCAIRN:
3014         case CHIP_VERDE:
3015         case CHIP_OLAND:
3016 #endif
3017         case CHIP_BONAIRE:
3018         case CHIP_KAVERI:
3019         case CHIP_KABINI:
3020         case CHIP_MULLINS:
3021                 /*
3022                  * We have systems in the wild with these ASICs that require
3023                  * LVDS and VGA support which is not supported with DC.
3024                  *
3025                  * Fallback to the non-DC driver here by default so as not to
3026                  * cause regressions.
3027                  */
3028                 return amdgpu_dc > 0;
3029         case CHIP_HAWAII:
3030         case CHIP_CARRIZO:
3031         case CHIP_STONEY:
3032         case CHIP_POLARIS10:
3033         case CHIP_POLARIS11:
3034         case CHIP_POLARIS12:
3035         case CHIP_VEGAM:
3036         case CHIP_TONGA:
3037         case CHIP_FIJI:
3038         case CHIP_VEGA10:
3039         case CHIP_VEGA12:
3040         case CHIP_VEGA20:
3041 #if defined(CONFIG_DRM_AMD_DC_DCN)
3042         case CHIP_RAVEN:
3043         case CHIP_NAVI10:
3044         case CHIP_NAVI14:
3045         case CHIP_NAVI12:
3046         case CHIP_RENOIR:
3047         case CHIP_SIENNA_CICHLID:
3048         case CHIP_NAVY_FLOUNDER:
3049         case CHIP_DIMGREY_CAVEFISH:
3050         case CHIP_VANGOGH:
3051 #endif
3052                 return amdgpu_dc != 0;
3053 #endif
3054         default:
3055                 if (amdgpu_dc > 0)
3056                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3057                                          "but isn't supported by ASIC, ignoring\n");
3058                 return false;
3059         }
3060 }
3061
3062 /**
3063  * amdgpu_device_has_dc_support - check if dc is supported
3064  *
3065  * @adev: amdgpu_device pointer
3066  *
3067  * Returns true for supported, false for not supported
3068  */
3069 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3070 {
3071         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3072                 return false;
3073
3074         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3075 }
3076
3077
3078 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3079 {
3080         struct amdgpu_device *adev =
3081                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3082         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3083
3084         /* It's a bug to not have a hive within this function */
3085         if (WARN_ON(!hive))
3086                 return;
3087
3088         /*
3089          * Use task barrier to synchronize all xgmi reset works across the
3090          * hive. task_barrier_enter and task_barrier_exit will block
3091          * until all the threads running the xgmi reset works reach
3092          * those points. task_barrier_full will do both blocks.
3093          */
3094         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3095
3096                 task_barrier_enter(&hive->tb);
3097                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3098
3099                 if (adev->asic_reset_res)
3100                         goto fail;
3101
3102                 task_barrier_exit(&hive->tb);
3103                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3104
3105                 if (adev->asic_reset_res)
3106                         goto fail;
3107
3108                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3109                         adev->mmhub.funcs->reset_ras_error_count(adev);
3110         } else {
3111
3112                 task_barrier_full(&hive->tb);
3113                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3114         }
3115
3116 fail:
3117         if (adev->asic_reset_res)
3118                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3119                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3120         amdgpu_put_xgmi_hive(hive);
3121 }
3122
3123 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3124 {
3125         char *input = amdgpu_lockup_timeout;
3126         char *timeout_setting = NULL;
3127         int index = 0;
3128         long timeout;
3129         int ret = 0;
3130
3131         /*
3132          * By default timeout for non compute jobs is 10000.
3133          * And there is no timeout enforced on compute jobs.
3134          * In SR-IOV or passthrough mode, timeout for compute
3135          * jobs are 60000 by default.
3136          */
3137         adev->gfx_timeout = msecs_to_jiffies(10000);
3138         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3139         if (amdgpu_sriov_vf(adev))
3140                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3141                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3142         else if (amdgpu_passthrough(adev))
3143                 adev->compute_timeout =  msecs_to_jiffies(60000);
3144         else
3145                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3146
3147         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3148                 while ((timeout_setting = strsep(&input, ",")) &&
3149                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3150                         ret = kstrtol(timeout_setting, 0, &timeout);
3151                         if (ret)
3152                                 return ret;
3153
3154                         if (timeout == 0) {
3155                                 index++;
3156                                 continue;
3157                         } else if (timeout < 0) {
3158                                 timeout = MAX_SCHEDULE_TIMEOUT;
3159                         } else {
3160                                 timeout = msecs_to_jiffies(timeout);
3161                         }
3162
3163                         switch (index++) {
3164                         case 0:
3165                                 adev->gfx_timeout = timeout;
3166                                 break;
3167                         case 1:
3168                                 adev->compute_timeout = timeout;
3169                                 break;
3170                         case 2:
3171                                 adev->sdma_timeout = timeout;
3172                                 break;
3173                         case 3:
3174                                 adev->video_timeout = timeout;
3175                                 break;
3176                         default:
3177                                 break;
3178                         }
3179                 }
3180                 /*
3181                  * There is only one value specified and
3182                  * it should apply to all non-compute jobs.
3183                  */
3184                 if (index == 1) {
3185                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3186                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3187                                 adev->compute_timeout = adev->gfx_timeout;
3188                 }
3189         }
3190
3191         return ret;
3192 }
3193
3194 static const struct attribute *amdgpu_dev_attributes[] = {
3195         &dev_attr_product_name.attr,
3196         &dev_attr_product_number.attr,
3197         &dev_attr_serial_number.attr,
3198         &dev_attr_pcie_replay_count.attr,
3199         NULL
3200 };
3201
3202
3203 /**
3204  * amdgpu_device_init - initialize the driver
3205  *
3206  * @adev: amdgpu_device pointer
3207  * @flags: driver flags
3208  *
3209  * Initializes the driver info and hw (all asics).
3210  * Returns 0 for success or an error on failure.
3211  * Called at driver startup.
3212  */
3213 int amdgpu_device_init(struct amdgpu_device *adev,
3214                        uint32_t flags)
3215 {
3216         struct drm_device *ddev = adev_to_drm(adev);
3217         struct pci_dev *pdev = adev->pdev;
3218         int r, i;
3219         bool atpx = false;
3220         u32 max_MBps;
3221
3222         adev->shutdown = false;
3223         adev->flags = flags;
3224
3225         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3226                 adev->asic_type = amdgpu_force_asic_type;
3227         else
3228                 adev->asic_type = flags & AMD_ASIC_MASK;
3229
3230         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3231         if (amdgpu_emu_mode == 1)
3232                 adev->usec_timeout *= 10;
3233         adev->gmc.gart_size = 512 * 1024 * 1024;
3234         adev->accel_working = false;
3235         adev->num_rings = 0;
3236         adev->mman.buffer_funcs = NULL;
3237         adev->mman.buffer_funcs_ring = NULL;
3238         adev->vm_manager.vm_pte_funcs = NULL;
3239         adev->vm_manager.vm_pte_num_scheds = 0;
3240         adev->gmc.gmc_funcs = NULL;
3241         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3242         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3243
3244         adev->smc_rreg = &amdgpu_invalid_rreg;
3245         adev->smc_wreg = &amdgpu_invalid_wreg;
3246         adev->pcie_rreg = &amdgpu_invalid_rreg;
3247         adev->pcie_wreg = &amdgpu_invalid_wreg;
3248         adev->pciep_rreg = &amdgpu_invalid_rreg;
3249         adev->pciep_wreg = &amdgpu_invalid_wreg;
3250         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3251         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3252         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3253         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3254         adev->didt_rreg = &amdgpu_invalid_rreg;
3255         adev->didt_wreg = &amdgpu_invalid_wreg;
3256         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3257         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3258         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3259         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3260
3261         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3262                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3263                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3264
3265         /* mutex initialization are all done here so we
3266          * can recall function without having locking issues */
3267         atomic_set(&adev->irq.ih.lock, 0);
3268         mutex_init(&adev->firmware.mutex);
3269         mutex_init(&adev->pm.mutex);
3270         mutex_init(&adev->gfx.gpu_clock_mutex);
3271         mutex_init(&adev->srbm_mutex);
3272         mutex_init(&adev->gfx.pipe_reserve_mutex);
3273         mutex_init(&adev->gfx.gfx_off_mutex);
3274         mutex_init(&adev->grbm_idx_mutex);
3275         mutex_init(&adev->mn_lock);
3276         mutex_init(&adev->virt.vf_errors.lock);
3277         hash_init(adev->mn_hash);
3278         atomic_set(&adev->in_gpu_reset, 0);
3279         init_rwsem(&adev->reset_sem);
3280         mutex_init(&adev->psp.mutex);
3281         mutex_init(&adev->notifier_lock);
3282
3283         r = amdgpu_device_check_arguments(adev);
3284         if (r)
3285                 return r;
3286
3287         spin_lock_init(&adev->mmio_idx_lock);
3288         spin_lock_init(&adev->smc_idx_lock);
3289         spin_lock_init(&adev->pcie_idx_lock);
3290         spin_lock_init(&adev->uvd_ctx_idx_lock);
3291         spin_lock_init(&adev->didt_idx_lock);
3292         spin_lock_init(&adev->gc_cac_idx_lock);
3293         spin_lock_init(&adev->se_cac_idx_lock);
3294         spin_lock_init(&adev->audio_endpt_idx_lock);
3295         spin_lock_init(&adev->mm_stats.lock);
3296
3297         INIT_LIST_HEAD(&adev->shadow_list);
3298         mutex_init(&adev->shadow_list_lock);
3299
3300         INIT_LIST_HEAD(&adev->reset_list);
3301
3302         INIT_DELAYED_WORK(&adev->delayed_init_work,
3303                           amdgpu_device_delayed_init_work_handler);
3304         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3305                           amdgpu_device_delay_enable_gfx_off);
3306
3307         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3308
3309         adev->gfx.gfx_off_req_count = 1;
3310         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3311
3312         atomic_set(&adev->throttling_logging_enabled, 1);
3313         /*
3314          * If throttling continues, logging will be performed every minute
3315          * to avoid log flooding. "-1" is subtracted since the thermal
3316          * throttling interrupt comes every second. Thus, the total logging
3317          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3318          * for throttling interrupt) = 60 seconds.
3319          */
3320         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3321         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3322
3323         /* Registers mapping */
3324         /* TODO: block userspace mapping of io register */
3325         if (adev->asic_type >= CHIP_BONAIRE) {
3326                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3327                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3328         } else {
3329                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3330                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3331         }
3332
3333         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3334         if (adev->rmmio == NULL) {
3335                 return -ENOMEM;
3336         }
3337         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3338         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3339
3340         /* io port mapping */
3341         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3342                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3343                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3344                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3345                         break;
3346                 }
3347         }
3348         if (adev->rio_mem == NULL)
3349                 DRM_INFO("PCI I/O BAR is not found.\n");
3350
3351         /* enable PCIE atomic ops */
3352         r = pci_enable_atomic_ops_to_root(adev->pdev,
3353                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3354                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3355         if (r) {
3356                 adev->have_atomics_support = false;
3357                 DRM_INFO("PCIE atomic ops is not supported\n");
3358         } else {
3359                 adev->have_atomics_support = true;
3360         }
3361
3362         amdgpu_device_get_pcie_info(adev);
3363
3364         if (amdgpu_mcbp)
3365                 DRM_INFO("MCBP is enabled\n");
3366
3367         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3368                 adev->enable_mes = true;
3369
3370         /* detect hw virtualization here */
3371         amdgpu_detect_virtualization(adev);
3372
3373         r = amdgpu_device_get_job_timeout_settings(adev);
3374         if (r) {
3375                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3376                 goto failed_unmap;
3377         }
3378
3379         /* early init functions */
3380         r = amdgpu_device_ip_early_init(adev);
3381         if (r)
3382                 goto failed_unmap;
3383
3384         /* doorbell bar mapping and doorbell index init*/
3385         amdgpu_device_doorbell_init(adev);
3386
3387         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3388         /* this will fail for cards that aren't VGA class devices, just
3389          * ignore it */
3390         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3391                 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3392
3393         if (amdgpu_device_supports_atpx(ddev))
3394                 atpx = true;
3395         if (amdgpu_has_atpx() &&
3396             (amdgpu_is_atpx_hybrid() ||
3397              amdgpu_has_atpx_dgpu_power_cntl()) &&
3398             !pci_is_thunderbolt_attached(adev->pdev))
3399                 vga_switcheroo_register_client(adev->pdev,
3400                                                &amdgpu_switcheroo_ops, atpx);
3401         if (atpx)
3402                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3403
3404         if (amdgpu_emu_mode == 1) {
3405                 /* post the asic on emulation mode */
3406                 emu_soc_asic_init(adev);
3407                 goto fence_driver_init;
3408         }
3409
3410         /* detect if we are with an SRIOV vbios */
3411         amdgpu_device_detect_sriov_bios(adev);
3412
3413         /* check if we need to reset the asic
3414          *  E.g., driver was not cleanly unloaded previously, etc.
3415          */
3416         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3417                 r = amdgpu_asic_reset(adev);
3418                 if (r) {
3419                         dev_err(adev->dev, "asic reset on init failed\n");
3420                         goto failed;
3421                 }
3422         }
3423
3424         pci_enable_pcie_error_reporting(adev->pdev);
3425
3426         /* Post card if necessary */
3427         if (amdgpu_device_need_post(adev)) {
3428                 if (!adev->bios) {
3429                         dev_err(adev->dev, "no vBIOS found\n");
3430                         r = -EINVAL;
3431                         goto failed;
3432                 }
3433                 DRM_INFO("GPU posting now...\n");
3434                 r = amdgpu_device_asic_init(adev);
3435                 if (r) {
3436                         dev_err(adev->dev, "gpu post error!\n");
3437                         goto failed;
3438                 }
3439         }
3440
3441         if (adev->is_atom_fw) {
3442                 /* Initialize clocks */
3443                 r = amdgpu_atomfirmware_get_clock_info(adev);
3444                 if (r) {
3445                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3446                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3447                         goto failed;
3448                 }
3449         } else {
3450                 /* Initialize clocks */
3451                 r = amdgpu_atombios_get_clock_info(adev);
3452                 if (r) {
3453                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3454                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3455                         goto failed;
3456                 }
3457                 /* init i2c buses */
3458                 if (!amdgpu_device_has_dc_support(adev))
3459                         amdgpu_atombios_i2c_init(adev);
3460         }
3461
3462 fence_driver_init:
3463         /* Fence driver */
3464         r = amdgpu_fence_driver_init(adev);
3465         if (r) {
3466                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3467                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3468                 goto failed;
3469         }
3470
3471         /* init the mode config */
3472         drm_mode_config_init(adev_to_drm(adev));
3473
3474         r = amdgpu_device_ip_init(adev);
3475         if (r) {
3476                 /* failed in exclusive mode due to timeout */
3477                 if (amdgpu_sriov_vf(adev) &&
3478                     !amdgpu_sriov_runtime(adev) &&
3479                     amdgpu_virt_mmio_blocked(adev) &&
3480                     !amdgpu_virt_wait_reset(adev)) {
3481                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3482                         /* Don't send request since VF is inactive. */
3483                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3484                         adev->virt.ops = NULL;
3485                         r = -EAGAIN;
3486                         goto failed;
3487                 }
3488                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3489                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3490                 goto failed;
3491         }
3492
3493         dev_info(adev->dev,
3494                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3495                         adev->gfx.config.max_shader_engines,
3496                         adev->gfx.config.max_sh_per_se,
3497                         adev->gfx.config.max_cu_per_sh,
3498                         adev->gfx.cu_info.number);
3499
3500         adev->accel_working = true;
3501
3502         amdgpu_vm_check_compute_bug(adev);
3503
3504         /* Initialize the buffer migration limit. */
3505         if (amdgpu_moverate >= 0)
3506                 max_MBps = amdgpu_moverate;
3507         else
3508                 max_MBps = 8; /* Allow 8 MB/s. */
3509         /* Get a log2 for easy divisions. */
3510         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3511
3512         amdgpu_fbdev_init(adev);
3513
3514         r = amdgpu_pm_sysfs_init(adev);
3515         if (r) {
3516                 adev->pm_sysfs_en = false;
3517                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3518         } else
3519                 adev->pm_sysfs_en = true;
3520
3521         r = amdgpu_ucode_sysfs_init(adev);
3522         if (r) {
3523                 adev->ucode_sysfs_en = false;
3524                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3525         } else
3526                 adev->ucode_sysfs_en = true;
3527
3528         if ((amdgpu_testing & 1)) {
3529                 if (adev->accel_working)
3530                         amdgpu_test_moves(adev);
3531                 else
3532                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3533         }
3534         if (amdgpu_benchmarking) {
3535                 if (adev->accel_working)
3536                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3537                 else
3538                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3539         }
3540
3541         /*
3542          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3543          * Otherwise the mgpu fan boost feature will be skipped due to the
3544          * gpu instance is counted less.
3545          */
3546         amdgpu_register_gpu_instance(adev);
3547
3548         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3549          * explicit gating rather than handling it automatically.
3550          */
3551         r = amdgpu_device_ip_late_init(adev);
3552         if (r) {
3553                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3554                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3555                 goto failed;
3556         }
3557
3558         /* must succeed. */
3559         amdgpu_ras_resume(adev);
3560
3561         queue_delayed_work(system_wq, &adev->delayed_init_work,
3562                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3563
3564         if (amdgpu_sriov_vf(adev))
3565                 flush_delayed_work(&adev->delayed_init_work);
3566
3567         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3568         if (r)
3569                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3570
3571         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3572                 r = amdgpu_pmu_init(adev);
3573         if (r)
3574                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3575
3576         /* Have stored pci confspace at hand for restore in sudden PCI error */
3577         if (amdgpu_device_cache_pci_state(adev->pdev))
3578                 pci_restore_state(pdev);
3579
3580         return 0;
3581
3582 failed:
3583         amdgpu_vf_error_trans_all(adev);
3584         if (atpx)
3585                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3586
3587 failed_unmap:
3588         iounmap(adev->rmmio);
3589         adev->rmmio = NULL;
3590
3591         return r;
3592 }
3593
3594 /**
3595  * amdgpu_device_fini - tear down the driver
3596  *
3597  * @adev: amdgpu_device pointer
3598  *
3599  * Tear down the driver info (all asics).
3600  * Called at driver shutdown.
3601  */
3602 void amdgpu_device_fini(struct amdgpu_device *adev)
3603 {
3604         dev_info(adev->dev, "amdgpu: finishing device.\n");
3605         flush_delayed_work(&adev->delayed_init_work);
3606         ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3607         adev->shutdown = true;
3608
3609         kfree(adev->pci_state);
3610
3611         /* make sure IB test finished before entering exclusive mode
3612          * to avoid preemption on IB test
3613          * */
3614         if (amdgpu_sriov_vf(adev)) {
3615                 amdgpu_virt_request_full_gpu(adev, false);
3616                 amdgpu_virt_fini_data_exchange(adev);
3617         }
3618
3619         /* disable all interrupts */
3620         amdgpu_irq_disable_all(adev);
3621         if (adev->mode_info.mode_config_initialized){
3622                 if (!amdgpu_device_has_dc_support(adev))
3623                         drm_helper_force_disable_all(adev_to_drm(adev));
3624                 else
3625                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3626         }
3627         amdgpu_fence_driver_fini(adev);
3628         if (adev->pm_sysfs_en)
3629                 amdgpu_pm_sysfs_fini(adev);
3630         amdgpu_fbdev_fini(adev);
3631         amdgpu_device_ip_fini(adev);
3632         release_firmware(adev->firmware.gpu_info_fw);
3633         adev->firmware.gpu_info_fw = NULL;
3634         adev->accel_working = false;
3635         /* free i2c buses */
3636         if (!amdgpu_device_has_dc_support(adev))
3637                 amdgpu_i2c_fini(adev);
3638
3639         if (amdgpu_emu_mode != 1)
3640                 amdgpu_atombios_fini(adev);
3641
3642         kfree(adev->bios);
3643         adev->bios = NULL;
3644         if (amdgpu_has_atpx() &&
3645             (amdgpu_is_atpx_hybrid() ||
3646              amdgpu_has_atpx_dgpu_power_cntl()) &&
3647             !pci_is_thunderbolt_attached(adev->pdev))
3648                 vga_switcheroo_unregister_client(adev->pdev);
3649         if (amdgpu_device_supports_atpx(adev_to_drm(adev)))
3650                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3651         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3652                 vga_client_register(adev->pdev, NULL, NULL, NULL);
3653         if (adev->rio_mem)
3654                 pci_iounmap(adev->pdev, adev->rio_mem);
3655         adev->rio_mem = NULL;
3656         iounmap(adev->rmmio);
3657         adev->rmmio = NULL;
3658         amdgpu_device_doorbell_fini(adev);
3659
3660         if (adev->ucode_sysfs_en)
3661                 amdgpu_ucode_sysfs_fini(adev);
3662
3663         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3664         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3665                 amdgpu_pmu_fini(adev);
3666         if (adev->mman.discovery_bin)
3667                 amdgpu_discovery_fini(adev);
3668 }
3669
3670
3671 /*
3672  * Suspend & resume.
3673  */
3674 /**
3675  * amdgpu_device_suspend - initiate device suspend
3676  *
3677  * @dev: drm dev pointer
3678  * @fbcon : notify the fbdev of suspend
3679  *
3680  * Puts the hw in the suspend state (all asics).
3681  * Returns 0 for success or an error on failure.
3682  * Called at driver suspend.
3683  */
3684 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3685 {
3686         struct amdgpu_device *adev;
3687         struct drm_crtc *crtc;
3688         struct drm_connector *connector;
3689         struct drm_connector_list_iter iter;
3690         int r;
3691
3692         adev = drm_to_adev(dev);
3693
3694         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3695                 return 0;
3696
3697         adev->in_suspend = true;
3698         drm_kms_helper_poll_disable(dev);
3699
3700         if (fbcon)
3701                 amdgpu_fbdev_set_suspend(adev, 1);
3702
3703         cancel_delayed_work_sync(&adev->delayed_init_work);
3704
3705         if (!amdgpu_device_has_dc_support(adev)) {
3706                 /* turn off display hw */
3707                 drm_modeset_lock_all(dev);
3708                 drm_connector_list_iter_begin(dev, &iter);
3709                 drm_for_each_connector_iter(connector, &iter)
3710                         drm_helper_connector_dpms(connector,
3711                                                   DRM_MODE_DPMS_OFF);
3712                 drm_connector_list_iter_end(&iter);
3713                 drm_modeset_unlock_all(dev);
3714                         /* unpin the front buffers and cursors */
3715                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3716                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3717                         struct drm_framebuffer *fb = crtc->primary->fb;
3718                         struct amdgpu_bo *robj;
3719
3720                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3721                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3722                                 r = amdgpu_bo_reserve(aobj, true);
3723                                 if (r == 0) {
3724                                         amdgpu_bo_unpin(aobj);
3725                                         amdgpu_bo_unreserve(aobj);
3726                                 }
3727                         }
3728
3729                         if (fb == NULL || fb->obj[0] == NULL) {
3730                                 continue;
3731                         }
3732                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3733                         /* don't unpin kernel fb objects */
3734                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3735                                 r = amdgpu_bo_reserve(robj, true);
3736                                 if (r == 0) {
3737                                         amdgpu_bo_unpin(robj);
3738                                         amdgpu_bo_unreserve(robj);
3739                                 }
3740                         }
3741                 }
3742         }
3743
3744         amdgpu_ras_suspend(adev);
3745
3746         r = amdgpu_device_ip_suspend_phase1(adev);
3747
3748         amdgpu_amdkfd_suspend(adev, adev->in_runpm);
3749
3750         /* evict vram memory */
3751         amdgpu_bo_evict_vram(adev);
3752
3753         amdgpu_fence_driver_suspend(adev);
3754
3755         if (adev->in_poweroff_reboot_com ||
3756             !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev))
3757                 r = amdgpu_device_ip_suspend_phase2(adev);
3758         else
3759                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
3760         /* evict remaining vram memory
3761          * This second call to evict vram is to evict the gart page table
3762          * using the CPU.
3763          */
3764         amdgpu_bo_evict_vram(adev);
3765
3766         return 0;
3767 }
3768
3769 /**
3770  * amdgpu_device_resume - initiate device resume
3771  *
3772  * @dev: drm dev pointer
3773  * @fbcon : notify the fbdev of resume
3774  *
3775  * Bring the hw back to operating state (all asics).
3776  * Returns 0 for success or an error on failure.
3777  * Called at driver resume.
3778  */
3779 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3780 {
3781         struct drm_connector *connector;
3782         struct drm_connector_list_iter iter;
3783         struct amdgpu_device *adev = drm_to_adev(dev);
3784         struct drm_crtc *crtc;
3785         int r = 0;
3786
3787         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3788                 return 0;
3789
3790         if (amdgpu_acpi_is_s0ix_supported(adev))
3791                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3792
3793         /* post card */
3794         if (amdgpu_device_need_post(adev)) {
3795                 r = amdgpu_device_asic_init(adev);
3796                 if (r)
3797                         dev_err(adev->dev, "amdgpu asic init failed\n");
3798         }
3799
3800         r = amdgpu_device_ip_resume(adev);
3801         if (r) {
3802                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3803                 return r;
3804         }
3805         amdgpu_fence_driver_resume(adev);
3806
3807
3808         r = amdgpu_device_ip_late_init(adev);
3809         if (r)
3810                 return r;
3811
3812         queue_delayed_work(system_wq, &adev->delayed_init_work,
3813                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3814
3815         if (!amdgpu_device_has_dc_support(adev)) {
3816                 /* pin cursors */
3817                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3818                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3819
3820                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3821                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3822                                 r = amdgpu_bo_reserve(aobj, true);
3823                                 if (r == 0) {
3824                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3825                                         if (r != 0)
3826                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3827                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3828                                         amdgpu_bo_unreserve(aobj);
3829                                 }
3830                         }
3831                 }
3832         }
3833         r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3834         if (r)
3835                 return r;
3836
3837         /* Make sure IB tests flushed */
3838         flush_delayed_work(&adev->delayed_init_work);
3839
3840         /* blat the mode back in */
3841         if (fbcon) {
3842                 if (!amdgpu_device_has_dc_support(adev)) {
3843                         /* pre DCE11 */
3844                         drm_helper_resume_force_mode(dev);
3845
3846                         /* turn on display hw */
3847                         drm_modeset_lock_all(dev);
3848
3849                         drm_connector_list_iter_begin(dev, &iter);
3850                         drm_for_each_connector_iter(connector, &iter)
3851                                 drm_helper_connector_dpms(connector,
3852                                                           DRM_MODE_DPMS_ON);
3853                         drm_connector_list_iter_end(&iter);
3854
3855                         drm_modeset_unlock_all(dev);
3856                 }
3857                 amdgpu_fbdev_set_suspend(adev, 0);
3858         }
3859
3860         drm_kms_helper_poll_enable(dev);
3861
3862         amdgpu_ras_resume(adev);
3863
3864         /*
3865          * Most of the connector probing functions try to acquire runtime pm
3866          * refs to ensure that the GPU is powered on when connector polling is
3867          * performed. Since we're calling this from a runtime PM callback,
3868          * trying to acquire rpm refs will cause us to deadlock.
3869          *
3870          * Since we're guaranteed to be holding the rpm lock, it's safe to
3871          * temporarily disable the rpm helpers so this doesn't deadlock us.
3872          */
3873 #ifdef CONFIG_PM
3874         dev->dev->power.disable_depth++;
3875 #endif
3876         if (!amdgpu_device_has_dc_support(adev))
3877                 drm_helper_hpd_irq_event(dev);
3878         else
3879                 drm_kms_helper_hotplug_event(dev);
3880 #ifdef CONFIG_PM
3881         dev->dev->power.disable_depth--;
3882 #endif
3883         adev->in_suspend = false;
3884
3885         return 0;
3886 }
3887
3888 /**
3889  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3890  *
3891  * @adev: amdgpu_device pointer
3892  *
3893  * The list of all the hardware IPs that make up the asic is walked and
3894  * the check_soft_reset callbacks are run.  check_soft_reset determines
3895  * if the asic is still hung or not.
3896  * Returns true if any of the IPs are still in a hung state, false if not.
3897  */
3898 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3899 {
3900         int i;
3901         bool asic_hang = false;
3902
3903         if (amdgpu_sriov_vf(adev))
3904                 return true;
3905
3906         if (amdgpu_asic_need_full_reset(adev))
3907                 return true;
3908
3909         for (i = 0; i < adev->num_ip_blocks; i++) {
3910                 if (!adev->ip_blocks[i].status.valid)
3911                         continue;
3912                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3913                         adev->ip_blocks[i].status.hang =
3914                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3915                 if (adev->ip_blocks[i].status.hang) {
3916                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3917                         asic_hang = true;
3918                 }
3919         }
3920         return asic_hang;
3921 }
3922
3923 /**
3924  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3925  *
3926  * @adev: amdgpu_device pointer
3927  *
3928  * The list of all the hardware IPs that make up the asic is walked and the
3929  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3930  * handles any IP specific hardware or software state changes that are
3931  * necessary for a soft reset to succeed.
3932  * Returns 0 on success, negative error code on failure.
3933  */
3934 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3935 {
3936         int i, r = 0;
3937
3938         for (i = 0; i < adev->num_ip_blocks; i++) {
3939                 if (!adev->ip_blocks[i].status.valid)
3940                         continue;
3941                 if (adev->ip_blocks[i].status.hang &&
3942                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3943                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3944                         if (r)
3945                                 return r;
3946                 }
3947         }
3948
3949         return 0;
3950 }
3951
3952 /**
3953  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3954  *
3955  * @adev: amdgpu_device pointer
3956  *
3957  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3958  * reset is necessary to recover.
3959  * Returns true if a full asic reset is required, false if not.
3960  */
3961 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3962 {
3963         int i;
3964
3965         if (amdgpu_asic_need_full_reset(adev))
3966                 return true;
3967
3968         for (i = 0; i < adev->num_ip_blocks; i++) {
3969                 if (!adev->ip_blocks[i].status.valid)
3970                         continue;
3971                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3972                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3973                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3974                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3975                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3976                         if (adev->ip_blocks[i].status.hang) {
3977                                 dev_info(adev->dev, "Some block need full reset!\n");
3978                                 return true;
3979                         }
3980                 }
3981         }
3982         return false;
3983 }
3984
3985 /**
3986  * amdgpu_device_ip_soft_reset - do a soft reset
3987  *
3988  * @adev: amdgpu_device pointer
3989  *
3990  * The list of all the hardware IPs that make up the asic is walked and the
3991  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3992  * IP specific hardware or software state changes that are necessary to soft
3993  * reset the IP.
3994  * Returns 0 on success, negative error code on failure.
3995  */
3996 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3997 {
3998         int i, r = 0;
3999
4000         for (i = 0; i < adev->num_ip_blocks; i++) {
4001                 if (!adev->ip_blocks[i].status.valid)
4002                         continue;
4003                 if (adev->ip_blocks[i].status.hang &&
4004                     adev->ip_blocks[i].version->funcs->soft_reset) {
4005                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4006                         if (r)
4007                                 return r;
4008                 }
4009         }
4010
4011         return 0;
4012 }
4013
4014 /**
4015  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4016  *
4017  * @adev: amdgpu_device pointer
4018  *
4019  * The list of all the hardware IPs that make up the asic is walked and the
4020  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4021  * handles any IP specific hardware or software state changes that are
4022  * necessary after the IP has been soft reset.
4023  * Returns 0 on success, negative error code on failure.
4024  */
4025 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4026 {
4027         int i, r = 0;
4028
4029         for (i = 0; i < adev->num_ip_blocks; i++) {
4030                 if (!adev->ip_blocks[i].status.valid)
4031                         continue;
4032                 if (adev->ip_blocks[i].status.hang &&
4033                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4034                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4035                 if (r)
4036                         return r;
4037         }
4038
4039         return 0;
4040 }
4041
4042 /**
4043  * amdgpu_device_recover_vram - Recover some VRAM contents
4044  *
4045  * @adev: amdgpu_device pointer
4046  *
4047  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4048  * restore things like GPUVM page tables after a GPU reset where
4049  * the contents of VRAM might be lost.
4050  *
4051  * Returns:
4052  * 0 on success, negative error code on failure.
4053  */
4054 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4055 {
4056         struct dma_fence *fence = NULL, *next = NULL;
4057         struct amdgpu_bo *shadow;
4058         long r = 1, tmo;
4059
4060         if (amdgpu_sriov_runtime(adev))
4061                 tmo = msecs_to_jiffies(8000);
4062         else
4063                 tmo = msecs_to_jiffies(100);
4064
4065         dev_info(adev->dev, "recover vram bo from shadow start\n");
4066         mutex_lock(&adev->shadow_list_lock);
4067         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4068
4069                 /* No need to recover an evicted BO */
4070                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4071                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4072                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4073                         continue;
4074
4075                 r = amdgpu_bo_restore_shadow(shadow, &next);
4076                 if (r)
4077                         break;
4078
4079                 if (fence) {
4080                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4081                         dma_fence_put(fence);
4082                         fence = next;
4083                         if (tmo == 0) {
4084                                 r = -ETIMEDOUT;
4085                                 break;
4086                         } else if (tmo < 0) {
4087                                 r = tmo;
4088                                 break;
4089                         }
4090                 } else {
4091                         fence = next;
4092                 }
4093         }
4094         mutex_unlock(&adev->shadow_list_lock);
4095
4096         if (fence)
4097                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4098         dma_fence_put(fence);
4099
4100         if (r < 0 || tmo <= 0) {
4101                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4102                 return -EIO;
4103         }
4104
4105         dev_info(adev->dev, "recover vram bo from shadow done\n");
4106         return 0;
4107 }
4108
4109
4110 /**
4111  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4112  *
4113  * @adev: amdgpu_device pointer
4114  * @from_hypervisor: request from hypervisor
4115  *
4116  * do VF FLR and reinitialize Asic
4117  * return 0 means succeeded otherwise failed
4118  */
4119 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4120                                      bool from_hypervisor)
4121 {
4122         int r;
4123
4124         if (from_hypervisor)
4125                 r = amdgpu_virt_request_full_gpu(adev, true);
4126         else
4127                 r = amdgpu_virt_reset_gpu(adev);
4128         if (r)
4129                 return r;
4130
4131         amdgpu_amdkfd_pre_reset(adev);
4132
4133         /* Resume IP prior to SMC */
4134         r = amdgpu_device_ip_reinit_early_sriov(adev);
4135         if (r)
4136                 goto error;
4137
4138         amdgpu_virt_init_data_exchange(adev);
4139         /* we need recover gart prior to run SMC/CP/SDMA resume */
4140         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4141
4142         r = amdgpu_device_fw_loading(adev);
4143         if (r)
4144                 return r;
4145
4146         /* now we are okay to resume SMC/CP/SDMA */
4147         r = amdgpu_device_ip_reinit_late_sriov(adev);
4148         if (r)
4149                 goto error;
4150
4151         amdgpu_irq_gpu_reset_resume_helper(adev);
4152         r = amdgpu_ib_ring_tests(adev);
4153         amdgpu_amdkfd_post_reset(adev);
4154
4155 error:
4156         amdgpu_virt_release_full_gpu(adev, true);
4157         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4158                 amdgpu_inc_vram_lost(adev);
4159                 r = amdgpu_device_recover_vram(adev);
4160         }
4161
4162         return r;
4163 }
4164
4165 /**
4166  * amdgpu_device_has_job_running - check if there is any job in mirror list
4167  *
4168  * @adev: amdgpu_device pointer
4169  *
4170  * check if there is any job in mirror list
4171  */
4172 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4173 {
4174         int i;
4175         struct drm_sched_job *job;
4176
4177         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4178                 struct amdgpu_ring *ring = adev->rings[i];
4179
4180                 if (!ring || !ring->sched.thread)
4181                         continue;
4182
4183                 spin_lock(&ring->sched.job_list_lock);
4184                 job = list_first_entry_or_null(&ring->sched.pending_list,
4185                                                struct drm_sched_job, list);
4186                 spin_unlock(&ring->sched.job_list_lock);
4187                 if (job)
4188                         return true;
4189         }
4190         return false;
4191 }
4192
4193 /**
4194  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4195  *
4196  * @adev: amdgpu_device pointer
4197  *
4198  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4199  * a hung GPU.
4200  */
4201 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4202 {
4203         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4204                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4205                 return false;
4206         }
4207
4208         if (amdgpu_gpu_recovery == 0)
4209                 goto disabled;
4210
4211         if (amdgpu_sriov_vf(adev))
4212                 return true;
4213
4214         if (amdgpu_gpu_recovery == -1) {
4215                 switch (adev->asic_type) {
4216                 case CHIP_BONAIRE:
4217                 case CHIP_HAWAII:
4218                 case CHIP_TOPAZ:
4219                 case CHIP_TONGA:
4220                 case CHIP_FIJI:
4221                 case CHIP_POLARIS10:
4222                 case CHIP_POLARIS11:
4223                 case CHIP_POLARIS12:
4224                 case CHIP_VEGAM:
4225                 case CHIP_VEGA20:
4226                 case CHIP_VEGA10:
4227                 case CHIP_VEGA12:
4228                 case CHIP_RAVEN:
4229                 case CHIP_ARCTURUS:
4230                 case CHIP_RENOIR:
4231                 case CHIP_NAVI10:
4232                 case CHIP_NAVI14:
4233                 case CHIP_NAVI12:
4234                 case CHIP_SIENNA_CICHLID:
4235                 case CHIP_NAVY_FLOUNDER:
4236                 case CHIP_DIMGREY_CAVEFISH:
4237                         break;
4238                 default:
4239                         goto disabled;
4240                 }
4241         }
4242
4243         return true;
4244
4245 disabled:
4246                 dev_info(adev->dev, "GPU recovery disabled.\n");
4247                 return false;
4248 }
4249
4250 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4251 {
4252         u32 i;
4253         int ret = 0;
4254
4255         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4256
4257         dev_info(adev->dev, "GPU mode1 reset\n");
4258
4259         /* disable BM */
4260         pci_clear_master(adev->pdev);
4261
4262         amdgpu_device_cache_pci_state(adev->pdev);
4263
4264         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4265                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4266                 ret = amdgpu_dpm_mode1_reset(adev);
4267         } else {
4268                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4269                 ret = psp_gpu_reset(adev);
4270         }
4271
4272         if (ret)
4273                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4274
4275         amdgpu_device_load_pci_state(adev->pdev);
4276
4277         /* wait for asic to come out of reset */
4278         for (i = 0; i < adev->usec_timeout; i++) {
4279                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4280
4281                 if (memsize != 0xffffffff)
4282                         break;
4283                 udelay(1);
4284         }
4285
4286         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4287         return ret;
4288 }
4289
4290 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4291                                         struct amdgpu_job *job,
4292                                         bool *need_full_reset_arg)
4293 {
4294         int i, r = 0;
4295         bool need_full_reset  = *need_full_reset_arg;
4296
4297         amdgpu_debugfs_wait_dump(adev);
4298
4299         if (amdgpu_sriov_vf(adev)) {
4300                 /* stop the data exchange thread */
4301                 amdgpu_virt_fini_data_exchange(adev);
4302         }
4303
4304         /* block all schedulers and reset given job's ring */
4305         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4306                 struct amdgpu_ring *ring = adev->rings[i];
4307
4308                 if (!ring || !ring->sched.thread)
4309                         continue;
4310
4311                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4312                 amdgpu_fence_driver_force_completion(ring);
4313         }
4314
4315         if(job)
4316                 drm_sched_increase_karma(&job->base);
4317
4318         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4319         if (!amdgpu_sriov_vf(adev)) {
4320
4321                 if (!need_full_reset)
4322                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4323
4324                 if (!need_full_reset) {
4325                         amdgpu_device_ip_pre_soft_reset(adev);
4326                         r = amdgpu_device_ip_soft_reset(adev);
4327                         amdgpu_device_ip_post_soft_reset(adev);
4328                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4329                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4330                                 need_full_reset = true;
4331                         }
4332                 }
4333
4334                 if (need_full_reset)
4335                         r = amdgpu_device_ip_suspend(adev);
4336
4337                 *need_full_reset_arg = need_full_reset;
4338         }
4339
4340         return r;
4341 }
4342
4343 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4344                                struct list_head *device_list_handle,
4345                                bool *need_full_reset_arg,
4346                                bool skip_hw_reset)
4347 {
4348         struct amdgpu_device *tmp_adev = NULL;
4349         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4350         int r = 0;
4351
4352         /*
4353          * ASIC reset has to be done on all XGMI hive nodes ASAP
4354          * to allow proper links negotiation in FW (within 1 sec)
4355          */
4356         if (!skip_hw_reset && need_full_reset) {
4357                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4358                         /* For XGMI run all resets in parallel to speed up the process */
4359                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4360                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4361                                         r = -EALREADY;
4362                         } else
4363                                 r = amdgpu_asic_reset(tmp_adev);
4364
4365                         if (r) {
4366                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4367                                          r, adev_to_drm(tmp_adev)->unique);
4368                                 break;
4369                         }
4370                 }
4371
4372                 /* For XGMI wait for all resets to complete before proceed */
4373                 if (!r) {
4374                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4375                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4376                                         flush_work(&tmp_adev->xgmi_reset_work);
4377                                         r = tmp_adev->asic_reset_res;
4378                                         if (r)
4379                                                 break;
4380                                 }
4381                         }
4382                 }
4383         }
4384
4385         if (!r && amdgpu_ras_intr_triggered()) {
4386                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4387                         if (tmp_adev->mmhub.funcs &&
4388                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4389                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4390                 }
4391
4392                 amdgpu_ras_intr_cleared();
4393         }
4394
4395         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4396                 if (need_full_reset) {
4397                         /* post card */
4398                         if (amdgpu_device_asic_init(tmp_adev))
4399                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4400
4401                         if (!r) {
4402                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4403                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4404                                 if (r)
4405                                         goto out;
4406
4407                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4408                                 if (vram_lost) {
4409                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4410                                         amdgpu_inc_vram_lost(tmp_adev);
4411                                 }
4412
4413                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4414                                 if (r)
4415                                         goto out;
4416
4417                                 r = amdgpu_device_fw_loading(tmp_adev);
4418                                 if (r)
4419                                         return r;
4420
4421                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4422                                 if (r)
4423                                         goto out;
4424
4425                                 if (vram_lost)
4426                                         amdgpu_device_fill_reset_magic(tmp_adev);
4427
4428                                 /*
4429                                  * Add this ASIC as tracked as reset was already
4430                                  * complete successfully.
4431                                  */
4432                                 amdgpu_register_gpu_instance(tmp_adev);
4433
4434                                 r = amdgpu_device_ip_late_init(tmp_adev);
4435                                 if (r)
4436                                         goto out;
4437
4438                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4439
4440                                 /*
4441                                  * The GPU enters bad state once faulty pages
4442                                  * by ECC has reached the threshold, and ras
4443                                  * recovery is scheduled next. So add one check
4444                                  * here to break recovery if it indeed exceeds
4445                                  * bad page threshold, and remind user to
4446                                  * retire this GPU or setting one bigger
4447                                  * bad_page_threshold value to fix this once
4448                                  * probing driver again.
4449                                  */
4450                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4451                                         /* must succeed. */
4452                                         amdgpu_ras_resume(tmp_adev);
4453                                 } else {
4454                                         r = -EINVAL;
4455                                         goto out;
4456                                 }
4457
4458                                 /* Update PSP FW topology after reset */
4459                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4460                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4461                         }
4462                 }
4463
4464 out:
4465                 if (!r) {
4466                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4467                         r = amdgpu_ib_ring_tests(tmp_adev);
4468                         if (r) {
4469                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4470                                 r = amdgpu_device_ip_suspend(tmp_adev);
4471                                 need_full_reset = true;
4472                                 r = -EAGAIN;
4473                                 goto end;
4474                         }
4475                 }
4476
4477                 if (!r)
4478                         r = amdgpu_device_recover_vram(tmp_adev);
4479                 else
4480                         tmp_adev->asic_reset_res = r;
4481         }
4482
4483 end:
4484         *need_full_reset_arg = need_full_reset;
4485         return r;
4486 }
4487
4488 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4489                                 struct amdgpu_hive_info *hive)
4490 {
4491         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4492                 return false;
4493
4494         if (hive) {
4495                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4496         } else {
4497                 down_write(&adev->reset_sem);
4498         }
4499
4500         switch (amdgpu_asic_reset_method(adev)) {
4501         case AMD_RESET_METHOD_MODE1:
4502                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4503                 break;
4504         case AMD_RESET_METHOD_MODE2:
4505                 adev->mp1_state = PP_MP1_STATE_RESET;
4506                 break;
4507         default:
4508                 adev->mp1_state = PP_MP1_STATE_NONE;
4509                 break;
4510         }
4511
4512         return true;
4513 }
4514
4515 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4516 {
4517         amdgpu_vf_error_trans_all(adev);
4518         adev->mp1_state = PP_MP1_STATE_NONE;
4519         atomic_set(&adev->in_gpu_reset, 0);
4520         up_write(&adev->reset_sem);
4521 }
4522
4523 /*
4524  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4525  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4526  *
4527  * unlock won't require roll back.
4528  */
4529 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4530 {
4531         struct amdgpu_device *tmp_adev = NULL;
4532
4533         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4534                 if (!hive) {
4535                         dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4536                         return -ENODEV;
4537                 }
4538                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4539                         if (!amdgpu_device_lock_adev(tmp_adev, hive))
4540                                 goto roll_back;
4541                 }
4542         } else if (!amdgpu_device_lock_adev(adev, hive))
4543                 return -EAGAIN;
4544
4545         return 0;
4546 roll_back:
4547         if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4548                 /*
4549                  * if the lockup iteration break in the middle of a hive,
4550                  * it may means there may has a race issue,
4551                  * or a hive device locked up independently.
4552                  * we may be in trouble and may not, so will try to roll back
4553                  * the lock and give out a warnning.
4554                  */
4555                 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4556                 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4557                         amdgpu_device_unlock_adev(tmp_adev);
4558                 }
4559         }
4560         return -EAGAIN;
4561 }
4562
4563 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4564 {
4565         struct pci_dev *p = NULL;
4566
4567         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4568                         adev->pdev->bus->number, 1);
4569         if (p) {
4570                 pm_runtime_enable(&(p->dev));
4571                 pm_runtime_resume(&(p->dev));
4572         }
4573 }
4574
4575 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4576 {
4577         enum amd_reset_method reset_method;
4578         struct pci_dev *p = NULL;
4579         u64 expires;
4580
4581         /*
4582          * For now, only BACO and mode1 reset are confirmed
4583          * to suffer the audio issue without proper suspended.
4584          */
4585         reset_method = amdgpu_asic_reset_method(adev);
4586         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4587              (reset_method != AMD_RESET_METHOD_MODE1))
4588                 return -EINVAL;
4589
4590         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4591                         adev->pdev->bus->number, 1);
4592         if (!p)
4593                 return -ENODEV;
4594
4595         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4596         if (!expires)
4597                 /*
4598                  * If we cannot get the audio device autosuspend delay,
4599                  * a fixed 4S interval will be used. Considering 3S is
4600                  * the audio controller default autosuspend delay setting.
4601                  * 4S used here is guaranteed to cover that.
4602                  */
4603                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4604
4605         while (!pm_runtime_status_suspended(&(p->dev))) {
4606                 if (!pm_runtime_suspend(&(p->dev)))
4607                         break;
4608
4609                 if (expires < ktime_get_mono_fast_ns()) {
4610                         dev_warn(adev->dev, "failed to suspend display audio\n");
4611                         /* TODO: abort the succeeding gpu reset? */
4612                         return -ETIMEDOUT;
4613                 }
4614         }
4615
4616         pm_runtime_disable(&(p->dev));
4617
4618         return 0;
4619 }
4620
4621 /**
4622  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4623  *
4624  * @adev: amdgpu_device pointer
4625  * @job: which job trigger hang
4626  *
4627  * Attempt to reset the GPU if it has hung (all asics).
4628  * Attempt to do soft-reset or full-reset and reinitialize Asic
4629  * Returns 0 for success or an error on failure.
4630  */
4631
4632 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4633                               struct amdgpu_job *job)
4634 {
4635         struct list_head device_list, *device_list_handle =  NULL;
4636         bool need_full_reset = false;
4637         bool job_signaled = false;
4638         struct amdgpu_hive_info *hive = NULL;
4639         struct amdgpu_device *tmp_adev = NULL;
4640         int i, r = 0;
4641         bool need_emergency_restart = false;
4642         bool audio_suspended = false;
4643
4644         /*
4645          * Special case: RAS triggered and full reset isn't supported
4646          */
4647         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4648
4649         /*
4650          * Flush RAM to disk so that after reboot
4651          * the user can read log and see why the system rebooted.
4652          */
4653         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4654                 DRM_WARN("Emergency reboot.");
4655
4656                 ksys_sync_helper();
4657                 emergency_restart();
4658         }
4659
4660         dev_info(adev->dev, "GPU %s begin!\n",
4661                 need_emergency_restart ? "jobs stop":"reset");
4662
4663         /*
4664          * Here we trylock to avoid chain of resets executing from
4665          * either trigger by jobs on different adevs in XGMI hive or jobs on
4666          * different schedulers for same device while this TO handler is running.
4667          * We always reset all schedulers for device and all devices for XGMI
4668          * hive so that should take care of them too.
4669          */
4670         hive = amdgpu_get_xgmi_hive(adev);
4671         if (hive) {
4672                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4673                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4674                                 job ? job->base.id : -1, hive->hive_id);
4675                         amdgpu_put_xgmi_hive(hive);
4676                         if (job)
4677                                 drm_sched_increase_karma(&job->base);
4678                         return 0;
4679                 }
4680                 mutex_lock(&hive->hive_lock);
4681         }
4682
4683         /*
4684          * lock the device before we try to operate the linked list
4685          * if didn't get the device lock, don't touch the linked list since
4686          * others may iterating it.
4687          */
4688         r = amdgpu_device_lock_hive_adev(adev, hive);
4689         if (r) {
4690                 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4691                                         job ? job->base.id : -1);
4692
4693                 /* even we skipped this reset, still need to set the job to guilty */
4694                 if (job)
4695                         drm_sched_increase_karma(&job->base);
4696                 goto skip_recovery;
4697         }
4698
4699         /*
4700          * Build list of devices to reset.
4701          * In case we are in XGMI hive mode, resort the device list
4702          * to put adev in the 1st position.
4703          */
4704         INIT_LIST_HEAD(&device_list);
4705         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4706                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4707                         list_add_tail(&tmp_adev->reset_list, &device_list);
4708                 if (!list_is_first(&adev->reset_list, &device_list))
4709                         list_rotate_to_front(&adev->reset_list, &device_list);
4710                 device_list_handle = &device_list;
4711         } else {
4712                 list_add_tail(&adev->reset_list, &device_list);
4713                 device_list_handle = &device_list;
4714         }
4715
4716         /* block all schedulers and reset given job's ring */
4717         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4718                 /*
4719                  * Try to put the audio codec into suspend state
4720                  * before gpu reset started.
4721                  *
4722                  * Due to the power domain of the graphics device
4723                  * is shared with AZ power domain. Without this,
4724                  * we may change the audio hardware from behind
4725                  * the audio driver's back. That will trigger
4726                  * some audio codec errors.
4727                  */
4728                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4729                         audio_suspended = true;
4730
4731                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4732
4733                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4734
4735                 if (!amdgpu_sriov_vf(tmp_adev))
4736                         amdgpu_amdkfd_pre_reset(tmp_adev);
4737
4738                 /*
4739                  * Mark these ASICs to be reseted as untracked first
4740                  * And add them back after reset completed
4741                  */
4742                 amdgpu_unregister_gpu_instance(tmp_adev);
4743
4744                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4745
4746                 /* disable ras on ALL IPs */
4747                 if (!need_emergency_restart &&
4748                       amdgpu_device_ip_need_full_reset(tmp_adev))
4749                         amdgpu_ras_suspend(tmp_adev);
4750
4751                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4752                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4753
4754                         if (!ring || !ring->sched.thread)
4755                                 continue;
4756
4757                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4758
4759                         if (need_emergency_restart)
4760                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4761                 }
4762                 atomic_inc(&tmp_adev->gpu_reset_counter);
4763         }
4764
4765         if (need_emergency_restart)
4766                 goto skip_sched_resume;
4767
4768         /*
4769          * Must check guilty signal here since after this point all old
4770          * HW fences are force signaled.
4771          *
4772          * job->base holds a reference to parent fence
4773          */
4774         if (job && job->base.s_fence->parent &&
4775             dma_fence_is_signaled(job->base.s_fence->parent)) {
4776                 job_signaled = true;
4777                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4778                 goto skip_hw_reset;
4779         }
4780
4781 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4782         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4783                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4784                                                  (tmp_adev == adev) ? job : NULL,
4785                                                  &need_full_reset);
4786                 /*TODO Should we stop ?*/
4787                 if (r) {
4788                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4789                                   r, adev_to_drm(tmp_adev)->unique);
4790                         tmp_adev->asic_reset_res = r;
4791                 }
4792         }
4793
4794         /* Actual ASIC resets if needed.*/
4795         /* TODO Implement XGMI hive reset logic for SRIOV */
4796         if (amdgpu_sriov_vf(adev)) {
4797                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4798                 if (r)
4799                         adev->asic_reset_res = r;
4800         } else {
4801                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4802                 if (r && r == -EAGAIN)
4803                         goto retry;
4804         }
4805
4806 skip_hw_reset:
4807
4808         /* Post ASIC reset for all devs .*/
4809         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4810
4811                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4812                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4813
4814                         if (!ring || !ring->sched.thread)
4815                                 continue;
4816
4817                         /* No point to resubmit jobs if we didn't HW reset*/
4818                         if (!tmp_adev->asic_reset_res && !job_signaled)
4819                                 drm_sched_resubmit_jobs(&ring->sched);
4820
4821                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4822                 }
4823
4824                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4825                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4826                 }
4827
4828                 tmp_adev->asic_reset_res = 0;
4829
4830                 if (r) {
4831                         /* bad news, how to tell it to userspace ? */
4832                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4833                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4834                 } else {
4835                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4836                 }
4837         }
4838
4839 skip_sched_resume:
4840         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4841                 /* unlock kfd: SRIOV would do it separately */
4842                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4843                         amdgpu_amdkfd_post_reset(tmp_adev);
4844
4845                 /* kfd_post_reset will do nothing if kfd device is not initialized,
4846                  * need to bring up kfd here if it's not be initialized before
4847                  */
4848                 if (!adev->kfd.init_complete)
4849                         amdgpu_amdkfd_device_init(adev);
4850
4851                 if (audio_suspended)
4852                         amdgpu_device_resume_display_audio(tmp_adev);
4853                 amdgpu_device_unlock_adev(tmp_adev);
4854         }
4855
4856 skip_recovery:
4857         if (hive) {
4858                 atomic_set(&hive->in_reset, 0);
4859                 mutex_unlock(&hive->hive_lock);
4860                 amdgpu_put_xgmi_hive(hive);
4861         }
4862
4863         if (r && r != -EAGAIN)
4864                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4865         return r;
4866 }
4867
4868 /**
4869  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4870  *
4871  * @adev: amdgpu_device pointer
4872  *
4873  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4874  * and lanes) of the slot the device is in. Handles APUs and
4875  * virtualized environments where PCIE config space may not be available.
4876  */
4877 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4878 {
4879         struct pci_dev *pdev;
4880         enum pci_bus_speed speed_cap, platform_speed_cap;
4881         enum pcie_link_width platform_link_width;
4882
4883         if (amdgpu_pcie_gen_cap)
4884                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4885
4886         if (amdgpu_pcie_lane_cap)
4887                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4888
4889         /* covers APUs as well */
4890         if (pci_is_root_bus(adev->pdev->bus)) {
4891                 if (adev->pm.pcie_gen_mask == 0)
4892                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4893                 if (adev->pm.pcie_mlw_mask == 0)
4894                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4895                 return;
4896         }
4897
4898         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4899                 return;
4900
4901         pcie_bandwidth_available(adev->pdev, NULL,
4902                                  &platform_speed_cap, &platform_link_width);
4903
4904         if (adev->pm.pcie_gen_mask == 0) {
4905                 /* asic caps */
4906                 pdev = adev->pdev;
4907                 speed_cap = pcie_get_speed_cap(pdev);
4908                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4909                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4910                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4911                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4912                 } else {
4913                         if (speed_cap == PCIE_SPEED_32_0GT)
4914                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4915                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4916                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4917                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
4918                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
4919                         else if (speed_cap == PCIE_SPEED_16_0GT)
4920                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4921                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4922                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4923                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4924                         else if (speed_cap == PCIE_SPEED_8_0GT)
4925                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4926                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4927                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4928                         else if (speed_cap == PCIE_SPEED_5_0GT)
4929                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4930                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4931                         else
4932                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4933                 }
4934                 /* platform caps */
4935                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4936                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4937                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4938                 } else {
4939                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
4940                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4941                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4942                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4943                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
4944                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
4945                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
4946                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4947                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4948                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4949                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4950                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4951                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4952                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4953                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4954                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4955                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4956                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4957                         else
4958                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4959
4960                 }
4961         }
4962         if (adev->pm.pcie_mlw_mask == 0) {
4963                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4964                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4965                 } else {
4966                         switch (platform_link_width) {
4967                         case PCIE_LNK_X32:
4968                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4969                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4970                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4971                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4972                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4973                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4974                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4975                                 break;
4976                         case PCIE_LNK_X16:
4977                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4978                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4979                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4980                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4981                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4982                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4983                                 break;
4984                         case PCIE_LNK_X12:
4985                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4986                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4987                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4988                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4989                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4990                                 break;
4991                         case PCIE_LNK_X8:
4992                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4993                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4994                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4995                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4996                                 break;
4997                         case PCIE_LNK_X4:
4998                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4999                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5000                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5001                                 break;
5002                         case PCIE_LNK_X2:
5003                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5004                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5005                                 break;
5006                         case PCIE_LNK_X1:
5007                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5008                                 break;
5009                         default:
5010                                 break;
5011                         }
5012                 }
5013         }
5014 }
5015
5016 int amdgpu_device_baco_enter(struct drm_device *dev)
5017 {
5018         struct amdgpu_device *adev = drm_to_adev(dev);
5019         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5020
5021         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5022                 return -ENOTSUPP;
5023
5024         if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5025                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5026
5027         return amdgpu_dpm_baco_enter(adev);
5028 }
5029
5030 int amdgpu_device_baco_exit(struct drm_device *dev)
5031 {
5032         struct amdgpu_device *adev = drm_to_adev(dev);
5033         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5034         int ret = 0;
5035
5036         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5037                 return -ENOTSUPP;
5038
5039         ret = amdgpu_dpm_baco_exit(adev);
5040         if (ret)
5041                 return ret;
5042
5043         if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5044                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5045
5046         return 0;
5047 }
5048
5049 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5050 {
5051         int i;
5052
5053         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5054                 struct amdgpu_ring *ring = adev->rings[i];
5055
5056                 if (!ring || !ring->sched.thread)
5057                         continue;
5058
5059                 cancel_delayed_work_sync(&ring->sched.work_tdr);
5060         }
5061 }
5062
5063 /**
5064  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5065  * @pdev: PCI device struct
5066  * @state: PCI channel state
5067  *
5068  * Description: Called when a PCI error is detected.
5069  *
5070  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5071  */
5072 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5073 {
5074         struct drm_device *dev = pci_get_drvdata(pdev);
5075         struct amdgpu_device *adev = drm_to_adev(dev);
5076         int i;
5077
5078         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5079
5080         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5081                 DRM_WARN("No support for XGMI hive yet...");
5082                 return PCI_ERS_RESULT_DISCONNECT;
5083         }
5084
5085         switch (state) {
5086         case pci_channel_io_normal:
5087                 return PCI_ERS_RESULT_CAN_RECOVER;
5088         /* Fatal error, prepare for slot reset */
5089         case pci_channel_io_frozen:
5090                 /*
5091                  * Cancel and wait for all TDRs in progress if failing to
5092                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5093                  *
5094                  * Locking adev->reset_sem will prevent any external access
5095                  * to GPU during PCI error recovery
5096                  */
5097                 while (!amdgpu_device_lock_adev(adev, NULL))
5098                         amdgpu_cancel_all_tdr(adev);
5099
5100                 /*
5101                  * Block any work scheduling as we do for regular GPU reset
5102                  * for the duration of the recovery
5103                  */
5104                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5105                         struct amdgpu_ring *ring = adev->rings[i];
5106
5107                         if (!ring || !ring->sched.thread)
5108                                 continue;
5109
5110                         drm_sched_stop(&ring->sched, NULL);
5111                 }
5112                 atomic_inc(&adev->gpu_reset_counter);
5113                 return PCI_ERS_RESULT_NEED_RESET;
5114         case pci_channel_io_perm_failure:
5115                 /* Permanent error, prepare for device removal */
5116                 return PCI_ERS_RESULT_DISCONNECT;
5117         }
5118
5119         return PCI_ERS_RESULT_NEED_RESET;
5120 }
5121
5122 /**
5123  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5124  * @pdev: pointer to PCI device
5125  */
5126 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5127 {
5128
5129         DRM_INFO("PCI error: mmio enabled callback!!\n");
5130
5131         /* TODO - dump whatever for debugging purposes */
5132
5133         /* This called only if amdgpu_pci_error_detected returns
5134          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5135          * works, no need to reset slot.
5136          */
5137
5138         return PCI_ERS_RESULT_RECOVERED;
5139 }
5140
5141 /**
5142  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5143  * @pdev: PCI device struct
5144  *
5145  * Description: This routine is called by the pci error recovery
5146  * code after the PCI slot has been reset, just before we
5147  * should resume normal operations.
5148  */
5149 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5150 {
5151         struct drm_device *dev = pci_get_drvdata(pdev);
5152         struct amdgpu_device *adev = drm_to_adev(dev);
5153         int r, i;
5154         bool need_full_reset = true;
5155         u32 memsize;
5156         struct list_head device_list;
5157
5158         DRM_INFO("PCI error: slot reset callback!!\n");
5159
5160         INIT_LIST_HEAD(&device_list);
5161         list_add_tail(&adev->reset_list, &device_list);
5162
5163         /* wait for asic to come out of reset */
5164         msleep(500);
5165
5166         /* Restore PCI confspace */
5167         amdgpu_device_load_pci_state(pdev);
5168
5169         /* confirm  ASIC came out of reset */
5170         for (i = 0; i < adev->usec_timeout; i++) {
5171                 memsize = amdgpu_asic_get_config_memsize(adev);
5172
5173                 if (memsize != 0xffffffff)
5174                         break;
5175                 udelay(1);
5176         }
5177         if (memsize == 0xffffffff) {
5178                 r = -ETIME;
5179                 goto out;
5180         }
5181
5182         adev->in_pci_err_recovery = true;
5183         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5184         adev->in_pci_err_recovery = false;
5185         if (r)
5186                 goto out;
5187
5188         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5189
5190 out:
5191         if (!r) {
5192                 if (amdgpu_device_cache_pci_state(adev->pdev))
5193                         pci_restore_state(adev->pdev);
5194
5195                 DRM_INFO("PCIe error recovery succeeded\n");
5196         } else {
5197                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5198                 amdgpu_device_unlock_adev(adev);
5199         }
5200
5201         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5202 }
5203
5204 /**
5205  * amdgpu_pci_resume() - resume normal ops after PCI reset
5206  * @pdev: pointer to PCI device
5207  *
5208  * Called when the error recovery driver tells us that its
5209  * OK to resume normal operation.
5210  */
5211 void amdgpu_pci_resume(struct pci_dev *pdev)
5212 {
5213         struct drm_device *dev = pci_get_drvdata(pdev);
5214         struct amdgpu_device *adev = drm_to_adev(dev);
5215         int i;
5216
5217
5218         DRM_INFO("PCI error: resume callback!!\n");
5219
5220         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5221                 struct amdgpu_ring *ring = adev->rings[i];
5222
5223                 if (!ring || !ring->sched.thread)
5224                         continue;
5225
5226
5227                 drm_sched_resubmit_jobs(&ring->sched);
5228                 drm_sched_start(&ring->sched, true);
5229         }
5230
5231         amdgpu_device_unlock_adev(adev);
5232 }
5233
5234 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5235 {
5236         struct drm_device *dev = pci_get_drvdata(pdev);
5237         struct amdgpu_device *adev = drm_to_adev(dev);
5238         int r;
5239
5240         r = pci_save_state(pdev);
5241         if (!r) {
5242                 kfree(adev->pci_state);
5243
5244                 adev->pci_state = pci_store_saved_state(pdev);
5245
5246                 if (!adev->pci_state) {
5247                         DRM_ERROR("Failed to store PCI saved state");
5248                         return false;
5249                 }
5250         } else {
5251                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5252                 return false;
5253         }
5254
5255         return true;
5256 }
5257
5258 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5259 {
5260         struct drm_device *dev = pci_get_drvdata(pdev);
5261         struct amdgpu_device *adev = drm_to_adev(dev);
5262         int r;
5263
5264         if (!adev->pci_state)
5265                 return false;
5266
5267         r = pci_load_saved_state(pdev, adev->pci_state);
5268
5269         if (!r) {
5270                 pci_restore_state(pdev);
5271         } else {
5272                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5273                 return false;
5274         }
5275
5276         return true;
5277 }
5278
5279