drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  84
  85 #define AMDGPU_RESUME_MS                2000
  86
  87 const char *amdgpu_asic_name[] = {
  88         "TAHITI",
  89         "PITCAIRN",
  90         "VERDE",
  91         "OLAND",
  92         "HAINAN",
  93         "BONAIRE",
  94         "KAVERI",
  95         "KABINI",
  96         "HAWAII",
  97         "MULLINS",
  98         "TOPAZ",
  99         "TONGA",
 100         "FIJI",
 101         "CARRIZO",
 102         "STONEY",
 103         "POLARIS10",
 104         "POLARIS11",
 105         "POLARIS12",
 106         "VEGAM",
 107         "VEGA10",
 108         "VEGA12",
 109         "VEGA20",
 110         "RAVEN",
 111         "ARCTURUS",
 112         "RENOIR",
 113         "ALDEBARAN",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "VANGOGH",
 120         "DIMGREY_CAVEFISH",
 121         "LAST",
 122 };
 123
 124 /**
 125  * DOC: pcie_replay_count
 126  *
 127  * The amdgpu driver provides a sysfs API for reporting the total number
 128  * of PCIe replays (NAKs)
 129  * The file pcie_replay_count is used for this and returns the total
 130  * number of replays as a sum of the NAKs generated and NAKs received
 131  */
 132
 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 134                 struct device_attribute *attr, char *buf)
 135 {
 136         struct drm_device *ddev = dev_get_drvdata(dev);
 137         struct amdgpu_device *adev = drm_to_adev(ddev);
 138         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 139
 140         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 141 }
 142
 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 144                 amdgpu_device_get_pcie_replay_count, NULL);
 145
 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 147
 148 /**
 149  * DOC: product_name
 150  *
 151  * The amdgpu driver provides a sysfs API for reporting the product name
 152  * for the device
 153  * The file serial_number is used for this and returns the product name
 154  * as returned from the FRU.
 155  * NOTE: This is only available for certain server cards
 156  */
 157
 158 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 159                 struct device_attribute *attr, char *buf)
 160 {
 161         struct drm_device *ddev = dev_get_drvdata(dev);
 162         struct amdgpu_device *adev = drm_to_adev(ddev);
 163
 164         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 165 }
 166
 167 static DEVICE_ATTR(product_name, S_IRUGO,
 168                 amdgpu_device_get_product_name, NULL);
 169
 170 /**
 171  * DOC: product_number
 172  *
 173  * The amdgpu driver provides a sysfs API for reporting the part number
 174  * for the device
 175  * The file serial_number is used for this and returns the part number
 176  * as returned from the FRU.
 177  * NOTE: This is only available for certain server cards
 178  */
 179
 180 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 181                 struct device_attribute *attr, char *buf)
 182 {
 183         struct drm_device *ddev = dev_get_drvdata(dev);
 184         struct amdgpu_device *adev = drm_to_adev(ddev);
 185
 186         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 187 }
 188
 189 static DEVICE_ATTR(product_number, S_IRUGO,
 190                 amdgpu_device_get_product_number, NULL);
 191
 192 /**
 193  * DOC: serial_number
 194  *
 195  * The amdgpu driver provides a sysfs API for reporting the serial number
 196  * for the device
 197  * The file serial_number is used for this and returns the serial number
 198  * as returned from the FRU.
 199  * NOTE: This is only available for certain server cards
 200  */
 201
 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 203                 struct device_attribute *attr, char *buf)
 204 {
 205         struct drm_device *ddev = dev_get_drvdata(dev);
 206         struct amdgpu_device *adev = drm_to_adev(ddev);
 207
 208         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 209 }
 210
 211 static DEVICE_ATTR(serial_number, S_IRUGO,
 212                 amdgpu_device_get_serial_number, NULL);
 213
 214 /**
 215  * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
 216  *
 217  * @dev: drm_device pointer
 218  *
 219  * Returns true if the device is a dGPU with HG/PX power control,
 220  * otherwise return false.
 221  */
 222 bool amdgpu_device_supports_atpx(struct drm_device *dev)
 223 {
 224         struct amdgpu_device *adev = drm_to_adev(dev);
 225
 226         if (adev->flags & AMD_IS_PX)
 227                 return true;
 228         return false;
 229 }
 230
 231 /**
 232  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 233  *
 234  * @dev: drm_device pointer
 235  *
 236  * Returns true if the device is a dGPU with HG/PX power control,
 237  * otherwise return false.
 238  */
 239 bool amdgpu_device_supports_boco(struct drm_device *dev)
 240 {
 241         struct amdgpu_device *adev = drm_to_adev(dev);
 242
 243         if (adev->has_pr3)
 244                 return true;
 245         return false;
 246 }
 247
 248 /**
 249  * amdgpu_device_supports_baco - Does the device support BACO
 250  *
 251  * @dev: drm_device pointer
 252  *
 253  * Returns true if the device supporte BACO,
 254  * otherwise return false.
 255  */
 256 bool amdgpu_device_supports_baco(struct drm_device *dev)
 257 {
 258         struct amdgpu_device *adev = drm_to_adev(dev);
 259
 260         return amdgpu_asic_supports_baco(adev);
 261 }
 262
 263 /*
 264  * VRAM access helper functions
 265  */
 266
 267 /**
 268  * amdgpu_device_vram_access - read/write a buffer in vram
 269  *
 270  * @adev: amdgpu_device pointer
 271  * @pos: offset of the buffer in vram
 272  * @buf: virtual address of the buffer in system memory
 273  * @size: read/write size, sizeof(@buf) must > @size
 274  * @write: true - write to vram, otherwise - read from vram
 275  */
 276 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 277                                uint32_t *buf, size_t size, bool write)
 278 {
 279         unsigned long flags;
 280         uint32_t hi = ~0;
 281         uint64_t last;
 282
 283
 284 #ifdef CONFIG_64BIT
 285         last = min(pos + size, adev->gmc.visible_vram_size);
 286         if (last > pos) {
 287                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 288                 size_t count = last - pos;
 289
 290                 if (write) {
 291                         memcpy_toio(addr, buf, count);
 292                         mb();
 293                         amdgpu_asic_flush_hdp(adev, NULL);
 294                 } else {
 295                         amdgpu_asic_invalidate_hdp(adev, NULL);
 296                         mb();
 297                         memcpy_fromio(buf, addr, count);
 298                 }
 299
 300                 if (count == size)
 301                         return;
 302
 303                 pos += count;
 304                 buf += count / 4;
 305                 size -= count;
 306         }
 307 #endif
 308
 309         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 310         for (last = pos + size; pos < last; pos += 4) {
 311                 uint32_t tmp = pos >> 31;
 312
 313                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 314                 if (tmp != hi) {
 315                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 316                         hi = tmp;
 317                 }
 318                 if (write)
 319                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 320                 else
 321                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 322         }
 323         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 324 }
 325
 326 /*
 327  * register access helper functions.
 328  */
 329 /**
 330  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 331  *
 332  * @adev: amdgpu_device pointer
 333  * @reg: dword aligned register offset
 334  * @acc_flags: access flags which require special behavior
 335  *
 336  * Returns the 32 bit value from the offset specified.
 337  */
 338 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 339                             uint32_t reg, uint32_t acc_flags)
 340 {
 341         uint32_t ret;
 342
 343         if (adev->in_pci_err_recovery)
 344                 return 0;
 345
 346         if ((reg * 4) < adev->rmmio_size) {
 347                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 348                     amdgpu_sriov_runtime(adev) &&
 349                     down_read_trylock(&adev->reset_sem)) {
 350                         ret = amdgpu_kiq_rreg(adev, reg);
 351                         up_read(&adev->reset_sem);
 352                 } else {
 353                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 354                 }
 355         } else {
 356                 ret = adev->pcie_rreg(adev, reg * 4);
 357         }
 358
 359         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 360
 361         return ret;
 362 }
 363
 364 /*
 365  * MMIO register read with bytes helper functions
 366  * @offset:bytes offset from MMIO start
 367  *
 368 */
 369
 370 /**
 371  * amdgpu_mm_rreg8 - read a memory mapped IO register
 372  *
 373  * @adev: amdgpu_device pointer
 374  * @offset: byte aligned register offset
 375  *
 376  * Returns the 8 bit value from the offset specified.
 377  */
 378 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 379 {
 380         if (adev->in_pci_err_recovery)
 381                 return 0;
 382
 383         if (offset < adev->rmmio_size)
 384                 return (readb(adev->rmmio + offset));
 385         BUG();
 386 }
 387
 388 /*
 389  * MMIO register write with bytes helper functions
 390  * @offset:bytes offset from MMIO start
 391  * @value: the value want to be written to the register
 392  *
 393 */
 394 /**
 395  * amdgpu_mm_wreg8 - read a memory mapped IO register
 396  *
 397  * @adev: amdgpu_device pointer
 398  * @offset: byte aligned register offset
 399  * @value: 8 bit value to write
 400  *
 401  * Writes the value specified to the offset specified.
 402  */
 403 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 404 {
 405         if (adev->in_pci_err_recovery)
 406                 return;
 407
 408         if (offset < adev->rmmio_size)
 409                 writeb(value, adev->rmmio + offset);
 410         else
 411                 BUG();
 412 }
 413
 414 /**
 415  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 416  *
 417  * @adev: amdgpu_device pointer
 418  * @reg: dword aligned register offset
 419  * @v: 32 bit value to write to the register
 420  * @acc_flags: access flags which require special behavior
 421  *
 422  * Writes the value specified to the offset specified.
 423  */
 424 void amdgpu_device_wreg(struct amdgpu_device *adev,
 425                         uint32_t reg, uint32_t v,
 426                         uint32_t acc_flags)
 427 {
 428         if (adev->in_pci_err_recovery)
 429                 return;
 430
 431         if ((reg * 4) < adev->rmmio_size) {
 432                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 433                     amdgpu_sriov_runtime(adev) &&
 434                     down_read_trylock(&adev->reset_sem)) {
 435                         amdgpu_kiq_wreg(adev, reg, v);
 436                         up_read(&adev->reset_sem);
 437                 } else {
 438                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 439                 }
 440         } else {
 441                 adev->pcie_wreg(adev, reg * 4, v);
 442         }
 443
 444         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 445 }
 446
 447 /*
 448  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 449  *
 450  * this function is invoked only the debugfs register access
 451  * */
 452 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 453                              uint32_t reg, uint32_t v)
 454 {
 455         if (adev->in_pci_err_recovery)
 456                 return;
 457
 458         if (amdgpu_sriov_fullaccess(adev) &&
 459             adev->gfx.rlc.funcs &&
 460             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 461                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 462                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 463         } else {
 464                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 465         }
 466 }
 467
 468 /**
 469  * amdgpu_io_rreg - read an IO register
 470  *
 471  * @adev: amdgpu_device pointer
 472  * @reg: dword aligned register offset
 473  *
 474  * Returns the 32 bit value from the offset specified.
 475  */
 476 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 477 {
 478         if (adev->in_pci_err_recovery)
 479                 return 0;
 480
 481         if ((reg * 4) < adev->rio_mem_size)
 482                 return ioread32(adev->rio_mem + (reg * 4));
 483         else {
 484                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 485                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 486         }
 487 }
 488
 489 /**
 490  * amdgpu_io_wreg - write to an IO register
 491  *
 492  * @adev: amdgpu_device pointer
 493  * @reg: dword aligned register offset
 494  * @v: 32 bit value to write to the register
 495  *
 496  * Writes the value specified to the offset specified.
 497  */
 498 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 499 {
 500         if (adev->in_pci_err_recovery)
 501                 return;
 502
 503         if ((reg * 4) < adev->rio_mem_size)
 504                 iowrite32(v, adev->rio_mem + (reg * 4));
 505         else {
 506                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 507                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 508         }
 509 }
 510
 511 /**
 512  * amdgpu_mm_rdoorbell - read a doorbell dword
 513  *
 514  * @adev: amdgpu_device pointer
 515  * @index: doorbell index
 516  *
 517  * Returns the value in the doorbell aperture at the
 518  * requested doorbell index (CIK).
 519  */
 520 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 521 {
 522         if (adev->in_pci_err_recovery)
 523                 return 0;
 524
 525         if (index < adev->doorbell.num_doorbells) {
 526                 return readl(adev->doorbell.ptr + index);
 527         } else {
 528                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 529                 return 0;
 530         }
 531 }
 532
 533 /**
 534  * amdgpu_mm_wdoorbell - write a doorbell dword
 535  *
 536  * @adev: amdgpu_device pointer
 537  * @index: doorbell index
 538  * @v: value to write
 539  *
 540  * Writes @v to the doorbell aperture at the
 541  * requested doorbell index (CIK).
 542  */
 543 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 544 {
 545         if (adev->in_pci_err_recovery)
 546                 return;
 547
 548         if (index < adev->doorbell.num_doorbells) {
 549                 writel(v, adev->doorbell.ptr + index);
 550         } else {
 551                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 552         }
 553 }
 554
 555 /**
 556  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 557  *
 558  * @adev: amdgpu_device pointer
 559  * @index: doorbell index
 560  *
 561  * Returns the value in the doorbell aperture at the
 562  * requested doorbell index (VEGA10+).
 563  */
 564 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 565 {
 566         if (adev->in_pci_err_recovery)
 567                 return 0;
 568
 569         if (index < adev->doorbell.num_doorbells) {
 570                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 571         } else {
 572                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 573                 return 0;
 574         }
 575 }
 576
 577 /**
 578  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 579  *
 580  * @adev: amdgpu_device pointer
 581  * @index: doorbell index
 582  * @v: value to write
 583  *
 584  * Writes @v to the doorbell aperture at the
 585  * requested doorbell index (VEGA10+).
 586  */
 587 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 588 {
 589         if (adev->in_pci_err_recovery)
 590                 return;
 591
 592         if (index < adev->doorbell.num_doorbells) {
 593                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 594         } else {
 595                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 596         }
 597 }
 598
 599 /**
 600  * amdgpu_device_indirect_rreg - read an indirect register
 601  *
 602  * @adev: amdgpu_device pointer
 603  * @pcie_index: mmio register offset
 604  * @pcie_data: mmio register offset
 605  * @reg_addr: indirect register address to read from
 606  *
 607  * Returns the value of indirect register @reg_addr
 608  */
 609 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 610                                 u32 pcie_index, u32 pcie_data,
 611                                 u32 reg_addr)
 612 {
 613         unsigned long flags;
 614         u32 r;
 615         void __iomem *pcie_index_offset;
 616         void __iomem *pcie_data_offset;
 617
 618         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 619         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 620         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 621
 622         writel(reg_addr, pcie_index_offset);
 623         readl(pcie_index_offset);
 624         r = readl(pcie_data_offset);
 625         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 626
 627         return r;
 628 }
 629
 630 /**
 631  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 632  *
 633  * @adev: amdgpu_device pointer
 634  * @pcie_index: mmio register offset
 635  * @pcie_data: mmio register offset
 636  * @reg_addr: indirect register address to read from
 637  *
 638  * Returns the value of indirect register @reg_addr
 639  */
 640 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 641                                   u32 pcie_index, u32 pcie_data,
 642                                   u32 reg_addr)
 643 {
 644         unsigned long flags;
 645         u64 r;
 646         void __iomem *pcie_index_offset;
 647         void __iomem *pcie_data_offset;
 648
 649         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 650         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 651         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 652
 653         /* read low 32 bits */
 654         writel(reg_addr, pcie_index_offset);
 655         readl(pcie_index_offset);
 656         r = readl(pcie_data_offset);
 657         /* read high 32 bits */
 658         writel(reg_addr + 4, pcie_index_offset);
 659         readl(pcie_index_offset);
 660         r |= ((u64)readl(pcie_data_offset) << 32);
 661         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 662
 663         return r;
 664 }
 665
 666 /**
 667  * amdgpu_device_indirect_wreg - write an indirect register address
 668  *
 669  * @adev: amdgpu_device pointer
 670  * @pcie_index: mmio register offset
 671  * @pcie_data: mmio register offset
 672  * @reg_addr: indirect register offset
 673  * @reg_data: indirect register data
 674  *
 675  */
 676 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 677                                  u32 pcie_index, u32 pcie_data,
 678                                  u32 reg_addr, u32 reg_data)
 679 {
 680         unsigned long flags;
 681         void __iomem *pcie_index_offset;
 682         void __iomem *pcie_data_offset;
 683
 684         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 685         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 686         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 687
 688         writel(reg_addr, pcie_index_offset);
 689         readl(pcie_index_offset);
 690         writel(reg_data, pcie_data_offset);
 691         readl(pcie_data_offset);
 692         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 693 }
 694
 695 /**
 696  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 697  *
 698  * @adev: amdgpu_device pointer
 699  * @pcie_index: mmio register offset
 700  * @pcie_data: mmio register offset
 701  * @reg_addr: indirect register offset
 702  * @reg_data: indirect register data
 703  *
 704  */
 705 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 706                                    u32 pcie_index, u32 pcie_data,
 707                                    u32 reg_addr, u64 reg_data)
 708 {
 709         unsigned long flags;
 710         void __iomem *pcie_index_offset;
 711         void __iomem *pcie_data_offset;
 712
 713         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 714         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 715         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 716
 717         /* write low 32 bits */
 718         writel(reg_addr, pcie_index_offset);
 719         readl(pcie_index_offset);
 720         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 721         readl(pcie_data_offset);
 722         /* write high 32 bits */
 723         writel(reg_addr + 4, pcie_index_offset);
 724         readl(pcie_index_offset);
 725         writel((u32)(reg_data >> 32), pcie_data_offset);
 726         readl(pcie_data_offset);
 727         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 728 }
 729
 730 /**
 731  * amdgpu_invalid_rreg - dummy reg read function
 732  *
 733  * @adev: amdgpu_device pointer
 734  * @reg: offset of register
 735  *
 736  * Dummy register read function.  Used for register blocks
 737  * that certain asics don't have (all asics).
 738  * Returns the value in the register.
 739  */
 740 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 741 {
 742         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 743         BUG();
 744         return 0;
 745 }
 746
 747 /**
 748  * amdgpu_invalid_wreg - dummy reg write function
 749  *
 750  * @adev: amdgpu_device pointer
 751  * @reg: offset of register
 752  * @v: value to write to the register
 753  *
 754  * Dummy register read function.  Used for register blocks
 755  * that certain asics don't have (all asics).
 756  */
 757 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 758 {
 759         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 760                   reg, v);
 761         BUG();
 762 }
 763
 764 /**
 765  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 766  *
 767  * @adev: amdgpu_device pointer
 768  * @reg: offset of register
 769  *
 770  * Dummy register read function.  Used for register blocks
 771  * that certain asics don't have (all asics).
 772  * Returns the value in the register.
 773  */
 774 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 775 {
 776         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 777         BUG();
 778         return 0;
 779 }
 780
 781 /**
 782  * amdgpu_invalid_wreg64 - dummy reg write function
 783  *
 784  * @adev: amdgpu_device pointer
 785  * @reg: offset of register
 786  * @v: value to write to the register
 787  *
 788  * Dummy register read function.  Used for register blocks
 789  * that certain asics don't have (all asics).
 790  */
 791 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 792 {
 793         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 794                   reg, v);
 795         BUG();
 796 }
 797
 798 /**
 799  * amdgpu_block_invalid_rreg - dummy reg read function
 800  *
 801  * @adev: amdgpu_device pointer
 802  * @block: offset of instance
 803  * @reg: offset of register
 804  *
 805  * Dummy register read function.  Used for register blocks
 806  * that certain asics don't have (all asics).
 807  * Returns the value in the register.
 808  */
 809 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 810                                           uint32_t block, uint32_t reg)
 811 {
 812         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 813                   reg, block);
 814         BUG();
 815         return 0;
 816 }
 817
 818 /**
 819  * amdgpu_block_invalid_wreg - dummy reg write function
 820  *
 821  * @adev: amdgpu_device pointer
 822  * @block: offset of instance
 823  * @reg: offset of register
 824  * @v: value to write to the register
 825  *
 826  * Dummy register read function.  Used for register blocks
 827  * that certain asics don't have (all asics).
 828  */
 829 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 830                                       uint32_t block,
 831                                       uint32_t reg, uint32_t v)
 832 {
 833         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 834                   reg, block, v);
 835         BUG();
 836 }
 837
 838 /**
 839  * amdgpu_device_asic_init - Wrapper for atom asic_init
 840  *
 841  * @adev: amdgpu_device pointer
 842  *
 843  * Does any asic specific work and then calls atom asic init.
 844  */
 845 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 846 {
 847         amdgpu_asic_pre_asic_init(adev);
 848
 849         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 850 }
 851
 852 /**
 853  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 854  *
 855  * @adev: amdgpu_device pointer
 856  *
 857  * Allocates a scratch page of VRAM for use by various things in the
 858  * driver.
 859  */
 860 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 861 {
 862         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 863                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 864                                        &adev->vram_scratch.robj,
 865                                        &adev->vram_scratch.gpu_addr,
 866                                        (void **)&adev->vram_scratch.ptr);
 867 }
 868
 869 /**
 870  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 871  *
 872  * @adev: amdgpu_device pointer
 873  *
 874  * Frees the VRAM scratch page.
 875  */
 876 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 877 {
 878         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 879 }
 880
 881 /**
 882  * amdgpu_device_program_register_sequence - program an array of registers.
 883  *
 884  * @adev: amdgpu_device pointer
 885  * @registers: pointer to the register array
 886  * @array_size: size of the register array
 887  *
 888  * Programs an array or registers with and and or masks.
 889  * This is a helper for setting golden registers.
 890  */
 891 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 892                                              const u32 *registers,
 893                                              const u32 array_size)
 894 {
 895         u32 tmp, reg, and_mask, or_mask;
 896         int i;
 897
 898         if (array_size % 3)
 899                 return;
 900
 901         for (i = 0; i < array_size; i +=3) {
 902                 reg = registers[i + 0];
 903                 and_mask = registers[i + 1];
 904                 or_mask = registers[i + 2];
 905
 906                 if (and_mask == 0xffffffff) {
 907                         tmp = or_mask;
 908                 } else {
 909                         tmp = RREG32(reg);
 910                         tmp &= ~and_mask;
 911                         if (adev->family >= AMDGPU_FAMILY_AI)
 912                                 tmp |= (or_mask & and_mask);
 913                         else
 914                                 tmp |= or_mask;
 915                 }
 916                 WREG32(reg, tmp);
 917         }
 918 }
 919
 920 /**
 921  * amdgpu_device_pci_config_reset - reset the GPU
 922  *
 923  * @adev: amdgpu_device pointer
 924  *
 925  * Resets the GPU using the pci config reset sequence.
 926  * Only applicable to asics prior to vega10.
 927  */
 928 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 929 {
 930         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 931 }
 932
 933 /**
 934  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
 935  *
 936  * @adev: amdgpu_device pointer
 937  *
 938  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
 939  */
 940 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
 941 {
 942         return pci_reset_function(adev->pdev);
 943 }
 944
 945 /*
 946  * GPU doorbell aperture helpers function.
 947  */
 948 /**
 949  * amdgpu_device_doorbell_init - Init doorbell driver information.
 950  *
 951  * @adev: amdgpu_device pointer
 952  *
 953  * Init doorbell driver information (CIK)
 954  * Returns 0 on success, error on failure.
 955  */
 956 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 957 {
 958
 959         /* No doorbell on SI hardware generation */
 960         if (adev->asic_type < CHIP_BONAIRE) {
 961                 adev->doorbell.base = 0;
 962                 adev->doorbell.size = 0;
 963                 adev->doorbell.num_doorbells = 0;
 964                 adev->doorbell.ptr = NULL;
 965                 return 0;
 966         }
 967
 968         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 969                 return -EINVAL;
 970
 971         amdgpu_asic_init_doorbell_index(adev);
 972
 973         /* doorbell bar mapping */
 974         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 975         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 976
 977         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 978                                              adev->doorbell_index.max_assignment+1);
 979         if (adev->doorbell.num_doorbells == 0)
 980                 return -EINVAL;
 981
 982         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 983          * paging queue doorbell use the second page. The
 984          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 985          * doorbells are in the first page. So with paging queue enabled,
 986          * the max num_doorbells should + 1 page (0x400 in dword)
 987          */
 988         if (adev->asic_type >= CHIP_VEGA10)
 989                 adev->doorbell.num_doorbells += 0x400;
 990
 991         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 992                                      adev->doorbell.num_doorbells *
 993                                      sizeof(u32));
 994         if (adev->doorbell.ptr == NULL)
 995                 return -ENOMEM;
 996
 997         return 0;
 998 }
 999
1000 /**
1001  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1002  *
1003  * @adev: amdgpu_device pointer
1004  *
1005  * Tear down doorbell driver information (CIK)
1006  */
1007 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1008 {
1009         iounmap(adev->doorbell.ptr);
1010         adev->doorbell.ptr = NULL;
1011 }
1012
1013
1014
1015 /*
1016  * amdgpu_device_wb_*()
1017  * Writeback is the method by which the GPU updates special pages in memory
1018  * with the status of certain GPU events (fences, ring pointers,etc.).
1019  */
1020
1021 /**
1022  * amdgpu_device_wb_fini - Disable Writeback and free memory
1023  *
1024  * @adev: amdgpu_device pointer
1025  *
1026  * Disables Writeback and frees the Writeback memory (all asics).
1027  * Used at driver shutdown.
1028  */
1029 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1030 {
1031         if (adev->wb.wb_obj) {
1032                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1033                                       &adev->wb.gpu_addr,
1034                                       (void **)&adev->wb.wb);
1035                 adev->wb.wb_obj = NULL;
1036         }
1037 }
1038
1039 /**
1040  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1041  *
1042  * @adev: amdgpu_device pointer
1043  *
1044  * Initializes writeback and allocates writeback memory (all asics).
1045  * Used at driver startup.
1046  * Returns 0 on success or an -error on failure.
1047  */
1048 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1049 {
1050         int r;
1051
1052         if (adev->wb.wb_obj == NULL) {
1053                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1054                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1055                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1056                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1057                                             (void **)&adev->wb.wb);
1058                 if (r) {
1059                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1060                         return r;
1061                 }
1062
1063                 adev->wb.num_wb = AMDGPU_MAX_WB;
1064                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1065
1066                 /* clear wb memory */
1067                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1068         }
1069
1070         return 0;
1071 }
1072
1073 /**
1074  * amdgpu_device_wb_get - Allocate a wb entry
1075  *
1076  * @adev: amdgpu_device pointer
1077  * @wb: wb index
1078  *
1079  * Allocate a wb slot for use by the driver (all asics).
1080  * Returns 0 on success or -EINVAL on failure.
1081  */
1082 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1083 {
1084         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1085
1086         if (offset < adev->wb.num_wb) {
1087                 __set_bit(offset, adev->wb.used);
1088                 *wb = offset << 3; /* convert to dw offset */
1089                 return 0;
1090         } else {
1091                 return -EINVAL;
1092         }
1093 }
1094
1095 /**
1096  * amdgpu_device_wb_free - Free a wb entry
1097  *
1098  * @adev: amdgpu_device pointer
1099  * @wb: wb index
1100  *
1101  * Free a wb slot allocated for use by the driver (all asics)
1102  */
1103 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1104 {
1105         wb >>= 3;
1106         if (wb < adev->wb.num_wb)
1107                 __clear_bit(wb, adev->wb.used);
1108 }
1109
1110 /**
1111  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1112  *
1113  * @adev: amdgpu_device pointer
1114  *
1115  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1116  * to fail, but if any of the BARs is not accessible after the size we abort
1117  * driver loading by returning -ENODEV.
1118  */
1119 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1120 {
1121         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1122         struct pci_bus *root;
1123         struct resource *res;
1124         unsigned i;
1125         u16 cmd;
1126         int r;
1127
1128         /* Bypass for VF */
1129         if (amdgpu_sriov_vf(adev))
1130                 return 0;
1131
1132         /* skip if the bios has already enabled large BAR */
1133         if (adev->gmc.real_vram_size &&
1134             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1135                 return 0;
1136
1137         /* Check if the root BUS has 64bit memory resources */
1138         root = adev->pdev->bus;
1139         while (root->parent)
1140                 root = root->parent;
1141
1142         pci_bus_for_each_resource(root, res, i) {
1143                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1144                     res->start > 0x100000000ull)
1145                         break;
1146         }
1147
1148         /* Trying to resize is pointless without a root hub window above 4GB */
1149         if (!res)
1150                 return 0;
1151
1152         /* Limit the BAR size to what is available */
1153         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1154                         rbar_size);
1155
1156         /* Disable memory decoding while we change the BAR addresses and size */
1157         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1158         pci_write_config_word(adev->pdev, PCI_COMMAND,
1159                               cmd & ~PCI_COMMAND_MEMORY);
1160
1161         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1162         amdgpu_device_doorbell_fini(adev);
1163         if (adev->asic_type >= CHIP_BONAIRE)
1164                 pci_release_resource(adev->pdev, 2);
1165
1166         pci_release_resource(adev->pdev, 0);
1167
1168         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1169         if (r == -ENOSPC)
1170                 DRM_INFO("Not enough PCI address space for a large BAR.");
1171         else if (r && r != -ENOTSUPP)
1172                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1173
1174         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1175
1176         /* When the doorbell or fb BAR isn't available we have no chance of
1177          * using the device.
1178          */
1179         r = amdgpu_device_doorbell_init(adev);
1180         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1181                 return -ENODEV;
1182
1183         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1184
1185         return 0;
1186 }
1187
1188 /*
1189  * GPU helpers function.
1190  */
1191 /**
1192  * amdgpu_device_need_post - check if the hw need post or not
1193  *
1194  * @adev: amdgpu_device pointer
1195  *
1196  * Check if the asic has been initialized (all asics) at driver startup
1197  * or post is needed if  hw reset is performed.
1198  * Returns true if need or false if not.
1199  */
1200 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1201 {
1202         uint32_t reg;
1203
1204         if (amdgpu_sriov_vf(adev))
1205                 return false;
1206
1207         if (amdgpu_passthrough(adev)) {
1208                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1209                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1210                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1211                  * vpost executed for smc version below 22.15
1212                  */
1213                 if (adev->asic_type == CHIP_FIJI) {
1214                         int err;
1215                         uint32_t fw_ver;
1216                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1217                         /* force vPost if error occured */
1218                         if (err)
1219                                 return true;
1220
1221                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1222                         if (fw_ver < 0x00160e00)
1223                                 return true;
1224                 }
1225         }
1226
1227         if (adev->has_hw_reset) {
1228                 adev->has_hw_reset = false;
1229                 return true;
1230         }
1231
1232         /* bios scratch used on CIK+ */
1233         if (adev->asic_type >= CHIP_BONAIRE)
1234                 return amdgpu_atombios_scratch_need_asic_init(adev);
1235
1236         /* check MEM_SIZE for older asics */
1237         reg = amdgpu_asic_get_config_memsize(adev);
1238
1239         if ((reg != 0) && (reg != 0xffffffff))
1240                 return false;
1241
1242         return true;
1243 }
1244
1245 /* if we get transitioned to only one device, take VGA back */
1246 /**
1247  * amdgpu_device_vga_set_decode - enable/disable vga decode
1248  *
1249  * @cookie: amdgpu_device pointer
1250  * @state: enable/disable vga decode
1251  *
1252  * Enable/disable vga decode (all asics).
1253  * Returns VGA resource flags.
1254  */
1255 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1256 {
1257         struct amdgpu_device *adev = cookie;
1258         amdgpu_asic_set_vga_state(adev, state);
1259         if (state)
1260                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1261                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1262         else
1263                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1264 }
1265
1266 /**
1267  * amdgpu_device_check_block_size - validate the vm block size
1268  *
1269  * @adev: amdgpu_device pointer
1270  *
1271  * Validates the vm block size specified via module parameter.
1272  * The vm block size defines number of bits in page table versus page directory,
1273  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1274  * page table and the remaining bits are in the page directory.
1275  */
1276 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1277 {
1278         /* defines number of bits in page table versus page directory,
1279          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1280          * page table and the remaining bits are in the page directory */
1281         if (amdgpu_vm_block_size == -1)
1282                 return;
1283
1284         if (amdgpu_vm_block_size < 9) {
1285                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1286                          amdgpu_vm_block_size);
1287                 amdgpu_vm_block_size = -1;
1288         }
1289 }
1290
1291 /**
1292  * amdgpu_device_check_vm_size - validate the vm size
1293  *
1294  * @adev: amdgpu_device pointer
1295  *
1296  * Validates the vm size in GB specified via module parameter.
1297  * The VM size is the size of the GPU virtual memory space in GB.
1298  */
1299 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1300 {
1301         /* no need to check the default value */
1302         if (amdgpu_vm_size == -1)
1303                 return;
1304
1305         if (amdgpu_vm_size < 1) {
1306                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1307                          amdgpu_vm_size);
1308                 amdgpu_vm_size = -1;
1309         }
1310 }
1311
1312 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1313 {
1314         struct sysinfo si;
1315         bool is_os_64 = (sizeof(void *) == 8);
1316         uint64_t total_memory;
1317         uint64_t dram_size_seven_GB = 0x1B8000000;
1318         uint64_t dram_size_three_GB = 0xB8000000;
1319
1320         if (amdgpu_smu_memory_pool_size == 0)
1321                 return;
1322
1323         if (!is_os_64) {
1324                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1325                 goto def_value;
1326         }
1327         si_meminfo(&si);
1328         total_memory = (uint64_t)si.totalram * si.mem_unit;
1329
1330         if ((amdgpu_smu_memory_pool_size == 1) ||
1331                 (amdgpu_smu_memory_pool_size == 2)) {
1332                 if (total_memory < dram_size_three_GB)
1333                         goto def_value1;
1334         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1335                 (amdgpu_smu_memory_pool_size == 8)) {
1336                 if (total_memory < dram_size_seven_GB)
1337                         goto def_value1;
1338         } else {
1339                 DRM_WARN("Smu memory pool size not supported\n");
1340                 goto def_value;
1341         }
1342         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1343
1344         return;
1345
1346 def_value1:
1347         DRM_WARN("No enough system memory\n");
1348 def_value:
1349         adev->pm.smu_prv_buffer_size = 0;
1350 }
1351
1352 /**
1353  * amdgpu_device_check_arguments - validate module params
1354  *
1355  * @adev: amdgpu_device pointer
1356  *
1357  * Validates certain module parameters and updates
1358  * the associated values used by the driver (all asics).
1359  */
1360 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1361 {
1362         if (amdgpu_sched_jobs < 4) {
1363                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1364                          amdgpu_sched_jobs);
1365                 amdgpu_sched_jobs = 4;
1366         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1367                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1368                          amdgpu_sched_jobs);
1369                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1370         }
1371
1372         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1373                 /* gart size must be greater or equal to 32M */
1374                 dev_warn(adev->dev, "gart size (%d) too small\n",
1375                          amdgpu_gart_size);
1376                 amdgpu_gart_size = -1;
1377         }
1378
1379         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1380                 /* gtt size must be greater or equal to 32M */
1381                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1382                                  amdgpu_gtt_size);
1383                 amdgpu_gtt_size = -1;
1384         }
1385
1386         /* valid range is between 4 and 9 inclusive */
1387         if (amdgpu_vm_fragment_size != -1 &&
1388             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1389                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1390                 amdgpu_vm_fragment_size = -1;
1391         }
1392
1393         if (amdgpu_sched_hw_submission < 2) {
1394                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1395                          amdgpu_sched_hw_submission);
1396                 amdgpu_sched_hw_submission = 2;
1397         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1398                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1399                          amdgpu_sched_hw_submission);
1400                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1401         }
1402
1403         amdgpu_device_check_smu_prv_buffer_size(adev);
1404
1405         amdgpu_device_check_vm_size(adev);
1406
1407         amdgpu_device_check_block_size(adev);
1408
1409         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1410
1411         amdgpu_gmc_tmz_set(adev);
1412
1413         amdgpu_gmc_noretry_set(adev);
1414
1415         return 0;
1416 }
1417
1418 /**
1419  * amdgpu_switcheroo_set_state - set switcheroo state
1420  *
1421  * @pdev: pci dev pointer
1422  * @state: vga_switcheroo state
1423  *
1424  * Callback for the switcheroo driver.  Suspends or resumes the
1425  * the asics before or after it is powered up using ACPI methods.
1426  */
1427 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1428                                         enum vga_switcheroo_state state)
1429 {
1430         struct drm_device *dev = pci_get_drvdata(pdev);
1431         int r;
1432
1433         if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF)
1434                 return;
1435
1436         if (state == VGA_SWITCHEROO_ON) {
1437                 pr_info("switched on\n");
1438                 /* don't suspend or resume card normally */
1439                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1440
1441                 pci_set_power_state(pdev, PCI_D0);
1442                 amdgpu_device_load_pci_state(pdev);
1443                 r = pci_enable_device(pdev);
1444                 if (r)
1445                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1446                 amdgpu_device_resume(dev, true);
1447
1448                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1449         } else {
1450                 pr_info("switched off\n");
1451                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1452                 amdgpu_device_suspend(dev, true);
1453                 amdgpu_device_cache_pci_state(pdev);
1454                 /* Shut down the device */
1455                 pci_disable_device(pdev);
1456                 pci_set_power_state(pdev, PCI_D3cold);
1457                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1458         }
1459 }
1460
1461 /**
1462  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1463  *
1464  * @pdev: pci dev pointer
1465  *
1466  * Callback for the switcheroo driver.  Check of the switcheroo
1467  * state can be changed.
1468  * Returns true if the state can be changed, false if not.
1469  */
1470 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1471 {
1472         struct drm_device *dev = pci_get_drvdata(pdev);
1473
1474         /*
1475         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1476         * locking inversion with the driver load path. And the access here is
1477         * completely racy anyway. So don't bother with locking for now.
1478         */
1479         return atomic_read(&dev->open_count) == 0;
1480 }
1481
1482 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1483         .set_gpu_state = amdgpu_switcheroo_set_state,
1484         .reprobe = NULL,
1485         .can_switch = amdgpu_switcheroo_can_switch,
1486 };
1487
1488 /**
1489  * amdgpu_device_ip_set_clockgating_state - set the CG state
1490  *
1491  * @dev: amdgpu_device pointer
1492  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1493  * @state: clockgating state (gate or ungate)
1494  *
1495  * Sets the requested clockgating state for all instances of
1496  * the hardware IP specified.
1497  * Returns the error code from the last instance.
1498  */
1499 int amdgpu_device_ip_set_clockgating_state(void *dev,
1500                                            enum amd_ip_block_type block_type,
1501                                            enum amd_clockgating_state state)
1502 {
1503         struct amdgpu_device *adev = dev;
1504         int i, r = 0;
1505
1506         for (i = 0; i < adev->num_ip_blocks; i++) {
1507                 if (!adev->ip_blocks[i].status.valid)
1508                         continue;
1509                 if (adev->ip_blocks[i].version->type != block_type)
1510                         continue;
1511                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1512                         continue;
1513                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1514                         (void *)adev, state);
1515                 if (r)
1516                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1517                                   adev->ip_blocks[i].version->funcs->name, r);
1518         }
1519         return r;
1520 }
1521
1522 /**
1523  * amdgpu_device_ip_set_powergating_state - set the PG state
1524  *
1525  * @dev: amdgpu_device pointer
1526  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1527  * @state: powergating state (gate or ungate)
1528  *
1529  * Sets the requested powergating state for all instances of
1530  * the hardware IP specified.
1531  * Returns the error code from the last instance.
1532  */
1533 int amdgpu_device_ip_set_powergating_state(void *dev,
1534                                            enum amd_ip_block_type block_type,
1535                                            enum amd_powergating_state state)
1536 {
1537         struct amdgpu_device *adev = dev;
1538         int i, r = 0;
1539
1540         for (i = 0; i < adev->num_ip_blocks; i++) {
1541                 if (!adev->ip_blocks[i].status.valid)
1542                         continue;
1543                 if (adev->ip_blocks[i].version->type != block_type)
1544                         continue;
1545                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1546                         continue;
1547                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1548                         (void *)adev, state);
1549                 if (r)
1550                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1551                                   adev->ip_blocks[i].version->funcs->name, r);
1552         }
1553         return r;
1554 }
1555
1556 /**
1557  * amdgpu_device_ip_get_clockgating_state - get the CG state
1558  *
1559  * @adev: amdgpu_device pointer
1560  * @flags: clockgating feature flags
1561  *
1562  * Walks the list of IPs on the device and updates the clockgating
1563  * flags for each IP.
1564  * Updates @flags with the feature flags for each hardware IP where
1565  * clockgating is enabled.
1566  */
1567 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1568                                             u32 *flags)
1569 {
1570         int i;
1571
1572         for (i = 0; i < adev->num_ip_blocks; i++) {
1573                 if (!adev->ip_blocks[i].status.valid)
1574                         continue;
1575                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1576                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1577         }
1578 }
1579
1580 /**
1581  * amdgpu_device_ip_wait_for_idle - wait for idle
1582  *
1583  * @adev: amdgpu_device pointer
1584  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1585  *
1586  * Waits for the request hardware IP to be idle.
1587  * Returns 0 for success or a negative error code on failure.
1588  */
1589 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1590                                    enum amd_ip_block_type block_type)
1591 {
1592         int i, r;
1593
1594         for (i = 0; i < adev->num_ip_blocks; i++) {
1595                 if (!adev->ip_blocks[i].status.valid)
1596                         continue;
1597                 if (adev->ip_blocks[i].version->type == block_type) {
1598                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1599                         if (r)
1600                                 return r;
1601                         break;
1602                 }
1603         }
1604         return 0;
1605
1606 }
1607
1608 /**
1609  * amdgpu_device_ip_is_idle - is the hardware IP idle
1610  *
1611  * @adev: amdgpu_device pointer
1612  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1613  *
1614  * Check if the hardware IP is idle or not.
1615  * Returns true if it the IP is idle, false if not.
1616  */
1617 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1618                               enum amd_ip_block_type block_type)
1619 {
1620         int i;
1621
1622         for (i = 0; i < adev->num_ip_blocks; i++) {
1623                 if (!adev->ip_blocks[i].status.valid)
1624                         continue;
1625                 if (adev->ip_blocks[i].version->type == block_type)
1626                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1627         }
1628         return true;
1629
1630 }
1631
1632 /**
1633  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1634  *
1635  * @adev: amdgpu_device pointer
1636  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1637  *
1638  * Returns a pointer to the hardware IP block structure
1639  * if it exists for the asic, otherwise NULL.
1640  */
1641 struct amdgpu_ip_block *
1642 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1643                               enum amd_ip_block_type type)
1644 {
1645         int i;
1646
1647         for (i = 0; i < adev->num_ip_blocks; i++)
1648                 if (adev->ip_blocks[i].version->type == type)
1649                         return &adev->ip_blocks[i];
1650
1651         return NULL;
1652 }
1653
1654 /**
1655  * amdgpu_device_ip_block_version_cmp
1656  *
1657  * @adev: amdgpu_device pointer
1658  * @type: enum amd_ip_block_type
1659  * @major: major version
1660  * @minor: minor version
1661  *
1662  * return 0 if equal or greater
1663  * return 1 if smaller or the ip_block doesn't exist
1664  */
1665 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1666                                        enum amd_ip_block_type type,
1667                                        u32 major, u32 minor)
1668 {
1669         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1670
1671         if (ip_block && ((ip_block->version->major > major) ||
1672                         ((ip_block->version->major == major) &&
1673                         (ip_block->version->minor >= minor))))
1674                 return 0;
1675
1676         return 1;
1677 }
1678
1679 /**
1680  * amdgpu_device_ip_block_add
1681  *
1682  * @adev: amdgpu_device pointer
1683  * @ip_block_version: pointer to the IP to add
1684  *
1685  * Adds the IP block driver information to the collection of IPs
1686  * on the asic.
1687  */
1688 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1689                                const struct amdgpu_ip_block_version *ip_block_version)
1690 {
1691         if (!ip_block_version)
1692                 return -EINVAL;
1693
1694         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1695                   ip_block_version->funcs->name);
1696
1697         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1698
1699         return 0;
1700 }
1701
1702 /**
1703  * amdgpu_device_enable_virtual_display - enable virtual display feature
1704  *
1705  * @adev: amdgpu_device pointer
1706  *
1707  * Enabled the virtual display feature if the user has enabled it via
1708  * the module parameter virtual_display.  This feature provides a virtual
1709  * display hardware on headless boards or in virtualized environments.
1710  * This function parses and validates the configuration string specified by
1711  * the user and configues the virtual display configuration (number of
1712  * virtual connectors, crtcs, etc.) specified.
1713  */
1714 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1715 {
1716         adev->enable_virtual_display = false;
1717
1718         if (amdgpu_virtual_display) {
1719                 const char *pci_address_name = pci_name(adev->pdev);
1720                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1721
1722                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1723                 pciaddstr_tmp = pciaddstr;
1724                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1725                         pciaddname = strsep(&pciaddname_tmp, ",");
1726                         if (!strcmp("all", pciaddname)
1727                             || !strcmp(pci_address_name, pciaddname)) {
1728                                 long num_crtc;
1729                                 int res = -1;
1730
1731                                 adev->enable_virtual_display = true;
1732
1733                                 if (pciaddname_tmp)
1734                                         res = kstrtol(pciaddname_tmp, 10,
1735                                                       &num_crtc);
1736
1737                                 if (!res) {
1738                                         if (num_crtc < 1)
1739                                                 num_crtc = 1;
1740                                         if (num_crtc > 6)
1741                                                 num_crtc = 6;
1742                                         adev->mode_info.num_crtc = num_crtc;
1743                                 } else {
1744                                         adev->mode_info.num_crtc = 1;
1745                                 }
1746                                 break;
1747                         }
1748                 }
1749
1750                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1751                          amdgpu_virtual_display, pci_address_name,
1752                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1753
1754                 kfree(pciaddstr);
1755         }
1756 }
1757
1758 /**
1759  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1760  *
1761  * @adev: amdgpu_device pointer
1762  *
1763  * Parses the asic configuration parameters specified in the gpu info
1764  * firmware and makes them availale to the driver for use in configuring
1765  * the asic.
1766  * Returns 0 on success, -EINVAL on failure.
1767  */
1768 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1769 {
1770         const char *chip_name;
1771         char fw_name[40];
1772         int err;
1773         const struct gpu_info_firmware_header_v1_0 *hdr;
1774
1775         adev->firmware.gpu_info_fw = NULL;
1776
1777         if (adev->mman.discovery_bin) {
1778                 amdgpu_discovery_get_gfx_info(adev);
1779
1780                 /*
1781                  * FIXME: The bounding box is still needed by Navi12, so
1782                  * temporarily read it from gpu_info firmware. Should be droped
1783                  * when DAL no longer needs it.
1784                  */
1785                 if (adev->asic_type != CHIP_NAVI12)
1786                         return 0;
1787         }
1788
1789         switch (adev->asic_type) {
1790 #ifdef CONFIG_DRM_AMDGPU_SI
1791         case CHIP_VERDE:
1792         case CHIP_TAHITI:
1793         case CHIP_PITCAIRN:
1794         case CHIP_OLAND:
1795         case CHIP_HAINAN:
1796 #endif
1797 #ifdef CONFIG_DRM_AMDGPU_CIK
1798         case CHIP_BONAIRE:
1799         case CHIP_HAWAII:
1800         case CHIP_KAVERI:
1801         case CHIP_KABINI:
1802         case CHIP_MULLINS:
1803 #endif
1804         case CHIP_TOPAZ:
1805         case CHIP_TONGA:
1806         case CHIP_FIJI:
1807         case CHIP_POLARIS10:
1808         case CHIP_POLARIS11:
1809         case CHIP_POLARIS12:
1810         case CHIP_VEGAM:
1811         case CHIP_CARRIZO:
1812         case CHIP_STONEY:
1813         case CHIP_VEGA20:
1814         case CHIP_ALDEBARAN:
1815         case CHIP_SIENNA_CICHLID:
1816         case CHIP_NAVY_FLOUNDER:
1817         case CHIP_DIMGREY_CAVEFISH:
1818         default:
1819                 return 0;
1820         case CHIP_VEGA10:
1821                 chip_name = "vega10";
1822                 break;
1823         case CHIP_VEGA12:
1824                 chip_name = "vega12";
1825                 break;
1826         case CHIP_RAVEN:
1827                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1828                         chip_name = "raven2";
1829                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1830                         chip_name = "picasso";
1831                 else
1832                         chip_name = "raven";
1833                 break;
1834         case CHIP_ARCTURUS:
1835                 chip_name = "arcturus";
1836                 break;
1837         case CHIP_RENOIR:
1838                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1839                         chip_name = "renoir";
1840                 else
1841                         chip_name = "green_sardine";
1842                 break;
1843         case CHIP_NAVI10:
1844                 chip_name = "navi10";
1845                 break;
1846         case CHIP_NAVI14:
1847                 chip_name = "navi14";
1848                 break;
1849         case CHIP_NAVI12:
1850                 chip_name = "navi12";
1851                 break;
1852         case CHIP_VANGOGH:
1853                 chip_name = "vangogh";
1854                 break;
1855         }
1856
1857         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1858         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1859         if (err) {
1860                 dev_err(adev->dev,
1861                         "Failed to load gpu_info firmware \"%s\"\n",
1862                         fw_name);
1863                 goto out;
1864         }
1865         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1866         if (err) {
1867                 dev_err(adev->dev,
1868                         "Failed to validate gpu_info firmware \"%s\"\n",
1869                         fw_name);
1870                 goto out;
1871         }
1872
1873         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1874         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1875
1876         switch (hdr->version_major) {
1877         case 1:
1878         {
1879                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1880                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1881                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1882
1883                 /*
1884                  * Should be droped when DAL no longer needs it.
1885                  */
1886                 if (adev->asic_type == CHIP_NAVI12)
1887                         goto parse_soc_bounding_box;
1888
1889                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1890                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1891                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1892                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1893                 adev->gfx.config.max_texture_channel_caches =
1894                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1895                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1896                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1897                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1898                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1899                 adev->gfx.config.double_offchip_lds_buf =
1900                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1901                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1902                 adev->gfx.cu_info.max_waves_per_simd =
1903                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1904                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1905                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1906                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1907                 if (hdr->version_minor >= 1) {
1908                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1909                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1910                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1911                         adev->gfx.config.num_sc_per_sh =
1912                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1913                         adev->gfx.config.num_packer_per_sc =
1914                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1915                 }
1916
1917 parse_soc_bounding_box:
1918                 /*
1919                  * soc bounding box info is not integrated in disocovery table,
1920                  * we always need to parse it from gpu info firmware if needed.
1921                  */
1922                 if (hdr->version_minor == 2) {
1923                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1924                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1925                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1926                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1927                 }
1928                 break;
1929         }
1930         default:
1931                 dev_err(adev->dev,
1932                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1933                 err = -EINVAL;
1934                 goto out;
1935         }
1936 out:
1937         return err;
1938 }
1939
1940 /**
1941  * amdgpu_device_ip_early_init - run early init for hardware IPs
1942  *
1943  * @adev: amdgpu_device pointer
1944  *
1945  * Early initialization pass for hardware IPs.  The hardware IPs that make
1946  * up each asic are discovered each IP's early_init callback is run.  This
1947  * is the first stage in initializing the asic.
1948  * Returns 0 on success, negative error code on failure.
1949  */
1950 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1951 {
1952         int i, r;
1953
1954         amdgpu_device_enable_virtual_display(adev);
1955
1956         if (amdgpu_sriov_vf(adev)) {
1957                 r = amdgpu_virt_request_full_gpu(adev, true);
1958                 if (r)
1959                         return r;
1960         }
1961
1962         switch (adev->asic_type) {
1963 #ifdef CONFIG_DRM_AMDGPU_SI
1964         case CHIP_VERDE:
1965         case CHIP_TAHITI:
1966         case CHIP_PITCAIRN:
1967         case CHIP_OLAND:
1968         case CHIP_HAINAN:
1969                 adev->family = AMDGPU_FAMILY_SI;
1970                 r = si_set_ip_blocks(adev);
1971                 if (r)
1972                         return r;
1973                 break;
1974 #endif
1975 #ifdef CONFIG_DRM_AMDGPU_CIK
1976         case CHIP_BONAIRE:
1977         case CHIP_HAWAII:
1978         case CHIP_KAVERI:
1979         case CHIP_KABINI:
1980         case CHIP_MULLINS:
1981                 if (adev->flags & AMD_IS_APU)
1982                         adev->family = AMDGPU_FAMILY_KV;
1983                 else
1984                         adev->family = AMDGPU_FAMILY_CI;
1985
1986                 r = cik_set_ip_blocks(adev);
1987                 if (r)
1988                         return r;
1989                 break;
1990 #endif
1991         case CHIP_TOPAZ:
1992         case CHIP_TONGA:
1993         case CHIP_FIJI:
1994         case CHIP_POLARIS10:
1995         case CHIP_POLARIS11:
1996         case CHIP_POLARIS12:
1997         case CHIP_VEGAM:
1998         case CHIP_CARRIZO:
1999         case CHIP_STONEY:
2000                 if (adev->flags & AMD_IS_APU)
2001                         adev->family = AMDGPU_FAMILY_CZ;
2002                 else
2003                         adev->family = AMDGPU_FAMILY_VI;
2004
2005                 r = vi_set_ip_blocks(adev);
2006                 if (r)
2007                         return r;
2008                 break;
2009         case CHIP_VEGA10:
2010         case CHIP_VEGA12:
2011         case CHIP_VEGA20:
2012         case CHIP_RAVEN:
2013         case CHIP_ARCTURUS:
2014         case CHIP_RENOIR:
2015         case CHIP_ALDEBARAN:
2016                 if (adev->flags & AMD_IS_APU)
2017                         adev->family = AMDGPU_FAMILY_RV;
2018                 else
2019                         adev->family = AMDGPU_FAMILY_AI;
2020
2021                 r = soc15_set_ip_blocks(adev);
2022                 if (r)
2023                         return r;
2024                 break;
2025         case  CHIP_NAVI10:
2026         case  CHIP_NAVI14:
2027         case  CHIP_NAVI12:
2028         case  CHIP_SIENNA_CICHLID:
2029         case  CHIP_NAVY_FLOUNDER:
2030         case  CHIP_DIMGREY_CAVEFISH:
2031         case CHIP_VANGOGH:
2032                 if (adev->asic_type == CHIP_VANGOGH)
2033                         adev->family = AMDGPU_FAMILY_VGH;
2034                 else
2035                         adev->family = AMDGPU_FAMILY_NV;
2036
2037                 r = nv_set_ip_blocks(adev);
2038                 if (r)
2039                         return r;
2040                 break;
2041         default:
2042                 /* FIXME: not supported yet */
2043                 return -EINVAL;
2044         }
2045
2046         amdgpu_amdkfd_device_probe(adev);
2047
2048         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2049         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2050                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2051         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2052                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2053
2054         for (i = 0; i < adev->num_ip_blocks; i++) {
2055                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2056                         DRM_ERROR("disabled ip block: %d <%s>\n",
2057                                   i, adev->ip_blocks[i].version->funcs->name);
2058                         adev->ip_blocks[i].status.valid = false;
2059                 } else {
2060                         if (adev->ip_blocks[i].version->funcs->early_init) {
2061                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2062                                 if (r == -ENOENT) {
2063                                         adev->ip_blocks[i].status.valid = false;
2064                                 } else if (r) {
2065                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2066                                                   adev->ip_blocks[i].version->funcs->name, r);
2067                                         return r;
2068                                 } else {
2069                                         adev->ip_blocks[i].status.valid = true;
2070                                 }
2071                         } else {
2072                                 adev->ip_blocks[i].status.valid = true;
2073                         }
2074                 }
2075                 /* get the vbios after the asic_funcs are set up */
2076                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2077                         r = amdgpu_device_parse_gpu_info_fw(adev);
2078                         if (r)
2079                                 return r;
2080
2081                         /* Read BIOS */
2082                         if (!amdgpu_get_bios(adev))
2083                                 return -EINVAL;
2084
2085                         r = amdgpu_atombios_init(adev);
2086                         if (r) {
2087                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2088                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2089                                 return r;
2090                         }
2091                 }
2092         }
2093
2094         adev->cg_flags &= amdgpu_cg_mask;
2095         adev->pg_flags &= amdgpu_pg_mask;
2096
2097         return 0;
2098 }
2099
2100 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2101 {
2102         int i, r;
2103
2104         for (i = 0; i < adev->num_ip_blocks; i++) {
2105                 if (!adev->ip_blocks[i].status.sw)
2106                         continue;
2107                 if (adev->ip_blocks[i].status.hw)
2108                         continue;
2109                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2110                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2111                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2112                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2113                         if (r) {
2114                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2115                                           adev->ip_blocks[i].version->funcs->name, r);
2116                                 return r;
2117                         }
2118                         adev->ip_blocks[i].status.hw = true;
2119                 }
2120         }
2121
2122         return 0;
2123 }
2124
2125 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2126 {
2127         int i, r;
2128
2129         for (i = 0; i < adev->num_ip_blocks; i++) {
2130                 if (!adev->ip_blocks[i].status.sw)
2131                         continue;
2132                 if (adev->ip_blocks[i].status.hw)
2133                         continue;
2134                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2135                 if (r) {
2136                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2137                                   adev->ip_blocks[i].version->funcs->name, r);
2138                         return r;
2139                 }
2140                 adev->ip_blocks[i].status.hw = true;
2141         }
2142
2143         return 0;
2144 }
2145
2146 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2147 {
2148         int r = 0;
2149         int i;
2150         uint32_t smu_version;
2151
2152         if (adev->asic_type >= CHIP_VEGA10) {
2153                 for (i = 0; i < adev->num_ip_blocks; i++) {
2154                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2155                                 continue;
2156
2157                         /* no need to do the fw loading again if already done*/
2158                         if (adev->ip_blocks[i].status.hw == true)
2159                                 break;
2160
2161                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2162                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2163                                 if (r) {
2164                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2165                                                           adev->ip_blocks[i].version->funcs->name, r);
2166                                         return r;
2167                                 }
2168                         } else {
2169                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2170                                 if (r) {
2171                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2172                                                           adev->ip_blocks[i].version->funcs->name, r);
2173                                         return r;
2174                                 }
2175                         }
2176
2177                         adev->ip_blocks[i].status.hw = true;
2178                         break;
2179                 }
2180         }
2181
2182         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2183                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2184
2185         return r;
2186 }
2187
2188 /**
2189  * amdgpu_device_ip_init - run init for hardware IPs
2190  *
2191  * @adev: amdgpu_device pointer
2192  *
2193  * Main initialization pass for hardware IPs.  The list of all the hardware
2194  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2195  * are run.  sw_init initializes the software state associated with each IP
2196  * and hw_init initializes the hardware associated with each IP.
2197  * Returns 0 on success, negative error code on failure.
2198  */
2199 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2200 {
2201         int i, r;
2202
2203         r = amdgpu_ras_init(adev);
2204         if (r)
2205                 return r;
2206
2207         for (i = 0; i < adev->num_ip_blocks; i++) {
2208                 if (!adev->ip_blocks[i].status.valid)
2209                         continue;
2210                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2211                 if (r) {
2212                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2213                                   adev->ip_blocks[i].version->funcs->name, r);
2214                         goto init_failed;
2215                 }
2216                 adev->ip_blocks[i].status.sw = true;
2217
2218                 /* need to do gmc hw init early so we can allocate gpu mem */
2219                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2220                         r = amdgpu_device_vram_scratch_init(adev);
2221                         if (r) {
2222                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2223                                 goto init_failed;
2224                         }
2225                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2226                         if (r) {
2227                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2228                                 goto init_failed;
2229                         }
2230                         r = amdgpu_device_wb_init(adev);
2231                         if (r) {
2232                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2233                                 goto init_failed;
2234                         }
2235                         adev->ip_blocks[i].status.hw = true;
2236
2237                         /* right after GMC hw init, we create CSA */
2238                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2239                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2240                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2241                                                                 AMDGPU_CSA_SIZE);
2242                                 if (r) {
2243                                         DRM_ERROR("allocate CSA failed %d\n", r);
2244                                         goto init_failed;
2245                                 }
2246                         }
2247                 }
2248         }
2249
2250         if (amdgpu_sriov_vf(adev))
2251                 amdgpu_virt_init_data_exchange(adev);
2252
2253         r = amdgpu_ib_pool_init(adev);
2254         if (r) {
2255                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2256                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2257                 goto init_failed;
2258         }
2259
2260         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2261         if (r)
2262                 goto init_failed;
2263
2264         r = amdgpu_device_ip_hw_init_phase1(adev);
2265         if (r)
2266                 goto init_failed;
2267
2268         r = amdgpu_device_fw_loading(adev);
2269         if (r)
2270                 goto init_failed;
2271
2272         r = amdgpu_device_ip_hw_init_phase2(adev);
2273         if (r)
2274                 goto init_failed;
2275
2276         /*
2277          * retired pages will be loaded from eeprom and reserved here,
2278          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2279          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2280          * for I2C communication which only true at this point.
2281          *
2282          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2283          * failure from bad gpu situation and stop amdgpu init process
2284          * accordingly. For other failed cases, it will still release all
2285          * the resource and print error message, rather than returning one
2286          * negative value to upper level.
2287          *
2288          * Note: theoretically, this should be called before all vram allocations
2289          * to protect retired page from abusing
2290          */
2291         r = amdgpu_ras_recovery_init(adev);
2292         if (r)
2293                 goto init_failed;
2294
2295         if (adev->gmc.xgmi.num_physical_nodes > 1)
2296                 amdgpu_xgmi_add_device(adev);
2297         amdgpu_amdkfd_device_init(adev);
2298
2299         amdgpu_fru_get_product_info(adev);
2300
2301 init_failed:
2302         if (amdgpu_sriov_vf(adev))
2303                 amdgpu_virt_release_full_gpu(adev, true);
2304
2305         return r;
2306 }
2307
2308 /**
2309  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2310  *
2311  * @adev: amdgpu_device pointer
2312  *
2313  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2314  * this function before a GPU reset.  If the value is retained after a
2315  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2316  */
2317 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2318 {
2319         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2320 }
2321
2322 /**
2323  * amdgpu_device_check_vram_lost - check if vram is valid
2324  *
2325  * @adev: amdgpu_device pointer
2326  *
2327  * Checks the reset magic value written to the gart pointer in VRAM.
2328  * The driver calls this after a GPU reset to see if the contents of
2329  * VRAM is lost or now.
2330  * returns true if vram is lost, false if not.
2331  */
2332 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2333 {
2334         if (memcmp(adev->gart.ptr, adev->reset_magic,
2335                         AMDGPU_RESET_MAGIC_NUM))
2336                 return true;
2337
2338         if (!amdgpu_in_reset(adev))
2339                 return false;
2340
2341         /*
2342          * For all ASICs with baco/mode1 reset, the VRAM is
2343          * always assumed to be lost.
2344          */
2345         switch (amdgpu_asic_reset_method(adev)) {
2346         case AMD_RESET_METHOD_BACO:
2347         case AMD_RESET_METHOD_MODE1:
2348                 return true;
2349         default:
2350                 return false;
2351         }
2352 }
2353
2354 /**
2355  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2356  *
2357  * @adev: amdgpu_device pointer
2358  * @state: clockgating state (gate or ungate)
2359  *
2360  * The list of all the hardware IPs that make up the asic is walked and the
2361  * set_clockgating_state callbacks are run.
2362  * Late initialization pass enabling clockgating for hardware IPs.
2363  * Fini or suspend, pass disabling clockgating for hardware IPs.
2364  * Returns 0 on success, negative error code on failure.
2365  */
2366
2367 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2368                                                 enum amd_clockgating_state state)
2369 {
2370         int i, j, r;
2371
2372         if (amdgpu_emu_mode == 1)
2373                 return 0;
2374
2375         for (j = 0; j < adev->num_ip_blocks; j++) {
2376                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2377                 if (!adev->ip_blocks[i].status.late_initialized)
2378                         continue;
2379                 /* skip CG for VCE/UVD, it's handled specially */
2380                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2381                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2382                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2383                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2384                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2385                         /* enable clockgating to save power */
2386                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2387                                                                                      state);
2388                         if (r) {
2389                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2390                                           adev->ip_blocks[i].version->funcs->name, r);
2391                                 return r;
2392                         }
2393                 }
2394         }
2395
2396         return 0;
2397 }
2398
2399 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2400 {
2401         int i, j, r;
2402
2403         if (amdgpu_emu_mode == 1)
2404                 return 0;
2405
2406         for (j = 0; j < adev->num_ip_blocks; j++) {
2407                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2408                 if (!adev->ip_blocks[i].status.late_initialized)
2409                         continue;
2410                 /* skip CG for VCE/UVD, it's handled specially */
2411                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2412                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2413                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2414                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2415                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2416                         /* enable powergating to save power */
2417                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2418                                                                                         state);
2419                         if (r) {
2420                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2421                                           adev->ip_blocks[i].version->funcs->name, r);
2422                                 return r;
2423                         }
2424                 }
2425         }
2426         return 0;
2427 }
2428
2429 static int amdgpu_device_enable_mgpu_fan_boost(void)
2430 {
2431         struct amdgpu_gpu_instance *gpu_ins;
2432         struct amdgpu_device *adev;
2433         int i, ret = 0;
2434
2435         mutex_lock(&mgpu_info.mutex);
2436
2437         /*
2438          * MGPU fan boost feature should be enabled
2439          * only when there are two or more dGPUs in
2440          * the system
2441          */
2442         if (mgpu_info.num_dgpu < 2)
2443                 goto out;
2444
2445         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2446                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2447                 adev = gpu_ins->adev;
2448                 if (!(adev->flags & AMD_IS_APU) &&
2449                     !gpu_ins->mgpu_fan_enabled) {
2450                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2451                         if (ret)
2452                                 break;
2453
2454                         gpu_ins->mgpu_fan_enabled = 1;
2455                 }
2456         }
2457
2458 out:
2459         mutex_unlock(&mgpu_info.mutex);
2460
2461         return ret;
2462 }
2463
2464 /**
2465  * amdgpu_device_ip_late_init - run late init for hardware IPs
2466  *
2467  * @adev: amdgpu_device pointer
2468  *
2469  * Late initialization pass for hardware IPs.  The list of all the hardware
2470  * IPs that make up the asic is walked and the late_init callbacks are run.
2471  * late_init covers any special initialization that an IP requires
2472  * after all of the have been initialized or something that needs to happen
2473  * late in the init process.
2474  * Returns 0 on success, negative error code on failure.
2475  */
2476 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2477 {
2478         struct amdgpu_gpu_instance *gpu_instance;
2479         int i = 0, r;
2480
2481         for (i = 0; i < adev->num_ip_blocks; i++) {
2482                 if (!adev->ip_blocks[i].status.hw)
2483                         continue;
2484                 if (adev->ip_blocks[i].version->funcs->late_init) {
2485                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2486                         if (r) {
2487                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2488                                           adev->ip_blocks[i].version->funcs->name, r);
2489                                 return r;
2490                         }
2491                 }
2492                 adev->ip_blocks[i].status.late_initialized = true;
2493         }
2494
2495         amdgpu_ras_set_error_query_ready(adev, true);
2496
2497         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2498         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2499
2500         amdgpu_device_fill_reset_magic(adev);
2501
2502         r = amdgpu_device_enable_mgpu_fan_boost();
2503         if (r)
2504                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2505
2506
2507         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2508                 mutex_lock(&mgpu_info.mutex);
2509
2510                 /*
2511                  * Reset device p-state to low as this was booted with high.
2512                  *
2513                  * This should be performed only after all devices from the same
2514                  * hive get initialized.
2515                  *
2516                  * However, it's unknown how many device in the hive in advance.
2517                  * As this is counted one by one during devices initializations.
2518                  *
2519                  * So, we wait for all XGMI interlinked devices initialized.
2520                  * This may bring some delays as those devices may come from
2521                  * different hives. But that should be OK.
2522                  */
2523                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2524                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2525                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2526                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2527                                         continue;
2528
2529                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2530                                                 AMDGPU_XGMI_PSTATE_MIN);
2531                                 if (r) {
2532                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2533                                         break;
2534                                 }
2535                         }
2536                 }
2537
2538                 mutex_unlock(&mgpu_info.mutex);
2539         }
2540
2541         return 0;
2542 }
2543
2544 /**
2545  * amdgpu_device_ip_fini - run fini for hardware IPs
2546  *
2547  * @adev: amdgpu_device pointer
2548  *
2549  * Main teardown pass for hardware IPs.  The list of all the hardware
2550  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2551  * are run.  hw_fini tears down the hardware associated with each IP
2552  * and sw_fini tears down any software state associated with each IP.
2553  * Returns 0 on success, negative error code on failure.
2554  */
2555 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2556 {
2557         int i, r;
2558
2559         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2560                 amdgpu_virt_release_ras_err_handler_data(adev);
2561
2562         amdgpu_ras_pre_fini(adev);
2563
2564         if (adev->gmc.xgmi.num_physical_nodes > 1)
2565                 amdgpu_xgmi_remove_device(adev);
2566
2567         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2568         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2569
2570         amdgpu_amdkfd_device_fini(adev);
2571
2572         /* need to disable SMC first */
2573         for (i = 0; i < adev->num_ip_blocks; i++) {
2574                 if (!adev->ip_blocks[i].status.hw)
2575                         continue;
2576                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2577                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2578                         /* XXX handle errors */
2579                         if (r) {
2580                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2581                                           adev->ip_blocks[i].version->funcs->name, r);
2582                         }
2583                         adev->ip_blocks[i].status.hw = false;
2584                         break;
2585                 }
2586         }
2587
2588         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2589                 if (!adev->ip_blocks[i].status.hw)
2590                         continue;
2591
2592                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2593                 /* XXX handle errors */
2594                 if (r) {
2595                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2596                                   adev->ip_blocks[i].version->funcs->name, r);
2597                 }
2598
2599                 adev->ip_blocks[i].status.hw = false;
2600         }
2601
2602
2603         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2604                 if (!adev->ip_blocks[i].status.sw)
2605                         continue;
2606
2607                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2608                         amdgpu_ucode_free_bo(adev);
2609                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2610                         amdgpu_device_wb_fini(adev);
2611                         amdgpu_device_vram_scratch_fini(adev);
2612                         amdgpu_ib_pool_fini(adev);
2613                 }
2614
2615                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2616                 /* XXX handle errors */
2617                 if (r) {
2618                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2619                                   adev->ip_blocks[i].version->funcs->name, r);
2620                 }
2621                 adev->ip_blocks[i].status.sw = false;
2622                 adev->ip_blocks[i].status.valid = false;
2623         }
2624
2625         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2626                 if (!adev->ip_blocks[i].status.late_initialized)
2627                         continue;
2628                 if (adev->ip_blocks[i].version->funcs->late_fini)
2629                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2630                 adev->ip_blocks[i].status.late_initialized = false;
2631         }
2632
2633         amdgpu_ras_fini(adev);
2634
2635         if (amdgpu_sriov_vf(adev))
2636                 if (amdgpu_virt_release_full_gpu(adev, false))
2637                         DRM_ERROR("failed to release exclusive mode on fini\n");
2638
2639         return 0;
2640 }
2641
2642 /**
2643  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2644  *
2645  * @work: work_struct.
2646  */
2647 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2648 {
2649         struct amdgpu_device *adev =
2650                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2651         int r;
2652
2653         r = amdgpu_ib_ring_tests(adev);
2654         if (r)
2655                 DRM_ERROR("ib ring test failed (%d).\n", r);
2656 }
2657
2658 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2659 {
2660         struct amdgpu_device *adev =
2661                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2662
2663         mutex_lock(&adev->gfx.gfx_off_mutex);
2664         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2665                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2666                         adev->gfx.gfx_off_state = true;
2667         }
2668         mutex_unlock(&adev->gfx.gfx_off_mutex);
2669 }
2670
2671 /**
2672  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2673  *
2674  * @adev: amdgpu_device pointer
2675  *
2676  * Main suspend function for hardware IPs.  The list of all the hardware
2677  * IPs that make up the asic is walked, clockgating is disabled and the
2678  * suspend callbacks are run.  suspend puts the hardware and software state
2679  * in each IP into a state suitable for suspend.
2680  * Returns 0 on success, negative error code on failure.
2681  */
2682 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2683 {
2684         int i, r;
2685
2686         if (adev->in_poweroff_reboot_com ||
2687             !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) {
2688                 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2689                 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2690         }
2691
2692         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2693                 if (!adev->ip_blocks[i].status.valid)
2694                         continue;
2695
2696                 /* displays are handled separately */
2697                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2698                         continue;
2699
2700                 /* XXX handle errors */
2701                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2702                 /* XXX handle errors */
2703                 if (r) {
2704                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2705                                   adev->ip_blocks[i].version->funcs->name, r);
2706                         return r;
2707                 }
2708
2709                 adev->ip_blocks[i].status.hw = false;
2710         }
2711
2712         return 0;
2713 }
2714
2715 /**
2716  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2717  *
2718  * @adev: amdgpu_device pointer
2719  *
2720  * Main suspend function for hardware IPs.  The list of all the hardware
2721  * IPs that make up the asic is walked, clockgating is disabled and the
2722  * suspend callbacks are run.  suspend puts the hardware and software state
2723  * in each IP into a state suitable for suspend.
2724  * Returns 0 on success, negative error code on failure.
2725  */
2726 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2727 {
2728         int i, r;
2729
2730         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2731                 if (!adev->ip_blocks[i].status.valid)
2732                         continue;
2733                 /* displays are handled in phase1 */
2734                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2735                         continue;
2736                 /* PSP lost connection when err_event_athub occurs */
2737                 if (amdgpu_ras_intr_triggered() &&
2738                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2739                         adev->ip_blocks[i].status.hw = false;
2740                         continue;
2741                 }
2742                 /* XXX handle errors */
2743                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2744                 /* XXX handle errors */
2745                 if (r) {
2746                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2747                                   adev->ip_blocks[i].version->funcs->name, r);
2748                 }
2749                 adev->ip_blocks[i].status.hw = false;
2750                 /* handle putting the SMC in the appropriate state */
2751                 if(!amdgpu_sriov_vf(adev)){
2752                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2753                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2754                                 if (r) {
2755                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2756                                                         adev->mp1_state, r);
2757                                         return r;
2758                                 }
2759                         }
2760                 }
2761                 adev->ip_blocks[i].status.hw = false;
2762         }
2763
2764         return 0;
2765 }
2766
2767 /**
2768  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2769  *
2770  * @adev: amdgpu_device pointer
2771  *
2772  * Main suspend function for hardware IPs.  The list of all the hardware
2773  * IPs that make up the asic is walked, clockgating is disabled and the
2774  * suspend callbacks are run.  suspend puts the hardware and software state
2775  * in each IP into a state suitable for suspend.
2776  * Returns 0 on success, negative error code on failure.
2777  */
2778 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2779 {
2780         int r;
2781
2782         if (amdgpu_sriov_vf(adev)) {
2783                 amdgpu_virt_fini_data_exchange(adev);
2784                 amdgpu_virt_request_full_gpu(adev, false);
2785         }
2786
2787         r = amdgpu_device_ip_suspend_phase1(adev);
2788         if (r)
2789                 return r;
2790         r = amdgpu_device_ip_suspend_phase2(adev);
2791
2792         if (amdgpu_sriov_vf(adev))
2793                 amdgpu_virt_release_full_gpu(adev, false);
2794
2795         return r;
2796 }
2797
2798 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2799 {
2800         int i, r;
2801
2802         static enum amd_ip_block_type ip_order[] = {
2803                 AMD_IP_BLOCK_TYPE_GMC,
2804                 AMD_IP_BLOCK_TYPE_COMMON,
2805                 AMD_IP_BLOCK_TYPE_PSP,
2806                 AMD_IP_BLOCK_TYPE_IH,
2807         };
2808
2809         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2810                 int j;
2811                 struct amdgpu_ip_block *block;
2812
2813                 block = &adev->ip_blocks[i];
2814                 block->status.hw = false;
2815
2816                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2817
2818                         if (block->version->type != ip_order[j] ||
2819                                 !block->status.valid)
2820                                 continue;
2821
2822                         r = block->version->funcs->hw_init(adev);
2823                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2824                         if (r)
2825                                 return r;
2826                         block->status.hw = true;
2827                 }
2828         }
2829
2830         return 0;
2831 }
2832
2833 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2834 {
2835         int i, r;
2836
2837         static enum amd_ip_block_type ip_order[] = {
2838                 AMD_IP_BLOCK_TYPE_SMC,
2839                 AMD_IP_BLOCK_TYPE_DCE,
2840                 AMD_IP_BLOCK_TYPE_GFX,
2841                 AMD_IP_BLOCK_TYPE_SDMA,
2842                 AMD_IP_BLOCK_TYPE_UVD,
2843                 AMD_IP_BLOCK_TYPE_VCE,
2844                 AMD_IP_BLOCK_TYPE_VCN
2845         };
2846
2847         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2848                 int j;
2849                 struct amdgpu_ip_block *block;
2850
2851                 for (j = 0; j < adev->num_ip_blocks; j++) {
2852                         block = &adev->ip_blocks[j];
2853
2854                         if (block->version->type != ip_order[i] ||
2855                                 !block->status.valid ||
2856                                 block->status.hw)
2857                                 continue;
2858
2859                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2860                                 r = block->version->funcs->resume(adev);
2861                         else
2862                                 r = block->version->funcs->hw_init(adev);
2863
2864                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2865                         if (r)
2866                                 return r;
2867                         block->status.hw = true;
2868                 }
2869         }
2870
2871         return 0;
2872 }
2873
2874 /**
2875  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2876  *
2877  * @adev: amdgpu_device pointer
2878  *
2879  * First resume function for hardware IPs.  The list of all the hardware
2880  * IPs that make up the asic is walked and the resume callbacks are run for
2881  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2882  * after a suspend and updates the software state as necessary.  This
2883  * function is also used for restoring the GPU after a GPU reset.
2884  * Returns 0 on success, negative error code on failure.
2885  */
2886 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2887 {
2888         int i, r;
2889
2890         for (i = 0; i < adev->num_ip_blocks; i++) {
2891                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2892                         continue;
2893                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2894                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2895                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2896
2897                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2898                         if (r) {
2899                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2900                                           adev->ip_blocks[i].version->funcs->name, r);
2901                                 return r;
2902                         }
2903                         adev->ip_blocks[i].status.hw = true;
2904                 }
2905         }
2906
2907         return 0;
2908 }
2909
2910 /**
2911  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2912  *
2913  * @adev: amdgpu_device pointer
2914  *
2915  * First resume function for hardware IPs.  The list of all the hardware
2916  * IPs that make up the asic is walked and the resume callbacks are run for
2917  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2918  * functional state after a suspend and updates the software state as
2919  * necessary.  This function is also used for restoring the GPU after a GPU
2920  * reset.
2921  * Returns 0 on success, negative error code on failure.
2922  */
2923 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2924 {
2925         int i, r;
2926
2927         for (i = 0; i < adev->num_ip_blocks; i++) {
2928                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2929                         continue;
2930                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2931                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2932                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2933                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2934                         continue;
2935                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2936                 if (r) {
2937                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2938                                   adev->ip_blocks[i].version->funcs->name, r);
2939                         return r;
2940                 }
2941                 adev->ip_blocks[i].status.hw = true;
2942         }
2943
2944         return 0;
2945 }
2946
2947 /**
2948  * amdgpu_device_ip_resume - run resume for hardware IPs
2949  *
2950  * @adev: amdgpu_device pointer
2951  *
2952  * Main resume function for hardware IPs.  The hardware IPs
2953  * are split into two resume functions because they are
2954  * are also used in in recovering from a GPU reset and some additional
2955  * steps need to be take between them.  In this case (S3/S4) they are
2956  * run sequentially.
2957  * Returns 0 on success, negative error code on failure.
2958  */
2959 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2960 {
2961         int r;
2962
2963         r = amdgpu_device_ip_resume_phase1(adev);
2964         if (r)
2965                 return r;
2966
2967         r = amdgpu_device_fw_loading(adev);
2968         if (r)
2969                 return r;
2970
2971         r = amdgpu_device_ip_resume_phase2(adev);
2972
2973         return r;
2974 }
2975
2976 /**
2977  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2978  *
2979  * @adev: amdgpu_device pointer
2980  *
2981  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2982  */
2983 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2984 {
2985         if (amdgpu_sriov_vf(adev)) {
2986                 if (adev->is_atom_fw) {
2987                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2988                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2989                 } else {
2990                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2991                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2992                 }
2993
2994                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2995                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2996         }
2997 }
2998
2999 /**
3000  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3001  *
3002  * @asic_type: AMD asic type
3003  *
3004  * Check if there is DC (new modesetting infrastructre) support for an asic.
3005  * returns true if DC has support, false if not.
3006  */
3007 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3008 {
3009         switch (asic_type) {
3010 #if defined(CONFIG_DRM_AMD_DC)
3011 #if defined(CONFIG_DRM_AMD_DC_SI)
3012         case CHIP_TAHITI:
3013         case CHIP_PITCAIRN:
3014         case CHIP_VERDE:
3015         case CHIP_OLAND:
3016 #endif
3017         case CHIP_BONAIRE:
3018         case CHIP_KAVERI:
3019         case CHIP_KABINI:
3020         case CHIP_MULLINS:
3021                 /*
3022                  * We have systems in the wild with these ASICs that require
3023                  * LVDS and VGA support which is not supported with DC.
3024                  *
3025                  * Fallback to the non-DC driver here by default so as not to
3026                  * cause regressions.
3027                  */
3028                 return amdgpu_dc > 0;
3029         case CHIP_HAWAII:
3030         case CHIP_CARRIZO:
3031         case CHIP_STONEY:
3032         case CHIP_POLARIS10:
3033         case CHIP_POLARIS11:
3034         case CHIP_POLARIS12:
3035         case CHIP_VEGAM:
3036         case CHIP_TONGA:
3037         case CHIP_FIJI:
3038         case CHIP_VEGA10:
3039         case CHIP_VEGA12:
3040         case CHIP_VEGA20:
3041 #if defined(CONFIG_DRM_AMD_DC_DCN)
3042         case CHIP_RAVEN:
3043         case CHIP_NAVI10:
3044         case CHIP_NAVI14:
3045         case CHIP_NAVI12:
3046         case CHIP_RENOIR:
3047         case CHIP_SIENNA_CICHLID:
3048         case CHIP_NAVY_FLOUNDER:
3049         case CHIP_DIMGREY_CAVEFISH:
3050         case CHIP_VANGOGH:
3051 #endif
3052                 return amdgpu_dc != 0;
3053 #endif
3054         default:
3055                 if (amdgpu_dc > 0)
3056                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3057                                          "but isn't supported by ASIC, ignoring\n");
3058                 return false;
3059         }
3060 }
3061
3062 /**
3063  * amdgpu_device_has_dc_support - check if dc is supported
3064  *
3065  * @adev: amdgpu_device pointer
3066  *
3067  * Returns true for supported, false for not supported
3068  */
3069 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3070 {
3071         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3072                 return false;
3073
3074         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3075 }
3076
3077
3078 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3079 {
3080         struct amdgpu_device *adev =
3081                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3082         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3083
3084         /* It's a bug to not have a hive within this function */
3085         if (WARN_ON(!hive))
3086                 return;
3087
3088         /*
3089          * Use task barrier to synchronize all xgmi reset works across the
3090          * hive. task_barrier_enter and task_barrier_exit will block
3091          * until all the threads running the xgmi reset works reach
3092          * those points. task_barrier_full will do both blocks.
3093          */
3094         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3095
3096                 task_barrier_enter(&hive->tb);
3097                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3098
3099                 if (adev->asic_reset_res)
3100                         goto fail;
3101
3102                 task_barrier_exit(&hive->tb);
3103                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3104
3105                 if (adev->asic_reset_res)
3106                         goto fail;
3107
3108                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3109                         adev->mmhub.funcs->reset_ras_error_count(adev);
3110         } else {
3111
3112                 task_barrier_full(&hive->tb);
3113                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3114         }
3115
3116 fail:
3117         if (adev->asic_reset_res)
3118                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3119                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3120         amdgpu_put_xgmi_hive(hive);
3121 }
3122
3123 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3124 {
3125         char *input = amdgpu_lockup_timeout;
3126         char *timeout_setting = NULL;
3127         int index = 0;
3128         long timeout;
3129         int ret = 0;
3130
3131         /*
3132          * By default timeout for non compute jobs is 10000.
3133          * And there is no timeout enforced on compute jobs.
3134          * In SR-IOV or passthrough mode, timeout for compute
3135          * jobs are 60000 by default.
3136          */
3137         adev->gfx_timeout = msecs_to_jiffies(10000);
3138         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3139         if (amdgpu_sriov_vf(adev))
3140                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3141                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3142         else if (amdgpu_passthrough(adev))
3143                 adev->compute_timeout =  msecs_to_jiffies(60000);
3144         else
3145                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3146
3147         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3148                 while ((timeout_setting = strsep(&input, ",")) &&
3149                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3150                         ret = kstrtol(timeout_setting, 0, &timeout);
3151                         if (ret)
3152                                 return ret;
3153
3154                         if (timeout == 0) {
3155                                 index++;
3156                                 continue;
3157                         } else if (timeout < 0) {
3158                                 timeout = MAX_SCHEDULE_TIMEOUT;
3159                         } else {
3160                                 timeout = msecs_to_jiffies(timeout);
3161                         }
3162
3163                         switch (index++) {
3164                         case 0:
3165                                 adev->gfx_timeout = timeout;
3166                                 break;
3167                         case 1:
3168                                 adev->compute_timeout = timeout;
3169                                 break;
3170                         case 2:
3171                                 adev->sdma_timeout = timeout;
3172                                 break;
3173                         case 3:
3174                                 adev->video_timeout = timeout;
3175                                 break;
3176                         default:
3177                                 break;
3178                         }
3179                 }
3180                 /*
3181                  * There is only one value specified and
3182                  * it should apply to all non-compute jobs.
3183                  */
3184                 if (index == 1) {
3185                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3186                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3187                                 adev->compute_timeout = adev->gfx_timeout;
3188                 }
3189         }
3190
3191         return ret;
3192 }
3193
3194 static const struct attribute *amdgpu_dev_attributes[] = {
3195         &dev_attr_product_name.attr,
3196         &dev_attr_product_number.attr,
3197         &dev_attr_serial_number.attr,
3198         &dev_attr_pcie_replay_count.attr,
3199         NULL
3200 };
3201
3202
3203 /**
3204  * amdgpu_device_init - initialize the driver
3205  *
3206  * @adev: amdgpu_device pointer
3207  * @flags: driver flags
3208  *
3209  * Initializes the driver info and hw (all asics).
3210  * Returns 0 for success or an error on failure.
3211  * Called at driver startup.
3212  */
3213 int amdgpu_device_init(struct amdgpu_device *adev,
3214                        uint32_t flags)
3215 {
3216         struct drm_device *ddev = adev_to_drm(adev);
3217         struct pci_dev *pdev = adev->pdev;
3218         int r, i;
3219         bool atpx = false;
3220         u32 max_MBps;
3221
3222         adev->shutdown = false;
3223         adev->flags = flags;
3224
3225         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3226                 adev->asic_type = amdgpu_force_asic_type;
3227         else
3228                 adev->asic_type = flags & AMD_ASIC_MASK;
3229
3230         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3231         if (amdgpu_emu_mode == 1)
3232                 adev->usec_timeout *= 10;
3233         adev->gmc.gart_size = 512 * 1024 * 1024;
3234         adev->accel_working = false;
3235         adev->num_rings = 0;
3236         adev->mman.buffer_funcs = NULL;
3237         adev->mman.buffer_funcs_ring = NULL;
3238         adev->vm_manager.vm_pte_funcs = NULL;
3239         adev->vm_manager.vm_pte_num_scheds = 0;
3240         adev->gmc.gmc_funcs = NULL;
3241         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3242         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3243
3244         adev->smc_rreg = &amdgpu_invalid_rreg;
3245         adev->smc_wreg = &amdgpu_invalid_wreg;
3246         adev->pcie_rreg = &amdgpu_invalid_rreg;
3247         adev->pcie_wreg = &amdgpu_invalid_wreg;
3248         adev->pciep_rreg = &amdgpu_invalid_rreg;
3249         adev->pciep_wreg = &amdgpu_invalid_wreg;
3250         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3251         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3252         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3253         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3254         adev->didt_rreg = &amdgpu_invalid_rreg;
3255         adev->didt_wreg = &amdgpu_invalid_wreg;
3256         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3257         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3258         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3259         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3260
3261         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3262                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3263                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3264
3265         /* mutex initialization are all done here so we
3266          * can recall function without having locking issues */
3267         atomic_set(&adev->irq.ih.lock, 0);
3268         mutex_init(&adev->firmware.mutex);
3269         mutex_init(&adev->pm.mutex);
3270         mutex_init(&adev->gfx.gpu_clock_mutex);
3271         mutex_init(&adev->srbm_mutex);
3272         mutex_init(&adev->gfx.pipe_reserve_mutex);
3273         mutex_init(&adev->gfx.gfx_off_mutex);
3274         mutex_init(&adev->grbm_idx_mutex);
3275         mutex_init(&adev->mn_lock);
3276         mutex_init(&adev->virt.vf_errors.lock);
3277         hash_init(adev->mn_hash);
3278         atomic_set(&adev->in_gpu_reset, 0);
3279         init_rwsem(&adev->reset_sem);
3280         mutex_init(&adev->psp.mutex);
3281         mutex_init(&adev->notifier_lock);
3282
3283         r = amdgpu_device_check_arguments(adev);
3284         if (r)
3285                 return r;
3286
3287         spin_lock_init(&adev->mmio_idx_lock);
3288         spin_lock_init(&adev->smc_idx_lock);
3289         spin_lock_init(&adev->pcie_idx_lock);
3290         spin_lock_init(&adev->uvd_ctx_idx_lock);
3291         spin_lock_init(&adev->didt_idx_lock);
3292         spin_lock_init(&adev->gc_cac_idx_lock);
3293         spin_lock_init(&adev->se_cac_idx_lock);
3294         spin_lock_init(&adev->audio_endpt_idx_lock);
3295         spin_lock_init(&adev->mm_stats.lock);
3296
3297         INIT_LIST_HEAD(&adev->shadow_list);
3298         mutex_init(&adev->shadow_list_lock);
3299
3300         INIT_DELAYED_WORK(&adev->delayed_init_work,
3301                           amdgpu_device_delayed_init_work_handler);
3302         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3303                           amdgpu_device_delay_enable_gfx_off);
3304
3305         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3306
3307         adev->gfx.gfx_off_req_count = 1;
3308         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3309
3310         atomic_set(&adev->throttling_logging_enabled, 1);
3311         /*
3312          * If throttling continues, logging will be performed every minute
3313          * to avoid log flooding. "-1" is subtracted since the thermal
3314          * throttling interrupt comes every second. Thus, the total logging
3315          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3316          * for throttling interrupt) = 60 seconds.
3317          */
3318         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3319         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3320
3321         /* Registers mapping */
3322         /* TODO: block userspace mapping of io register */
3323         if (adev->asic_type >= CHIP_BONAIRE) {
3324                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3325                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3326         } else {
3327                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3328                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3329         }
3330
3331         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3332         if (adev->rmmio == NULL) {
3333                 return -ENOMEM;
3334         }
3335         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3336         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3337
3338         /* io port mapping */
3339         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3340                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3341                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3342                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3343                         break;
3344                 }
3345         }
3346         if (adev->rio_mem == NULL)
3347                 DRM_INFO("PCI I/O BAR is not found.\n");
3348
3349         /* enable PCIE atomic ops */
3350         r = pci_enable_atomic_ops_to_root(adev->pdev,
3351                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3352                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3353         if (r) {
3354                 adev->have_atomics_support = false;
3355                 DRM_INFO("PCIE atomic ops is not supported\n");
3356         } else {
3357                 adev->have_atomics_support = true;
3358         }
3359
3360         amdgpu_device_get_pcie_info(adev);
3361
3362         if (amdgpu_mcbp)
3363                 DRM_INFO("MCBP is enabled\n");
3364
3365         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3366                 adev->enable_mes = true;
3367
3368         /* detect hw virtualization here */
3369         amdgpu_detect_virtualization(adev);
3370
3371         r = amdgpu_device_get_job_timeout_settings(adev);
3372         if (r) {
3373                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3374                 goto failed_unmap;
3375         }
3376
3377         /* early init functions */
3378         r = amdgpu_device_ip_early_init(adev);
3379         if (r)
3380                 goto failed_unmap;
3381
3382         /* doorbell bar mapping and doorbell index init*/
3383         amdgpu_device_doorbell_init(adev);
3384
3385         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3386         /* this will fail for cards that aren't VGA class devices, just
3387          * ignore it */
3388         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3389                 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3390
3391         if (amdgpu_device_supports_atpx(ddev))
3392                 atpx = true;
3393         if (amdgpu_has_atpx() &&
3394             (amdgpu_is_atpx_hybrid() ||
3395              amdgpu_has_atpx_dgpu_power_cntl()) &&
3396             !pci_is_thunderbolt_attached(adev->pdev))
3397                 vga_switcheroo_register_client(adev->pdev,
3398                                                &amdgpu_switcheroo_ops, atpx);
3399         if (atpx)
3400                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3401
3402         if (amdgpu_emu_mode == 1) {
3403                 /* post the asic on emulation mode */
3404                 emu_soc_asic_init(adev);
3405                 goto fence_driver_init;
3406         }
3407
3408         /* detect if we are with an SRIOV vbios */
3409         amdgpu_device_detect_sriov_bios(adev);
3410
3411         /* check if we need to reset the asic
3412          *  E.g., driver was not cleanly unloaded previously, etc.
3413          */
3414         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3415                 r = amdgpu_asic_reset(adev);
3416                 if (r) {
3417                         dev_err(adev->dev, "asic reset on init failed\n");
3418                         goto failed;
3419                 }
3420         }
3421
3422         pci_enable_pcie_error_reporting(adev->pdev);
3423
3424         /* Post card if necessary */
3425         if (amdgpu_device_need_post(adev)) {
3426                 if (!adev->bios) {
3427                         dev_err(adev->dev, "no vBIOS found\n");
3428                         r = -EINVAL;
3429                         goto failed;
3430                 }
3431                 DRM_INFO("GPU posting now...\n");
3432                 r = amdgpu_device_asic_init(adev);
3433                 if (r) {
3434                         dev_err(adev->dev, "gpu post error!\n");
3435                         goto failed;
3436                 }
3437         }
3438
3439         if (adev->is_atom_fw) {
3440                 /* Initialize clocks */
3441                 r = amdgpu_atomfirmware_get_clock_info(adev);
3442                 if (r) {
3443                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3444                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3445                         goto failed;
3446                 }
3447         } else {
3448                 /* Initialize clocks */
3449                 r = amdgpu_atombios_get_clock_info(adev);
3450                 if (r) {
3451                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3452                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3453                         goto failed;
3454                 }
3455                 /* init i2c buses */
3456                 if (!amdgpu_device_has_dc_support(adev))
3457                         amdgpu_atombios_i2c_init(adev);
3458         }
3459
3460 fence_driver_init:
3461         /* Fence driver */
3462         r = amdgpu_fence_driver_init(adev);
3463         if (r) {
3464                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3465                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3466                 goto failed;
3467         }
3468
3469         /* init the mode config */
3470         drm_mode_config_init(adev_to_drm(adev));
3471
3472         r = amdgpu_device_ip_init(adev);
3473         if (r) {
3474                 /* failed in exclusive mode due to timeout */
3475                 if (amdgpu_sriov_vf(adev) &&
3476                     !amdgpu_sriov_runtime(adev) &&
3477                     amdgpu_virt_mmio_blocked(adev) &&
3478                     !amdgpu_virt_wait_reset(adev)) {
3479                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3480                         /* Don't send request since VF is inactive. */
3481                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3482                         adev->virt.ops = NULL;
3483                         r = -EAGAIN;
3484                         goto failed;
3485                 }
3486                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3487                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3488                 goto failed;
3489         }
3490
3491         dev_info(adev->dev,
3492                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3493                         adev->gfx.config.max_shader_engines,
3494                         adev->gfx.config.max_sh_per_se,
3495                         adev->gfx.config.max_cu_per_sh,
3496                         adev->gfx.cu_info.number);
3497
3498         adev->accel_working = true;
3499
3500         amdgpu_vm_check_compute_bug(adev);
3501
3502         /* Initialize the buffer migration limit. */
3503         if (amdgpu_moverate >= 0)
3504                 max_MBps = amdgpu_moverate;
3505         else
3506                 max_MBps = 8; /* Allow 8 MB/s. */
3507         /* Get a log2 for easy divisions. */
3508         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3509
3510         amdgpu_fbdev_init(adev);
3511
3512         r = amdgpu_pm_sysfs_init(adev);
3513         if (r) {
3514                 adev->pm_sysfs_en = false;
3515                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3516         } else
3517                 adev->pm_sysfs_en = true;
3518
3519         r = amdgpu_ucode_sysfs_init(adev);
3520         if (r) {
3521                 adev->ucode_sysfs_en = false;
3522                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3523         } else
3524                 adev->ucode_sysfs_en = true;
3525
3526         if ((amdgpu_testing & 1)) {
3527                 if (adev->accel_working)
3528                         amdgpu_test_moves(adev);
3529                 else
3530                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3531         }
3532         if (amdgpu_benchmarking) {
3533                 if (adev->accel_working)
3534                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3535                 else
3536                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3537         }
3538
3539         /*
3540          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3541          * Otherwise the mgpu fan boost feature will be skipped due to the
3542          * gpu instance is counted less.
3543          */
3544         amdgpu_register_gpu_instance(adev);
3545
3546         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3547          * explicit gating rather than handling it automatically.
3548          */
3549         r = amdgpu_device_ip_late_init(adev);
3550         if (r) {
3551                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3552                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3553                 goto failed;
3554         }
3555
3556         /* must succeed. */
3557         amdgpu_ras_resume(adev);
3558
3559         queue_delayed_work(system_wq, &adev->delayed_init_work,
3560                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3561
3562         if (amdgpu_sriov_vf(adev))
3563                 flush_delayed_work(&adev->delayed_init_work);
3564
3565         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3566         if (r)
3567                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3568
3569         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3570                 r = amdgpu_pmu_init(adev);
3571         if (r)
3572                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3573
3574         /* Have stored pci confspace at hand for restore in sudden PCI error */
3575         if (amdgpu_device_cache_pci_state(adev->pdev))
3576                 pci_restore_state(pdev);
3577
3578         return 0;
3579
3580 failed:
3581         amdgpu_vf_error_trans_all(adev);
3582         if (atpx)
3583                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3584
3585 failed_unmap:
3586         iounmap(adev->rmmio);
3587         adev->rmmio = NULL;
3588
3589         return r;
3590 }
3591
3592 /**
3593  * amdgpu_device_fini - tear down the driver
3594  *
3595  * @adev: amdgpu_device pointer
3596  *
3597  * Tear down the driver info (all asics).
3598  * Called at driver shutdown.
3599  */
3600 void amdgpu_device_fini(struct amdgpu_device *adev)
3601 {
3602         dev_info(adev->dev, "amdgpu: finishing device.\n");
3603         flush_delayed_work(&adev->delayed_init_work);
3604         ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3605         adev->shutdown = true;
3606
3607         kfree(adev->pci_state);
3608
3609         /* make sure IB test finished before entering exclusive mode
3610          * to avoid preemption on IB test
3611          * */
3612         if (amdgpu_sriov_vf(adev)) {
3613                 amdgpu_virt_request_full_gpu(adev, false);
3614                 amdgpu_virt_fini_data_exchange(adev);
3615         }
3616
3617         /* disable all interrupts */
3618         amdgpu_irq_disable_all(adev);
3619         if (adev->mode_info.mode_config_initialized){
3620                 if (!amdgpu_device_has_dc_support(adev))
3621                         drm_helper_force_disable_all(adev_to_drm(adev));
3622                 else
3623                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3624         }
3625         amdgpu_fence_driver_fini(adev);
3626         if (adev->pm_sysfs_en)
3627                 amdgpu_pm_sysfs_fini(adev);
3628         amdgpu_fbdev_fini(adev);
3629         amdgpu_device_ip_fini(adev);
3630         release_firmware(adev->firmware.gpu_info_fw);
3631         adev->firmware.gpu_info_fw = NULL;
3632         adev->accel_working = false;
3633         /* free i2c buses */
3634         if (!amdgpu_device_has_dc_support(adev))
3635                 amdgpu_i2c_fini(adev);
3636
3637         if (amdgpu_emu_mode != 1)
3638                 amdgpu_atombios_fini(adev);
3639
3640         kfree(adev->bios);
3641         adev->bios = NULL;
3642         if (amdgpu_has_atpx() &&
3643             (amdgpu_is_atpx_hybrid() ||
3644              amdgpu_has_atpx_dgpu_power_cntl()) &&
3645             !pci_is_thunderbolt_attached(adev->pdev))
3646                 vga_switcheroo_unregister_client(adev->pdev);
3647         if (amdgpu_device_supports_atpx(adev_to_drm(adev)))
3648                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3649         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3650                 vga_client_register(adev->pdev, NULL, NULL, NULL);
3651         if (adev->rio_mem)
3652                 pci_iounmap(adev->pdev, adev->rio_mem);
3653         adev->rio_mem = NULL;
3654         iounmap(adev->rmmio);
3655         adev->rmmio = NULL;
3656         amdgpu_device_doorbell_fini(adev);
3657
3658         if (adev->ucode_sysfs_en)
3659                 amdgpu_ucode_sysfs_fini(adev);
3660
3661         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3662         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3663                 amdgpu_pmu_fini(adev);
3664         if (adev->mman.discovery_bin)
3665                 amdgpu_discovery_fini(adev);
3666 }
3667
3668
3669 /*
3670  * Suspend & resume.
3671  */
3672 /**
3673  * amdgpu_device_suspend - initiate device suspend
3674  *
3675  * @dev: drm dev pointer
3676  * @fbcon : notify the fbdev of suspend
3677  *
3678  * Puts the hw in the suspend state (all asics).
3679  * Returns 0 for success or an error on failure.
3680  * Called at driver suspend.
3681  */
3682 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3683 {
3684         struct amdgpu_device *adev;
3685         struct drm_crtc *crtc;
3686         struct drm_connector *connector;
3687         struct drm_connector_list_iter iter;
3688         int r;
3689
3690         adev = drm_to_adev(dev);
3691
3692         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3693                 return 0;
3694
3695         adev->in_suspend = true;
3696         drm_kms_helper_poll_disable(dev);
3697
3698         if (fbcon)
3699                 amdgpu_fbdev_set_suspend(adev, 1);
3700
3701         cancel_delayed_work_sync(&adev->delayed_init_work);
3702
3703         if (!amdgpu_device_has_dc_support(adev)) {
3704                 /* turn off display hw */
3705                 drm_modeset_lock_all(dev);
3706                 drm_connector_list_iter_begin(dev, &iter);
3707                 drm_for_each_connector_iter(connector, &iter)
3708                         drm_helper_connector_dpms(connector,
3709                                                   DRM_MODE_DPMS_OFF);
3710                 drm_connector_list_iter_end(&iter);
3711                 drm_modeset_unlock_all(dev);
3712                         /* unpin the front buffers and cursors */
3713                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3714                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3715                         struct drm_framebuffer *fb = crtc->primary->fb;
3716                         struct amdgpu_bo *robj;
3717
3718                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3719                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3720                                 r = amdgpu_bo_reserve(aobj, true);
3721                                 if (r == 0) {
3722                                         amdgpu_bo_unpin(aobj);
3723                                         amdgpu_bo_unreserve(aobj);
3724                                 }
3725                         }
3726
3727                         if (fb == NULL || fb->obj[0] == NULL) {
3728                                 continue;
3729                         }
3730                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3731                         /* don't unpin kernel fb objects */
3732                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3733                                 r = amdgpu_bo_reserve(robj, true);
3734                                 if (r == 0) {
3735                                         amdgpu_bo_unpin(robj);
3736                                         amdgpu_bo_unreserve(robj);
3737                                 }
3738                         }
3739                 }
3740         }
3741
3742         amdgpu_ras_suspend(adev);
3743
3744         r = amdgpu_device_ip_suspend_phase1(adev);
3745
3746         amdgpu_amdkfd_suspend(adev, adev->in_runpm);
3747
3748         /* evict vram memory */
3749         amdgpu_bo_evict_vram(adev);
3750
3751         amdgpu_fence_driver_suspend(adev);
3752
3753         if (adev->in_poweroff_reboot_com ||
3754             !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev))
3755                 r = amdgpu_device_ip_suspend_phase2(adev);
3756         else
3757                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
3758         /* evict remaining vram memory
3759          * This second call to evict vram is to evict the gart page table
3760          * using the CPU.
3761          */
3762         amdgpu_bo_evict_vram(adev);
3763
3764         return 0;
3765 }
3766
3767 /**
3768  * amdgpu_device_resume - initiate device resume
3769  *
3770  * @dev: drm dev pointer
3771  * @fbcon : notify the fbdev of resume
3772  *
3773  * Bring the hw back to operating state (all asics).
3774  * Returns 0 for success or an error on failure.
3775  * Called at driver resume.
3776  */
3777 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3778 {
3779         struct drm_connector *connector;
3780         struct drm_connector_list_iter iter;
3781         struct amdgpu_device *adev = drm_to_adev(dev);
3782         struct drm_crtc *crtc;
3783         int r = 0;
3784
3785         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3786                 return 0;
3787
3788         if (amdgpu_acpi_is_s0ix_supported(adev))
3789                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3790
3791         /* post card */
3792         if (amdgpu_device_need_post(adev)) {
3793                 r = amdgpu_device_asic_init(adev);
3794                 if (r)
3795                         dev_err(adev->dev, "amdgpu asic init failed\n");
3796         }
3797
3798         r = amdgpu_device_ip_resume(adev);
3799         if (r) {
3800                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3801                 return r;
3802         }
3803         amdgpu_fence_driver_resume(adev);
3804
3805
3806         r = amdgpu_device_ip_late_init(adev);
3807         if (r)
3808                 return r;
3809
3810         queue_delayed_work(system_wq, &adev->delayed_init_work,
3811                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3812
3813         if (!amdgpu_device_has_dc_support(adev)) {
3814                 /* pin cursors */
3815                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3816                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3817
3818                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3819                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3820                                 r = amdgpu_bo_reserve(aobj, true);
3821                                 if (r == 0) {
3822                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3823                                         if (r != 0)
3824                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3825                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3826                                         amdgpu_bo_unreserve(aobj);
3827                                 }
3828                         }
3829                 }
3830         }
3831         r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3832         if (r)
3833                 return r;
3834
3835         /* Make sure IB tests flushed */
3836         flush_delayed_work(&adev->delayed_init_work);
3837
3838         /* blat the mode back in */
3839         if (fbcon) {
3840                 if (!amdgpu_device_has_dc_support(adev)) {
3841                         /* pre DCE11 */
3842                         drm_helper_resume_force_mode(dev);
3843
3844                         /* turn on display hw */
3845                         drm_modeset_lock_all(dev);
3846
3847                         drm_connector_list_iter_begin(dev, &iter);
3848                         drm_for_each_connector_iter(connector, &iter)
3849                                 drm_helper_connector_dpms(connector,
3850                                                           DRM_MODE_DPMS_ON);
3851                         drm_connector_list_iter_end(&iter);
3852
3853                         drm_modeset_unlock_all(dev);
3854                 }
3855                 amdgpu_fbdev_set_suspend(adev, 0);
3856         }
3857
3858         drm_kms_helper_poll_enable(dev);
3859
3860         amdgpu_ras_resume(adev);
3861
3862         /*
3863          * Most of the connector probing functions try to acquire runtime pm
3864          * refs to ensure that the GPU is powered on when connector polling is
3865          * performed. Since we're calling this from a runtime PM callback,
3866          * trying to acquire rpm refs will cause us to deadlock.
3867          *
3868          * Since we're guaranteed to be holding the rpm lock, it's safe to
3869          * temporarily disable the rpm helpers so this doesn't deadlock us.
3870          */
3871 #ifdef CONFIG_PM
3872         dev->dev->power.disable_depth++;
3873 #endif
3874         if (!amdgpu_device_has_dc_support(adev))
3875                 drm_helper_hpd_irq_event(dev);
3876         else
3877                 drm_kms_helper_hotplug_event(dev);
3878 #ifdef CONFIG_PM
3879         dev->dev->power.disable_depth--;
3880 #endif
3881         adev->in_suspend = false;
3882
3883         return 0;
3884 }
3885
3886 /**
3887  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3888  *
3889  * @adev: amdgpu_device pointer
3890  *
3891  * The list of all the hardware IPs that make up the asic is walked and
3892  * the check_soft_reset callbacks are run.  check_soft_reset determines
3893  * if the asic is still hung or not.
3894  * Returns true if any of the IPs are still in a hung state, false if not.
3895  */
3896 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3897 {
3898         int i;
3899         bool asic_hang = false;
3900
3901         if (amdgpu_sriov_vf(adev))
3902                 return true;
3903
3904         if (amdgpu_asic_need_full_reset(adev))
3905                 return true;
3906
3907         for (i = 0; i < adev->num_ip_blocks; i++) {
3908                 if (!adev->ip_blocks[i].status.valid)
3909                         continue;
3910                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3911                         adev->ip_blocks[i].status.hang =
3912                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3913                 if (adev->ip_blocks[i].status.hang) {
3914                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3915                         asic_hang = true;
3916                 }
3917         }
3918         return asic_hang;
3919 }
3920
3921 /**
3922  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3923  *
3924  * @adev: amdgpu_device pointer
3925  *
3926  * The list of all the hardware IPs that make up the asic is walked and the
3927  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3928  * handles any IP specific hardware or software state changes that are
3929  * necessary for a soft reset to succeed.
3930  * Returns 0 on success, negative error code on failure.
3931  */
3932 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3933 {
3934         int i, r = 0;
3935
3936         for (i = 0; i < adev->num_ip_blocks; i++) {
3937                 if (!adev->ip_blocks[i].status.valid)
3938                         continue;
3939                 if (adev->ip_blocks[i].status.hang &&
3940                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3941                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3942                         if (r)
3943                                 return r;
3944                 }
3945         }
3946
3947         return 0;
3948 }
3949
3950 /**
3951  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3952  *
3953  * @adev: amdgpu_device pointer
3954  *
3955  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3956  * reset is necessary to recover.
3957  * Returns true if a full asic reset is required, false if not.
3958  */
3959 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3960 {
3961         int i;
3962
3963         if (amdgpu_asic_need_full_reset(adev))
3964                 return true;
3965
3966         for (i = 0; i < adev->num_ip_blocks; i++) {
3967                 if (!adev->ip_blocks[i].status.valid)
3968                         continue;
3969                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3970                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3971                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3972                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3973                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3974                         if (adev->ip_blocks[i].status.hang) {
3975                                 dev_info(adev->dev, "Some block need full reset!\n");
3976                                 return true;
3977                         }
3978                 }
3979         }
3980         return false;
3981 }
3982
3983 /**
3984  * amdgpu_device_ip_soft_reset - do a soft reset
3985  *
3986  * @adev: amdgpu_device pointer
3987  *
3988  * The list of all the hardware IPs that make up the asic is walked and the
3989  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3990  * IP specific hardware or software state changes that are necessary to soft
3991  * reset the IP.
3992  * Returns 0 on success, negative error code on failure.
3993  */
3994 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3995 {
3996         int i, r = 0;
3997
3998         for (i = 0; i < adev->num_ip_blocks; i++) {
3999                 if (!adev->ip_blocks[i].status.valid)
4000                         continue;
4001                 if (adev->ip_blocks[i].status.hang &&
4002                     adev->ip_blocks[i].version->funcs->soft_reset) {
4003                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4004                         if (r)
4005                                 return r;
4006                 }
4007         }
4008
4009         return 0;
4010 }
4011
4012 /**
4013  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4014  *
4015  * @adev: amdgpu_device pointer
4016  *
4017  * The list of all the hardware IPs that make up the asic is walked and the
4018  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4019  * handles any IP specific hardware or software state changes that are
4020  * necessary after the IP has been soft reset.
4021  * Returns 0 on success, negative error code on failure.
4022  */
4023 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4024 {
4025         int i, r = 0;
4026
4027         for (i = 0; i < adev->num_ip_blocks; i++) {
4028                 if (!adev->ip_blocks[i].status.valid)
4029                         continue;
4030                 if (adev->ip_blocks[i].status.hang &&
4031                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4032                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4033                 if (r)
4034                         return r;
4035         }
4036
4037         return 0;
4038 }
4039
4040 /**
4041  * amdgpu_device_recover_vram - Recover some VRAM contents
4042  *
4043  * @adev: amdgpu_device pointer
4044  *
4045  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4046  * restore things like GPUVM page tables after a GPU reset where
4047  * the contents of VRAM might be lost.
4048  *
4049  * Returns:
4050  * 0 on success, negative error code on failure.
4051  */
4052 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4053 {
4054         struct dma_fence *fence = NULL, *next = NULL;
4055         struct amdgpu_bo *shadow;
4056         long r = 1, tmo;
4057
4058         if (amdgpu_sriov_runtime(adev))
4059                 tmo = msecs_to_jiffies(8000);
4060         else
4061                 tmo = msecs_to_jiffies(100);
4062
4063         dev_info(adev->dev, "recover vram bo from shadow start\n");
4064         mutex_lock(&adev->shadow_list_lock);
4065         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4066
4067                 /* No need to recover an evicted BO */
4068                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4069                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4070                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4071                         continue;
4072
4073                 r = amdgpu_bo_restore_shadow(shadow, &next);
4074                 if (r)
4075                         break;
4076
4077                 if (fence) {
4078                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4079                         dma_fence_put(fence);
4080                         fence = next;
4081                         if (tmo == 0) {
4082                                 r = -ETIMEDOUT;
4083                                 break;
4084                         } else if (tmo < 0) {
4085                                 r = tmo;
4086                                 break;
4087                         }
4088                 } else {
4089                         fence = next;
4090                 }
4091         }
4092         mutex_unlock(&adev->shadow_list_lock);
4093
4094         if (fence)
4095                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4096         dma_fence_put(fence);
4097
4098         if (r < 0 || tmo <= 0) {
4099                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4100                 return -EIO;
4101         }
4102
4103         dev_info(adev->dev, "recover vram bo from shadow done\n");
4104         return 0;
4105 }
4106
4107
4108 /**
4109  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4110  *
4111  * @adev: amdgpu_device pointer
4112  * @from_hypervisor: request from hypervisor
4113  *
4114  * do VF FLR and reinitialize Asic
4115  * return 0 means succeeded otherwise failed
4116  */
4117 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4118                                      bool from_hypervisor)
4119 {
4120         int r;
4121
4122         if (from_hypervisor)
4123                 r = amdgpu_virt_request_full_gpu(adev, true);
4124         else
4125                 r = amdgpu_virt_reset_gpu(adev);
4126         if (r)
4127                 return r;
4128
4129         amdgpu_amdkfd_pre_reset(adev);
4130
4131         /* Resume IP prior to SMC */
4132         r = amdgpu_device_ip_reinit_early_sriov(adev);
4133         if (r)
4134                 goto error;
4135
4136         amdgpu_virt_init_data_exchange(adev);
4137         /* we need recover gart prior to run SMC/CP/SDMA resume */
4138         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4139
4140         r = amdgpu_device_fw_loading(adev);
4141         if (r)
4142                 return r;
4143
4144         /* now we are okay to resume SMC/CP/SDMA */
4145         r = amdgpu_device_ip_reinit_late_sriov(adev);
4146         if (r)
4147                 goto error;
4148
4149         amdgpu_irq_gpu_reset_resume_helper(adev);
4150         r = amdgpu_ib_ring_tests(adev);
4151         amdgpu_amdkfd_post_reset(adev);
4152
4153 error:
4154         amdgpu_virt_release_full_gpu(adev, true);
4155         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4156                 amdgpu_inc_vram_lost(adev);
4157                 r = amdgpu_device_recover_vram(adev);
4158         }
4159
4160         return r;
4161 }
4162
4163 /**
4164  * amdgpu_device_has_job_running - check if there is any job in mirror list
4165  *
4166  * @adev: amdgpu_device pointer
4167  *
4168  * check if there is any job in mirror list
4169  */
4170 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4171 {
4172         int i;
4173         struct drm_sched_job *job;
4174
4175         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4176                 struct amdgpu_ring *ring = adev->rings[i];
4177
4178                 if (!ring || !ring->sched.thread)
4179                         continue;
4180
4181                 spin_lock(&ring->sched.job_list_lock);
4182                 job = list_first_entry_or_null(&ring->sched.pending_list,
4183                                                struct drm_sched_job, list);
4184                 spin_unlock(&ring->sched.job_list_lock);
4185                 if (job)
4186                         return true;
4187         }
4188         return false;
4189 }
4190
4191 /**
4192  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4193  *
4194  * @adev: amdgpu_device pointer
4195  *
4196  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4197  * a hung GPU.
4198  */
4199 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4200 {
4201         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4202                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4203                 return false;
4204         }
4205
4206         if (amdgpu_gpu_recovery == 0)
4207                 goto disabled;
4208
4209         if (amdgpu_sriov_vf(adev))
4210                 return true;
4211
4212         if (amdgpu_gpu_recovery == -1) {
4213                 switch (adev->asic_type) {
4214                 case CHIP_BONAIRE:
4215                 case CHIP_HAWAII:
4216                 case CHIP_TOPAZ:
4217                 case CHIP_TONGA:
4218                 case CHIP_FIJI:
4219                 case CHIP_POLARIS10:
4220                 case CHIP_POLARIS11:
4221                 case CHIP_POLARIS12:
4222                 case CHIP_VEGAM:
4223                 case CHIP_VEGA20:
4224                 case CHIP_VEGA10:
4225                 case CHIP_VEGA12:
4226                 case CHIP_RAVEN:
4227                 case CHIP_ARCTURUS:
4228                 case CHIP_RENOIR:
4229                 case CHIP_NAVI10:
4230                 case CHIP_NAVI14:
4231                 case CHIP_NAVI12:
4232                 case CHIP_SIENNA_CICHLID:
4233                 case CHIP_NAVY_FLOUNDER:
4234                 case CHIP_DIMGREY_CAVEFISH:
4235                         break;
4236                 default:
4237                         goto disabled;
4238                 }
4239         }
4240
4241         return true;
4242
4243 disabled:
4244                 dev_info(adev->dev, "GPU recovery disabled.\n");
4245                 return false;
4246 }
4247
4248 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4249 {
4250         u32 i;
4251         int ret = 0;
4252
4253         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4254
4255         dev_info(adev->dev, "GPU mode1 reset\n");
4256
4257         /* disable BM */
4258         pci_clear_master(adev->pdev);
4259
4260         amdgpu_device_cache_pci_state(adev->pdev);
4261
4262         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4263                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4264                 ret = amdgpu_dpm_mode1_reset(adev);
4265         } else {
4266                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4267                 ret = psp_gpu_reset(adev);
4268         }
4269
4270         if (ret)
4271                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4272
4273         amdgpu_device_load_pci_state(adev->pdev);
4274
4275         /* wait for asic to come out of reset */
4276         for (i = 0; i < adev->usec_timeout; i++) {
4277                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4278
4279                 if (memsize != 0xffffffff)
4280                         break;
4281                 udelay(1);
4282         }
4283
4284         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4285         return ret;
4286 }
4287
4288 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4289                                         struct amdgpu_job *job,
4290                                         bool *need_full_reset_arg)
4291 {
4292         int i, r = 0;
4293         bool need_full_reset  = *need_full_reset_arg;
4294
4295         amdgpu_debugfs_wait_dump(adev);
4296
4297         if (amdgpu_sriov_vf(adev)) {
4298                 /* stop the data exchange thread */
4299                 amdgpu_virt_fini_data_exchange(adev);
4300         }
4301
4302         /* block all schedulers and reset given job's ring */
4303         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4304                 struct amdgpu_ring *ring = adev->rings[i];
4305
4306                 if (!ring || !ring->sched.thread)
4307                         continue;
4308
4309                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4310                 amdgpu_fence_driver_force_completion(ring);
4311         }
4312
4313         if(job)
4314                 drm_sched_increase_karma(&job->base);
4315
4316         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4317         if (!amdgpu_sriov_vf(adev)) {
4318
4319                 if (!need_full_reset)
4320                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4321
4322                 if (!need_full_reset) {
4323                         amdgpu_device_ip_pre_soft_reset(adev);
4324                         r = amdgpu_device_ip_soft_reset(adev);
4325                         amdgpu_device_ip_post_soft_reset(adev);
4326                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4327                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4328                                 need_full_reset = true;
4329                         }
4330                 }
4331
4332                 if (need_full_reset)
4333                         r = amdgpu_device_ip_suspend(adev);
4334
4335                 *need_full_reset_arg = need_full_reset;
4336         }
4337
4338         return r;
4339 }
4340
4341 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4342                                struct list_head *device_list_handle,
4343                                bool *need_full_reset_arg,
4344                                bool skip_hw_reset)
4345 {
4346         struct amdgpu_device *tmp_adev = NULL;
4347         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4348         int r = 0;
4349
4350         /*
4351          * ASIC reset has to be done on all HGMI hive nodes ASAP
4352          * to allow proper links negotiation in FW (within 1 sec)
4353          */
4354         if (!skip_hw_reset && need_full_reset) {
4355                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4356                         /* For XGMI run all resets in parallel to speed up the process */
4357                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4358                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4359                                         r = -EALREADY;
4360                         } else
4361                                 r = amdgpu_asic_reset(tmp_adev);
4362
4363                         if (r) {
4364                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4365                                          r, adev_to_drm(tmp_adev)->unique);
4366                                 break;
4367                         }
4368                 }
4369
4370                 /* For XGMI wait for all resets to complete before proceed */
4371                 if (!r) {
4372                         list_for_each_entry(tmp_adev, device_list_handle,
4373                                             gmc.xgmi.head) {
4374                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4375                                         flush_work(&tmp_adev->xgmi_reset_work);
4376                                         r = tmp_adev->asic_reset_res;
4377                                         if (r)
4378                                                 break;
4379                                 }
4380                         }
4381                 }
4382         }
4383
4384         if (!r && amdgpu_ras_intr_triggered()) {
4385                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4386                         if (tmp_adev->mmhub.funcs &&
4387                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4388                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4389                 }
4390
4391                 amdgpu_ras_intr_cleared();
4392         }
4393
4394         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4395                 if (need_full_reset) {
4396                         /* post card */
4397                         if (amdgpu_device_asic_init(tmp_adev))
4398                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4399
4400                         if (!r) {
4401                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4402                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4403                                 if (r)
4404                                         goto out;
4405
4406                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4407                                 if (vram_lost) {
4408                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4409                                         amdgpu_inc_vram_lost(tmp_adev);
4410                                 }
4411
4412                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4413                                 if (r)
4414                                         goto out;
4415
4416                                 r = amdgpu_device_fw_loading(tmp_adev);
4417                                 if (r)
4418                                         return r;
4419
4420                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4421                                 if (r)
4422                                         goto out;
4423
4424                                 if (vram_lost)
4425                                         amdgpu_device_fill_reset_magic(tmp_adev);
4426
4427                                 /*
4428                                  * Add this ASIC as tracked as reset was already
4429                                  * complete successfully.
4430                                  */
4431                                 amdgpu_register_gpu_instance(tmp_adev);
4432
4433                                 r = amdgpu_device_ip_late_init(tmp_adev);
4434                                 if (r)
4435                                         goto out;
4436
4437                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4438
4439                                 /*
4440                                  * The GPU enters bad state once faulty pages
4441                                  * by ECC has reached the threshold, and ras
4442                                  * recovery is scheduled next. So add one check
4443                                  * here to break recovery if it indeed exceeds
4444                                  * bad page threshold, and remind user to
4445                                  * retire this GPU or setting one bigger
4446                                  * bad_page_threshold value to fix this once
4447                                  * probing driver again.
4448                                  */
4449                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4450                                         /* must succeed. */
4451                                         amdgpu_ras_resume(tmp_adev);
4452                                 } else {
4453                                         r = -EINVAL;
4454                                         goto out;
4455                                 }
4456
4457                                 /* Update PSP FW topology after reset */
4458                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4459                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4460                         }
4461                 }
4462
4463 out:
4464                 if (!r) {
4465                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4466                         r = amdgpu_ib_ring_tests(tmp_adev);
4467                         if (r) {
4468                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4469                                 r = amdgpu_device_ip_suspend(tmp_adev);
4470                                 need_full_reset = true;
4471                                 r = -EAGAIN;
4472                                 goto end;
4473                         }
4474                 }
4475
4476                 if (!r)
4477                         r = amdgpu_device_recover_vram(tmp_adev);
4478                 else
4479                         tmp_adev->asic_reset_res = r;
4480         }
4481
4482 end:
4483         *need_full_reset_arg = need_full_reset;
4484         return r;
4485 }
4486
4487 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4488                                 struct amdgpu_hive_info *hive)
4489 {
4490         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4491                 return false;
4492
4493         if (hive) {
4494                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4495         } else {
4496                 down_write(&adev->reset_sem);
4497         }
4498
4499         switch (amdgpu_asic_reset_method(adev)) {
4500         case AMD_RESET_METHOD_MODE1:
4501                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4502                 break;
4503         case AMD_RESET_METHOD_MODE2:
4504                 adev->mp1_state = PP_MP1_STATE_RESET;
4505                 break;
4506         default:
4507                 adev->mp1_state = PP_MP1_STATE_NONE;
4508                 break;
4509         }
4510
4511         return true;
4512 }
4513
4514 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4515 {
4516         amdgpu_vf_error_trans_all(adev);
4517         adev->mp1_state = PP_MP1_STATE_NONE;
4518         atomic_set(&adev->in_gpu_reset, 0);
4519         up_write(&adev->reset_sem);
4520 }
4521
4522 /*
4523  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4524  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4525  *
4526  * unlock won't require roll back.
4527  */
4528 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4529 {
4530         struct amdgpu_device *tmp_adev = NULL;
4531
4532         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4533                 if (!hive) {
4534                         dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4535                         return -ENODEV;
4536                 }
4537                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4538                         if (!amdgpu_device_lock_adev(tmp_adev, hive))
4539                                 goto roll_back;
4540                 }
4541         } else if (!amdgpu_device_lock_adev(adev, hive))
4542                 return -EAGAIN;
4543
4544         return 0;
4545 roll_back:
4546         if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4547                 /*
4548                  * if the lockup iteration break in the middle of a hive,
4549                  * it may means there may has a race issue,
4550                  * or a hive device locked up independently.
4551                  * we may be in trouble and may not, so will try to roll back
4552                  * the lock and give out a warnning.
4553                  */
4554                 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4555                 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4556                         amdgpu_device_unlock_adev(tmp_adev);
4557                 }
4558         }
4559         return -EAGAIN;
4560 }
4561
4562 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4563 {
4564         struct pci_dev *p = NULL;
4565
4566         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4567                         adev->pdev->bus->number, 1);
4568         if (p) {
4569                 pm_runtime_enable(&(p->dev));
4570                 pm_runtime_resume(&(p->dev));
4571         }
4572 }
4573
4574 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4575 {
4576         enum amd_reset_method reset_method;
4577         struct pci_dev *p = NULL;
4578         u64 expires;
4579
4580         /*
4581          * For now, only BACO and mode1 reset are confirmed
4582          * to suffer the audio issue without proper suspended.
4583          */
4584         reset_method = amdgpu_asic_reset_method(adev);
4585         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4586              (reset_method != AMD_RESET_METHOD_MODE1))
4587                 return -EINVAL;
4588
4589         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4590                         adev->pdev->bus->number, 1);
4591         if (!p)
4592                 return -ENODEV;
4593
4594         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4595         if (!expires)
4596                 /*
4597                  * If we cannot get the audio device autosuspend delay,
4598                  * a fixed 4S interval will be used. Considering 3S is
4599                  * the audio controller default autosuspend delay setting.
4600                  * 4S used here is guaranteed to cover that.
4601                  */
4602                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4603
4604         while (!pm_runtime_status_suspended(&(p->dev))) {
4605                 if (!pm_runtime_suspend(&(p->dev)))
4606                         break;
4607
4608                 if (expires < ktime_get_mono_fast_ns()) {
4609                         dev_warn(adev->dev, "failed to suspend display audio\n");
4610                         /* TODO: abort the succeeding gpu reset? */
4611                         return -ETIMEDOUT;
4612                 }
4613         }
4614
4615         pm_runtime_disable(&(p->dev));
4616
4617         return 0;
4618 }
4619
4620 /**
4621  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4622  *
4623  * @adev: amdgpu_device pointer
4624  * @job: which job trigger hang
4625  *
4626  * Attempt to reset the GPU if it has hung (all asics).
4627  * Attempt to do soft-reset or full-reset and reinitialize Asic
4628  * Returns 0 for success or an error on failure.
4629  */
4630
4631 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4632                               struct amdgpu_job *job)
4633 {
4634         struct list_head device_list, *device_list_handle =  NULL;
4635         bool need_full_reset = false;
4636         bool job_signaled = false;
4637         struct amdgpu_hive_info *hive = NULL;
4638         struct amdgpu_device *tmp_adev = NULL;
4639         int i, r = 0;
4640         bool need_emergency_restart = false;
4641         bool audio_suspended = false;
4642
4643         /*
4644          * Special case: RAS triggered and full reset isn't supported
4645          */
4646         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4647
4648         /*
4649          * Flush RAM to disk so that after reboot
4650          * the user can read log and see why the system rebooted.
4651          */
4652         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4653                 DRM_WARN("Emergency reboot.");
4654
4655                 ksys_sync_helper();
4656                 emergency_restart();
4657         }
4658
4659         dev_info(adev->dev, "GPU %s begin!\n",
4660                 need_emergency_restart ? "jobs stop":"reset");
4661
4662         /*
4663          * Here we trylock to avoid chain of resets executing from
4664          * either trigger by jobs on different adevs in XGMI hive or jobs on
4665          * different schedulers for same device while this TO handler is running.
4666          * We always reset all schedulers for device and all devices for XGMI
4667          * hive so that should take care of them too.
4668          */
4669         hive = amdgpu_get_xgmi_hive(adev);
4670         if (hive) {
4671                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4672                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4673                                 job ? job->base.id : -1, hive->hive_id);
4674                         amdgpu_put_xgmi_hive(hive);
4675                         if (job)
4676                                 drm_sched_increase_karma(&job->base);
4677                         return 0;
4678                 }
4679                 mutex_lock(&hive->hive_lock);
4680         }
4681
4682         /*
4683          * lock the device before we try to operate the linked list
4684          * if didn't get the device lock, don't touch the linked list since
4685          * others may iterating it.
4686          */
4687         r = amdgpu_device_lock_hive_adev(adev, hive);
4688         if (r) {
4689                 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4690                                         job ? job->base.id : -1);
4691
4692                 /* even we skipped this reset, still need to set the job to guilty */
4693                 if (job)
4694                         drm_sched_increase_karma(&job->base);
4695                 goto skip_recovery;
4696         }
4697
4698         /*
4699          * Build list of devices to reset.
4700          * In case we are in XGMI hive mode, resort the device list
4701          * to put adev in the 1st position.
4702          */
4703         INIT_LIST_HEAD(&device_list);
4704         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4705                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4706                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4707                 device_list_handle = &hive->device_list;
4708         } else {
4709                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4710                 device_list_handle = &device_list;
4711         }
4712
4713         /* block all schedulers and reset given job's ring */
4714         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4715                 /*
4716                  * Try to put the audio codec into suspend state
4717                  * before gpu reset started.
4718                  *
4719                  * Due to the power domain of the graphics device
4720                  * is shared with AZ power domain. Without this,
4721                  * we may change the audio hardware from behind
4722                  * the audio driver's back. That will trigger
4723                  * some audio codec errors.
4724                  */
4725                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4726                         audio_suspended = true;
4727
4728                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4729
4730                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4731
4732                 if (!amdgpu_sriov_vf(tmp_adev))
4733                         amdgpu_amdkfd_pre_reset(tmp_adev);
4734
4735                 /*
4736                  * Mark these ASICs to be reseted as untracked first
4737                  * And add them back after reset completed
4738                  */
4739                 amdgpu_unregister_gpu_instance(tmp_adev);
4740
4741                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4742
4743                 /* disable ras on ALL IPs */
4744                 if (!need_emergency_restart &&
4745                       amdgpu_device_ip_need_full_reset(tmp_adev))
4746                         amdgpu_ras_suspend(tmp_adev);
4747
4748                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4749                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4750
4751                         if (!ring || !ring->sched.thread)
4752                                 continue;
4753
4754                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4755
4756                         if (need_emergency_restart)
4757                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4758                 }
4759                 atomic_inc(&tmp_adev->gpu_reset_counter);
4760         }
4761
4762         if (need_emergency_restart)
4763                 goto skip_sched_resume;
4764
4765         /*
4766          * Must check guilty signal here since after this point all old
4767          * HW fences are force signaled.
4768          *
4769          * job->base holds a reference to parent fence
4770          */
4771         if (job && job->base.s_fence->parent &&
4772             dma_fence_is_signaled(job->base.s_fence->parent)) {
4773                 job_signaled = true;
4774                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4775                 goto skip_hw_reset;
4776         }
4777
4778 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4779         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4780                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4781                                                  (tmp_adev == adev) ? job : NULL,
4782                                                  &need_full_reset);
4783                 /*TODO Should we stop ?*/
4784                 if (r) {
4785                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4786                                   r, adev_to_drm(tmp_adev)->unique);
4787                         tmp_adev->asic_reset_res = r;
4788                 }
4789         }
4790
4791         /* Actual ASIC resets if needed.*/
4792         /* TODO Implement XGMI hive reset logic for SRIOV */
4793         if (amdgpu_sriov_vf(adev)) {
4794                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4795                 if (r)
4796                         adev->asic_reset_res = r;
4797         } else {
4798                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4799                 if (r && r == -EAGAIN)
4800                         goto retry;
4801         }
4802
4803 skip_hw_reset:
4804
4805         /* Post ASIC reset for all devs .*/
4806         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4807
4808                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4809                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4810
4811                         if (!ring || !ring->sched.thread)
4812                                 continue;
4813
4814                         /* No point to resubmit jobs if we didn't HW reset*/
4815                         if (!tmp_adev->asic_reset_res && !job_signaled)
4816                                 drm_sched_resubmit_jobs(&ring->sched);
4817
4818                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4819                 }
4820
4821                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4822                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4823                 }
4824
4825                 tmp_adev->asic_reset_res = 0;
4826
4827                 if (r) {
4828                         /* bad news, how to tell it to userspace ? */
4829                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4830                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4831                 } else {
4832                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4833                 }
4834         }
4835
4836 skip_sched_resume:
4837         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4838                 /*unlock kfd: SRIOV would do it separately */
4839                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4840                         amdgpu_amdkfd_post_reset(tmp_adev);
4841                 if (audio_suspended)
4842                         amdgpu_device_resume_display_audio(tmp_adev);
4843                 amdgpu_device_unlock_adev(tmp_adev);
4844         }
4845
4846 skip_recovery:
4847         if (hive) {
4848                 atomic_set(&hive->in_reset, 0);
4849                 mutex_unlock(&hive->hive_lock);
4850                 amdgpu_put_xgmi_hive(hive);
4851         }
4852
4853         if (r && r != -EAGAIN)
4854                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4855         return r;
4856 }
4857
4858 /**
4859  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4860  *
4861  * @adev: amdgpu_device pointer
4862  *
4863  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4864  * and lanes) of the slot the device is in. Handles APUs and
4865  * virtualized environments where PCIE config space may not be available.
4866  */
4867 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4868 {
4869         struct pci_dev *pdev;
4870         enum pci_bus_speed speed_cap, platform_speed_cap;
4871         enum pcie_link_width platform_link_width;
4872
4873         if (amdgpu_pcie_gen_cap)
4874                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4875
4876         if (amdgpu_pcie_lane_cap)
4877                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4878
4879         /* covers APUs as well */
4880         if (pci_is_root_bus(adev->pdev->bus)) {
4881                 if (adev->pm.pcie_gen_mask == 0)
4882                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4883                 if (adev->pm.pcie_mlw_mask == 0)
4884                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4885                 return;
4886         }
4887
4888         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4889                 return;
4890
4891         pcie_bandwidth_available(adev->pdev, NULL,
4892                                  &platform_speed_cap, &platform_link_width);
4893
4894         if (adev->pm.pcie_gen_mask == 0) {
4895                 /* asic caps */
4896                 pdev = adev->pdev;
4897                 speed_cap = pcie_get_speed_cap(pdev);
4898                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4899                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4900                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4901                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4902                 } else {
4903                         if (speed_cap == PCIE_SPEED_32_0GT)
4904                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4905                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4906                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4907                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
4908                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
4909                         else if (speed_cap == PCIE_SPEED_16_0GT)
4910                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4911                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4912                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4913                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4914                         else if (speed_cap == PCIE_SPEED_8_0GT)
4915                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4916                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4917                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4918                         else if (speed_cap == PCIE_SPEED_5_0GT)
4919                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4920                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4921                         else
4922                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4923                 }
4924                 /* platform caps */
4925                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4926                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4927                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4928                 } else {
4929                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
4930                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4931                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4932                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4933                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
4934                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
4935                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
4936                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4937                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4938                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4939                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4940                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4941                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4942                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4943                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4944                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4945                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4946                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4947                         else
4948                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4949
4950                 }
4951         }
4952         if (adev->pm.pcie_mlw_mask == 0) {
4953                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4954                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4955                 } else {
4956                         switch (platform_link_width) {
4957                         case PCIE_LNK_X32:
4958                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4959                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4960                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4961                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4962                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4963                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4964                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4965                                 break;
4966                         case PCIE_LNK_X16:
4967                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4968                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4969                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4970                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4971                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4972                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4973                                 break;
4974                         case PCIE_LNK_X12:
4975                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4976                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4977                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4978                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4979                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4980                                 break;
4981                         case PCIE_LNK_X8:
4982                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4983                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4984                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4985                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4986                                 break;
4987                         case PCIE_LNK_X4:
4988                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4989                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4990                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4991                                 break;
4992                         case PCIE_LNK_X2:
4993                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4994                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4995                                 break;
4996                         case PCIE_LNK_X1:
4997                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4998                                 break;
4999                         default:
5000                                 break;
5001                         }
5002                 }
5003         }
5004 }
5005
5006 int amdgpu_device_baco_enter(struct drm_device *dev)
5007 {
5008         struct amdgpu_device *adev = drm_to_adev(dev);
5009         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5010
5011         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5012                 return -ENOTSUPP;
5013
5014         if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5015                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5016
5017         return amdgpu_dpm_baco_enter(adev);
5018 }
5019
5020 int amdgpu_device_baco_exit(struct drm_device *dev)
5021 {
5022         struct amdgpu_device *adev = drm_to_adev(dev);
5023         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5024         int ret = 0;
5025
5026         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5027                 return -ENOTSUPP;
5028
5029         ret = amdgpu_dpm_baco_exit(adev);
5030         if (ret)
5031                 return ret;
5032
5033         if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5034                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5035
5036         return 0;
5037 }
5038
5039 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5040 {
5041         int i;
5042
5043         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5044                 struct amdgpu_ring *ring = adev->rings[i];
5045
5046                 if (!ring || !ring->sched.thread)
5047                         continue;
5048
5049                 cancel_delayed_work_sync(&ring->sched.work_tdr);
5050         }
5051 }
5052
5053 /**
5054  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5055  * @pdev: PCI device struct
5056  * @state: PCI channel state
5057  *
5058  * Description: Called when a PCI error is detected.
5059  *
5060  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5061  */
5062 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5063 {
5064         struct drm_device *dev = pci_get_drvdata(pdev);
5065         struct amdgpu_device *adev = drm_to_adev(dev);
5066         int i;
5067
5068         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5069
5070         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5071                 DRM_WARN("No support for XGMI hive yet...");
5072                 return PCI_ERS_RESULT_DISCONNECT;
5073         }
5074
5075         switch (state) {
5076         case pci_channel_io_normal:
5077                 return PCI_ERS_RESULT_CAN_RECOVER;
5078         /* Fatal error, prepare for slot reset */
5079         case pci_channel_io_frozen:
5080                 /*
5081                  * Cancel and wait for all TDRs in progress if failing to
5082                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5083                  *
5084                  * Locking adev->reset_sem will prevent any external access
5085                  * to GPU during PCI error recovery
5086                  */
5087                 while (!amdgpu_device_lock_adev(adev, NULL))
5088                         amdgpu_cancel_all_tdr(adev);
5089
5090                 /*
5091                  * Block any work scheduling as we do for regular GPU reset
5092                  * for the duration of the recovery
5093                  */
5094                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5095                         struct amdgpu_ring *ring = adev->rings[i];
5096
5097                         if (!ring || !ring->sched.thread)
5098                                 continue;
5099
5100                         drm_sched_stop(&ring->sched, NULL);
5101                 }
5102                 atomic_inc(&adev->gpu_reset_counter);
5103                 return PCI_ERS_RESULT_NEED_RESET;
5104         case pci_channel_io_perm_failure:
5105                 /* Permanent error, prepare for device removal */
5106                 return PCI_ERS_RESULT_DISCONNECT;
5107         }
5108
5109         return PCI_ERS_RESULT_NEED_RESET;
5110 }
5111
5112 /**
5113  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5114  * @pdev: pointer to PCI device
5115  */
5116 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5117 {
5118
5119         DRM_INFO("PCI error: mmio enabled callback!!\n");
5120
5121         /* TODO - dump whatever for debugging purposes */
5122
5123         /* This called only if amdgpu_pci_error_detected returns
5124          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5125          * works, no need to reset slot.
5126          */
5127
5128         return PCI_ERS_RESULT_RECOVERED;
5129 }
5130
5131 /**
5132  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5133  * @pdev: PCI device struct
5134  *
5135  * Description: This routine is called by the pci error recovery
5136  * code after the PCI slot has been reset, just before we
5137  * should resume normal operations.
5138  */
5139 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5140 {
5141         struct drm_device *dev = pci_get_drvdata(pdev);
5142         struct amdgpu_device *adev = drm_to_adev(dev);
5143         int r, i;
5144         bool need_full_reset = true;
5145         u32 memsize;
5146         struct list_head device_list;
5147
5148         DRM_INFO("PCI error: slot reset callback!!\n");
5149
5150         INIT_LIST_HEAD(&device_list);
5151         list_add_tail(&adev->gmc.xgmi.head, &device_list);
5152
5153         /* wait for asic to come out of reset */
5154         msleep(500);
5155
5156         /* Restore PCI confspace */
5157         amdgpu_device_load_pci_state(pdev);
5158
5159         /* confirm  ASIC came out of reset */
5160         for (i = 0; i < adev->usec_timeout; i++) {
5161                 memsize = amdgpu_asic_get_config_memsize(adev);
5162
5163                 if (memsize != 0xffffffff)
5164                         break;
5165                 udelay(1);
5166         }
5167         if (memsize == 0xffffffff) {
5168                 r = -ETIME;
5169                 goto out;
5170         }
5171
5172         adev->in_pci_err_recovery = true;
5173         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5174         adev->in_pci_err_recovery = false;
5175         if (r)
5176                 goto out;
5177
5178         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5179
5180 out:
5181         if (!r) {
5182                 if (amdgpu_device_cache_pci_state(adev->pdev))
5183                         pci_restore_state(adev->pdev);
5184
5185                 DRM_INFO("PCIe error recovery succeeded\n");
5186         } else {
5187                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5188                 amdgpu_device_unlock_adev(adev);
5189         }
5190
5191         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5192 }
5193
5194 /**
5195  * amdgpu_pci_resume() - resume normal ops after PCI reset
5196  * @pdev: pointer to PCI device
5197  *
5198  * Called when the error recovery driver tells us that its
5199  * OK to resume normal operation.
5200  */
5201 void amdgpu_pci_resume(struct pci_dev *pdev)
5202 {
5203         struct drm_device *dev = pci_get_drvdata(pdev);
5204         struct amdgpu_device *adev = drm_to_adev(dev);
5205         int i;
5206
5207
5208         DRM_INFO("PCI error: resume callback!!\n");
5209
5210         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5211                 struct amdgpu_ring *ring = adev->rings[i];
5212
5213                 if (!ring || !ring->sched.thread)
5214                         continue;
5215
5216
5217                 drm_sched_resubmit_jobs(&ring->sched);
5218                 drm_sched_start(&ring->sched, true);
5219         }
5220
5221         amdgpu_device_unlock_adev(adev);
5222 }
5223
5224 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5225 {
5226         struct drm_device *dev = pci_get_drvdata(pdev);
5227         struct amdgpu_device *adev = drm_to_adev(dev);
5228         int r;
5229
5230         r = pci_save_state(pdev);
5231         if (!r) {
5232                 kfree(adev->pci_state);
5233
5234                 adev->pci_state = pci_store_saved_state(pdev);
5235
5236                 if (!adev->pci_state) {
5237                         DRM_ERROR("Failed to store PCI saved state");
5238                         return false;
5239                 }
5240         } else {
5241                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5242                 return false;
5243         }
5244
5245         return true;
5246 }
5247
5248 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5249 {
5250         struct drm_device *dev = pci_get_drvdata(pdev);
5251         struct amdgpu_device *adev = drm_to_adev(dev);
5252         int r;
5253
5254         if (!adev->pci_state)
5255                 return false;
5256
5257         r = pci_load_saved_state(pdev, adev->pci_state);
5258
5259         if (!r) {
5260                 pci_restore_state(pdev);
5261         } else {
5262                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5263                 return false;
5264         }
5265
5266         return true;
5267 }
5268
5269