drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83
  84 #define AMDGPU_RESUME_MS                2000
  85
  86 const char *amdgpu_asic_name[] = {
  87         "TAHITI",
  88         "PITCAIRN",
  89         "VERDE",
  90         "OLAND",
  91         "HAINAN",
  92         "BONAIRE",
  93         "KAVERI",
  94         "KABINI",
  95         "HAWAII",
  96         "MULLINS",
  97         "TOPAZ",
  98         "TONGA",
  99         "FIJI",
 100         "CARRIZO",
 101         "STONEY",
 102         "POLARIS10",
 103         "POLARIS11",
 104         "POLARIS12",
 105         "VEGAM",
 106         "VEGA10",
 107         "VEGA12",
 108         "VEGA20",
 109         "RAVEN",
 110         "ARCTURUS",
 111         "RENOIR",
 112         "NAVI10",
 113         "NAVI14",
 114         "NAVI12",
 115         "LAST",
 116 };
 117
 118 /**
 119  * DOC: pcie_replay_count
 120  *
 121  * The amdgpu driver provides a sysfs API for reporting the total number
 122  * of PCIe replays (NAKs)
 123  * The file pcie_replay_count is used for this and returns the total
 124  * number of replays as a sum of the NAKs generated and NAKs received
 125  */
 126
 127 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 128                 struct device_attribute *attr, char *buf)
 129 {
 130         struct drm_device *ddev = dev_get_drvdata(dev);
 131         struct amdgpu_device *adev = ddev->dev_private;
 132         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 133
 134         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 135 }
 136
 137 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 138                 amdgpu_device_get_pcie_replay_count, NULL);
 139
 140 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 141
 142 /**
 143  * DOC: product_name
 144  *
 145  * The amdgpu driver provides a sysfs API for reporting the product name
 146  * for the device
 147  * The file serial_number is used for this and returns the product name
 148  * as returned from the FRU.
 149  * NOTE: This is only available for certain server cards
 150  */
 151
 152 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 153                 struct device_attribute *attr, char *buf)
 154 {
 155         struct drm_device *ddev = dev_get_drvdata(dev);
 156         struct amdgpu_device *adev = ddev->dev_private;
 157
 158         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 159 }
 160
 161 static DEVICE_ATTR(product_name, S_IRUGO,
 162                 amdgpu_device_get_product_name, NULL);
 163
 164 /**
 165  * DOC: product_number
 166  *
 167  * The amdgpu driver provides a sysfs API for reporting the part number
 168  * for the device
 169  * The file serial_number is used for this and returns the part number
 170  * as returned from the FRU.
 171  * NOTE: This is only available for certain server cards
 172  */
 173
 174 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 175                 struct device_attribute *attr, char *buf)
 176 {
 177         struct drm_device *ddev = dev_get_drvdata(dev);
 178         struct amdgpu_device *adev = ddev->dev_private;
 179
 180         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 181 }
 182
 183 static DEVICE_ATTR(product_number, S_IRUGO,
 184                 amdgpu_device_get_product_number, NULL);
 185
 186 /**
 187  * DOC: serial_number
 188  *
 189  * The amdgpu driver provides a sysfs API for reporting the serial number
 190  * for the device
 191  * The file serial_number is used for this and returns the serial number
 192  * as returned from the FRU.
 193  * NOTE: This is only available for certain server cards
 194  */
 195
 196 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 197                 struct device_attribute *attr, char *buf)
 198 {
 199         struct drm_device *ddev = dev_get_drvdata(dev);
 200         struct amdgpu_device *adev = ddev->dev_private;
 201
 202         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 203 }
 204
 205 static DEVICE_ATTR(serial_number, S_IRUGO,
 206                 amdgpu_device_get_serial_number, NULL);
 207
 208 /**
 209  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 210  *
 211  * @dev: drm_device pointer
 212  *
 213  * Returns true if the device is a dGPU with HG/PX power control,
 214  * otherwise return false.
 215  */
 216 bool amdgpu_device_supports_boco(struct drm_device *dev)
 217 {
 218         struct amdgpu_device *adev = dev->dev_private;
 219
 220         if (adev->flags & AMD_IS_PX)
 221                 return true;
 222         return false;
 223 }
 224
 225 /**
 226  * amdgpu_device_supports_baco - Does the device support BACO
 227  *
 228  * @dev: drm_device pointer
 229  *
 230  * Returns true if the device supporte BACO,
 231  * otherwise return false.
 232  */
 233 bool amdgpu_device_supports_baco(struct drm_device *dev)
 234 {
 235         struct amdgpu_device *adev = dev->dev_private;
 236
 237         return amdgpu_asic_supports_baco(adev);
 238 }
 239
 240 /**
 241  * VRAM access helper functions.
 242  *
 243  * amdgpu_device_vram_access - read/write a buffer in vram
 244  *
 245  * @adev: amdgpu_device pointer
 246  * @pos: offset of the buffer in vram
 247  * @buf: virtual address of the buffer in system memory
 248  * @size: read/write size, sizeof(@buf) must > @size
 249  * @write: true - write to vram, otherwise - read from vram
 250  */
 251 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 252                                uint32_t *buf, size_t size, bool write)
 253 {
 254         unsigned long flags;
 255         uint32_t hi = ~0;
 256         uint64_t last;
 257
 258         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 259         for (last = pos + size; pos < last; pos += 4) {
 260                 uint32_t tmp = pos >> 31;
 261
 262                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 263                 if (tmp != hi) {
 264                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 265                         hi = tmp;
 266                 }
 267                 if (write)
 268                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 269                 else
 270                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 271         }
 272         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 273 }
 274
 275 /*
 276  * device register access helper functions.
 277  */
 278 /**
 279  * amdgpu_device_rreg - read a register
 280  *
 281  * @adev: amdgpu_device pointer
 282  * @reg: dword aligned register offset
 283  * @acc_flags: access flags which require special behavior
 284  *
 285  * Returns the 32 bit value from the offset specified.
 286  */
 287 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, uint32_t reg,
 288                             uint32_t acc_flags)
 289 {
 290         uint32_t ret;
 291
 292         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 293                 return amdgpu_kiq_rreg(adev, reg);
 294
 295         if ((reg * 4) < adev->rmmio_size)
 296                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 297         else
 298                 ret = adev->pcie_rreg(adev, (reg * 4));
 299         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 300         return ret;
 301 }
 302
 303 /*
 304  * MMIO register read with bytes helper functions
 305  * @offset:bytes offset from MMIO start
 306  *
 307 */
 308
 309 /**
 310  * amdgpu_mm_rreg8 - read a memory mapped IO register
 311  *
 312  * @adev: amdgpu_device pointer
 313  * @offset: byte aligned register offset
 314  *
 315  * Returns the 8 bit value from the offset specified.
 316  */
 317 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 318         if (offset < adev->rmmio_size)
 319                 return (readb(adev->rmmio + offset));
 320         BUG();
 321 }
 322
 323 /*
 324  * MMIO register write with bytes helper functions
 325  * @offset:bytes offset from MMIO start
 326  * @value: the value want to be written to the register
 327  *
 328 */
 329 /**
 330  * amdgpu_mm_wreg8 - read a memory mapped IO register
 331  *
 332  * @adev: amdgpu_device pointer
 333  * @offset: byte aligned register offset
 334  * @value: 8 bit value to write
 335  *
 336  * Writes the value specified to the offset specified.
 337  */
 338 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 339         if (offset < adev->rmmio_size)
 340                 writeb(value, adev->rmmio + offset);
 341         else
 342                 BUG();
 343 }
 344
 345 void static inline amdgpu_device_wreg_no_kiq(struct amdgpu_device *adev, uint32_t reg,
 346                                              uint32_t v, uint32_t acc_flags)
 347 {
 348         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 349
 350         if ((reg * 4) < adev->rmmio_size)
 351                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 352         else
 353                 adev->pcie_wreg(adev, (reg * 4), v);
 354 }
 355
 356 /**
 357  * amdgpu_device_wreg - write to a register
 358  *
 359  * @adev: amdgpu_device pointer
 360  * @reg: dword aligned register offset
 361  * @v: 32 bit value to write to the register
 362  * @acc_flags: access flags which require special behavior
 363  *
 364  * Writes the value specified to the offset specified.
 365  */
 366 void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 367                         uint32_t acc_flags)
 368 {
 369         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 370                 return amdgpu_kiq_wreg(adev, reg, v);
 371
 372         amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);
 373 }
 374
 375 /*
 376  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 377  *
 378  * this function is invoked only the debugfs register access
 379  * */
 380 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 381                     uint32_t acc_flags)
 382 {
 383         if (amdgpu_sriov_fullaccess(adev) &&
 384                 adev->gfx.rlc.funcs &&
 385                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 386
 387                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 388                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 389         }
 390
 391         amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);
 392 }
 393
 394 /**
 395  * amdgpu_io_rreg - read an IO register
 396  *
 397  * @adev: amdgpu_device pointer
 398  * @reg: dword aligned register offset
 399  *
 400  * Returns the 32 bit value from the offset specified.
 401  */
 402 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 403 {
 404         if ((reg * 4) < adev->rio_mem_size)
 405                 return ioread32(adev->rio_mem + (reg * 4));
 406         else {
 407                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 408                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 409         }
 410 }
 411
 412 /**
 413  * amdgpu_io_wreg - write to an IO register
 414  *
 415  * @adev: amdgpu_device pointer
 416  * @reg: dword aligned register offset
 417  * @v: 32 bit value to write to the register
 418  *
 419  * Writes the value specified to the offset specified.
 420  */
 421 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 422 {
 423         if ((reg * 4) < adev->rio_mem_size)
 424                 iowrite32(v, adev->rio_mem + (reg * 4));
 425         else {
 426                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 427                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 428         }
 429 }
 430
 431 /**
 432  * amdgpu_mm_rdoorbell - read a doorbell dword
 433  *
 434  * @adev: amdgpu_device pointer
 435  * @index: doorbell index
 436  *
 437  * Returns the value in the doorbell aperture at the
 438  * requested doorbell index (CIK).
 439  */
 440 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 441 {
 442         if (index < adev->doorbell.num_doorbells) {
 443                 return readl(adev->doorbell.ptr + index);
 444         } else {
 445                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 446                 return 0;
 447         }
 448 }
 449
 450 /**
 451  * amdgpu_mm_wdoorbell - write a doorbell dword
 452  *
 453  * @adev: amdgpu_device pointer
 454  * @index: doorbell index
 455  * @v: value to write
 456  *
 457  * Writes @v to the doorbell aperture at the
 458  * requested doorbell index (CIK).
 459  */
 460 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 461 {
 462         if (index < adev->doorbell.num_doorbells) {
 463                 writel(v, adev->doorbell.ptr + index);
 464         } else {
 465                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 466         }
 467 }
 468
 469 /**
 470  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 471  *
 472  * @adev: amdgpu_device pointer
 473  * @index: doorbell index
 474  *
 475  * Returns the value in the doorbell aperture at the
 476  * requested doorbell index (VEGA10+).
 477  */
 478 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 479 {
 480         if (index < adev->doorbell.num_doorbells) {
 481                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 482         } else {
 483                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 484                 return 0;
 485         }
 486 }
 487
 488 /**
 489  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 490  *
 491  * @adev: amdgpu_device pointer
 492  * @index: doorbell index
 493  * @v: value to write
 494  *
 495  * Writes @v to the doorbell aperture at the
 496  * requested doorbell index (VEGA10+).
 497  */
 498 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 499 {
 500         if (index < adev->doorbell.num_doorbells) {
 501                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 502         } else {
 503                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 504         }
 505 }
 506
 507 /**
 508  * amdgpu_invalid_rreg - dummy reg read function
 509  *
 510  * @adev: amdgpu device pointer
 511  * @reg: offset of register
 512  *
 513  * Dummy register read function.  Used for register blocks
 514  * that certain asics don't have (all asics).
 515  * Returns the value in the register.
 516  */
 517 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 518 {
 519         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 520         BUG();
 521         return 0;
 522 }
 523
 524 /**
 525  * amdgpu_invalid_wreg - dummy reg write function
 526  *
 527  * @adev: amdgpu device pointer
 528  * @reg: offset of register
 529  * @v: value to write to the register
 530  *
 531  * Dummy register read function.  Used for register blocks
 532  * that certain asics don't have (all asics).
 533  */
 534 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 535 {
 536         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 537                   reg, v);
 538         BUG();
 539 }
 540
 541 /**
 542  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 543  *
 544  * @adev: amdgpu device pointer
 545  * @reg: offset of register
 546  *
 547  * Dummy register read function.  Used for register blocks
 548  * that certain asics don't have (all asics).
 549  * Returns the value in the register.
 550  */
 551 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 552 {
 553         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 554         BUG();
 555         return 0;
 556 }
 557
 558 /**
 559  * amdgpu_invalid_wreg64 - dummy reg write function
 560  *
 561  * @adev: amdgpu device pointer
 562  * @reg: offset of register
 563  * @v: value to write to the register
 564  *
 565  * Dummy register read function.  Used for register blocks
 566  * that certain asics don't have (all asics).
 567  */
 568 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 569 {
 570         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 571                   reg, v);
 572         BUG();
 573 }
 574
 575 /**
 576  * amdgpu_block_invalid_rreg - dummy reg read function
 577  *
 578  * @adev: amdgpu device pointer
 579  * @block: offset of instance
 580  * @reg: offset of register
 581  *
 582  * Dummy register read function.  Used for register blocks
 583  * that certain asics don't have (all asics).
 584  * Returns the value in the register.
 585  */
 586 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 587                                           uint32_t block, uint32_t reg)
 588 {
 589         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 590                   reg, block);
 591         BUG();
 592         return 0;
 593 }
 594
 595 /**
 596  * amdgpu_block_invalid_wreg - dummy reg write function
 597  *
 598  * @adev: amdgpu device pointer
 599  * @block: offset of instance
 600  * @reg: offset of register
 601  * @v: value to write to the register
 602  *
 603  * Dummy register read function.  Used for register blocks
 604  * that certain asics don't have (all asics).
 605  */
 606 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 607                                       uint32_t block,
 608                                       uint32_t reg, uint32_t v)
 609 {
 610         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 611                   reg, block, v);
 612         BUG();
 613 }
 614
 615 /**
 616  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 617  *
 618  * @adev: amdgpu device pointer
 619  *
 620  * Allocates a scratch page of VRAM for use by various things in the
 621  * driver.
 622  */
 623 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 624 {
 625         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 626                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 627                                        &adev->vram_scratch.robj,
 628                                        &adev->vram_scratch.gpu_addr,
 629                                        (void **)&adev->vram_scratch.ptr);
 630 }
 631
 632 /**
 633  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 634  *
 635  * @adev: amdgpu device pointer
 636  *
 637  * Frees the VRAM scratch page.
 638  */
 639 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 640 {
 641         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 642 }
 643
 644 /**
 645  * amdgpu_device_program_register_sequence - program an array of registers.
 646  *
 647  * @adev: amdgpu_device pointer
 648  * @registers: pointer to the register array
 649  * @array_size: size of the register array
 650  *
 651  * Programs an array or registers with and and or masks.
 652  * This is a helper for setting golden registers.
 653  */
 654 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 655                                              const u32 *registers,
 656                                              const u32 array_size)
 657 {
 658         u32 tmp, reg, and_mask, or_mask;
 659         int i;
 660
 661         if (array_size % 3)
 662                 return;
 663
 664         for (i = 0; i < array_size; i +=3) {
 665                 reg = registers[i + 0];
 666                 and_mask = registers[i + 1];
 667                 or_mask = registers[i + 2];
 668
 669                 if (and_mask == 0xffffffff) {
 670                         tmp = or_mask;
 671                 } else {
 672                         tmp = RREG32(reg);
 673                         tmp &= ~and_mask;
 674                         if (adev->family >= AMDGPU_FAMILY_AI)
 675                                 tmp |= (or_mask & and_mask);
 676                         else
 677                                 tmp |= or_mask;
 678                 }
 679                 WREG32(reg, tmp);
 680         }
 681 }
 682
 683 /**
 684  * amdgpu_device_pci_config_reset - reset the GPU
 685  *
 686  * @adev: amdgpu_device pointer
 687  *
 688  * Resets the GPU using the pci config reset sequence.
 689  * Only applicable to asics prior to vega10.
 690  */
 691 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 692 {
 693         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 694 }
 695
 696 /*
 697  * GPU doorbell aperture helpers function.
 698  */
 699 /**
 700  * amdgpu_device_doorbell_init - Init doorbell driver information.
 701  *
 702  * @adev: amdgpu_device pointer
 703  *
 704  * Init doorbell driver information (CIK)
 705  * Returns 0 on success, error on failure.
 706  */
 707 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 708 {
 709
 710         /* No doorbell on SI hardware generation */
 711         if (adev->asic_type < CHIP_BONAIRE) {
 712                 adev->doorbell.base = 0;
 713                 adev->doorbell.size = 0;
 714                 adev->doorbell.num_doorbells = 0;
 715                 adev->doorbell.ptr = NULL;
 716                 return 0;
 717         }
 718
 719         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 720                 return -EINVAL;
 721
 722         amdgpu_asic_init_doorbell_index(adev);
 723
 724         /* doorbell bar mapping */
 725         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 726         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 727
 728         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 729                                              adev->doorbell_index.max_assignment+1);
 730         if (adev->doorbell.num_doorbells == 0)
 731                 return -EINVAL;
 732
 733         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 734          * paging queue doorbell use the second page. The
 735          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 736          * doorbells are in the first page. So with paging queue enabled,
 737          * the max num_doorbells should + 1 page (0x400 in dword)
 738          */
 739         if (adev->asic_type >= CHIP_VEGA10)
 740                 adev->doorbell.num_doorbells += 0x400;
 741
 742         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 743                                      adev->doorbell.num_doorbells *
 744                                      sizeof(u32));
 745         if (adev->doorbell.ptr == NULL)
 746                 return -ENOMEM;
 747
 748         return 0;
 749 }
 750
 751 /**
 752  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 753  *
 754  * @adev: amdgpu_device pointer
 755  *
 756  * Tear down doorbell driver information (CIK)
 757  */
 758 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 759 {
 760         iounmap(adev->doorbell.ptr);
 761         adev->doorbell.ptr = NULL;
 762 }
 763
 764
 765
 766 /*
 767  * amdgpu_device_wb_*()
 768  * Writeback is the method by which the GPU updates special pages in memory
 769  * with the status of certain GPU events (fences, ring pointers,etc.).
 770  */
 771
 772 /**
 773  * amdgpu_device_wb_fini - Disable Writeback and free memory
 774  *
 775  * @adev: amdgpu_device pointer
 776  *
 777  * Disables Writeback and frees the Writeback memory (all asics).
 778  * Used at driver shutdown.
 779  */
 780 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 781 {
 782         if (adev->wb.wb_obj) {
 783                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 784                                       &adev->wb.gpu_addr,
 785                                       (void **)&adev->wb.wb);
 786                 adev->wb.wb_obj = NULL;
 787         }
 788 }
 789
 790 /**
 791  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 792  *
 793  * @adev: amdgpu_device pointer
 794  *
 795  * Initializes writeback and allocates writeback memory (all asics).
 796  * Used at driver startup.
 797  * Returns 0 on success or an -error on failure.
 798  */
 799 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 800 {
 801         int r;
 802
 803         if (adev->wb.wb_obj == NULL) {
 804                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 805                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 806                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 807                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 808                                             (void **)&adev->wb.wb);
 809                 if (r) {
 810                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 811                         return r;
 812                 }
 813
 814                 adev->wb.num_wb = AMDGPU_MAX_WB;
 815                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 816
 817                 /* clear wb memory */
 818                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 819         }
 820
 821         return 0;
 822 }
 823
 824 /**
 825  * amdgpu_device_wb_get - Allocate a wb entry
 826  *
 827  * @adev: amdgpu_device pointer
 828  * @wb: wb index
 829  *
 830  * Allocate a wb slot for use by the driver (all asics).
 831  * Returns 0 on success or -EINVAL on failure.
 832  */
 833 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 834 {
 835         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 836
 837         if (offset < adev->wb.num_wb) {
 838                 __set_bit(offset, adev->wb.used);
 839                 *wb = offset << 3; /* convert to dw offset */
 840                 return 0;
 841         } else {
 842                 return -EINVAL;
 843         }
 844 }
 845
 846 /**
 847  * amdgpu_device_wb_free - Free a wb entry
 848  *
 849  * @adev: amdgpu_device pointer
 850  * @wb: wb index
 851  *
 852  * Free a wb slot allocated for use by the driver (all asics)
 853  */
 854 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 855 {
 856         wb >>= 3;
 857         if (wb < adev->wb.num_wb)
 858                 __clear_bit(wb, adev->wb.used);
 859 }
 860
 861 /**
 862  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 863  *
 864  * @adev: amdgpu_device pointer
 865  *
 866  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 867  * to fail, but if any of the BARs is not accessible after the size we abort
 868  * driver loading by returning -ENODEV.
 869  */
 870 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 871 {
 872         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 873         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 874         struct pci_bus *root;
 875         struct resource *res;
 876         unsigned i;
 877         u16 cmd;
 878         int r;
 879
 880         /* Bypass for VF */
 881         if (amdgpu_sriov_vf(adev))
 882                 return 0;
 883
 884         /* Check if the root BUS has 64bit memory resources */
 885         root = adev->pdev->bus;
 886         while (root->parent)
 887                 root = root->parent;
 888
 889         pci_bus_for_each_resource(root, res, i) {
 890                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 891                     res->start > 0x100000000ull)
 892                         break;
 893         }
 894
 895         /* Trying to resize is pointless without a root hub window above 4GB */
 896         if (!res)
 897                 return 0;
 898
 899         /* Disable memory decoding while we change the BAR addresses and size */
 900         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 901         pci_write_config_word(adev->pdev, PCI_COMMAND,
 902                               cmd & ~PCI_COMMAND_MEMORY);
 903
 904         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
 905         amdgpu_device_doorbell_fini(adev);
 906         if (adev->asic_type >= CHIP_BONAIRE)
 907                 pci_release_resource(adev->pdev, 2);
 908
 909         pci_release_resource(adev->pdev, 0);
 910
 911         r = pci_resize_resource(adev->pdev, 0, rbar_size);
 912         if (r == -ENOSPC)
 913                 DRM_INFO("Not enough PCI address space for a large BAR.");
 914         else if (r && r != -ENOTSUPP)
 915                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
 916
 917         pci_assign_unassigned_bus_resources(adev->pdev->bus);
 918
 919         /* When the doorbell or fb BAR isn't available we have no chance of
 920          * using the device.
 921          */
 922         r = amdgpu_device_doorbell_init(adev);
 923         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
 924                 return -ENODEV;
 925
 926         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
 927
 928         return 0;
 929 }
 930
 931 /*
 932  * GPU helpers function.
 933  */
 934 /**
 935  * amdgpu_device_need_post - check if the hw need post or not
 936  *
 937  * @adev: amdgpu_device pointer
 938  *
 939  * Check if the asic has been initialized (all asics) at driver startup
 940  * or post is needed if  hw reset is performed.
 941  * Returns true if need or false if not.
 942  */
 943 bool amdgpu_device_need_post(struct amdgpu_device *adev)
 944 {
 945         uint32_t reg;
 946
 947         if (amdgpu_sriov_vf(adev))
 948                 return false;
 949
 950         if (amdgpu_passthrough(adev)) {
 951                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
 952                  * some old smc fw still need driver do vPost otherwise gpu hang, while
 953                  * those smc fw version above 22.15 doesn't have this flaw, so we force
 954                  * vpost executed for smc version below 22.15
 955                  */
 956                 if (adev->asic_type == CHIP_FIJI) {
 957                         int err;
 958                         uint32_t fw_ver;
 959                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
 960                         /* force vPost if error occured */
 961                         if (err)
 962                                 return true;
 963
 964                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
 965                         if (fw_ver < 0x00160e00)
 966                                 return true;
 967                 }
 968         }
 969
 970         if (adev->has_hw_reset) {
 971                 adev->has_hw_reset = false;
 972                 return true;
 973         }
 974
 975         /* bios scratch used on CIK+ */
 976         if (adev->asic_type >= CHIP_BONAIRE)
 977                 return amdgpu_atombios_scratch_need_asic_init(adev);
 978
 979         /* check MEM_SIZE for older asics */
 980         reg = amdgpu_asic_get_config_memsize(adev);
 981
 982         if ((reg != 0) && (reg != 0xffffffff))
 983                 return false;
 984
 985         return true;
 986 }
 987
 988 /* if we get transitioned to only one device, take VGA back */
 989 /**
 990  * amdgpu_device_vga_set_decode - enable/disable vga decode
 991  *
 992  * @cookie: amdgpu_device pointer
 993  * @state: enable/disable vga decode
 994  *
 995  * Enable/disable vga decode (all asics).
 996  * Returns VGA resource flags.
 997  */
 998 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
 999 {
1000         struct amdgpu_device *adev = cookie;
1001         amdgpu_asic_set_vga_state(adev, state);
1002         if (state)
1003                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1004                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1005         else
1006                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1007 }
1008
1009 /**
1010  * amdgpu_device_check_block_size - validate the vm block size
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Validates the vm block size specified via module parameter.
1015  * The vm block size defines number of bits in page table versus page directory,
1016  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1017  * page table and the remaining bits are in the page directory.
1018  */
1019 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1020 {
1021         /* defines number of bits in page table versus page directory,
1022          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1023          * page table and the remaining bits are in the page directory */
1024         if (amdgpu_vm_block_size == -1)
1025                 return;
1026
1027         if (amdgpu_vm_block_size < 9) {
1028                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1029                          amdgpu_vm_block_size);
1030                 amdgpu_vm_block_size = -1;
1031         }
1032 }
1033
1034 /**
1035  * amdgpu_device_check_vm_size - validate the vm size
1036  *
1037  * @adev: amdgpu_device pointer
1038  *
1039  * Validates the vm size in GB specified via module parameter.
1040  * The VM size is the size of the GPU virtual memory space in GB.
1041  */
1042 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1043 {
1044         /* no need to check the default value */
1045         if (amdgpu_vm_size == -1)
1046                 return;
1047
1048         if (amdgpu_vm_size < 1) {
1049                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1050                          amdgpu_vm_size);
1051                 amdgpu_vm_size = -1;
1052         }
1053 }
1054
1055 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1056 {
1057         struct sysinfo si;
1058         bool is_os_64 = (sizeof(void *) == 8);
1059         uint64_t total_memory;
1060         uint64_t dram_size_seven_GB = 0x1B8000000;
1061         uint64_t dram_size_three_GB = 0xB8000000;
1062
1063         if (amdgpu_smu_memory_pool_size == 0)
1064                 return;
1065
1066         if (!is_os_64) {
1067                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1068                 goto def_value;
1069         }
1070         si_meminfo(&si);
1071         total_memory = (uint64_t)si.totalram * si.mem_unit;
1072
1073         if ((amdgpu_smu_memory_pool_size == 1) ||
1074                 (amdgpu_smu_memory_pool_size == 2)) {
1075                 if (total_memory < dram_size_three_GB)
1076                         goto def_value1;
1077         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1078                 (amdgpu_smu_memory_pool_size == 8)) {
1079                 if (total_memory < dram_size_seven_GB)
1080                         goto def_value1;
1081         } else {
1082                 DRM_WARN("Smu memory pool size not supported\n");
1083                 goto def_value;
1084         }
1085         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1086
1087         return;
1088
1089 def_value1:
1090         DRM_WARN("No enough system memory\n");
1091 def_value:
1092         adev->pm.smu_prv_buffer_size = 0;
1093 }
1094
1095 /**
1096  * amdgpu_device_check_arguments - validate module params
1097  *
1098  * @adev: amdgpu_device pointer
1099  *
1100  * Validates certain module parameters and updates
1101  * the associated values used by the driver (all asics).
1102  */
1103 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1104 {
1105         if (amdgpu_sched_jobs < 4) {
1106                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1107                          amdgpu_sched_jobs);
1108                 amdgpu_sched_jobs = 4;
1109         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1110                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1111                          amdgpu_sched_jobs);
1112                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1113         }
1114
1115         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1116                 /* gart size must be greater or equal to 32M */
1117                 dev_warn(adev->dev, "gart size (%d) too small\n",
1118                          amdgpu_gart_size);
1119                 amdgpu_gart_size = -1;
1120         }
1121
1122         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1123                 /* gtt size must be greater or equal to 32M */
1124                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1125                                  amdgpu_gtt_size);
1126                 amdgpu_gtt_size = -1;
1127         }
1128
1129         /* valid range is between 4 and 9 inclusive */
1130         if (amdgpu_vm_fragment_size != -1 &&
1131             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1132                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1133                 amdgpu_vm_fragment_size = -1;
1134         }
1135
1136         amdgpu_device_check_smu_prv_buffer_size(adev);
1137
1138         amdgpu_device_check_vm_size(adev);
1139
1140         amdgpu_device_check_block_size(adev);
1141
1142         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1143
1144         amdgpu_gmc_tmz_set(adev);
1145
1146         return 0;
1147 }
1148
1149 /**
1150  * amdgpu_switcheroo_set_state - set switcheroo state
1151  *
1152  * @pdev: pci dev pointer
1153  * @state: vga_switcheroo state
1154  *
1155  * Callback for the switcheroo driver.  Suspends or resumes the
1156  * the asics before or after it is powered up using ACPI methods.
1157  */
1158 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1159 {
1160         struct drm_device *dev = pci_get_drvdata(pdev);
1161         int r;
1162
1163         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1164                 return;
1165
1166         if (state == VGA_SWITCHEROO_ON) {
1167                 pr_info("switched on\n");
1168                 /* don't suspend or resume card normally */
1169                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1170
1171                 pci_set_power_state(dev->pdev, PCI_D0);
1172                 pci_restore_state(dev->pdev);
1173                 r = pci_enable_device(dev->pdev);
1174                 if (r)
1175                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1176                 amdgpu_device_resume(dev, true);
1177
1178                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1179                 drm_kms_helper_poll_enable(dev);
1180         } else {
1181                 pr_info("switched off\n");
1182                 drm_kms_helper_poll_disable(dev);
1183                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1184                 amdgpu_device_suspend(dev, true);
1185                 pci_save_state(dev->pdev);
1186                 /* Shut down the device */
1187                 pci_disable_device(dev->pdev);
1188                 pci_set_power_state(dev->pdev, PCI_D3cold);
1189                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1190         }
1191 }
1192
1193 /**
1194  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1195  *
1196  * @pdev: pci dev pointer
1197  *
1198  * Callback for the switcheroo driver.  Check of the switcheroo
1199  * state can be changed.
1200  * Returns true if the state can be changed, false if not.
1201  */
1202 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1203 {
1204         struct drm_device *dev = pci_get_drvdata(pdev);
1205
1206         /*
1207         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1208         * locking inversion with the driver load path. And the access here is
1209         * completely racy anyway. So don't bother with locking for now.
1210         */
1211         return atomic_read(&dev->open_count) == 0;
1212 }
1213
1214 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1215         .set_gpu_state = amdgpu_switcheroo_set_state,
1216         .reprobe = NULL,
1217         .can_switch = amdgpu_switcheroo_can_switch,
1218 };
1219
1220 /**
1221  * amdgpu_device_ip_set_clockgating_state - set the CG state
1222  *
1223  * @dev: amdgpu_device pointer
1224  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1225  * @state: clockgating state (gate or ungate)
1226  *
1227  * Sets the requested clockgating state for all instances of
1228  * the hardware IP specified.
1229  * Returns the error code from the last instance.
1230  */
1231 int amdgpu_device_ip_set_clockgating_state(void *dev,
1232                                            enum amd_ip_block_type block_type,
1233                                            enum amd_clockgating_state state)
1234 {
1235         struct amdgpu_device *adev = dev;
1236         int i, r = 0;
1237
1238         for (i = 0; i < adev->num_ip_blocks; i++) {
1239                 if (!adev->ip_blocks[i].status.valid)
1240                         continue;
1241                 if (adev->ip_blocks[i].version->type != block_type)
1242                         continue;
1243                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1244                         continue;
1245                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1246                         (void *)adev, state);
1247                 if (r)
1248                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1249                                   adev->ip_blocks[i].version->funcs->name, r);
1250         }
1251         return r;
1252 }
1253
1254 /**
1255  * amdgpu_device_ip_set_powergating_state - set the PG state
1256  *
1257  * @dev: amdgpu_device pointer
1258  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1259  * @state: powergating state (gate or ungate)
1260  *
1261  * Sets the requested powergating state for all instances of
1262  * the hardware IP specified.
1263  * Returns the error code from the last instance.
1264  */
1265 int amdgpu_device_ip_set_powergating_state(void *dev,
1266                                            enum amd_ip_block_type block_type,
1267                                            enum amd_powergating_state state)
1268 {
1269         struct amdgpu_device *adev = dev;
1270         int i, r = 0;
1271
1272         for (i = 0; i < adev->num_ip_blocks; i++) {
1273                 if (!adev->ip_blocks[i].status.valid)
1274                         continue;
1275                 if (adev->ip_blocks[i].version->type != block_type)
1276                         continue;
1277                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1278                         continue;
1279                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1280                         (void *)adev, state);
1281                 if (r)
1282                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1283                                   adev->ip_blocks[i].version->funcs->name, r);
1284         }
1285         return r;
1286 }
1287
1288 /**
1289  * amdgpu_device_ip_get_clockgating_state - get the CG state
1290  *
1291  * @adev: amdgpu_device pointer
1292  * @flags: clockgating feature flags
1293  *
1294  * Walks the list of IPs on the device and updates the clockgating
1295  * flags for each IP.
1296  * Updates @flags with the feature flags for each hardware IP where
1297  * clockgating is enabled.
1298  */
1299 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1300                                             u32 *flags)
1301 {
1302         int i;
1303
1304         for (i = 0; i < adev->num_ip_blocks; i++) {
1305                 if (!adev->ip_blocks[i].status.valid)
1306                         continue;
1307                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1308                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1309         }
1310 }
1311
1312 /**
1313  * amdgpu_device_ip_wait_for_idle - wait for idle
1314  *
1315  * @adev: amdgpu_device pointer
1316  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1317  *
1318  * Waits for the request hardware IP to be idle.
1319  * Returns 0 for success or a negative error code on failure.
1320  */
1321 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1322                                    enum amd_ip_block_type block_type)
1323 {
1324         int i, r;
1325
1326         for (i = 0; i < adev->num_ip_blocks; i++) {
1327                 if (!adev->ip_blocks[i].status.valid)
1328                         continue;
1329                 if (adev->ip_blocks[i].version->type == block_type) {
1330                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1331                         if (r)
1332                                 return r;
1333                         break;
1334                 }
1335         }
1336         return 0;
1337
1338 }
1339
1340 /**
1341  * amdgpu_device_ip_is_idle - is the hardware IP idle
1342  *
1343  * @adev: amdgpu_device pointer
1344  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1345  *
1346  * Check if the hardware IP is idle or not.
1347  * Returns true if it the IP is idle, false if not.
1348  */
1349 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1350                               enum amd_ip_block_type block_type)
1351 {
1352         int i;
1353
1354         for (i = 0; i < adev->num_ip_blocks; i++) {
1355                 if (!adev->ip_blocks[i].status.valid)
1356                         continue;
1357                 if (adev->ip_blocks[i].version->type == block_type)
1358                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1359         }
1360         return true;
1361
1362 }
1363
1364 /**
1365  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1366  *
1367  * @adev: amdgpu_device pointer
1368  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1369  *
1370  * Returns a pointer to the hardware IP block structure
1371  * if it exists for the asic, otherwise NULL.
1372  */
1373 struct amdgpu_ip_block *
1374 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1375                               enum amd_ip_block_type type)
1376 {
1377         int i;
1378
1379         for (i = 0; i < adev->num_ip_blocks; i++)
1380                 if (adev->ip_blocks[i].version->type == type)
1381                         return &adev->ip_blocks[i];
1382
1383         return NULL;
1384 }
1385
1386 /**
1387  * amdgpu_device_ip_block_version_cmp
1388  *
1389  * @adev: amdgpu_device pointer
1390  * @type: enum amd_ip_block_type
1391  * @major: major version
1392  * @minor: minor version
1393  *
1394  * return 0 if equal or greater
1395  * return 1 if smaller or the ip_block doesn't exist
1396  */
1397 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1398                                        enum amd_ip_block_type type,
1399                                        u32 major, u32 minor)
1400 {
1401         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1402
1403         if (ip_block && ((ip_block->version->major > major) ||
1404                         ((ip_block->version->major == major) &&
1405                         (ip_block->version->minor >= minor))))
1406                 return 0;
1407
1408         return 1;
1409 }
1410
1411 /**
1412  * amdgpu_device_ip_block_add
1413  *
1414  * @adev: amdgpu_device pointer
1415  * @ip_block_version: pointer to the IP to add
1416  *
1417  * Adds the IP block driver information to the collection of IPs
1418  * on the asic.
1419  */
1420 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1421                                const struct amdgpu_ip_block_version *ip_block_version)
1422 {
1423         if (!ip_block_version)
1424                 return -EINVAL;
1425
1426         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1427                   ip_block_version->funcs->name);
1428
1429         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1430
1431         return 0;
1432 }
1433
1434 /**
1435  * amdgpu_device_enable_virtual_display - enable virtual display feature
1436  *
1437  * @adev: amdgpu_device pointer
1438  *
1439  * Enabled the virtual display feature if the user has enabled it via
1440  * the module parameter virtual_display.  This feature provides a virtual
1441  * display hardware on headless boards or in virtualized environments.
1442  * This function parses and validates the configuration string specified by
1443  * the user and configues the virtual display configuration (number of
1444  * virtual connectors, crtcs, etc.) specified.
1445  */
1446 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1447 {
1448         adev->enable_virtual_display = false;
1449
1450         if (amdgpu_virtual_display) {
1451                 struct drm_device *ddev = adev->ddev;
1452                 const char *pci_address_name = pci_name(ddev->pdev);
1453                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1454
1455                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1456                 pciaddstr_tmp = pciaddstr;
1457                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1458                         pciaddname = strsep(&pciaddname_tmp, ",");
1459                         if (!strcmp("all", pciaddname)
1460                             || !strcmp(pci_address_name, pciaddname)) {
1461                                 long num_crtc;
1462                                 int res = -1;
1463
1464                                 adev->enable_virtual_display = true;
1465
1466                                 if (pciaddname_tmp)
1467                                         res = kstrtol(pciaddname_tmp, 10,
1468                                                       &num_crtc);
1469
1470                                 if (!res) {
1471                                         if (num_crtc < 1)
1472                                                 num_crtc = 1;
1473                                         if (num_crtc > 6)
1474                                                 num_crtc = 6;
1475                                         adev->mode_info.num_crtc = num_crtc;
1476                                 } else {
1477                                         adev->mode_info.num_crtc = 1;
1478                                 }
1479                                 break;
1480                         }
1481                 }
1482
1483                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1484                          amdgpu_virtual_display, pci_address_name,
1485                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1486
1487                 kfree(pciaddstr);
1488         }
1489 }
1490
1491 /**
1492  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1493  *
1494  * @adev: amdgpu_device pointer
1495  *
1496  * Parses the asic configuration parameters specified in the gpu info
1497  * firmware and makes them availale to the driver for use in configuring
1498  * the asic.
1499  * Returns 0 on success, -EINVAL on failure.
1500  */
1501 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1502 {
1503         const char *chip_name;
1504         char fw_name[30];
1505         int err;
1506         const struct gpu_info_firmware_header_v1_0 *hdr;
1507
1508         adev->firmware.gpu_info_fw = NULL;
1509
1510         switch (adev->asic_type) {
1511         case CHIP_TOPAZ:
1512         case CHIP_TONGA:
1513         case CHIP_FIJI:
1514         case CHIP_POLARIS10:
1515         case CHIP_POLARIS11:
1516         case CHIP_POLARIS12:
1517         case CHIP_VEGAM:
1518         case CHIP_CARRIZO:
1519         case CHIP_STONEY:
1520 #ifdef CONFIG_DRM_AMDGPU_SI
1521         case CHIP_VERDE:
1522         case CHIP_TAHITI:
1523         case CHIP_PITCAIRN:
1524         case CHIP_OLAND:
1525         case CHIP_HAINAN:
1526 #endif
1527 #ifdef CONFIG_DRM_AMDGPU_CIK
1528         case CHIP_BONAIRE:
1529         case CHIP_HAWAII:
1530         case CHIP_KAVERI:
1531         case CHIP_KABINI:
1532         case CHIP_MULLINS:
1533 #endif
1534         case CHIP_VEGA20:
1535         default:
1536                 return 0;
1537         case CHIP_VEGA10:
1538                 chip_name = "vega10";
1539                 break;
1540         case CHIP_VEGA12:
1541                 chip_name = "vega12";
1542                 break;
1543         case CHIP_RAVEN:
1544                 if (adev->rev_id >= 8)
1545                         chip_name = "raven2";
1546                 else if (adev->pdev->device == 0x15d8)
1547                         chip_name = "picasso";
1548                 else
1549                         chip_name = "raven";
1550                 break;
1551         case CHIP_ARCTURUS:
1552                 chip_name = "arcturus";
1553                 break;
1554         case CHIP_RENOIR:
1555                 chip_name = "renoir";
1556                 break;
1557         case CHIP_NAVI10:
1558                 chip_name = "navi10";
1559                 break;
1560         case CHIP_NAVI14:
1561                 chip_name = "navi14";
1562                 break;
1563         case CHIP_NAVI12:
1564                 chip_name = "navi12";
1565                 break;
1566         }
1567
1568         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1569         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1570         if (err) {
1571                 dev_err(adev->dev,
1572                         "Failed to load gpu_info firmware \"%s\"\n",
1573                         fw_name);
1574                 goto out;
1575         }
1576         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1577         if (err) {
1578                 dev_err(adev->dev,
1579                         "Failed to validate gpu_info firmware \"%s\"\n",
1580                         fw_name);
1581                 goto out;
1582         }
1583
1584         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1585         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1586
1587         switch (hdr->version_major) {
1588         case 1:
1589         {
1590                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1591                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1592                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1593
1594                 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
1595                         goto parse_soc_bounding_box;
1596
1597                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1598                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1599                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1600                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1601                 adev->gfx.config.max_texture_channel_caches =
1602                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1603                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1604                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1605                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1606                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1607                 adev->gfx.config.double_offchip_lds_buf =
1608                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1609                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1610                 adev->gfx.cu_info.max_waves_per_simd =
1611                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1612                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1613                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1614                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1615                 if (hdr->version_minor >= 1) {
1616                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1617                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1618                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1619                         adev->gfx.config.num_sc_per_sh =
1620                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1621                         adev->gfx.config.num_packer_per_sc =
1622                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1623                 }
1624
1625 parse_soc_bounding_box:
1626                 /*
1627                  * soc bounding box info is not integrated in disocovery table,
1628                  * we always need to parse it from gpu info firmware.
1629                  */
1630                 if (hdr->version_minor == 2) {
1631                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1632                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1633                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1634                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1635                 }
1636                 break;
1637         }
1638         default:
1639                 dev_err(adev->dev,
1640                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1641                 err = -EINVAL;
1642                 goto out;
1643         }
1644 out:
1645         return err;
1646 }
1647
1648 /**
1649  * amdgpu_device_ip_early_init - run early init for hardware IPs
1650  *
1651  * @adev: amdgpu_device pointer
1652  *
1653  * Early initialization pass for hardware IPs.  The hardware IPs that make
1654  * up each asic are discovered each IP's early_init callback is run.  This
1655  * is the first stage in initializing the asic.
1656  * Returns 0 on success, negative error code on failure.
1657  */
1658 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1659 {
1660         int i, r;
1661
1662         amdgpu_device_enable_virtual_display(adev);
1663
1664         switch (adev->asic_type) {
1665         case CHIP_TOPAZ:
1666         case CHIP_TONGA:
1667         case CHIP_FIJI:
1668         case CHIP_POLARIS10:
1669         case CHIP_POLARIS11:
1670         case CHIP_POLARIS12:
1671         case CHIP_VEGAM:
1672         case CHIP_CARRIZO:
1673         case CHIP_STONEY:
1674                 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
1675                         adev->family = AMDGPU_FAMILY_CZ;
1676                 else
1677                         adev->family = AMDGPU_FAMILY_VI;
1678
1679                 r = vi_set_ip_blocks(adev);
1680                 if (r)
1681                         return r;
1682                 break;
1683 #ifdef CONFIG_DRM_AMDGPU_SI
1684         case CHIP_VERDE:
1685         case CHIP_TAHITI:
1686         case CHIP_PITCAIRN:
1687         case CHIP_OLAND:
1688         case CHIP_HAINAN:
1689                 adev->family = AMDGPU_FAMILY_SI;
1690                 r = si_set_ip_blocks(adev);
1691                 if (r)
1692                         return r;
1693                 break;
1694 #endif
1695 #ifdef CONFIG_DRM_AMDGPU_CIK
1696         case CHIP_BONAIRE:
1697         case CHIP_HAWAII:
1698         case CHIP_KAVERI:
1699         case CHIP_KABINI:
1700         case CHIP_MULLINS:
1701                 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
1702                         adev->family = AMDGPU_FAMILY_CI;
1703                 else
1704                         adev->family = AMDGPU_FAMILY_KV;
1705
1706                 r = cik_set_ip_blocks(adev);
1707                 if (r)
1708                         return r;
1709                 break;
1710 #endif
1711         case CHIP_VEGA10:
1712         case CHIP_VEGA12:
1713         case CHIP_VEGA20:
1714         case CHIP_RAVEN:
1715         case CHIP_ARCTURUS:
1716         case CHIP_RENOIR:
1717                 if (adev->asic_type == CHIP_RAVEN ||
1718                     adev->asic_type == CHIP_RENOIR)
1719                         adev->family = AMDGPU_FAMILY_RV;
1720                 else
1721                         adev->family = AMDGPU_FAMILY_AI;
1722
1723                 r = soc15_set_ip_blocks(adev);
1724                 if (r)
1725                         return r;
1726                 break;
1727         case  CHIP_NAVI10:
1728         case  CHIP_NAVI14:
1729         case  CHIP_NAVI12:
1730                 adev->family = AMDGPU_FAMILY_NV;
1731
1732                 r = nv_set_ip_blocks(adev);
1733                 if (r)
1734                         return r;
1735                 break;
1736         default:
1737                 /* FIXME: not supported yet */
1738                 return -EINVAL;
1739         }
1740
1741         r = amdgpu_device_parse_gpu_info_fw(adev);
1742         if (r)
1743                 return r;
1744
1745         if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
1746                 amdgpu_discovery_get_gfx_info(adev);
1747
1748         amdgpu_amdkfd_device_probe(adev);
1749
1750         if (amdgpu_sriov_vf(adev)) {
1751                 /* handle vbios stuff prior full access mode for new handshake */
1752                 if (adev->virt.req_init_data_ver == 1) {
1753                         if (!amdgpu_get_bios(adev)) {
1754                                 DRM_ERROR("failed to get vbios\n");
1755                                 return -EINVAL;
1756                         }
1757
1758                         r = amdgpu_atombios_init(adev);
1759                         if (r) {
1760                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1761                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1762                                 return r;
1763                         }
1764                 }
1765         }
1766
1767         /* we need to send REQ_GPU here for legacy handshaker otherwise the vbios
1768          * will not be prepared by host for this VF */
1769         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver < 1) {
1770                 r = amdgpu_virt_request_full_gpu(adev, true);
1771                 if (r)
1772                         return r;
1773         }
1774
1775         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1776         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1777                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1778
1779         for (i = 0; i < adev->num_ip_blocks; i++) {
1780                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1781                         DRM_ERROR("disabled ip block: %d <%s>\n",
1782                                   i, adev->ip_blocks[i].version->funcs->name);
1783                         adev->ip_blocks[i].status.valid = false;
1784                 } else {
1785                         if (adev->ip_blocks[i].version->funcs->early_init) {
1786                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1787                                 if (r == -ENOENT) {
1788                                         adev->ip_blocks[i].status.valid = false;
1789                                 } else if (r) {
1790                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1791                                                   adev->ip_blocks[i].version->funcs->name, r);
1792                                         return r;
1793                                 } else {
1794                                         adev->ip_blocks[i].status.valid = true;
1795                                 }
1796                         } else {
1797                                 adev->ip_blocks[i].status.valid = true;
1798                         }
1799                 }
1800                 /* get the vbios after the asic_funcs are set up */
1801                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1802                         /* skip vbios handling for new handshake */
1803                         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver == 1)
1804                                 continue;
1805
1806                         /* Read BIOS */
1807                         if (!amdgpu_get_bios(adev))
1808                                 return -EINVAL;
1809
1810                         r = amdgpu_atombios_init(adev);
1811                         if (r) {
1812                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1813                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1814                                 return r;
1815                         }
1816                 }
1817         }
1818
1819         adev->cg_flags &= amdgpu_cg_mask;
1820         adev->pg_flags &= amdgpu_pg_mask;
1821
1822         return 0;
1823 }
1824
1825 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1826 {
1827         int i, r;
1828
1829         for (i = 0; i < adev->num_ip_blocks; i++) {
1830                 if (!adev->ip_blocks[i].status.sw)
1831                         continue;
1832                 if (adev->ip_blocks[i].status.hw)
1833                         continue;
1834                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1835                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1836                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1837                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1838                         if (r) {
1839                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1840                                           adev->ip_blocks[i].version->funcs->name, r);
1841                                 return r;
1842                         }
1843                         adev->ip_blocks[i].status.hw = true;
1844                 }
1845         }
1846
1847         return 0;
1848 }
1849
1850 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1851 {
1852         int i, r;
1853
1854         for (i = 0; i < adev->num_ip_blocks; i++) {
1855                 if (!adev->ip_blocks[i].status.sw)
1856                         continue;
1857                 if (adev->ip_blocks[i].status.hw)
1858                         continue;
1859                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1860                 if (r) {
1861                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1862                                   adev->ip_blocks[i].version->funcs->name, r);
1863                         return r;
1864                 }
1865                 adev->ip_blocks[i].status.hw = true;
1866         }
1867
1868         return 0;
1869 }
1870
1871 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1872 {
1873         int r = 0;
1874         int i;
1875         uint32_t smu_version;
1876
1877         if (adev->asic_type >= CHIP_VEGA10) {
1878                 for (i = 0; i < adev->num_ip_blocks; i++) {
1879                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1880                                 continue;
1881
1882                         /* no need to do the fw loading again if already done*/
1883                         if (adev->ip_blocks[i].status.hw == true)
1884                                 break;
1885
1886                         if (adev->in_gpu_reset || adev->in_suspend) {
1887                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
1888                                 if (r) {
1889                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
1890                                                           adev->ip_blocks[i].version->funcs->name, r);
1891                                         return r;
1892                                 }
1893                         } else {
1894                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1895                                 if (r) {
1896                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1897                                                           adev->ip_blocks[i].version->funcs->name, r);
1898                                         return r;
1899                                 }
1900                         }
1901
1902                         adev->ip_blocks[i].status.hw = true;
1903                         break;
1904                 }
1905         }
1906
1907         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1908                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1909
1910         return r;
1911 }
1912
1913 /**
1914  * amdgpu_device_ip_init - run init for hardware IPs
1915  *
1916  * @adev: amdgpu_device pointer
1917  *
1918  * Main initialization pass for hardware IPs.  The list of all the hardware
1919  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1920  * are run.  sw_init initializes the software state associated with each IP
1921  * and hw_init initializes the hardware associated with each IP.
1922  * Returns 0 on success, negative error code on failure.
1923  */
1924 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1925 {
1926         int i, r;
1927
1928         r = amdgpu_ras_init(adev);
1929         if (r)
1930                 return r;
1931
1932         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver > 0) {
1933                 r = amdgpu_virt_request_full_gpu(adev, true);
1934                 if (r)
1935                         return -EAGAIN;
1936         }
1937
1938         for (i = 0; i < adev->num_ip_blocks; i++) {
1939                 if (!adev->ip_blocks[i].status.valid)
1940                         continue;
1941                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1942                 if (r) {
1943                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1944                                   adev->ip_blocks[i].version->funcs->name, r);
1945                         goto init_failed;
1946                 }
1947                 adev->ip_blocks[i].status.sw = true;
1948
1949                 /* need to do gmc hw init early so we can allocate gpu mem */
1950                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1951                         r = amdgpu_device_vram_scratch_init(adev);
1952                         if (r) {
1953                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
1954                                 goto init_failed;
1955                         }
1956                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
1957                         if (r) {
1958                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
1959                                 goto init_failed;
1960                         }
1961                         r = amdgpu_device_wb_init(adev);
1962                         if (r) {
1963                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
1964                                 goto init_failed;
1965                         }
1966                         adev->ip_blocks[i].status.hw = true;
1967
1968                         /* right after GMC hw init, we create CSA */
1969                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1970                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
1971                                                                 AMDGPU_GEM_DOMAIN_VRAM,
1972                                                                 AMDGPU_CSA_SIZE);
1973                                 if (r) {
1974                                         DRM_ERROR("allocate CSA failed %d\n", r);
1975                                         goto init_failed;
1976                                 }
1977                         }
1978                 }
1979         }
1980
1981         if (amdgpu_sriov_vf(adev))
1982                 amdgpu_virt_init_data_exchange(adev);
1983
1984         r = amdgpu_ib_pool_init(adev);
1985         if (r) {
1986                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
1987                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
1988                 goto init_failed;
1989         }
1990
1991         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
1992         if (r)
1993                 goto init_failed;
1994
1995         r = amdgpu_device_ip_hw_init_phase1(adev);
1996         if (r)
1997                 goto init_failed;
1998
1999         r = amdgpu_device_fw_loading(adev);
2000         if (r)
2001                 goto init_failed;
2002
2003         r = amdgpu_device_ip_hw_init_phase2(adev);
2004         if (r)
2005                 goto init_failed;
2006
2007         /*
2008          * retired pages will be loaded from eeprom and reserved here,
2009          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2010          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2011          * for I2C communication which only true at this point.
2012          * recovery_init may fail, but it can free all resources allocated by
2013          * itself and its failure should not stop amdgpu init process.
2014          *
2015          * Note: theoretically, this should be called before all vram allocations
2016          * to protect retired page from abusing
2017          */
2018         amdgpu_ras_recovery_init(adev);
2019
2020         if (adev->gmc.xgmi.num_physical_nodes > 1)
2021                 amdgpu_xgmi_add_device(adev);
2022         amdgpu_amdkfd_device_init(adev);
2023
2024         amdgpu_fru_get_product_info(adev);
2025
2026 init_failed:
2027         if (amdgpu_sriov_vf(adev))
2028                 amdgpu_virt_release_full_gpu(adev, true);
2029
2030         return r;
2031 }
2032
2033 /**
2034  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2035  *
2036  * @adev: amdgpu_device pointer
2037  *
2038  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2039  * this function before a GPU reset.  If the value is retained after a
2040  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2041  */
2042 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2043 {
2044         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2045 }
2046
2047 /**
2048  * amdgpu_device_check_vram_lost - check if vram is valid
2049  *
2050  * @adev: amdgpu_device pointer
2051  *
2052  * Checks the reset magic value written to the gart pointer in VRAM.
2053  * The driver calls this after a GPU reset to see if the contents of
2054  * VRAM is lost or now.
2055  * returns true if vram is lost, false if not.
2056  */
2057 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2058 {
2059         if (memcmp(adev->gart.ptr, adev->reset_magic,
2060                         AMDGPU_RESET_MAGIC_NUM))
2061                 return true;
2062
2063         if (!adev->in_gpu_reset)
2064                 return false;
2065
2066         /*
2067          * For all ASICs with baco/mode1 reset, the VRAM is
2068          * always assumed to be lost.
2069          */
2070         switch (amdgpu_asic_reset_method(adev)) {
2071         case AMD_RESET_METHOD_BACO:
2072         case AMD_RESET_METHOD_MODE1:
2073                 return true;
2074         default:
2075                 return false;
2076         }
2077 }
2078
2079 /**
2080  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2081  *
2082  * @adev: amdgpu_device pointer
2083  * @state: clockgating state (gate or ungate)
2084  *
2085  * The list of all the hardware IPs that make up the asic is walked and the
2086  * set_clockgating_state callbacks are run.
2087  * Late initialization pass enabling clockgating for hardware IPs.
2088  * Fini or suspend, pass disabling clockgating for hardware IPs.
2089  * Returns 0 on success, negative error code on failure.
2090  */
2091
2092 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2093                                                 enum amd_clockgating_state state)
2094 {
2095         int i, j, r;
2096
2097         if (amdgpu_emu_mode == 1)
2098                 return 0;
2099
2100         for (j = 0; j < adev->num_ip_blocks; j++) {
2101                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2102                 if (!adev->ip_blocks[i].status.late_initialized)
2103                         continue;
2104                 /* skip CG for VCE/UVD, it's handled specially */
2105                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2106                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2107                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2108                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2109                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2110                         /* enable clockgating to save power */
2111                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2112                                                                                      state);
2113                         if (r) {
2114                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2115                                           adev->ip_blocks[i].version->funcs->name, r);
2116                                 return r;
2117                         }
2118                 }
2119         }
2120
2121         return 0;
2122 }
2123
2124 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2125 {
2126         int i, j, r;
2127
2128         if (amdgpu_emu_mode == 1)
2129                 return 0;
2130
2131         for (j = 0; j < adev->num_ip_blocks; j++) {
2132                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2133                 if (!adev->ip_blocks[i].status.late_initialized)
2134                         continue;
2135                 /* skip CG for VCE/UVD, it's handled specially */
2136                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2137                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2138                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2139                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2140                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2141                         /* enable powergating to save power */
2142                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2143                                                                                         state);
2144                         if (r) {
2145                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2146                                           adev->ip_blocks[i].version->funcs->name, r);
2147                                 return r;
2148                         }
2149                 }
2150         }
2151         return 0;
2152 }
2153
2154 static int amdgpu_device_enable_mgpu_fan_boost(void)
2155 {
2156         struct amdgpu_gpu_instance *gpu_ins;
2157         struct amdgpu_device *adev;
2158         int i, ret = 0;
2159
2160         mutex_lock(&mgpu_info.mutex);
2161
2162         /*
2163          * MGPU fan boost feature should be enabled
2164          * only when there are two or more dGPUs in
2165          * the system
2166          */
2167         if (mgpu_info.num_dgpu < 2)
2168                 goto out;
2169
2170         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2171                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2172                 adev = gpu_ins->adev;
2173                 if (!(adev->flags & AMD_IS_APU) &&
2174                     !gpu_ins->mgpu_fan_enabled &&
2175                     adev->powerplay.pp_funcs &&
2176                     adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2177                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2178                         if (ret)
2179                                 break;
2180
2181                         gpu_ins->mgpu_fan_enabled = 1;
2182                 }
2183         }
2184
2185 out:
2186         mutex_unlock(&mgpu_info.mutex);
2187
2188         return ret;
2189 }
2190
2191 /**
2192  * amdgpu_device_ip_late_init - run late init for hardware IPs
2193  *
2194  * @adev: amdgpu_device pointer
2195  *
2196  * Late initialization pass for hardware IPs.  The list of all the hardware
2197  * IPs that make up the asic is walked and the late_init callbacks are run.
2198  * late_init covers any special initialization that an IP requires
2199  * after all of the have been initialized or something that needs to happen
2200  * late in the init process.
2201  * Returns 0 on success, negative error code on failure.
2202  */
2203 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2204 {
2205         struct amdgpu_gpu_instance *gpu_instance;
2206         int i = 0, r;
2207
2208         for (i = 0; i < adev->num_ip_blocks; i++) {
2209                 if (!adev->ip_blocks[i].status.hw)
2210                         continue;
2211                 if (adev->ip_blocks[i].version->funcs->late_init) {
2212                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2213                         if (r) {
2214                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2215                                           adev->ip_blocks[i].version->funcs->name, r);
2216                                 return r;
2217                         }
2218                 }
2219                 adev->ip_blocks[i].status.late_initialized = true;
2220         }
2221
2222         amdgpu_ras_set_error_query_ready(adev, true);
2223
2224         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2225         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2226
2227         amdgpu_device_fill_reset_magic(adev);
2228
2229         r = amdgpu_device_enable_mgpu_fan_boost();
2230         if (r)
2231                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2232
2233
2234         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2235                 mutex_lock(&mgpu_info.mutex);
2236
2237                 /*
2238                  * Reset device p-state to low as this was booted with high.
2239                  *
2240                  * This should be performed only after all devices from the same
2241                  * hive get initialized.
2242                  *
2243                  * However, it's unknown how many device in the hive in advance.
2244                  * As this is counted one by one during devices initializations.
2245                  *
2246                  * So, we wait for all XGMI interlinked devices initialized.
2247                  * This may bring some delays as those devices may come from
2248                  * different hives. But that should be OK.
2249                  */
2250                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2251                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2252                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2253                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2254                                         continue;
2255
2256                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2257                                                 AMDGPU_XGMI_PSTATE_MIN);
2258                                 if (r) {
2259                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2260                                         break;
2261                                 }
2262                         }
2263                 }
2264
2265                 mutex_unlock(&mgpu_info.mutex);
2266         }
2267
2268         return 0;
2269 }
2270
2271 /**
2272  * amdgpu_device_ip_fini - run fini for hardware IPs
2273  *
2274  * @adev: amdgpu_device pointer
2275  *
2276  * Main teardown pass for hardware IPs.  The list of all the hardware
2277  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2278  * are run.  hw_fini tears down the hardware associated with each IP
2279  * and sw_fini tears down any software state associated with each IP.
2280  * Returns 0 on success, negative error code on failure.
2281  */
2282 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2283 {
2284         int i, r;
2285
2286         amdgpu_ras_pre_fini(adev);
2287
2288         if (adev->gmc.xgmi.num_physical_nodes > 1)
2289                 amdgpu_xgmi_remove_device(adev);
2290
2291         amdgpu_amdkfd_device_fini(adev);
2292
2293         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2294         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2295
2296         /* need to disable SMC first */
2297         for (i = 0; i < adev->num_ip_blocks; i++) {
2298                 if (!adev->ip_blocks[i].status.hw)
2299                         continue;
2300                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2301                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2302                         /* XXX handle errors */
2303                         if (r) {
2304                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2305                                           adev->ip_blocks[i].version->funcs->name, r);
2306                         }
2307                         adev->ip_blocks[i].status.hw = false;
2308                         break;
2309                 }
2310         }
2311
2312         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2313                 if (!adev->ip_blocks[i].status.hw)
2314                         continue;
2315
2316                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2317                 /* XXX handle errors */
2318                 if (r) {
2319                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2320                                   adev->ip_blocks[i].version->funcs->name, r);
2321                 }
2322
2323                 adev->ip_blocks[i].status.hw = false;
2324         }
2325
2326
2327         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2328                 if (!adev->ip_blocks[i].status.sw)
2329                         continue;
2330
2331                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2332                         amdgpu_ucode_free_bo(adev);
2333                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2334                         amdgpu_device_wb_fini(adev);
2335                         amdgpu_device_vram_scratch_fini(adev);
2336                         amdgpu_ib_pool_fini(adev);
2337                 }
2338
2339                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2340                 /* XXX handle errors */
2341                 if (r) {
2342                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2343                                   adev->ip_blocks[i].version->funcs->name, r);
2344                 }
2345                 adev->ip_blocks[i].status.sw = false;
2346                 adev->ip_blocks[i].status.valid = false;
2347         }
2348
2349         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2350                 if (!adev->ip_blocks[i].status.late_initialized)
2351                         continue;
2352                 if (adev->ip_blocks[i].version->funcs->late_fini)
2353                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2354                 adev->ip_blocks[i].status.late_initialized = false;
2355         }
2356
2357         amdgpu_ras_fini(adev);
2358
2359         if (amdgpu_sriov_vf(adev))
2360                 if (amdgpu_virt_release_full_gpu(adev, false))
2361                         DRM_ERROR("failed to release exclusive mode on fini\n");
2362
2363         return 0;
2364 }
2365
2366 /**
2367  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2368  *
2369  * @work: work_struct.
2370  */
2371 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2372 {
2373         struct amdgpu_device *adev =
2374                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2375         int r;
2376
2377         r = amdgpu_ib_ring_tests(adev);
2378         if (r)
2379                 DRM_ERROR("ib ring test failed (%d).\n", r);
2380 }
2381
2382 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2383 {
2384         struct amdgpu_device *adev =
2385                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2386
2387         mutex_lock(&adev->gfx.gfx_off_mutex);
2388         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2389                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2390                         adev->gfx.gfx_off_state = true;
2391         }
2392         mutex_unlock(&adev->gfx.gfx_off_mutex);
2393 }
2394
2395 /**
2396  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2397  *
2398  * @adev: amdgpu_device pointer
2399  *
2400  * Main suspend function for hardware IPs.  The list of all the hardware
2401  * IPs that make up the asic is walked, clockgating is disabled and the
2402  * suspend callbacks are run.  suspend puts the hardware and software state
2403  * in each IP into a state suitable for suspend.
2404  * Returns 0 on success, negative error code on failure.
2405  */
2406 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2407 {
2408         int i, r;
2409
2410         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2411         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2412
2413         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2414                 if (!adev->ip_blocks[i].status.valid)
2415                         continue;
2416                 /* displays are handled separately */
2417                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
2418                         /* XXX handle errors */
2419                         r = adev->ip_blocks[i].version->funcs->suspend(adev);
2420                         /* XXX handle errors */
2421                         if (r) {
2422                                 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2423                                           adev->ip_blocks[i].version->funcs->name, r);
2424                                 return r;
2425                         }
2426                         adev->ip_blocks[i].status.hw = false;
2427                 }
2428         }
2429
2430         return 0;
2431 }
2432
2433 /**
2434  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2435  *
2436  * @adev: amdgpu_device pointer
2437  *
2438  * Main suspend function for hardware IPs.  The list of all the hardware
2439  * IPs that make up the asic is walked, clockgating is disabled and the
2440  * suspend callbacks are run.  suspend puts the hardware and software state
2441  * in each IP into a state suitable for suspend.
2442  * Returns 0 on success, negative error code on failure.
2443  */
2444 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2445 {
2446         int i, r;
2447
2448         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2449                 if (!adev->ip_blocks[i].status.valid)
2450                         continue;
2451                 /* displays are handled in phase1 */
2452                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2453                         continue;
2454                 /* PSP lost connection when err_event_athub occurs */
2455                 if (amdgpu_ras_intr_triggered() &&
2456                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2457                         adev->ip_blocks[i].status.hw = false;
2458                         continue;
2459                 }
2460                 /* XXX handle errors */
2461                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2462                 /* XXX handle errors */
2463                 if (r) {
2464                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2465                                   adev->ip_blocks[i].version->funcs->name, r);
2466                 }
2467                 adev->ip_blocks[i].status.hw = false;
2468                 /* handle putting the SMC in the appropriate state */
2469                 if(!amdgpu_sriov_vf(adev)){
2470                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2471                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2472                                 if (r) {
2473                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2474                                                         adev->mp1_state, r);
2475                                         return r;
2476                                 }
2477                         }
2478                 }
2479                 adev->ip_blocks[i].status.hw = false;
2480         }
2481
2482         return 0;
2483 }
2484
2485 /**
2486  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2487  *
2488  * @adev: amdgpu_device pointer
2489  *
2490  * Main suspend function for hardware IPs.  The list of all the hardware
2491  * IPs that make up the asic is walked, clockgating is disabled and the
2492  * suspend callbacks are run.  suspend puts the hardware and software state
2493  * in each IP into a state suitable for suspend.
2494  * Returns 0 on success, negative error code on failure.
2495  */
2496 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2497 {
2498         int r;
2499
2500         if (amdgpu_sriov_vf(adev))
2501                 amdgpu_virt_request_full_gpu(adev, false);
2502
2503         r = amdgpu_device_ip_suspend_phase1(adev);
2504         if (r)
2505                 return r;
2506         r = amdgpu_device_ip_suspend_phase2(adev);
2507
2508         if (amdgpu_sriov_vf(adev))
2509                 amdgpu_virt_release_full_gpu(adev, false);
2510
2511         return r;
2512 }
2513
2514 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2515 {
2516         int i, r;
2517
2518         static enum amd_ip_block_type ip_order[] = {
2519                 AMD_IP_BLOCK_TYPE_GMC,
2520                 AMD_IP_BLOCK_TYPE_COMMON,
2521                 AMD_IP_BLOCK_TYPE_PSP,
2522                 AMD_IP_BLOCK_TYPE_IH,
2523         };
2524
2525         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2526                 int j;
2527                 struct amdgpu_ip_block *block;
2528
2529                 for (j = 0; j < adev->num_ip_blocks; j++) {
2530                         block = &adev->ip_blocks[j];
2531
2532                         block->status.hw = false;
2533                         if (block->version->type != ip_order[i] ||
2534                                 !block->status.valid)
2535                                 continue;
2536
2537                         r = block->version->funcs->hw_init(adev);
2538                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2539                         if (r)
2540                                 return r;
2541                         block->status.hw = true;
2542                 }
2543         }
2544
2545         return 0;
2546 }
2547
2548 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2549 {
2550         int i, r;
2551
2552         static enum amd_ip_block_type ip_order[] = {
2553                 AMD_IP_BLOCK_TYPE_SMC,
2554                 AMD_IP_BLOCK_TYPE_DCE,
2555                 AMD_IP_BLOCK_TYPE_GFX,
2556                 AMD_IP_BLOCK_TYPE_SDMA,
2557                 AMD_IP_BLOCK_TYPE_UVD,
2558                 AMD_IP_BLOCK_TYPE_VCE,
2559                 AMD_IP_BLOCK_TYPE_VCN
2560         };
2561
2562         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2563                 int j;
2564                 struct amdgpu_ip_block *block;
2565
2566                 for (j = 0; j < adev->num_ip_blocks; j++) {
2567                         block = &adev->ip_blocks[j];
2568
2569                         if (block->version->type != ip_order[i] ||
2570                                 !block->status.valid ||
2571                                 block->status.hw)
2572                                 continue;
2573
2574                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2575                                 r = block->version->funcs->resume(adev);
2576                         else
2577                                 r = block->version->funcs->hw_init(adev);
2578
2579                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2580                         if (r)
2581                                 return r;
2582                         block->status.hw = true;
2583                 }
2584         }
2585
2586         return 0;
2587 }
2588
2589 /**
2590  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2591  *
2592  * @adev: amdgpu_device pointer
2593  *
2594  * First resume function for hardware IPs.  The list of all the hardware
2595  * IPs that make up the asic is walked and the resume callbacks are run for
2596  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2597  * after a suspend and updates the software state as necessary.  This
2598  * function is also used for restoring the GPU after a GPU reset.
2599  * Returns 0 on success, negative error code on failure.
2600  */
2601 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2602 {
2603         int i, r;
2604
2605         for (i = 0; i < adev->num_ip_blocks; i++) {
2606                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2607                         continue;
2608                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2609                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2610                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2611
2612                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2613                         if (r) {
2614                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2615                                           adev->ip_blocks[i].version->funcs->name, r);
2616                                 return r;
2617                         }
2618                         adev->ip_blocks[i].status.hw = true;
2619                 }
2620         }
2621
2622         return 0;
2623 }
2624
2625 /**
2626  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2627  *
2628  * @adev: amdgpu_device pointer
2629  *
2630  * First resume function for hardware IPs.  The list of all the hardware
2631  * IPs that make up the asic is walked and the resume callbacks are run for
2632  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2633  * functional state after a suspend and updates the software state as
2634  * necessary.  This function is also used for restoring the GPU after a GPU
2635  * reset.
2636  * Returns 0 on success, negative error code on failure.
2637  */
2638 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2639 {
2640         int i, r;
2641
2642         for (i = 0; i < adev->num_ip_blocks; i++) {
2643                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2644                         continue;
2645                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2646                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2647                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2648                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2649                         continue;
2650                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2651                 if (r) {
2652                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2653                                   adev->ip_blocks[i].version->funcs->name, r);
2654                         return r;
2655                 }
2656                 adev->ip_blocks[i].status.hw = true;
2657         }
2658
2659         return 0;
2660 }
2661
2662 /**
2663  * amdgpu_device_ip_resume - run resume for hardware IPs
2664  *
2665  * @adev: amdgpu_device pointer
2666  *
2667  * Main resume function for hardware IPs.  The hardware IPs
2668  * are split into two resume functions because they are
2669  * are also used in in recovering from a GPU reset and some additional
2670  * steps need to be take between them.  In this case (S3/S4) they are
2671  * run sequentially.
2672  * Returns 0 on success, negative error code on failure.
2673  */
2674 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2675 {
2676         int r;
2677
2678         r = amdgpu_device_ip_resume_phase1(adev);
2679         if (r)
2680                 return r;
2681
2682         r = amdgpu_device_fw_loading(adev);
2683         if (r)
2684                 return r;
2685
2686         r = amdgpu_device_ip_resume_phase2(adev);
2687
2688         return r;
2689 }
2690
2691 /**
2692  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2693  *
2694  * @adev: amdgpu_device pointer
2695  *
2696  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2697  */
2698 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2699 {
2700         if (amdgpu_sriov_vf(adev)) {
2701                 if (adev->is_atom_fw) {
2702                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2703                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2704                 } else {
2705                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2706                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2707                 }
2708
2709                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2710                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2711         }
2712 }
2713
2714 /**
2715  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2716  *
2717  * @asic_type: AMD asic type
2718  *
2719  * Check if there is DC (new modesetting infrastructre) support for an asic.
2720  * returns true if DC has support, false if not.
2721  */
2722 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2723 {
2724         switch (asic_type) {
2725 #if defined(CONFIG_DRM_AMD_DC)
2726         case CHIP_BONAIRE:
2727         case CHIP_KAVERI:
2728         case CHIP_KABINI:
2729         case CHIP_MULLINS:
2730                 /*
2731                  * We have systems in the wild with these ASICs that require
2732                  * LVDS and VGA support which is not supported with DC.
2733                  *
2734                  * Fallback to the non-DC driver here by default so as not to
2735                  * cause regressions.
2736                  */
2737                 return amdgpu_dc > 0;
2738         case CHIP_HAWAII:
2739         case CHIP_CARRIZO:
2740         case CHIP_STONEY:
2741         case CHIP_POLARIS10:
2742         case CHIP_POLARIS11:
2743         case CHIP_POLARIS12:
2744         case CHIP_VEGAM:
2745         case CHIP_TONGA:
2746         case CHIP_FIJI:
2747         case CHIP_VEGA10:
2748         case CHIP_VEGA12:
2749         case CHIP_VEGA20:
2750 #if defined(CONFIG_DRM_AMD_DC_DCN)
2751         case CHIP_RAVEN:
2752         case CHIP_NAVI10:
2753         case CHIP_NAVI14:
2754         case CHIP_NAVI12:
2755         case CHIP_RENOIR:
2756 #endif
2757                 return amdgpu_dc != 0;
2758 #endif
2759         default:
2760                 if (amdgpu_dc > 0)
2761                         DRM_INFO("Display Core has been requested via kernel parameter "
2762                                          "but isn't supported by ASIC, ignoring\n");
2763                 return false;
2764         }
2765 }
2766
2767 /**
2768  * amdgpu_device_has_dc_support - check if dc is supported
2769  *
2770  * @adev: amdgpu_device_pointer
2771  *
2772  * Returns true for supported, false for not supported
2773  */
2774 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2775 {
2776         if (amdgpu_sriov_vf(adev))
2777                 return false;
2778
2779         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2780 }
2781
2782
2783 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2784 {
2785         struct amdgpu_device *adev =
2786                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2787         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2788
2789         /* It's a bug to not have a hive within this function */
2790         if (WARN_ON(!hive))
2791                 return;
2792
2793         /*
2794          * Use task barrier to synchronize all xgmi reset works across the
2795          * hive. task_barrier_enter and task_barrier_exit will block
2796          * until all the threads running the xgmi reset works reach
2797          * those points. task_barrier_full will do both blocks.
2798          */
2799         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2800
2801                 task_barrier_enter(&hive->tb);
2802                 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2803
2804                 if (adev->asic_reset_res)
2805                         goto fail;
2806
2807                 task_barrier_exit(&hive->tb);
2808                 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2809
2810                 if (adev->asic_reset_res)
2811                         goto fail;
2812
2813                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2814                         adev->mmhub.funcs->reset_ras_error_count(adev);
2815         } else {
2816
2817                 task_barrier_full(&hive->tb);
2818                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2819         }
2820
2821 fail:
2822         if (adev->asic_reset_res)
2823                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2824                          adev->asic_reset_res, adev->ddev->unique);
2825 }
2826
2827 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2828 {
2829         char *input = amdgpu_lockup_timeout;
2830         char *timeout_setting = NULL;
2831         int index = 0;
2832         long timeout;
2833         int ret = 0;
2834
2835         /*
2836          * By default timeout for non compute jobs is 10000.
2837          * And there is no timeout enforced on compute jobs.
2838          * In SR-IOV or passthrough mode, timeout for compute
2839          * jobs are 60000 by default.
2840          */
2841         adev->gfx_timeout = msecs_to_jiffies(10000);
2842         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2843         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2844                 adev->compute_timeout =  msecs_to_jiffies(60000);
2845         else
2846                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2847
2848         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2849                 while ((timeout_setting = strsep(&input, ",")) &&
2850                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2851                         ret = kstrtol(timeout_setting, 0, &timeout);
2852                         if (ret)
2853                                 return ret;
2854
2855                         if (timeout == 0) {
2856                                 index++;
2857                                 continue;
2858                         } else if (timeout < 0) {
2859                                 timeout = MAX_SCHEDULE_TIMEOUT;
2860                         } else {
2861                                 timeout = msecs_to_jiffies(timeout);
2862                         }
2863
2864                         switch (index++) {
2865                         case 0:
2866                                 adev->gfx_timeout = timeout;
2867                                 break;
2868                         case 1:
2869                                 adev->compute_timeout = timeout;
2870                                 break;
2871                         case 2:
2872                                 adev->sdma_timeout = timeout;
2873                                 break;
2874                         case 3:
2875                                 adev->video_timeout = timeout;
2876                                 break;
2877                         default:
2878                                 break;
2879                         }
2880                 }
2881                 /*
2882                  * There is only one value specified and
2883                  * it should apply to all non-compute jobs.
2884                  */
2885                 if (index == 1) {
2886                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2887                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2888                                 adev->compute_timeout = adev->gfx_timeout;
2889                 }
2890         }
2891
2892         return ret;
2893 }
2894
2895 /**
2896  * amdgpu_device_init - initialize the driver
2897  *
2898  * @adev: amdgpu_device pointer
2899  * @ddev: drm dev pointer
2900  * @pdev: pci dev pointer
2901  * @flags: driver flags
2902  *
2903  * Initializes the driver info and hw (all asics).
2904  * Returns 0 for success or an error on failure.
2905  * Called at driver startup.
2906  */
2907 int amdgpu_device_init(struct amdgpu_device *adev,
2908                        struct drm_device *ddev,
2909                        struct pci_dev *pdev,
2910                        uint32_t flags)
2911 {
2912         int r, i;
2913         bool boco = false;
2914         u32 max_MBps;
2915
2916         adev->shutdown = false;
2917         adev->dev = &pdev->dev;
2918         adev->ddev = ddev;
2919         adev->pdev = pdev;
2920         adev->flags = flags;
2921
2922         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2923                 adev->asic_type = amdgpu_force_asic_type;
2924         else
2925                 adev->asic_type = flags & AMD_ASIC_MASK;
2926
2927         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2928         if (amdgpu_emu_mode == 1)
2929                 adev->usec_timeout *= 10;
2930         adev->gmc.gart_size = 512 * 1024 * 1024;
2931         adev->accel_working = false;
2932         adev->num_rings = 0;
2933         adev->mman.buffer_funcs = NULL;
2934         adev->mman.buffer_funcs_ring = NULL;
2935         adev->vm_manager.vm_pte_funcs = NULL;
2936         adev->vm_manager.vm_pte_num_scheds = 0;
2937         adev->gmc.gmc_funcs = NULL;
2938         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2939         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
2940
2941         adev->smc_rreg = &amdgpu_invalid_rreg;
2942         adev->smc_wreg = &amdgpu_invalid_wreg;
2943         adev->pcie_rreg = &amdgpu_invalid_rreg;
2944         adev->pcie_wreg = &amdgpu_invalid_wreg;
2945         adev->pciep_rreg = &amdgpu_invalid_rreg;
2946         adev->pciep_wreg = &amdgpu_invalid_wreg;
2947         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
2948         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
2949         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
2950         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
2951         adev->didt_rreg = &amdgpu_invalid_rreg;
2952         adev->didt_wreg = &amdgpu_invalid_wreg;
2953         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
2954         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
2955         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
2956         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
2957
2958         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
2959                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
2960                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
2961
2962         /* mutex initialization are all done here so we
2963          * can recall function without having locking issues */
2964         atomic_set(&adev->irq.ih.lock, 0);
2965         mutex_init(&adev->firmware.mutex);
2966         mutex_init(&adev->pm.mutex);
2967         mutex_init(&adev->gfx.gpu_clock_mutex);
2968         mutex_init(&adev->srbm_mutex);
2969         mutex_init(&adev->gfx.pipe_reserve_mutex);
2970         mutex_init(&adev->gfx.gfx_off_mutex);
2971         mutex_init(&adev->grbm_idx_mutex);
2972         mutex_init(&adev->mn_lock);
2973         mutex_init(&adev->virt.vf_errors.lock);
2974         hash_init(adev->mn_hash);
2975         mutex_init(&adev->lock_reset);
2976         mutex_init(&adev->psp.mutex);
2977         mutex_init(&adev->notifier_lock);
2978
2979         r = amdgpu_device_check_arguments(adev);
2980         if (r)
2981                 return r;
2982
2983         spin_lock_init(&adev->mmio_idx_lock);
2984         spin_lock_init(&adev->smc_idx_lock);
2985         spin_lock_init(&adev->pcie_idx_lock);
2986         spin_lock_init(&adev->uvd_ctx_idx_lock);
2987         spin_lock_init(&adev->didt_idx_lock);
2988         spin_lock_init(&adev->gc_cac_idx_lock);
2989         spin_lock_init(&adev->se_cac_idx_lock);
2990         spin_lock_init(&adev->audio_endpt_idx_lock);
2991         spin_lock_init(&adev->mm_stats.lock);
2992
2993         INIT_LIST_HEAD(&adev->shadow_list);
2994         mutex_init(&adev->shadow_list_lock);
2995
2996         INIT_DELAYED_WORK(&adev->delayed_init_work,
2997                           amdgpu_device_delayed_init_work_handler);
2998         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
2999                           amdgpu_device_delay_enable_gfx_off);
3000
3001         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3002
3003         adev->gfx.gfx_off_req_count = 1;
3004         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3005
3006         /* Registers mapping */
3007         /* TODO: block userspace mapping of io register */
3008         if (adev->asic_type >= CHIP_BONAIRE) {
3009                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3010                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3011         } else {
3012                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3013                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3014         }
3015
3016         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3017         if (adev->rmmio == NULL) {
3018                 return -ENOMEM;
3019         }
3020         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3021         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3022
3023         /* io port mapping */
3024         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3025                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3026                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3027                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3028                         break;
3029                 }
3030         }
3031         if (adev->rio_mem == NULL)
3032                 DRM_INFO("PCI I/O BAR is not found.\n");
3033
3034         /* enable PCIE atomic ops */
3035         r = pci_enable_atomic_ops_to_root(adev->pdev,
3036                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3037                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3038         if (r) {
3039                 adev->have_atomics_support = false;
3040                 DRM_INFO("PCIE atomic ops is not supported\n");
3041         } else {
3042                 adev->have_atomics_support = true;
3043         }
3044
3045         amdgpu_device_get_pcie_info(adev);
3046
3047         if (amdgpu_mcbp)
3048                 DRM_INFO("MCBP is enabled\n");
3049
3050         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3051                 adev->enable_mes = true;
3052
3053         /* detect hw virtualization here */
3054         amdgpu_detect_virtualization(adev);
3055
3056         r = amdgpu_device_get_job_timeout_settings(adev);
3057         if (r) {
3058                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3059                 return r;
3060         }
3061
3062         /* early init functions */
3063         r = amdgpu_device_ip_early_init(adev);
3064         if (r)
3065                 return r;
3066
3067         /* doorbell bar mapping and doorbell index init*/
3068         amdgpu_device_doorbell_init(adev);
3069
3070         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3071         /* this will fail for cards that aren't VGA class devices, just
3072          * ignore it */
3073         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3074
3075         if (amdgpu_device_supports_boco(ddev))
3076                 boco = true;
3077         if (amdgpu_has_atpx() &&
3078             (amdgpu_is_atpx_hybrid() ||
3079              amdgpu_has_atpx_dgpu_power_cntl()) &&
3080             !pci_is_thunderbolt_attached(adev->pdev))
3081                 vga_switcheroo_register_client(adev->pdev,
3082                                                &amdgpu_switcheroo_ops, boco);
3083         if (boco)
3084                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3085
3086         if (amdgpu_emu_mode == 1) {
3087                 /* post the asic on emulation mode */
3088                 emu_soc_asic_init(adev);
3089                 goto fence_driver_init;
3090         }
3091
3092         /* detect if we are with an SRIOV vbios */
3093         amdgpu_device_detect_sriov_bios(adev);
3094
3095         /* check if we need to reset the asic
3096          *  E.g., driver was not cleanly unloaded previously, etc.
3097          */
3098         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3099                 r = amdgpu_asic_reset(adev);
3100                 if (r) {
3101                         dev_err(adev->dev, "asic reset on init failed\n");
3102                         goto failed;
3103                 }
3104         }
3105
3106         /* Post card if necessary */
3107         if (amdgpu_device_need_post(adev)) {
3108                 if (!adev->bios) {
3109                         dev_err(adev->dev, "no vBIOS found\n");
3110                         r = -EINVAL;
3111                         goto failed;
3112                 }
3113                 DRM_INFO("GPU posting now...\n");
3114                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3115                 if (r) {
3116                         dev_err(adev->dev, "gpu post error!\n");
3117                         goto failed;
3118                 }
3119         }
3120
3121         if (adev->is_atom_fw) {
3122                 /* Initialize clocks */
3123                 r = amdgpu_atomfirmware_get_clock_info(adev);
3124                 if (r) {
3125                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3126                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3127                         goto failed;
3128                 }
3129         } else {
3130                 /* Initialize clocks */
3131                 r = amdgpu_atombios_get_clock_info(adev);
3132                 if (r) {
3133                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3134                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3135                         goto failed;
3136                 }
3137                 /* init i2c buses */
3138                 if (!amdgpu_device_has_dc_support(adev))
3139                         amdgpu_atombios_i2c_init(adev);
3140         }
3141
3142 fence_driver_init:
3143         /* Fence driver */
3144         r = amdgpu_fence_driver_init(adev);
3145         if (r) {
3146                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3147                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3148                 goto failed;
3149         }
3150
3151         /* init the mode config */
3152         drm_mode_config_init(adev->ddev);
3153
3154         r = amdgpu_device_ip_init(adev);
3155         if (r) {
3156                 /* failed in exclusive mode due to timeout */
3157                 if (amdgpu_sriov_vf(adev) &&
3158                     !amdgpu_sriov_runtime(adev) &&
3159                     amdgpu_virt_mmio_blocked(adev) &&
3160                     !amdgpu_virt_wait_reset(adev)) {
3161                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3162                         /* Don't send request since VF is inactive. */
3163                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3164                         adev->virt.ops = NULL;
3165                         r = -EAGAIN;
3166                         goto failed;
3167                 }
3168                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3169                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3170                 goto failed;
3171         }
3172
3173         dev_info(adev->dev,
3174                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3175                         adev->gfx.config.max_shader_engines,
3176                         adev->gfx.config.max_sh_per_se,
3177                         adev->gfx.config.max_cu_per_sh,
3178                         adev->gfx.cu_info.number);
3179
3180         adev->accel_working = true;
3181
3182         amdgpu_vm_check_compute_bug(adev);
3183
3184         /* Initialize the buffer migration limit. */
3185         if (amdgpu_moverate >= 0)
3186                 max_MBps = amdgpu_moverate;
3187         else
3188                 max_MBps = 8; /* Allow 8 MB/s. */
3189         /* Get a log2 for easy divisions. */
3190         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3191
3192         amdgpu_fbdev_init(adev);
3193
3194         r = amdgpu_pm_sysfs_init(adev);
3195         if (r) {
3196                 adev->pm_sysfs_en = false;
3197                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3198         } else
3199                 adev->pm_sysfs_en = true;
3200
3201         r = amdgpu_ucode_sysfs_init(adev);
3202         if (r) {
3203                 adev->ucode_sysfs_en = false;
3204                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3205         } else
3206                 adev->ucode_sysfs_en = true;
3207
3208         if ((amdgpu_testing & 1)) {
3209                 if (adev->accel_working)
3210                         amdgpu_test_moves(adev);
3211                 else
3212                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3213         }
3214         if (amdgpu_benchmarking) {
3215                 if (adev->accel_working)
3216                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3217                 else
3218                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3219         }
3220
3221         /*
3222          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3223          * Otherwise the mgpu fan boost feature will be skipped due to the
3224          * gpu instance is counted less.
3225          */
3226         amdgpu_register_gpu_instance(adev);
3227
3228         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3229          * explicit gating rather than handling it automatically.
3230          */
3231         r = amdgpu_device_ip_late_init(adev);
3232         if (r) {
3233                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3234                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3235                 goto failed;
3236         }
3237
3238         /* must succeed. */
3239         amdgpu_ras_resume(adev);
3240
3241         queue_delayed_work(system_wq, &adev->delayed_init_work,
3242                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3243
3244         r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
3245         if (r) {
3246                 dev_err(adev->dev, "Could not create pcie_replay_count");
3247                 return r;
3248         }
3249
3250         r = device_create_file(adev->dev, &dev_attr_product_name);
3251         if (r) {
3252                 dev_err(adev->dev, "Could not create product_name");
3253                 return r;
3254         }
3255
3256         r = device_create_file(adev->dev, &dev_attr_product_number);
3257         if (r) {
3258                 dev_err(adev->dev, "Could not create product_number");
3259                 return r;
3260         }
3261
3262         r = device_create_file(adev->dev, &dev_attr_serial_number);
3263         if (r) {
3264                 dev_err(adev->dev, "Could not create serial_number");
3265                 return r;
3266         }
3267
3268         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3269                 r = amdgpu_pmu_init(adev);
3270         if (r)
3271                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3272
3273         return 0;
3274
3275 failed:
3276         amdgpu_vf_error_trans_all(adev);
3277         if (boco)
3278                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3279
3280         return r;
3281 }
3282
3283 /**
3284  * amdgpu_device_fini - tear down the driver
3285  *
3286  * @adev: amdgpu_device pointer
3287  *
3288  * Tear down the driver info (all asics).
3289  * Called at driver shutdown.
3290  */
3291 void amdgpu_device_fini(struct amdgpu_device *adev)
3292 {
3293         int r;
3294
3295         DRM_INFO("amdgpu: finishing device.\n");
3296         flush_delayed_work(&adev->delayed_init_work);
3297         adev->shutdown = true;
3298
3299         /* make sure IB test finished before entering exclusive mode
3300          * to avoid preemption on IB test
3301          * */
3302         if (amdgpu_sriov_vf(adev))
3303                 amdgpu_virt_request_full_gpu(adev, false);
3304
3305         /* disable all interrupts */
3306         amdgpu_irq_disable_all(adev);
3307         if (adev->mode_info.mode_config_initialized){
3308                 if (!amdgpu_device_has_dc_support(adev))
3309                         drm_helper_force_disable_all(adev->ddev);
3310                 else
3311                         drm_atomic_helper_shutdown(adev->ddev);
3312         }
3313         amdgpu_fence_driver_fini(adev);
3314         if (adev->pm_sysfs_en)
3315                 amdgpu_pm_sysfs_fini(adev);
3316         amdgpu_fbdev_fini(adev);
3317         r = amdgpu_device_ip_fini(adev);
3318         if (adev->firmware.gpu_info_fw) {
3319                 release_firmware(adev->firmware.gpu_info_fw);
3320                 adev->firmware.gpu_info_fw = NULL;
3321         }
3322         adev->accel_working = false;
3323         /* free i2c buses */
3324         if (!amdgpu_device_has_dc_support(adev))
3325                 amdgpu_i2c_fini(adev);
3326
3327         if (amdgpu_emu_mode != 1)
3328                 amdgpu_atombios_fini(adev);
3329
3330         kfree(adev->bios);
3331         adev->bios = NULL;
3332         if (amdgpu_has_atpx() &&
3333             (amdgpu_is_atpx_hybrid() ||
3334              amdgpu_has_atpx_dgpu_power_cntl()) &&
3335             !pci_is_thunderbolt_attached(adev->pdev))
3336                 vga_switcheroo_unregister_client(adev->pdev);
3337         if (amdgpu_device_supports_boco(adev->ddev))
3338                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3339         vga_client_register(adev->pdev, NULL, NULL, NULL);
3340         if (adev->rio_mem)
3341                 pci_iounmap(adev->pdev, adev->rio_mem);
3342         adev->rio_mem = NULL;
3343         iounmap(adev->rmmio);
3344         adev->rmmio = NULL;
3345         amdgpu_device_doorbell_fini(adev);
3346
3347         device_remove_file(adev->dev, &dev_attr_pcie_replay_count);
3348         if (adev->ucode_sysfs_en)
3349                 amdgpu_ucode_sysfs_fini(adev);
3350         device_remove_file(adev->dev, &dev_attr_product_name);
3351         device_remove_file(adev->dev, &dev_attr_product_number);
3352         device_remove_file(adev->dev, &dev_attr_serial_number);
3353         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3354                 amdgpu_pmu_fini(adev);
3355         if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
3356                 amdgpu_discovery_fini(adev);
3357 }
3358
3359
3360 /*
3361  * Suspend & resume.
3362  */
3363 /**
3364  * amdgpu_device_suspend - initiate device suspend
3365  *
3366  * @dev: drm dev pointer
3367  * @suspend: suspend state
3368  * @fbcon : notify the fbdev of suspend
3369  *
3370  * Puts the hw in the suspend state (all asics).
3371  * Returns 0 for success or an error on failure.
3372  * Called at driver suspend.
3373  */
3374 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3375 {
3376         struct amdgpu_device *adev;
3377         struct drm_crtc *crtc;
3378         struct drm_connector *connector;
3379         struct drm_connector_list_iter iter;
3380         int r;
3381
3382         if (dev == NULL || dev->dev_private == NULL) {
3383                 return -ENODEV;
3384         }
3385
3386         adev = dev->dev_private;
3387
3388         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3389                 return 0;
3390
3391         adev->in_suspend = true;
3392         drm_kms_helper_poll_disable(dev);
3393
3394         if (fbcon)
3395                 amdgpu_fbdev_set_suspend(adev, 1);
3396
3397         cancel_delayed_work_sync(&adev->delayed_init_work);
3398
3399         if (!amdgpu_device_has_dc_support(adev)) {
3400                 /* turn off display hw */
3401                 drm_modeset_lock_all(dev);
3402                 drm_connector_list_iter_begin(dev, &iter);
3403                 drm_for_each_connector_iter(connector, &iter)
3404                         drm_helper_connector_dpms(connector,
3405                                                   DRM_MODE_DPMS_OFF);
3406                 drm_connector_list_iter_end(&iter);
3407                 drm_modeset_unlock_all(dev);
3408                         /* unpin the front buffers and cursors */
3409                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3410                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3411                         struct drm_framebuffer *fb = crtc->primary->fb;
3412                         struct amdgpu_bo *robj;
3413
3414                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3415                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3416                                 r = amdgpu_bo_reserve(aobj, true);
3417                                 if (r == 0) {
3418                                         amdgpu_bo_unpin(aobj);
3419                                         amdgpu_bo_unreserve(aobj);
3420                                 }
3421                         }
3422
3423                         if (fb == NULL || fb->obj[0] == NULL) {
3424                                 continue;
3425                         }
3426                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3427                         /* don't unpin kernel fb objects */
3428                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3429                                 r = amdgpu_bo_reserve(robj, true);
3430                                 if (r == 0) {
3431                                         amdgpu_bo_unpin(robj);
3432                                         amdgpu_bo_unreserve(robj);
3433                                 }
3434                         }
3435                 }
3436         }
3437
3438         amdgpu_ras_suspend(adev);
3439
3440         r = amdgpu_device_ip_suspend_phase1(adev);
3441
3442         amdgpu_amdkfd_suspend(adev, !fbcon);
3443
3444         /* evict vram memory */
3445         amdgpu_bo_evict_vram(adev);
3446
3447         amdgpu_fence_driver_suspend(adev);
3448
3449         r = amdgpu_device_ip_suspend_phase2(adev);
3450
3451         /* evict remaining vram memory
3452          * This second call to evict vram is to evict the gart page table
3453          * using the CPU.
3454          */
3455         amdgpu_bo_evict_vram(adev);
3456
3457         return 0;
3458 }
3459
3460 /**
3461  * amdgpu_device_resume - initiate device resume
3462  *
3463  * @dev: drm dev pointer
3464  * @resume: resume state
3465  * @fbcon : notify the fbdev of resume
3466  *
3467  * Bring the hw back to operating state (all asics).
3468  * Returns 0 for success or an error on failure.
3469  * Called at driver resume.
3470  */
3471 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3472 {
3473         struct drm_connector *connector;
3474         struct drm_connector_list_iter iter;
3475         struct amdgpu_device *adev = dev->dev_private;
3476         struct drm_crtc *crtc;
3477         int r = 0;
3478
3479         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3480                 return 0;
3481
3482         /* post card */
3483         if (amdgpu_device_need_post(adev)) {
3484                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3485                 if (r)
3486                         DRM_ERROR("amdgpu asic init failed\n");
3487         }
3488
3489         r = amdgpu_device_ip_resume(adev);
3490         if (r) {
3491                 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3492                 return r;
3493         }
3494         amdgpu_fence_driver_resume(adev);
3495
3496
3497         r = amdgpu_device_ip_late_init(adev);
3498         if (r)
3499                 return r;
3500
3501         queue_delayed_work(system_wq, &adev->delayed_init_work,
3502                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3503
3504         if (!amdgpu_device_has_dc_support(adev)) {
3505                 /* pin cursors */
3506                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3507                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3508
3509                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3510                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3511                                 r = amdgpu_bo_reserve(aobj, true);
3512                                 if (r == 0) {
3513                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3514                                         if (r != 0)
3515                                                 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3516                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3517                                         amdgpu_bo_unreserve(aobj);
3518                                 }
3519                         }
3520                 }
3521         }
3522         r = amdgpu_amdkfd_resume(adev, !fbcon);
3523         if (r)
3524                 return r;
3525
3526         /* Make sure IB tests flushed */
3527         flush_delayed_work(&adev->delayed_init_work);
3528
3529         /* blat the mode back in */
3530         if (fbcon) {
3531                 if (!amdgpu_device_has_dc_support(adev)) {
3532                         /* pre DCE11 */
3533                         drm_helper_resume_force_mode(dev);
3534
3535                         /* turn on display hw */
3536                         drm_modeset_lock_all(dev);
3537
3538                         drm_connector_list_iter_begin(dev, &iter);
3539                         drm_for_each_connector_iter(connector, &iter)
3540                                 drm_helper_connector_dpms(connector,
3541                                                           DRM_MODE_DPMS_ON);
3542                         drm_connector_list_iter_end(&iter);
3543
3544                         drm_modeset_unlock_all(dev);
3545                 }
3546                 amdgpu_fbdev_set_suspend(adev, 0);
3547         }
3548
3549         drm_kms_helper_poll_enable(dev);
3550
3551         amdgpu_ras_resume(adev);
3552
3553         /*
3554          * Most of the connector probing functions try to acquire runtime pm
3555          * refs to ensure that the GPU is powered on when connector polling is
3556          * performed. Since we're calling this from a runtime PM callback,
3557          * trying to acquire rpm refs will cause us to deadlock.
3558          *
3559          * Since we're guaranteed to be holding the rpm lock, it's safe to
3560          * temporarily disable the rpm helpers so this doesn't deadlock us.
3561          */
3562 #ifdef CONFIG_PM
3563         dev->dev->power.disable_depth++;
3564 #endif
3565         if (!amdgpu_device_has_dc_support(adev))
3566                 drm_helper_hpd_irq_event(dev);
3567         else
3568                 drm_kms_helper_hotplug_event(dev);
3569 #ifdef CONFIG_PM
3570         dev->dev->power.disable_depth--;
3571 #endif
3572         adev->in_suspend = false;
3573
3574         return 0;
3575 }
3576
3577 /**
3578  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3579  *
3580  * @adev: amdgpu_device pointer
3581  *
3582  * The list of all the hardware IPs that make up the asic is walked and
3583  * the check_soft_reset callbacks are run.  check_soft_reset determines
3584  * if the asic is still hung or not.
3585  * Returns true if any of the IPs are still in a hung state, false if not.
3586  */
3587 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3588 {
3589         int i;
3590         bool asic_hang = false;
3591
3592         if (amdgpu_sriov_vf(adev))
3593                 return true;
3594
3595         if (amdgpu_asic_need_full_reset(adev))
3596                 return true;
3597
3598         for (i = 0; i < adev->num_ip_blocks; i++) {
3599                 if (!adev->ip_blocks[i].status.valid)
3600                         continue;
3601                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3602                         adev->ip_blocks[i].status.hang =
3603                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3604                 if (adev->ip_blocks[i].status.hang) {
3605                         DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3606                         asic_hang = true;
3607                 }
3608         }
3609         return asic_hang;
3610 }
3611
3612 /**
3613  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3614  *
3615  * @adev: amdgpu_device pointer
3616  *
3617  * The list of all the hardware IPs that make up the asic is walked and the
3618  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3619  * handles any IP specific hardware or software state changes that are
3620  * necessary for a soft reset to succeed.
3621  * Returns 0 on success, negative error code on failure.
3622  */
3623 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3624 {
3625         int i, r = 0;
3626
3627         for (i = 0; i < adev->num_ip_blocks; i++) {
3628                 if (!adev->ip_blocks[i].status.valid)
3629                         continue;
3630                 if (adev->ip_blocks[i].status.hang &&
3631                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3632                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3633                         if (r)
3634                                 return r;
3635                 }
3636         }
3637
3638         return 0;
3639 }
3640
3641 /**
3642  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3643  *
3644  * @adev: amdgpu_device pointer
3645  *
3646  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3647  * reset is necessary to recover.
3648  * Returns true if a full asic reset is required, false if not.
3649  */
3650 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3651 {
3652         int i;
3653
3654         if (amdgpu_asic_need_full_reset(adev))
3655                 return true;
3656
3657         for (i = 0; i < adev->num_ip_blocks; i++) {
3658                 if (!adev->ip_blocks[i].status.valid)
3659                         continue;
3660                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3661                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3662                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3663                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3664                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3665                         if (adev->ip_blocks[i].status.hang) {
3666                                 DRM_INFO("Some block need full reset!\n");
3667                                 return true;
3668                         }
3669                 }
3670         }
3671         return false;
3672 }
3673
3674 /**
3675  * amdgpu_device_ip_soft_reset - do a soft reset
3676  *
3677  * @adev: amdgpu_device pointer
3678  *
3679  * The list of all the hardware IPs that make up the asic is walked and the
3680  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3681  * IP specific hardware or software state changes that are necessary to soft
3682  * reset the IP.
3683  * Returns 0 on success, negative error code on failure.
3684  */
3685 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3686 {
3687         int i, r = 0;
3688
3689         for (i = 0; i < adev->num_ip_blocks; i++) {
3690                 if (!adev->ip_blocks[i].status.valid)
3691                         continue;
3692                 if (adev->ip_blocks[i].status.hang &&
3693                     adev->ip_blocks[i].version->funcs->soft_reset) {
3694                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3695                         if (r)
3696                                 return r;
3697                 }
3698         }
3699
3700         return 0;
3701 }
3702
3703 /**
3704  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3705  *
3706  * @adev: amdgpu_device pointer
3707  *
3708  * The list of all the hardware IPs that make up the asic is walked and the
3709  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3710  * handles any IP specific hardware or software state changes that are
3711  * necessary after the IP has been soft reset.
3712  * Returns 0 on success, negative error code on failure.
3713  */
3714 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3715 {
3716         int i, r = 0;
3717
3718         for (i = 0; i < adev->num_ip_blocks; i++) {
3719                 if (!adev->ip_blocks[i].status.valid)
3720                         continue;
3721                 if (adev->ip_blocks[i].status.hang &&
3722                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3723                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3724                 if (r)
3725                         return r;
3726         }
3727
3728         return 0;
3729 }
3730
3731 /**
3732  * amdgpu_device_recover_vram - Recover some VRAM contents
3733  *
3734  * @adev: amdgpu_device pointer
3735  *
3736  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3737  * restore things like GPUVM page tables after a GPU reset where
3738  * the contents of VRAM might be lost.
3739  *
3740  * Returns:
3741  * 0 on success, negative error code on failure.
3742  */
3743 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3744 {
3745         struct dma_fence *fence = NULL, *next = NULL;
3746         struct amdgpu_bo *shadow;
3747         long r = 1, tmo;
3748
3749         if (amdgpu_sriov_runtime(adev))
3750                 tmo = msecs_to_jiffies(8000);
3751         else
3752                 tmo = msecs_to_jiffies(100);
3753
3754         DRM_INFO("recover vram bo from shadow start\n");
3755         mutex_lock(&adev->shadow_list_lock);
3756         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3757
3758                 /* No need to recover an evicted BO */
3759                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3760                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3761                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3762                         continue;
3763
3764                 r = amdgpu_bo_restore_shadow(shadow, &next);
3765                 if (r)
3766                         break;
3767
3768                 if (fence) {
3769                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3770                         dma_fence_put(fence);
3771                         fence = next;
3772                         if (tmo == 0) {
3773                                 r = -ETIMEDOUT;
3774                                 break;
3775                         } else if (tmo < 0) {
3776                                 r = tmo;
3777                                 break;
3778                         }
3779                 } else {
3780                         fence = next;
3781                 }
3782         }
3783         mutex_unlock(&adev->shadow_list_lock);
3784
3785         if (fence)
3786                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3787         dma_fence_put(fence);
3788
3789         if (r < 0 || tmo <= 0) {
3790                 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3791                 return -EIO;
3792         }
3793
3794         DRM_INFO("recover vram bo from shadow done\n");
3795         return 0;
3796 }
3797
3798
3799 /**
3800  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3801  *
3802  * @adev: amdgpu device pointer
3803  * @from_hypervisor: request from hypervisor
3804  *
3805  * do VF FLR and reinitialize Asic
3806  * return 0 means succeeded otherwise failed
3807  */
3808 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3809                                      bool from_hypervisor)
3810 {
3811         int r;
3812
3813         if (from_hypervisor)
3814                 r = amdgpu_virt_request_full_gpu(adev, true);
3815         else
3816                 r = amdgpu_virt_reset_gpu(adev);
3817         if (r)
3818                 return r;
3819
3820         amdgpu_amdkfd_pre_reset(adev);
3821
3822         /* Resume IP prior to SMC */
3823         r = amdgpu_device_ip_reinit_early_sriov(adev);
3824         if (r)
3825                 goto error;
3826
3827         amdgpu_virt_init_data_exchange(adev);
3828         /* we need recover gart prior to run SMC/CP/SDMA resume */
3829         amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3830
3831         r = amdgpu_device_fw_loading(adev);
3832         if (r)
3833                 return r;
3834
3835         /* now we are okay to resume SMC/CP/SDMA */
3836         r = amdgpu_device_ip_reinit_late_sriov(adev);
3837         if (r)
3838                 goto error;
3839
3840         amdgpu_irq_gpu_reset_resume_helper(adev);
3841         r = amdgpu_ib_ring_tests(adev);
3842         amdgpu_amdkfd_post_reset(adev);
3843
3844 error:
3845         amdgpu_virt_release_full_gpu(adev, true);
3846         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3847                 amdgpu_inc_vram_lost(adev);
3848                 r = amdgpu_device_recover_vram(adev);
3849         }
3850
3851         return r;
3852 }
3853
3854 /**
3855  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3856  *
3857  * @adev: amdgpu device pointer
3858  *
3859  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3860  * a hung GPU.
3861  */
3862 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3863 {
3864         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3865                 DRM_INFO("Timeout, but no hardware hang detected.\n");
3866                 return false;
3867         }
3868
3869         if (amdgpu_gpu_recovery == 0)
3870                 goto disabled;
3871
3872         if (amdgpu_sriov_vf(adev))
3873                 return true;
3874
3875         if (amdgpu_gpu_recovery == -1) {
3876                 switch (adev->asic_type) {
3877                 case CHIP_BONAIRE:
3878                 case CHIP_HAWAII:
3879                 case CHIP_TOPAZ:
3880                 case CHIP_TONGA:
3881                 case CHIP_FIJI:
3882                 case CHIP_POLARIS10:
3883                 case CHIP_POLARIS11:
3884                 case CHIP_POLARIS12:
3885                 case CHIP_VEGAM:
3886                 case CHIP_VEGA20:
3887                 case CHIP_VEGA10:
3888                 case CHIP_VEGA12:
3889                 case CHIP_RAVEN:
3890                 case CHIP_ARCTURUS:
3891                 case CHIP_RENOIR:
3892                 case CHIP_NAVI10:
3893                 case CHIP_NAVI14:
3894                 case CHIP_NAVI12:
3895                         break;
3896                 default:
3897                         goto disabled;
3898                 }
3899         }
3900
3901         return true;
3902
3903 disabled:
3904                 DRM_INFO("GPU recovery disabled.\n");
3905                 return false;
3906 }
3907
3908
3909 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3910                                         struct amdgpu_job *job,
3911                                         bool *need_full_reset_arg)
3912 {
3913         int i, r = 0;
3914         bool need_full_reset  = *need_full_reset_arg;
3915
3916         /* block all schedulers and reset given job's ring */
3917         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3918                 struct amdgpu_ring *ring = adev->rings[i];
3919
3920                 if (!ring || !ring->sched.thread)
3921                         continue;
3922
3923                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3924                 amdgpu_fence_driver_force_completion(ring);
3925         }
3926
3927         if(job)
3928                 drm_sched_increase_karma(&job->base);
3929
3930         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3931         if (!amdgpu_sriov_vf(adev)) {
3932
3933                 if (!need_full_reset)
3934                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3935
3936                 if (!need_full_reset) {
3937                         amdgpu_device_ip_pre_soft_reset(adev);
3938                         r = amdgpu_device_ip_soft_reset(adev);
3939                         amdgpu_device_ip_post_soft_reset(adev);
3940                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3941                                 DRM_INFO("soft reset failed, will fallback to full reset!\n");
3942                                 need_full_reset = true;
3943                         }
3944                 }
3945
3946                 if (need_full_reset)
3947                         r = amdgpu_device_ip_suspend(adev);
3948
3949                 *need_full_reset_arg = need_full_reset;
3950         }
3951
3952         return r;
3953 }
3954
3955 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
3956                                struct list_head *device_list_handle,
3957                                bool *need_full_reset_arg)
3958 {
3959         struct amdgpu_device *tmp_adev = NULL;
3960         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
3961         int r = 0;
3962
3963         /*
3964          * ASIC reset has to be done on all HGMI hive nodes ASAP
3965          * to allow proper links negotiation in FW (within 1 sec)
3966          */
3967         if (need_full_reset) {
3968                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3969                         /* For XGMI run all resets in parallel to speed up the process */
3970                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3971                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
3972                                         r = -EALREADY;
3973                         } else
3974                                 r = amdgpu_asic_reset(tmp_adev);
3975
3976                         if (r) {
3977                                 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
3978                                          r, tmp_adev->ddev->unique);
3979                                 break;
3980                         }
3981                 }
3982
3983                 /* For XGMI wait for all resets to complete before proceed */
3984                 if (!r) {
3985                         list_for_each_entry(tmp_adev, device_list_handle,
3986                                             gmc.xgmi.head) {
3987                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3988                                         flush_work(&tmp_adev->xgmi_reset_work);
3989                                         r = tmp_adev->asic_reset_res;
3990                                         if (r)
3991                                                 break;
3992                                 }
3993                         }
3994                 }
3995         }
3996
3997         if (!r && amdgpu_ras_intr_triggered()) {
3998                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3999                         if (tmp_adev->mmhub.funcs &&
4000                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4001                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4002                 }
4003
4004                 amdgpu_ras_intr_cleared();
4005         }
4006
4007         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4008                 if (need_full_reset) {
4009                         /* post card */
4010                         if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4011                                 DRM_WARN("asic atom init failed!");
4012
4013                         if (!r) {
4014                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4015                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4016                                 if (r)
4017                                         goto out;
4018
4019                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4020                                 if (vram_lost) {
4021                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4022                                         amdgpu_inc_vram_lost(tmp_adev);
4023                                 }
4024
4025                                 r = amdgpu_gtt_mgr_recover(
4026                                         &tmp_adev->mman.bdev.man[TTM_PL_TT]);
4027                                 if (r)
4028                                         goto out;
4029
4030                                 r = amdgpu_device_fw_loading(tmp_adev);
4031                                 if (r)
4032                                         return r;
4033
4034                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4035                                 if (r)
4036                                         goto out;
4037
4038                                 if (vram_lost)
4039                                         amdgpu_device_fill_reset_magic(tmp_adev);
4040
4041                                 /*
4042                                  * Add this ASIC as tracked as reset was already
4043                                  * complete successfully.
4044                                  */
4045                                 amdgpu_register_gpu_instance(tmp_adev);
4046
4047                                 r = amdgpu_device_ip_late_init(tmp_adev);
4048                                 if (r)
4049                                         goto out;
4050
4051                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4052
4053                                 /* must succeed. */
4054                                 amdgpu_ras_resume(tmp_adev);
4055
4056                                 /* Update PSP FW topology after reset */
4057                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4058                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4059                         }
4060                 }
4061
4062
4063 out:
4064                 if (!r) {
4065                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4066                         r = amdgpu_ib_ring_tests(tmp_adev);
4067                         if (r) {
4068                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4069                                 r = amdgpu_device_ip_suspend(tmp_adev);
4070                                 need_full_reset = true;
4071                                 r = -EAGAIN;
4072                                 goto end;
4073                         }
4074                 }
4075
4076                 if (!r)
4077                         r = amdgpu_device_recover_vram(tmp_adev);
4078                 else
4079                         tmp_adev->asic_reset_res = r;
4080         }
4081
4082 end:
4083         *need_full_reset_arg = need_full_reset;
4084         return r;
4085 }
4086
4087 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4088 {
4089         if (trylock) {
4090                 if (!mutex_trylock(&adev->lock_reset))
4091                         return false;
4092         } else
4093                 mutex_lock(&adev->lock_reset);
4094
4095         atomic_inc(&adev->gpu_reset_counter);
4096         adev->in_gpu_reset = true;
4097         switch (amdgpu_asic_reset_method(adev)) {
4098         case AMD_RESET_METHOD_MODE1:
4099                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4100                 break;
4101         case AMD_RESET_METHOD_MODE2:
4102                 adev->mp1_state = PP_MP1_STATE_RESET;
4103                 break;
4104         default:
4105                 adev->mp1_state = PP_MP1_STATE_NONE;
4106                 break;
4107         }
4108
4109         return true;
4110 }
4111
4112 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4113 {
4114         amdgpu_vf_error_trans_all(adev);
4115         adev->mp1_state = PP_MP1_STATE_NONE;
4116         adev->in_gpu_reset = false;
4117         mutex_unlock(&adev->lock_reset);
4118 }
4119
4120 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4121 {
4122         struct pci_dev *p = NULL;
4123
4124         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4125                         adev->pdev->bus->number, 1);
4126         if (p) {
4127                 pm_runtime_enable(&(p->dev));
4128                 pm_runtime_resume(&(p->dev));
4129         }
4130 }
4131
4132 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4133 {
4134         enum amd_reset_method reset_method;
4135         struct pci_dev *p = NULL;
4136         u64 expires;
4137
4138         /*
4139          * For now, only BACO and mode1 reset are confirmed
4140          * to suffer the audio issue without proper suspended.
4141          */
4142         reset_method = amdgpu_asic_reset_method(adev);
4143         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4144              (reset_method != AMD_RESET_METHOD_MODE1))
4145                 return -EINVAL;
4146
4147         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4148                         adev->pdev->bus->number, 1);
4149         if (!p)
4150                 return -ENODEV;
4151
4152         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4153         if (!expires)
4154                 /*
4155                  * If we cannot get the audio device autosuspend delay,
4156                  * a fixed 4S interval will be used. Considering 3S is
4157                  * the audio controller default autosuspend delay setting.
4158                  * 4S used here is guaranteed to cover that.
4159                  */
4160                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4L;
4161
4162         while (!pm_runtime_status_suspended(&(p->dev))) {
4163                 if (!pm_runtime_suspend(&(p->dev)))
4164                         break;
4165
4166                 if (expires < ktime_get_mono_fast_ns()) {
4167                         dev_warn(adev->dev, "failed to suspend display audio\n");
4168                         /* TODO: abort the succeeding gpu reset? */
4169                         return -ETIMEDOUT;
4170                 }
4171         }
4172
4173         pm_runtime_disable(&(p->dev));
4174
4175         return 0;
4176 }
4177
4178 /**
4179  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4180  *
4181  * @adev: amdgpu device pointer
4182  * @job: which job trigger hang
4183  *
4184  * Attempt to reset the GPU if it has hung (all asics).
4185  * Attempt to do soft-reset or full-reset and reinitialize Asic
4186  * Returns 0 for success or an error on failure.
4187  */
4188
4189 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4190                               struct amdgpu_job *job)
4191 {
4192         struct list_head device_list, *device_list_handle =  NULL;
4193         bool need_full_reset = false;
4194         bool job_signaled = false;
4195         struct amdgpu_hive_info *hive = NULL;
4196         struct amdgpu_device *tmp_adev = NULL;
4197         int i, r = 0;
4198         bool in_ras_intr = amdgpu_ras_intr_triggered();
4199         bool use_baco =
4200                 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
4201                 true : false;
4202         bool audio_suspended = false;
4203
4204         /*
4205          * Flush RAM to disk so that after reboot
4206          * the user can read log and see why the system rebooted.
4207          */
4208         if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
4209
4210                 DRM_WARN("Emergency reboot.");
4211
4212                 ksys_sync_helper();
4213                 emergency_restart();
4214         }
4215
4216         dev_info(adev->dev, "GPU %s begin!\n",
4217                 (in_ras_intr && !use_baco) ? "jobs stop":"reset");
4218
4219         /*
4220          * Here we trylock to avoid chain of resets executing from
4221          * either trigger by jobs on different adevs in XGMI hive or jobs on
4222          * different schedulers for same device while this TO handler is running.
4223          * We always reset all schedulers for device and all devices for XGMI
4224          * hive so that should take care of them too.
4225          */
4226         hive = amdgpu_get_xgmi_hive(adev, true);
4227         if (hive && !mutex_trylock(&hive->reset_lock)) {
4228                 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4229                           job ? job->base.id : -1, hive->hive_id);
4230                 mutex_unlock(&hive->hive_lock);
4231                 return 0;
4232         }
4233
4234         /*
4235          * Build list of devices to reset.
4236          * In case we are in XGMI hive mode, resort the device list
4237          * to put adev in the 1st position.
4238          */
4239         INIT_LIST_HEAD(&device_list);
4240         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4241                 if (!hive)
4242                         return -ENODEV;
4243                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4244                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4245                 device_list_handle = &hive->device_list;
4246         } else {
4247                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4248                 device_list_handle = &device_list;
4249         }
4250
4251         /* block all schedulers and reset given job's ring */
4252         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4253                 if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
4254                         DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4255                                   job ? job->base.id : -1);
4256                         mutex_unlock(&hive->hive_lock);
4257                         return 0;
4258                 }
4259
4260                 /*
4261                  * Try to put the audio codec into suspend state
4262                  * before gpu reset started.
4263                  *
4264                  * Due to the power domain of the graphics device
4265                  * is shared with AZ power domain. Without this,
4266                  * we may change the audio hardware from behind
4267                  * the audio driver's back. That will trigger
4268                  * some audio codec errors.
4269                  */
4270                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4271                         audio_suspended = true;
4272
4273                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4274
4275                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4276
4277                 if (!amdgpu_sriov_vf(tmp_adev))
4278                         amdgpu_amdkfd_pre_reset(tmp_adev);
4279
4280                 /*
4281                  * Mark these ASICs to be reseted as untracked first
4282                  * And add them back after reset completed
4283                  */
4284                 amdgpu_unregister_gpu_instance(tmp_adev);
4285
4286                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4287
4288                 /* disable ras on ALL IPs */
4289                 if (!(in_ras_intr && !use_baco) &&
4290                       amdgpu_device_ip_need_full_reset(tmp_adev))
4291                         amdgpu_ras_suspend(tmp_adev);
4292
4293                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4294                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4295
4296                         if (!ring || !ring->sched.thread)
4297                                 continue;
4298
4299                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4300
4301                         if (in_ras_intr && !use_baco)
4302                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4303                 }
4304         }
4305
4306         if (in_ras_intr && !use_baco)
4307                 goto skip_sched_resume;
4308
4309         /*
4310          * Must check guilty signal here since after this point all old
4311          * HW fences are force signaled.
4312          *
4313          * job->base holds a reference to parent fence
4314          */
4315         if (job && job->base.s_fence->parent &&
4316             dma_fence_is_signaled(job->base.s_fence->parent)) {
4317                 job_signaled = true;
4318                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4319                 goto skip_hw_reset;
4320         }
4321
4322 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4323         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4324                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4325                                                  NULL,
4326                                                  &need_full_reset);
4327                 /*TODO Should we stop ?*/
4328                 if (r) {
4329                         DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4330                                   r, tmp_adev->ddev->unique);
4331                         tmp_adev->asic_reset_res = r;
4332                 }
4333         }
4334
4335         /* Actual ASIC resets if needed.*/
4336         /* TODO Implement XGMI hive reset logic for SRIOV */
4337         if (amdgpu_sriov_vf(adev)) {
4338                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4339                 if (r)
4340                         adev->asic_reset_res = r;
4341         } else {
4342                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4343                 if (r && r == -EAGAIN)
4344                         goto retry;
4345         }
4346
4347 skip_hw_reset:
4348
4349         /* Post ASIC reset for all devs .*/
4350         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4351
4352                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4353                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4354
4355                         if (!ring || !ring->sched.thread)
4356                                 continue;
4357
4358                         /* No point to resubmit jobs if we didn't HW reset*/
4359                         if (!tmp_adev->asic_reset_res && !job_signaled)
4360                                 drm_sched_resubmit_jobs(&ring->sched);
4361
4362                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4363                 }
4364
4365                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4366                         drm_helper_resume_force_mode(tmp_adev->ddev);
4367                 }
4368
4369                 tmp_adev->asic_reset_res = 0;
4370
4371                 if (r) {
4372                         /* bad news, how to tell it to userspace ? */
4373                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4374                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4375                 } else {
4376                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4377                 }
4378         }
4379
4380 skip_sched_resume:
4381         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4382                 /*unlock kfd: SRIOV would do it separately */
4383                 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
4384                         amdgpu_amdkfd_post_reset(tmp_adev);
4385                 if (audio_suspended)
4386                         amdgpu_device_resume_display_audio(tmp_adev);
4387                 amdgpu_device_unlock_adev(tmp_adev);
4388         }
4389
4390         if (hive) {
4391                 mutex_unlock(&hive->reset_lock);
4392                 mutex_unlock(&hive->hive_lock);
4393         }
4394
4395         if (r)
4396                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4397         return r;
4398 }
4399
4400 /**
4401  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4402  *
4403  * @adev: amdgpu_device pointer
4404  *
4405  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4406  * and lanes) of the slot the device is in. Handles APUs and
4407  * virtualized environments where PCIE config space may not be available.
4408  */
4409 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4410 {
4411         struct pci_dev *pdev;
4412         enum pci_bus_speed speed_cap, platform_speed_cap;
4413         enum pcie_link_width platform_link_width;
4414
4415         if (amdgpu_pcie_gen_cap)
4416                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4417
4418         if (amdgpu_pcie_lane_cap)
4419                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4420
4421         /* covers APUs as well */
4422         if (pci_is_root_bus(adev->pdev->bus)) {
4423                 if (adev->pm.pcie_gen_mask == 0)
4424                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4425                 if (adev->pm.pcie_mlw_mask == 0)
4426                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4427                 return;
4428         }
4429
4430         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4431                 return;
4432
4433         pcie_bandwidth_available(adev->pdev, NULL,
4434                                  &platform_speed_cap, &platform_link_width);
4435
4436         if (adev->pm.pcie_gen_mask == 0) {
4437                 /* asic caps */
4438                 pdev = adev->pdev;
4439                 speed_cap = pcie_get_speed_cap(pdev);
4440                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4441                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4442                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4443                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4444                 } else {
4445                         if (speed_cap == PCIE_SPEED_16_0GT)
4446                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4447                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4448                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4449                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4450                         else if (speed_cap == PCIE_SPEED_8_0GT)
4451                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4452                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4453                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4454                         else if (speed_cap == PCIE_SPEED_5_0GT)
4455                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4456                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4457                         else
4458                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4459                 }
4460                 /* platform caps */
4461                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4462                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4463                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4464                 } else {
4465                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4466                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4467                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4468                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4469                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4470                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4471                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4472                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4473                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4474                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4475                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4476                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4477                         else
4478                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4479
4480                 }
4481         }
4482         if (adev->pm.pcie_mlw_mask == 0) {
4483                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4484                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4485                 } else {
4486                         switch (platform_link_width) {
4487                         case PCIE_LNK_X32:
4488                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4489                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4490                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4491                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4492                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4493                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4494                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4495                                 break;
4496                         case PCIE_LNK_X16:
4497                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4498                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4499                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4500                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4501                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4502                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4503                                 break;
4504                         case PCIE_LNK_X12:
4505                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4506                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4507                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4508                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4509                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4510                                 break;
4511                         case PCIE_LNK_X8:
4512                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4513                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4514                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4515                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4516                                 break;
4517                         case PCIE_LNK_X4:
4518                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4519                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4520                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4521                                 break;
4522                         case PCIE_LNK_X2:
4523                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4524                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4525                                 break;
4526                         case PCIE_LNK_X1:
4527                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4528                                 break;
4529                         default:
4530                                 break;
4531                         }
4532                 }
4533         }
4534 }
4535
4536 int amdgpu_device_baco_enter(struct drm_device *dev)
4537 {
4538         struct amdgpu_device *adev = dev->dev_private;
4539         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4540
4541         if (!amdgpu_device_supports_baco(adev->ddev))
4542                 return -ENOTSUPP;
4543
4544         if (ras && ras->supported)
4545                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4546
4547         return amdgpu_dpm_baco_enter(adev);
4548 }
4549
4550 int amdgpu_device_baco_exit(struct drm_device *dev)
4551 {
4552         struct amdgpu_device *adev = dev->dev_private;
4553         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4554         int ret = 0;
4555
4556         if (!amdgpu_device_supports_baco(adev->ddev))
4557                 return -ENOTSUPP;
4558
4559         ret = amdgpu_dpm_baco_exit(adev);
4560         if (ret)
4561                 return ret;
4562
4563         if (ras && ras->supported)
4564                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4565
4566         return 0;
4567 }