drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33 #include <linux/iommu.h>
  34 #include <linux/pci.h>
  35 #include <linux/devcoredump.h>
  36 #include <generated/utsrelease.h>
  37 #include <linux/pci-p2pdma.h>
  38 #include <linux/apple-gmux.h>
  39
  40 #include <drm/drm_aperture.h>
  41 #include <drm/drm_atomic_helper.h>
  42 #include <drm/drm_crtc_helper.h>
  43 #include <drm/drm_fb_helper.h>
  44 #include <drm/drm_probe_helper.h>
  45 #include <drm/amdgpu_drm.h>
  46 #include <linux/vgaarb.h>
  47 #include <linux/vga_switcheroo.h>
  48 #include <linux/efi.h>
  49 #include "amdgpu.h"
  50 #include "amdgpu_trace.h"
  51 #include "amdgpu_i2c.h"
  52 #include "atom.h"
  53 #include "amdgpu_atombios.h"
  54 #include "amdgpu_atomfirmware.h"
  55 #include "amd_pcie.h"
  56 #ifdef CONFIG_DRM_AMDGPU_SI
  57 #include "si.h"
  58 #endif
  59 #ifdef CONFIG_DRM_AMDGPU_CIK
  60 #include "cik.h"
  61 #endif
  62 #include "vi.h"
  63 #include "soc15.h"
  64 #include "nv.h"
  65 #include "bif/bif_4_1_d.h"
  66 #include <linux/firmware.h>
  67 #include "amdgpu_vf_error.h"
  68
  69 #include "amdgpu_amdkfd.h"
  70 #include "amdgpu_pm.h"
  71
  72 #include "amdgpu_xgmi.h"
  73 #include "amdgpu_ras.h"
  74 #include "amdgpu_pmu.h"
  75 #include "amdgpu_fru_eeprom.h"
  76 #include "amdgpu_reset.h"
  77
  78 #include <linux/suspend.h>
  79 #include <drm/task_barrier.h>
  80 #include <linux/pm_runtime.h>
  81
  82 #include <drm/drm_drv.h>
  83
  84 #if IS_ENABLED(CONFIG_X86)
  85 #include <asm/intel-family.h>
  86 #endif
  87
  88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  95
  96 #define AMDGPU_RESUME_MS                2000
  97 #define AMDGPU_MAX_RETRY_LIMIT          2
  98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
  99
 100 static const struct drm_driver amdgpu_kms_driver;
 101
 102 const char *amdgpu_asic_name[] = {
 103         "TAHITI",
 104         "PITCAIRN",
 105         "VERDE",
 106         "OLAND",
 107         "HAINAN",
 108         "BONAIRE",
 109         "KAVERI",
 110         "KABINI",
 111         "HAWAII",
 112         "MULLINS",
 113         "TOPAZ",
 114         "TONGA",
 115         "FIJI",
 116         "CARRIZO",
 117         "STONEY",
 118         "POLARIS10",
 119         "POLARIS11",
 120         "POLARIS12",
 121         "VEGAM",
 122         "VEGA10",
 123         "VEGA12",
 124         "VEGA20",
 125         "RAVEN",
 126         "ARCTURUS",
 127         "RENOIR",
 128         "ALDEBARAN",
 129         "NAVI10",
 130         "CYAN_SKILLFISH",
 131         "NAVI14",
 132         "NAVI12",
 133         "SIENNA_CICHLID",
 134         "NAVY_FLOUNDER",
 135         "VANGOGH",
 136         "DIMGREY_CAVEFISH",
 137         "BEIGE_GOBY",
 138         "YELLOW_CARP",
 139         "IP DISCOVERY",
 140         "LAST",
 141 };
 142
 143 /**
 144  * DOC: pcie_replay_count
 145  *
 146  * The amdgpu driver provides a sysfs API for reporting the total number
 147  * of PCIe replays (NAKs)
 148  * The file pcie_replay_count is used for this and returns the total
 149  * number of replays as a sum of the NAKs generated and NAKs received
 150  */
 151
 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 153                 struct device_attribute *attr, char *buf)
 154 {
 155         struct drm_device *ddev = dev_get_drvdata(dev);
 156         struct amdgpu_device *adev = drm_to_adev(ddev);
 157         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 158
 159         return sysfs_emit(buf, "%llu\n", cnt);
 160 }
 161
 162 static DEVICE_ATTR(pcie_replay_count, 0444,
 163                 amdgpu_device_get_pcie_replay_count, NULL);
 164
 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 166
 167
 168 /**
 169  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
 170  *
 171  * @dev: drm_device pointer
 172  *
 173  * Returns true if the device is a dGPU with ATPX power control,
 174  * otherwise return false.
 175  */
 176 bool amdgpu_device_supports_px(struct drm_device *dev)
 177 {
 178         struct amdgpu_device *adev = drm_to_adev(dev);
 179
 180         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
 181                 return true;
 182         return false;
 183 }
 184
 185 /**
 186  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 187  *
 188  * @dev: drm_device pointer
 189  *
 190  * Returns true if the device is a dGPU with ACPI power control,
 191  * otherwise return false.
 192  */
 193 bool amdgpu_device_supports_boco(struct drm_device *dev)
 194 {
 195         struct amdgpu_device *adev = drm_to_adev(dev);
 196
 197         if (adev->has_pr3 ||
 198             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
 199                 return true;
 200         return false;
 201 }
 202
 203 /**
 204  * amdgpu_device_supports_baco - Does the device support BACO
 205  *
 206  * @dev: drm_device pointer
 207  *
 208  * Returns true if the device supporte BACO,
 209  * otherwise return false.
 210  */
 211 bool amdgpu_device_supports_baco(struct drm_device *dev)
 212 {
 213         struct amdgpu_device *adev = drm_to_adev(dev);
 214
 215         return amdgpu_asic_supports_baco(adev);
 216 }
 217
 218 /**
 219  * amdgpu_device_supports_smart_shift - Is the device dGPU with
 220  * smart shift support
 221  *
 222  * @dev: drm_device pointer
 223  *
 224  * Returns true if the device is a dGPU with Smart Shift support,
 225  * otherwise returns false.
 226  */
 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
 228 {
 229         return (amdgpu_device_supports_boco(dev) &&
 230                 amdgpu_acpi_is_power_shift_control_supported());
 231 }
 232
 233 /*
 234  * VRAM access helper functions
 235  */
 236
 237 /**
 238  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
 239  *
 240  * @adev: amdgpu_device pointer
 241  * @pos: offset of the buffer in vram
 242  * @buf: virtual address of the buffer in system memory
 243  * @size: read/write size, sizeof(@buf) must > @size
 244  * @write: true - write to vram, otherwise - read from vram
 245  */
 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
 247                              void *buf, size_t size, bool write)
 248 {
 249         unsigned long flags;
 250         uint32_t hi = ~0, tmp = 0;
 251         uint32_t *data = buf;
 252         uint64_t last;
 253         int idx;
 254
 255         if (!drm_dev_enter(adev_to_drm(adev), &idx))
 256                 return;
 257
 258         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
 259
 260         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 261         for (last = pos + size; pos < last; pos += 4) {
 262                 tmp = pos >> 31;
 263
 264                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 265                 if (tmp != hi) {
 266                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 267                         hi = tmp;
 268                 }
 269                 if (write)
 270                         WREG32_NO_KIQ(mmMM_DATA, *data++);
 271                 else
 272                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
 273         }
 274
 275         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 276         drm_dev_exit(idx);
 277 }
 278
 279 /**
 280  * amdgpu_device_aper_access - access vram by vram aperature
 281  *
 282  * @adev: amdgpu_device pointer
 283  * @pos: offset of the buffer in vram
 284  * @buf: virtual address of the buffer in system memory
 285  * @size: read/write size, sizeof(@buf) must > @size
 286  * @write: true - write to vram, otherwise - read from vram
 287  *
 288  * The return value means how many bytes have been transferred.
 289  */
 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
 291                                  void *buf, size_t size, bool write)
 292 {
 293 #ifdef CONFIG_64BIT
 294         void __iomem *addr;
 295         size_t count = 0;
 296         uint64_t last;
 297
 298         if (!adev->mman.aper_base_kaddr)
 299                 return 0;
 300
 301         last = min(pos + size, adev->gmc.visible_vram_size);
 302         if (last > pos) {
 303                 addr = adev->mman.aper_base_kaddr + pos;
 304                 count = last - pos;
 305
 306                 if (write) {
 307                         memcpy_toio(addr, buf, count);
 308                         /* Make sure HDP write cache flush happens without any reordering
 309                          * after the system memory contents are sent over PCIe device
 310                          */
 311                         mb();
 312                         amdgpu_device_flush_hdp(adev, NULL);
 313                 } else {
 314                         amdgpu_device_invalidate_hdp(adev, NULL);
 315                         /* Make sure HDP read cache is invalidated before issuing a read
 316                          * to the PCIe device
 317                          */
 318                         mb();
 319                         memcpy_fromio(buf, addr, count);
 320                 }
 321
 322         }
 323
 324         return count;
 325 #else
 326         return 0;
 327 #endif
 328 }
 329
 330 /**
 331  * amdgpu_device_vram_access - read/write a buffer in vram
 332  *
 333  * @adev: amdgpu_device pointer
 334  * @pos: offset of the buffer in vram
 335  * @buf: virtual address of the buffer in system memory
 336  * @size: read/write size, sizeof(@buf) must > @size
 337  * @write: true - write to vram, otherwise - read from vram
 338  */
 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 340                                void *buf, size_t size, bool write)
 341 {
 342         size_t count;
 343
 344         /* try to using vram apreature to access vram first */
 345         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
 346         size -= count;
 347         if (size) {
 348                 /* using MM to access rest vram */
 349                 pos += count;
 350                 buf += count;
 351                 amdgpu_device_mm_access(adev, pos, buf, size, write);
 352         }
 353 }
 354
 355 /*
 356  * register access helper functions.
 357  */
 358
 359 /* Check if hw access should be skipped because of hotplug or device error */
 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
 361 {
 362         if (adev->no_hw_access)
 363                 return true;
 364
 365 #ifdef CONFIG_LOCKDEP
 366         /*
 367          * This is a bit complicated to understand, so worth a comment. What we assert
 368          * here is that the GPU reset is not running on another thread in parallel.
 369          *
 370          * For this we trylock the read side of the reset semaphore, if that succeeds
 371          * we know that the reset is not running in paralell.
 372          *
 373          * If the trylock fails we assert that we are either already holding the read
 374          * side of the lock or are the reset thread itself and hold the write side of
 375          * the lock.
 376          */
 377         if (in_task()) {
 378                 if (down_read_trylock(&adev->reset_domain->sem))
 379                         up_read(&adev->reset_domain->sem);
 380                 else
 381                         lockdep_assert_held(&adev->reset_domain->sem);
 382         }
 383 #endif
 384         return false;
 385 }
 386
 387 /**
 388  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 389  *
 390  * @adev: amdgpu_device pointer
 391  * @reg: dword aligned register offset
 392  * @acc_flags: access flags which require special behavior
 393  *
 394  * Returns the 32 bit value from the offset specified.
 395  */
 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 397                             uint32_t reg, uint32_t acc_flags)
 398 {
 399         uint32_t ret;
 400
 401         if (amdgpu_device_skip_hw_access(adev))
 402                 return 0;
 403
 404         if ((reg * 4) < adev->rmmio_size) {
 405                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 406                     amdgpu_sriov_runtime(adev) &&
 407                     down_read_trylock(&adev->reset_domain->sem)) {
 408                         ret = amdgpu_kiq_rreg(adev, reg);
 409                         up_read(&adev->reset_domain->sem);
 410                 } else {
 411                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 412                 }
 413         } else {
 414                 ret = adev->pcie_rreg(adev, reg * 4);
 415         }
 416
 417         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 418
 419         return ret;
 420 }
 421
 422 /*
 423  * MMIO register read with bytes helper functions
 424  * @offset:bytes offset from MMIO start
 425  */
 426
 427 /**
 428  * amdgpu_mm_rreg8 - read a memory mapped IO register
 429  *
 430  * @adev: amdgpu_device pointer
 431  * @offset: byte aligned register offset
 432  *
 433  * Returns the 8 bit value from the offset specified.
 434  */
 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 436 {
 437         if (amdgpu_device_skip_hw_access(adev))
 438                 return 0;
 439
 440         if (offset < adev->rmmio_size)
 441                 return (readb(adev->rmmio + offset));
 442         BUG();
 443 }
 444
 445 /*
 446  * MMIO register write with bytes helper functions
 447  * @offset:bytes offset from MMIO start
 448  * @value: the value want to be written to the register
 449  */
 450
 451 /**
 452  * amdgpu_mm_wreg8 - read a memory mapped IO register
 453  *
 454  * @adev: amdgpu_device pointer
 455  * @offset: byte aligned register offset
 456  * @value: 8 bit value to write
 457  *
 458  * Writes the value specified to the offset specified.
 459  */
 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 461 {
 462         if (amdgpu_device_skip_hw_access(adev))
 463                 return;
 464
 465         if (offset < adev->rmmio_size)
 466                 writeb(value, adev->rmmio + offset);
 467         else
 468                 BUG();
 469 }
 470
 471 /**
 472  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 473  *
 474  * @adev: amdgpu_device pointer
 475  * @reg: dword aligned register offset
 476  * @v: 32 bit value to write to the register
 477  * @acc_flags: access flags which require special behavior
 478  *
 479  * Writes the value specified to the offset specified.
 480  */
 481 void amdgpu_device_wreg(struct amdgpu_device *adev,
 482                         uint32_t reg, uint32_t v,
 483                         uint32_t acc_flags)
 484 {
 485         if (amdgpu_device_skip_hw_access(adev))
 486                 return;
 487
 488         if ((reg * 4) < adev->rmmio_size) {
 489                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 490                     amdgpu_sriov_runtime(adev) &&
 491                     down_read_trylock(&adev->reset_domain->sem)) {
 492                         amdgpu_kiq_wreg(adev, reg, v);
 493                         up_read(&adev->reset_domain->sem);
 494                 } else {
 495                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 496                 }
 497         } else {
 498                 adev->pcie_wreg(adev, reg * 4, v);
 499         }
 500
 501         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 502 }
 503
 504 /**
 505  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
 506  *
 507  * @adev: amdgpu_device pointer
 508  * @reg: mmio/rlc register
 509  * @v: value to write
 510  *
 511  * this function is invoked only for the debugfs register access
 512  */
 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 514                              uint32_t reg, uint32_t v,
 515                              uint32_t xcc_id)
 516 {
 517         if (amdgpu_device_skip_hw_access(adev))
 518                 return;
 519
 520         if (amdgpu_sriov_fullaccess(adev) &&
 521             adev->gfx.rlc.funcs &&
 522             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 523                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 524                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
 525         } else if ((reg * 4) >= adev->rmmio_size) {
 526                 adev->pcie_wreg(adev, reg * 4, v);
 527         } else {
 528                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 529         }
 530 }
 531
 532 /**
 533  * amdgpu_device_indirect_rreg - read an indirect register
 534  *
 535  * @adev: amdgpu_device pointer
 536  * @reg_addr: indirect register address to read from
 537  *
 538  * Returns the value of indirect register @reg_addr
 539  */
 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 541                                 u32 reg_addr)
 542 {
 543         unsigned long flags, pcie_index, pcie_data;
 544         void __iomem *pcie_index_offset;
 545         void __iomem *pcie_data_offset;
 546         u32 r;
 547
 548         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 549         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 550
 551         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 552         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 553         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 554
 555         writel(reg_addr, pcie_index_offset);
 556         readl(pcie_index_offset);
 557         r = readl(pcie_data_offset);
 558         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 559
 560         return r;
 561 }
 562
 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
 564                                     u64 reg_addr)
 565 {
 566         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 567         u32 r;
 568         void __iomem *pcie_index_offset;
 569         void __iomem *pcie_index_hi_offset;
 570         void __iomem *pcie_data_offset;
 571
 572         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 573         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 574         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 575                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 576         else
 577                 pcie_index_hi = 0;
 578
 579         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 580         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 581         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 582         if (pcie_index_hi != 0)
 583                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 584                                 pcie_index_hi * 4;
 585
 586         writel(reg_addr, pcie_index_offset);
 587         readl(pcie_index_offset);
 588         if (pcie_index_hi != 0) {
 589                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 590                 readl(pcie_index_hi_offset);
 591         }
 592         r = readl(pcie_data_offset);
 593
 594         /* clear the high bits */
 595         if (pcie_index_hi != 0) {
 596                 writel(0, pcie_index_hi_offset);
 597                 readl(pcie_index_hi_offset);
 598         }
 599
 600         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 601
 602         return r;
 603 }
 604
 605 /**
 606  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 607  *
 608  * @adev: amdgpu_device pointer
 609  * @reg_addr: indirect register address to read from
 610  *
 611  * Returns the value of indirect register @reg_addr
 612  */
 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 614                                   u32 reg_addr)
 615 {
 616         unsigned long flags, pcie_index, pcie_data;
 617         void __iomem *pcie_index_offset;
 618         void __iomem *pcie_data_offset;
 619         u64 r;
 620
 621         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 622         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 623
 624         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 625         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 626         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 627
 628         /* read low 32 bits */
 629         writel(reg_addr, pcie_index_offset);
 630         readl(pcie_index_offset);
 631         r = readl(pcie_data_offset);
 632         /* read high 32 bits */
 633         writel(reg_addr + 4, pcie_index_offset);
 634         readl(pcie_index_offset);
 635         r |= ((u64)readl(pcie_data_offset) << 32);
 636         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 637
 638         return r;
 639 }
 640
 641 /**
 642  * amdgpu_device_indirect_wreg - write an indirect register address
 643  *
 644  * @adev: amdgpu_device pointer
 645  * @reg_addr: indirect register offset
 646  * @reg_data: indirect register data
 647  *
 648  */
 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 650                                  u32 reg_addr, u32 reg_data)
 651 {
 652         unsigned long flags, pcie_index, pcie_data;
 653         void __iomem *pcie_index_offset;
 654         void __iomem *pcie_data_offset;
 655
 656         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 657         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 658
 659         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 660         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 661         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 662
 663         writel(reg_addr, pcie_index_offset);
 664         readl(pcie_index_offset);
 665         writel(reg_data, pcie_data_offset);
 666         readl(pcie_data_offset);
 667         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 668 }
 669
 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
 671                                      u64 reg_addr, u32 reg_data)
 672 {
 673         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 674         void __iomem *pcie_index_offset;
 675         void __iomem *pcie_index_hi_offset;
 676         void __iomem *pcie_data_offset;
 677
 678         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 679         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 680         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 681                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 682         else
 683                 pcie_index_hi = 0;
 684
 685         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 686         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 687         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 688         if (pcie_index_hi != 0)
 689                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 690                                 pcie_index_hi * 4;
 691
 692         writel(reg_addr, pcie_index_offset);
 693         readl(pcie_index_offset);
 694         if (pcie_index_hi != 0) {
 695                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 696                 readl(pcie_index_hi_offset);
 697         }
 698         writel(reg_data, pcie_data_offset);
 699         readl(pcie_data_offset);
 700
 701         /* clear the high bits */
 702         if (pcie_index_hi != 0) {
 703                 writel(0, pcie_index_hi_offset);
 704                 readl(pcie_index_hi_offset);
 705         }
 706
 707         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 708 }
 709
 710 /**
 711  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 712  *
 713  * @adev: amdgpu_device pointer
 714  * @reg_addr: indirect register offset
 715  * @reg_data: indirect register data
 716  *
 717  */
 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 719                                    u32 reg_addr, u64 reg_data)
 720 {
 721         unsigned long flags, pcie_index, pcie_data;
 722         void __iomem *pcie_index_offset;
 723         void __iomem *pcie_data_offset;
 724
 725         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 726         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 727
 728         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 729         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 730         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 731
 732         /* write low 32 bits */
 733         writel(reg_addr, pcie_index_offset);
 734         readl(pcie_index_offset);
 735         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 736         readl(pcie_data_offset);
 737         /* write high 32 bits */
 738         writel(reg_addr + 4, pcie_index_offset);
 739         readl(pcie_index_offset);
 740         writel((u32)(reg_data >> 32), pcie_data_offset);
 741         readl(pcie_data_offset);
 742         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 743 }
 744
 745 /**
 746  * amdgpu_device_get_rev_id - query device rev_id
 747  *
 748  * @adev: amdgpu_device pointer
 749  *
 750  * Return device rev_id
 751  */
 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
 753 {
 754         return adev->nbio.funcs->get_rev_id(adev);
 755 }
 756
 757 /**
 758  * amdgpu_invalid_rreg - dummy reg read function
 759  *
 760  * @adev: amdgpu_device pointer
 761  * @reg: offset of register
 762  *
 763  * Dummy register read function.  Used for register blocks
 764  * that certain asics don't have (all asics).
 765  * Returns the value in the register.
 766  */
 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 768 {
 769         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 770         BUG();
 771         return 0;
 772 }
 773
 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
 775 {
 776         DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
 777         BUG();
 778         return 0;
 779 }
 780
 781 /**
 782  * amdgpu_invalid_wreg - dummy reg write function
 783  *
 784  * @adev: amdgpu_device pointer
 785  * @reg: offset of register
 786  * @v: value to write to the register
 787  *
 788  * Dummy register read function.  Used for register blocks
 789  * that certain asics don't have (all asics).
 790  */
 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 792 {
 793         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 794                   reg, v);
 795         BUG();
 796 }
 797
 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
 799 {
 800         DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
 801                   reg, v);
 802         BUG();
 803 }
 804
 805 /**
 806  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 807  *
 808  * @adev: amdgpu_device pointer
 809  * @reg: offset of register
 810  *
 811  * Dummy register read function.  Used for register blocks
 812  * that certain asics don't have (all asics).
 813  * Returns the value in the register.
 814  */
 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 816 {
 817         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 818         BUG();
 819         return 0;
 820 }
 821
 822 /**
 823  * amdgpu_invalid_wreg64 - dummy reg write function
 824  *
 825  * @adev: amdgpu_device pointer
 826  * @reg: offset of register
 827  * @v: value to write to the register
 828  *
 829  * Dummy register read function.  Used for register blocks
 830  * that certain asics don't have (all asics).
 831  */
 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 833 {
 834         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 835                   reg, v);
 836         BUG();
 837 }
 838
 839 /**
 840  * amdgpu_block_invalid_rreg - dummy reg read function
 841  *
 842  * @adev: amdgpu_device pointer
 843  * @block: offset of instance
 844  * @reg: offset of register
 845  *
 846  * Dummy register read function.  Used for register blocks
 847  * that certain asics don't have (all asics).
 848  * Returns the value in the register.
 849  */
 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 851                                           uint32_t block, uint32_t reg)
 852 {
 853         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 854                   reg, block);
 855         BUG();
 856         return 0;
 857 }
 858
 859 /**
 860  * amdgpu_block_invalid_wreg - dummy reg write function
 861  *
 862  * @adev: amdgpu_device pointer
 863  * @block: offset of instance
 864  * @reg: offset of register
 865  * @v: value to write to the register
 866  *
 867  * Dummy register read function.  Used for register blocks
 868  * that certain asics don't have (all asics).
 869  */
 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 871                                       uint32_t block,
 872                                       uint32_t reg, uint32_t v)
 873 {
 874         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 875                   reg, block, v);
 876         BUG();
 877 }
 878
 879 /**
 880  * amdgpu_device_asic_init - Wrapper for atom asic_init
 881  *
 882  * @adev: amdgpu_device pointer
 883  *
 884  * Does any asic specific work and then calls atom asic init.
 885  */
 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 887 {
 888         int ret;
 889
 890         amdgpu_asic_pre_asic_init(adev);
 891
 892         if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
 893             adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
 894                 amdgpu_psp_wait_for_bootloader(adev);
 895                 ret = amdgpu_atomfirmware_asic_init(adev, true);
 896                 return ret;
 897         } else {
 898                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 899         }
 900
 901         return 0;
 902 }
 903
 904 /**
 905  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
 906  *
 907  * @adev: amdgpu_device pointer
 908  *
 909  * Allocates a scratch page of VRAM for use by various things in the
 910  * driver.
 911  */
 912 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
 913 {
 914         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
 915                                        AMDGPU_GEM_DOMAIN_VRAM |
 916                                        AMDGPU_GEM_DOMAIN_GTT,
 917                                        &adev->mem_scratch.robj,
 918                                        &adev->mem_scratch.gpu_addr,
 919                                        (void **)&adev->mem_scratch.ptr);
 920 }
 921
 922 /**
 923  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
 924  *
 925  * @adev: amdgpu_device pointer
 926  *
 927  * Frees the VRAM scratch page.
 928  */
 929 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
 930 {
 931         amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
 932 }
 933
 934 /**
 935  * amdgpu_device_program_register_sequence - program an array of registers.
 936  *
 937  * @adev: amdgpu_device pointer
 938  * @registers: pointer to the register array
 939  * @array_size: size of the register array
 940  *
 941  * Programs an array or registers with and or masks.
 942  * This is a helper for setting golden registers.
 943  */
 944 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 945                                              const u32 *registers,
 946                                              const u32 array_size)
 947 {
 948         u32 tmp, reg, and_mask, or_mask;
 949         int i;
 950
 951         if (array_size % 3)
 952                 return;
 953
 954         for (i = 0; i < array_size; i += 3) {
 955                 reg = registers[i + 0];
 956                 and_mask = registers[i + 1];
 957                 or_mask = registers[i + 2];
 958
 959                 if (and_mask == 0xffffffff) {
 960                         tmp = or_mask;
 961                 } else {
 962                         tmp = RREG32(reg);
 963                         tmp &= ~and_mask;
 964                         if (adev->family >= AMDGPU_FAMILY_AI)
 965                                 tmp |= (or_mask & and_mask);
 966                         else
 967                                 tmp |= or_mask;
 968                 }
 969                 WREG32(reg, tmp);
 970         }
 971 }
 972
 973 /**
 974  * amdgpu_device_pci_config_reset - reset the GPU
 975  *
 976  * @adev: amdgpu_device pointer
 977  *
 978  * Resets the GPU using the pci config reset sequence.
 979  * Only applicable to asics prior to vega10.
 980  */
 981 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 982 {
 983         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 984 }
 985
 986 /**
 987  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
 988  *
 989  * @adev: amdgpu_device pointer
 990  *
 991  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
 992  */
 993 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
 994 {
 995         return pci_reset_function(adev->pdev);
 996 }
 997
 998 /*
 999  * amdgpu_device_wb_*()
1000  * Writeback is the method by which the GPU updates special pages in memory
1001  * with the status of certain GPU events (fences, ring pointers,etc.).
1002  */
1003
1004 /**
1005  * amdgpu_device_wb_fini - Disable Writeback and free memory
1006  *
1007  * @adev: amdgpu_device pointer
1008  *
1009  * Disables Writeback and frees the Writeback memory (all asics).
1010  * Used at driver shutdown.
1011  */
1012 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1013 {
1014         if (adev->wb.wb_obj) {
1015                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1016                                       &adev->wb.gpu_addr,
1017                                       (void **)&adev->wb.wb);
1018                 adev->wb.wb_obj = NULL;
1019         }
1020 }
1021
1022 /**
1023  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1024  *
1025  * @adev: amdgpu_device pointer
1026  *
1027  * Initializes writeback and allocates writeback memory (all asics).
1028  * Used at driver startup.
1029  * Returns 0 on success or an -error on failure.
1030  */
1031 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1032 {
1033         int r;
1034
1035         if (adev->wb.wb_obj == NULL) {
1036                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1037                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1038                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1039                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1040                                             (void **)&adev->wb.wb);
1041                 if (r) {
1042                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1043                         return r;
1044                 }
1045
1046                 adev->wb.num_wb = AMDGPU_MAX_WB;
1047                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1048
1049                 /* clear wb memory */
1050                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1051         }
1052
1053         return 0;
1054 }
1055
1056 /**
1057  * amdgpu_device_wb_get - Allocate a wb entry
1058  *
1059  * @adev: amdgpu_device pointer
1060  * @wb: wb index
1061  *
1062  * Allocate a wb slot for use by the driver (all asics).
1063  * Returns 0 on success or -EINVAL on failure.
1064  */
1065 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1066 {
1067         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1068
1069         if (offset < adev->wb.num_wb) {
1070                 __set_bit(offset, adev->wb.used);
1071                 *wb = offset << 3; /* convert to dw offset */
1072                 return 0;
1073         } else {
1074                 return -EINVAL;
1075         }
1076 }
1077
1078 /**
1079  * amdgpu_device_wb_free - Free a wb entry
1080  *
1081  * @adev: amdgpu_device pointer
1082  * @wb: wb index
1083  *
1084  * Free a wb slot allocated for use by the driver (all asics)
1085  */
1086 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1087 {
1088         wb >>= 3;
1089         if (wb < adev->wb.num_wb)
1090                 __clear_bit(wb, adev->wb.used);
1091 }
1092
1093 /**
1094  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1095  *
1096  * @adev: amdgpu_device pointer
1097  *
1098  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1099  * to fail, but if any of the BARs is not accessible after the size we abort
1100  * driver loading by returning -ENODEV.
1101  */
1102 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1103 {
1104         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1105         struct pci_bus *root;
1106         struct resource *res;
1107         unsigned int i;
1108         u16 cmd;
1109         int r;
1110
1111         if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1112                 return 0;
1113
1114         /* Bypass for VF */
1115         if (amdgpu_sriov_vf(adev))
1116                 return 0;
1117
1118         /* skip if the bios has already enabled large BAR */
1119         if (adev->gmc.real_vram_size &&
1120             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1121                 return 0;
1122
1123         /* Check if the root BUS has 64bit memory resources */
1124         root = adev->pdev->bus;
1125         while (root->parent)
1126                 root = root->parent;
1127
1128         pci_bus_for_each_resource(root, res, i) {
1129                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1130                     res->start > 0x100000000ull)
1131                         break;
1132         }
1133
1134         /* Trying to resize is pointless without a root hub window above 4GB */
1135         if (!res)
1136                 return 0;
1137
1138         /* Limit the BAR size to what is available */
1139         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1140                         rbar_size);
1141
1142         /* Disable memory decoding while we change the BAR addresses and size */
1143         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1144         pci_write_config_word(adev->pdev, PCI_COMMAND,
1145                               cmd & ~PCI_COMMAND_MEMORY);
1146
1147         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1148         amdgpu_doorbell_fini(adev);
1149         if (adev->asic_type >= CHIP_BONAIRE)
1150                 pci_release_resource(adev->pdev, 2);
1151
1152         pci_release_resource(adev->pdev, 0);
1153
1154         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1155         if (r == -ENOSPC)
1156                 DRM_INFO("Not enough PCI address space for a large BAR.");
1157         else if (r && r != -ENOTSUPP)
1158                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1159
1160         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1161
1162         /* When the doorbell or fb BAR isn't available we have no chance of
1163          * using the device.
1164          */
1165         r = amdgpu_doorbell_init(adev);
1166         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1167                 return -ENODEV;
1168
1169         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1170
1171         return 0;
1172 }
1173
1174 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1175 {
1176         if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1177                 return false;
1178
1179         return true;
1180 }
1181
1182 /*
1183  * GPU helpers function.
1184  */
1185 /**
1186  * amdgpu_device_need_post - check if the hw need post or not
1187  *
1188  * @adev: amdgpu_device pointer
1189  *
1190  * Check if the asic has been initialized (all asics) at driver startup
1191  * or post is needed if  hw reset is performed.
1192  * Returns true if need or false if not.
1193  */
1194 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1195 {
1196         uint32_t reg;
1197
1198         if (amdgpu_sriov_vf(adev))
1199                 return false;
1200
1201         if (!amdgpu_device_read_bios(adev))
1202                 return false;
1203
1204         if (amdgpu_passthrough(adev)) {
1205                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1206                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1207                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1208                  * vpost executed for smc version below 22.15
1209                  */
1210                 if (adev->asic_type == CHIP_FIJI) {
1211                         int err;
1212                         uint32_t fw_ver;
1213
1214                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1215                         /* force vPost if error occured */
1216                         if (err)
1217                                 return true;
1218
1219                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1220                         if (fw_ver < 0x00160e00)
1221                                 return true;
1222                 }
1223         }
1224
1225         /* Don't post if we need to reset whole hive on init */
1226         if (adev->gmc.xgmi.pending_reset)
1227                 return false;
1228
1229         if (adev->has_hw_reset) {
1230                 adev->has_hw_reset = false;
1231                 return true;
1232         }
1233
1234         /* bios scratch used on CIK+ */
1235         if (adev->asic_type >= CHIP_BONAIRE)
1236                 return amdgpu_atombios_scratch_need_asic_init(adev);
1237
1238         /* check MEM_SIZE for older asics */
1239         reg = amdgpu_asic_get_config_memsize(adev);
1240
1241         if ((reg != 0) && (reg != 0xffffffff))
1242                 return false;
1243
1244         return true;
1245 }
1246
1247 /*
1248  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1249  * speed switching. Until we have confirmation from Intel that a specific host
1250  * supports it, it's safer that we keep it disabled for all.
1251  *
1252  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1253  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1254  */
1255 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1256 {
1257 #if IS_ENABLED(CONFIG_X86)
1258         struct cpuinfo_x86 *c = &cpu_data(0);
1259
1260         if (c->x86_vendor == X86_VENDOR_INTEL)
1261                 return false;
1262 #endif
1263         return true;
1264 }
1265
1266 /**
1267  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1268  *
1269  * @adev: amdgpu_device pointer
1270  *
1271  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1272  * be set for this device.
1273  *
1274  * Returns true if it should be used or false if not.
1275  */
1276 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1277 {
1278         switch (amdgpu_aspm) {
1279         case -1:
1280                 break;
1281         case 0:
1282                 return false;
1283         case 1:
1284                 return true;
1285         default:
1286                 return false;
1287         }
1288         return pcie_aspm_enabled(adev->pdev);
1289 }
1290
1291 bool amdgpu_device_aspm_support_quirk(void)
1292 {
1293 #if IS_ENABLED(CONFIG_X86)
1294         struct cpuinfo_x86 *c = &cpu_data(0);
1295
1296         return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1297 #else
1298         return true;
1299 #endif
1300 }
1301
1302 /* if we get transitioned to only one device, take VGA back */
1303 /**
1304  * amdgpu_device_vga_set_decode - enable/disable vga decode
1305  *
1306  * @pdev: PCI device pointer
1307  * @state: enable/disable vga decode
1308  *
1309  * Enable/disable vga decode (all asics).
1310  * Returns VGA resource flags.
1311  */
1312 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1313                 bool state)
1314 {
1315         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1316
1317         amdgpu_asic_set_vga_state(adev, state);
1318         if (state)
1319                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1320                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1321         else
1322                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1323 }
1324
1325 /**
1326  * amdgpu_device_check_block_size - validate the vm block size
1327  *
1328  * @adev: amdgpu_device pointer
1329  *
1330  * Validates the vm block size specified via module parameter.
1331  * The vm block size defines number of bits in page table versus page directory,
1332  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1333  * page table and the remaining bits are in the page directory.
1334  */
1335 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1336 {
1337         /* defines number of bits in page table versus page directory,
1338          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1339          * page table and the remaining bits are in the page directory
1340          */
1341         if (amdgpu_vm_block_size == -1)
1342                 return;
1343
1344         if (amdgpu_vm_block_size < 9) {
1345                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1346                          amdgpu_vm_block_size);
1347                 amdgpu_vm_block_size = -1;
1348         }
1349 }
1350
1351 /**
1352  * amdgpu_device_check_vm_size - validate the vm size
1353  *
1354  * @adev: amdgpu_device pointer
1355  *
1356  * Validates the vm size in GB specified via module parameter.
1357  * The VM size is the size of the GPU virtual memory space in GB.
1358  */
1359 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1360 {
1361         /* no need to check the default value */
1362         if (amdgpu_vm_size == -1)
1363                 return;
1364
1365         if (amdgpu_vm_size < 1) {
1366                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1367                          amdgpu_vm_size);
1368                 amdgpu_vm_size = -1;
1369         }
1370 }
1371
1372 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1373 {
1374         struct sysinfo si;
1375         bool is_os_64 = (sizeof(void *) == 8);
1376         uint64_t total_memory;
1377         uint64_t dram_size_seven_GB = 0x1B8000000;
1378         uint64_t dram_size_three_GB = 0xB8000000;
1379
1380         if (amdgpu_smu_memory_pool_size == 0)
1381                 return;
1382
1383         if (!is_os_64) {
1384                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1385                 goto def_value;
1386         }
1387         si_meminfo(&si);
1388         total_memory = (uint64_t)si.totalram * si.mem_unit;
1389
1390         if ((amdgpu_smu_memory_pool_size == 1) ||
1391                 (amdgpu_smu_memory_pool_size == 2)) {
1392                 if (total_memory < dram_size_three_GB)
1393                         goto def_value1;
1394         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1395                 (amdgpu_smu_memory_pool_size == 8)) {
1396                 if (total_memory < dram_size_seven_GB)
1397                         goto def_value1;
1398         } else {
1399                 DRM_WARN("Smu memory pool size not supported\n");
1400                 goto def_value;
1401         }
1402         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1403
1404         return;
1405
1406 def_value1:
1407         DRM_WARN("No enough system memory\n");
1408 def_value:
1409         adev->pm.smu_prv_buffer_size = 0;
1410 }
1411
1412 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1413 {
1414         if (!(adev->flags & AMD_IS_APU) ||
1415             adev->asic_type < CHIP_RAVEN)
1416                 return 0;
1417
1418         switch (adev->asic_type) {
1419         case CHIP_RAVEN:
1420                 if (adev->pdev->device == 0x15dd)
1421                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1422                 if (adev->pdev->device == 0x15d8)
1423                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1424                 break;
1425         case CHIP_RENOIR:
1426                 if ((adev->pdev->device == 0x1636) ||
1427                     (adev->pdev->device == 0x164c))
1428                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1429                 else
1430                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1431                 break;
1432         case CHIP_VANGOGH:
1433                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1434                 break;
1435         case CHIP_YELLOW_CARP:
1436                 break;
1437         case CHIP_CYAN_SKILLFISH:
1438                 if ((adev->pdev->device == 0x13FE) ||
1439                     (adev->pdev->device == 0x143F))
1440                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1441                 break;
1442         default:
1443                 break;
1444         }
1445
1446         return 0;
1447 }
1448
1449 /**
1450  * amdgpu_device_check_arguments - validate module params
1451  *
1452  * @adev: amdgpu_device pointer
1453  *
1454  * Validates certain module parameters and updates
1455  * the associated values used by the driver (all asics).
1456  */
1457 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1458 {
1459         if (amdgpu_sched_jobs < 4) {
1460                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1461                          amdgpu_sched_jobs);
1462                 amdgpu_sched_jobs = 4;
1463         } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1464                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1465                          amdgpu_sched_jobs);
1466                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1467         }
1468
1469         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1470                 /* gart size must be greater or equal to 32M */
1471                 dev_warn(adev->dev, "gart size (%d) too small\n",
1472                          amdgpu_gart_size);
1473                 amdgpu_gart_size = -1;
1474         }
1475
1476         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1477                 /* gtt size must be greater or equal to 32M */
1478                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1479                                  amdgpu_gtt_size);
1480                 amdgpu_gtt_size = -1;
1481         }
1482
1483         /* valid range is between 4 and 9 inclusive */
1484         if (amdgpu_vm_fragment_size != -1 &&
1485             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1486                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1487                 amdgpu_vm_fragment_size = -1;
1488         }
1489
1490         if (amdgpu_sched_hw_submission < 2) {
1491                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1492                          amdgpu_sched_hw_submission);
1493                 amdgpu_sched_hw_submission = 2;
1494         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1495                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1496                          amdgpu_sched_hw_submission);
1497                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1498         }
1499
1500         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1501                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1502                 amdgpu_reset_method = -1;
1503         }
1504
1505         amdgpu_device_check_smu_prv_buffer_size(adev);
1506
1507         amdgpu_device_check_vm_size(adev);
1508
1509         amdgpu_device_check_block_size(adev);
1510
1511         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1512
1513         return 0;
1514 }
1515
1516 /**
1517  * amdgpu_switcheroo_set_state - set switcheroo state
1518  *
1519  * @pdev: pci dev pointer
1520  * @state: vga_switcheroo state
1521  *
1522  * Callback for the switcheroo driver.  Suspends or resumes
1523  * the asics before or after it is powered up using ACPI methods.
1524  */
1525 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1526                                         enum vga_switcheroo_state state)
1527 {
1528         struct drm_device *dev = pci_get_drvdata(pdev);
1529         int r;
1530
1531         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1532                 return;
1533
1534         if (state == VGA_SWITCHEROO_ON) {
1535                 pr_info("switched on\n");
1536                 /* don't suspend or resume card normally */
1537                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1538
1539                 pci_set_power_state(pdev, PCI_D0);
1540                 amdgpu_device_load_pci_state(pdev);
1541                 r = pci_enable_device(pdev);
1542                 if (r)
1543                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1544                 amdgpu_device_resume(dev, true);
1545
1546                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1547         } else {
1548                 pr_info("switched off\n");
1549                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1550                 amdgpu_device_suspend(dev, true);
1551                 amdgpu_device_cache_pci_state(pdev);
1552                 /* Shut down the device */
1553                 pci_disable_device(pdev);
1554                 pci_set_power_state(pdev, PCI_D3cold);
1555                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1556         }
1557 }
1558
1559 /**
1560  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1561  *
1562  * @pdev: pci dev pointer
1563  *
1564  * Callback for the switcheroo driver.  Check of the switcheroo
1565  * state can be changed.
1566  * Returns true if the state can be changed, false if not.
1567  */
1568 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1569 {
1570         struct drm_device *dev = pci_get_drvdata(pdev);
1571
1572        /*
1573         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1574         * locking inversion with the driver load path. And the access here is
1575         * completely racy anyway. So don't bother with locking for now.
1576         */
1577         return atomic_read(&dev->open_count) == 0;
1578 }
1579
1580 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1581         .set_gpu_state = amdgpu_switcheroo_set_state,
1582         .reprobe = NULL,
1583         .can_switch = amdgpu_switcheroo_can_switch,
1584 };
1585
1586 /**
1587  * amdgpu_device_ip_set_clockgating_state - set the CG state
1588  *
1589  * @dev: amdgpu_device pointer
1590  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1591  * @state: clockgating state (gate or ungate)
1592  *
1593  * Sets the requested clockgating state for all instances of
1594  * the hardware IP specified.
1595  * Returns the error code from the last instance.
1596  */
1597 int amdgpu_device_ip_set_clockgating_state(void *dev,
1598                                            enum amd_ip_block_type block_type,
1599                                            enum amd_clockgating_state state)
1600 {
1601         struct amdgpu_device *adev = dev;
1602         int i, r = 0;
1603
1604         for (i = 0; i < adev->num_ip_blocks; i++) {
1605                 if (!adev->ip_blocks[i].status.valid)
1606                         continue;
1607                 if (adev->ip_blocks[i].version->type != block_type)
1608                         continue;
1609                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1610                         continue;
1611                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1612                         (void *)adev, state);
1613                 if (r)
1614                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1615                                   adev->ip_blocks[i].version->funcs->name, r);
1616         }
1617         return r;
1618 }
1619
1620 /**
1621  * amdgpu_device_ip_set_powergating_state - set the PG state
1622  *
1623  * @dev: amdgpu_device pointer
1624  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1625  * @state: powergating state (gate or ungate)
1626  *
1627  * Sets the requested powergating state for all instances of
1628  * the hardware IP specified.
1629  * Returns the error code from the last instance.
1630  */
1631 int amdgpu_device_ip_set_powergating_state(void *dev,
1632                                            enum amd_ip_block_type block_type,
1633                                            enum amd_powergating_state state)
1634 {
1635         struct amdgpu_device *adev = dev;
1636         int i, r = 0;
1637
1638         for (i = 0; i < adev->num_ip_blocks; i++) {
1639                 if (!adev->ip_blocks[i].status.valid)
1640                         continue;
1641                 if (adev->ip_blocks[i].version->type != block_type)
1642                         continue;
1643                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1644                         continue;
1645                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1646                         (void *)adev, state);
1647                 if (r)
1648                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1649                                   adev->ip_blocks[i].version->funcs->name, r);
1650         }
1651         return r;
1652 }
1653
1654 /**
1655  * amdgpu_device_ip_get_clockgating_state - get the CG state
1656  *
1657  * @adev: amdgpu_device pointer
1658  * @flags: clockgating feature flags
1659  *
1660  * Walks the list of IPs on the device and updates the clockgating
1661  * flags for each IP.
1662  * Updates @flags with the feature flags for each hardware IP where
1663  * clockgating is enabled.
1664  */
1665 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1666                                             u64 *flags)
1667 {
1668         int i;
1669
1670         for (i = 0; i < adev->num_ip_blocks; i++) {
1671                 if (!adev->ip_blocks[i].status.valid)
1672                         continue;
1673                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1674                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1675         }
1676 }
1677
1678 /**
1679  * amdgpu_device_ip_wait_for_idle - wait for idle
1680  *
1681  * @adev: amdgpu_device pointer
1682  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1683  *
1684  * Waits for the request hardware IP to be idle.
1685  * Returns 0 for success or a negative error code on failure.
1686  */
1687 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1688                                    enum amd_ip_block_type block_type)
1689 {
1690         int i, r;
1691
1692         for (i = 0; i < adev->num_ip_blocks; i++) {
1693                 if (!adev->ip_blocks[i].status.valid)
1694                         continue;
1695                 if (adev->ip_blocks[i].version->type == block_type) {
1696                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1697                         if (r)
1698                                 return r;
1699                         break;
1700                 }
1701         }
1702         return 0;
1703
1704 }
1705
1706 /**
1707  * amdgpu_device_ip_is_idle - is the hardware IP idle
1708  *
1709  * @adev: amdgpu_device pointer
1710  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1711  *
1712  * Check if the hardware IP is idle or not.
1713  * Returns true if it the IP is idle, false if not.
1714  */
1715 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1716                               enum amd_ip_block_type block_type)
1717 {
1718         int i;
1719
1720         for (i = 0; i < adev->num_ip_blocks; i++) {
1721                 if (!adev->ip_blocks[i].status.valid)
1722                         continue;
1723                 if (adev->ip_blocks[i].version->type == block_type)
1724                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1725         }
1726         return true;
1727
1728 }
1729
1730 /**
1731  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1732  *
1733  * @adev: amdgpu_device pointer
1734  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1735  *
1736  * Returns a pointer to the hardware IP block structure
1737  * if it exists for the asic, otherwise NULL.
1738  */
1739 struct amdgpu_ip_block *
1740 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1741                               enum amd_ip_block_type type)
1742 {
1743         int i;
1744
1745         for (i = 0; i < adev->num_ip_blocks; i++)
1746                 if (adev->ip_blocks[i].version->type == type)
1747                         return &adev->ip_blocks[i];
1748
1749         return NULL;
1750 }
1751
1752 /**
1753  * amdgpu_device_ip_block_version_cmp
1754  *
1755  * @adev: amdgpu_device pointer
1756  * @type: enum amd_ip_block_type
1757  * @major: major version
1758  * @minor: minor version
1759  *
1760  * return 0 if equal or greater
1761  * return 1 if smaller or the ip_block doesn't exist
1762  */
1763 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1764                                        enum amd_ip_block_type type,
1765                                        u32 major, u32 minor)
1766 {
1767         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1768
1769         if (ip_block && ((ip_block->version->major > major) ||
1770                         ((ip_block->version->major == major) &&
1771                         (ip_block->version->minor >= minor))))
1772                 return 0;
1773
1774         return 1;
1775 }
1776
1777 /**
1778  * amdgpu_device_ip_block_add
1779  *
1780  * @adev: amdgpu_device pointer
1781  * @ip_block_version: pointer to the IP to add
1782  *
1783  * Adds the IP block driver information to the collection of IPs
1784  * on the asic.
1785  */
1786 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1787                                const struct amdgpu_ip_block_version *ip_block_version)
1788 {
1789         if (!ip_block_version)
1790                 return -EINVAL;
1791
1792         switch (ip_block_version->type) {
1793         case AMD_IP_BLOCK_TYPE_VCN:
1794                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1795                         return 0;
1796                 break;
1797         case AMD_IP_BLOCK_TYPE_JPEG:
1798                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1799                         return 0;
1800                 break;
1801         default:
1802                 break;
1803         }
1804
1805         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1806                   ip_block_version->funcs->name);
1807
1808         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1809
1810         return 0;
1811 }
1812
1813 /**
1814  * amdgpu_device_enable_virtual_display - enable virtual display feature
1815  *
1816  * @adev: amdgpu_device pointer
1817  *
1818  * Enabled the virtual display feature if the user has enabled it via
1819  * the module parameter virtual_display.  This feature provides a virtual
1820  * display hardware on headless boards or in virtualized environments.
1821  * This function parses and validates the configuration string specified by
1822  * the user and configues the virtual display configuration (number of
1823  * virtual connectors, crtcs, etc.) specified.
1824  */
1825 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1826 {
1827         adev->enable_virtual_display = false;
1828
1829         if (amdgpu_virtual_display) {
1830                 const char *pci_address_name = pci_name(adev->pdev);
1831                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1832
1833                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1834                 pciaddstr_tmp = pciaddstr;
1835                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1836                         pciaddname = strsep(&pciaddname_tmp, ",");
1837                         if (!strcmp("all", pciaddname)
1838                             || !strcmp(pci_address_name, pciaddname)) {
1839                                 long num_crtc;
1840                                 int res = -1;
1841
1842                                 adev->enable_virtual_display = true;
1843
1844                                 if (pciaddname_tmp)
1845                                         res = kstrtol(pciaddname_tmp, 10,
1846                                                       &num_crtc);
1847
1848                                 if (!res) {
1849                                         if (num_crtc < 1)
1850                                                 num_crtc = 1;
1851                                         if (num_crtc > 6)
1852                                                 num_crtc = 6;
1853                                         adev->mode_info.num_crtc = num_crtc;
1854                                 } else {
1855                                         adev->mode_info.num_crtc = 1;
1856                                 }
1857                                 break;
1858                         }
1859                 }
1860
1861                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1862                          amdgpu_virtual_display, pci_address_name,
1863                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1864
1865                 kfree(pciaddstr);
1866         }
1867 }
1868
1869 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1870 {
1871         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1872                 adev->mode_info.num_crtc = 1;
1873                 adev->enable_virtual_display = true;
1874                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1875                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1876         }
1877 }
1878
1879 /**
1880  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1881  *
1882  * @adev: amdgpu_device pointer
1883  *
1884  * Parses the asic configuration parameters specified in the gpu info
1885  * firmware and makes them availale to the driver for use in configuring
1886  * the asic.
1887  * Returns 0 on success, -EINVAL on failure.
1888  */
1889 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1890 {
1891         const char *chip_name;
1892         char fw_name[40];
1893         int err;
1894         const struct gpu_info_firmware_header_v1_0 *hdr;
1895
1896         adev->firmware.gpu_info_fw = NULL;
1897
1898         if (adev->mman.discovery_bin) {
1899                 /*
1900                  * FIXME: The bounding box is still needed by Navi12, so
1901                  * temporarily read it from gpu_info firmware. Should be dropped
1902                  * when DAL no longer needs it.
1903                  */
1904                 if (adev->asic_type != CHIP_NAVI12)
1905                         return 0;
1906         }
1907
1908         switch (adev->asic_type) {
1909         default:
1910                 return 0;
1911         case CHIP_VEGA10:
1912                 chip_name = "vega10";
1913                 break;
1914         case CHIP_VEGA12:
1915                 chip_name = "vega12";
1916                 break;
1917         case CHIP_RAVEN:
1918                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1919                         chip_name = "raven2";
1920                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1921                         chip_name = "picasso";
1922                 else
1923                         chip_name = "raven";
1924                 break;
1925         case CHIP_ARCTURUS:
1926                 chip_name = "arcturus";
1927                 break;
1928         case CHIP_NAVI12:
1929                 chip_name = "navi12";
1930                 break;
1931         }
1932
1933         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1934         err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1935         if (err) {
1936                 dev_err(adev->dev,
1937                         "Failed to get gpu_info firmware \"%s\"\n",
1938                         fw_name);
1939                 goto out;
1940         }
1941
1942         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1943         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1944
1945         switch (hdr->version_major) {
1946         case 1:
1947         {
1948                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1949                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1950                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1951
1952                 /*
1953                  * Should be droped when DAL no longer needs it.
1954                  */
1955                 if (adev->asic_type == CHIP_NAVI12)
1956                         goto parse_soc_bounding_box;
1957
1958                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1959                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1960                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1961                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1962                 adev->gfx.config.max_texture_channel_caches =
1963                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1964                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1965                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1966                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1967                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1968                 adev->gfx.config.double_offchip_lds_buf =
1969                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1970                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1971                 adev->gfx.cu_info.max_waves_per_simd =
1972                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1973                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1974                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1975                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1976                 if (hdr->version_minor >= 1) {
1977                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1978                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1979                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1980                         adev->gfx.config.num_sc_per_sh =
1981                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1982                         adev->gfx.config.num_packer_per_sc =
1983                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1984                 }
1985
1986 parse_soc_bounding_box:
1987                 /*
1988                  * soc bounding box info is not integrated in disocovery table,
1989                  * we always need to parse it from gpu info firmware if needed.
1990                  */
1991                 if (hdr->version_minor == 2) {
1992                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1993                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1994                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1995                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1996                 }
1997                 break;
1998         }
1999         default:
2000                 dev_err(adev->dev,
2001                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2002                 err = -EINVAL;
2003                 goto out;
2004         }
2005 out:
2006         return err;
2007 }
2008
2009 /**
2010  * amdgpu_device_ip_early_init - run early init for hardware IPs
2011  *
2012  * @adev: amdgpu_device pointer
2013  *
2014  * Early initialization pass for hardware IPs.  The hardware IPs that make
2015  * up each asic are discovered each IP's early_init callback is run.  This
2016  * is the first stage in initializing the asic.
2017  * Returns 0 on success, negative error code on failure.
2018  */
2019 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2020 {
2021         struct drm_device *dev = adev_to_drm(adev);
2022         struct pci_dev *parent;
2023         int i, r;
2024         bool total;
2025
2026         amdgpu_device_enable_virtual_display(adev);
2027
2028         if (amdgpu_sriov_vf(adev)) {
2029                 r = amdgpu_virt_request_full_gpu(adev, true);
2030                 if (r)
2031                         return r;
2032         }
2033
2034         switch (adev->asic_type) {
2035 #ifdef CONFIG_DRM_AMDGPU_SI
2036         case CHIP_VERDE:
2037         case CHIP_TAHITI:
2038         case CHIP_PITCAIRN:
2039         case CHIP_OLAND:
2040         case CHIP_HAINAN:
2041                 adev->family = AMDGPU_FAMILY_SI;
2042                 r = si_set_ip_blocks(adev);
2043                 if (r)
2044                         return r;
2045                 break;
2046 #endif
2047 #ifdef CONFIG_DRM_AMDGPU_CIK
2048         case CHIP_BONAIRE:
2049         case CHIP_HAWAII:
2050         case CHIP_KAVERI:
2051         case CHIP_KABINI:
2052         case CHIP_MULLINS:
2053                 if (adev->flags & AMD_IS_APU)
2054                         adev->family = AMDGPU_FAMILY_KV;
2055                 else
2056                         adev->family = AMDGPU_FAMILY_CI;
2057
2058                 r = cik_set_ip_blocks(adev);
2059                 if (r)
2060                         return r;
2061                 break;
2062 #endif
2063         case CHIP_TOPAZ:
2064         case CHIP_TONGA:
2065         case CHIP_FIJI:
2066         case CHIP_POLARIS10:
2067         case CHIP_POLARIS11:
2068         case CHIP_POLARIS12:
2069         case CHIP_VEGAM:
2070         case CHIP_CARRIZO:
2071         case CHIP_STONEY:
2072                 if (adev->flags & AMD_IS_APU)
2073                         adev->family = AMDGPU_FAMILY_CZ;
2074                 else
2075                         adev->family = AMDGPU_FAMILY_VI;
2076
2077                 r = vi_set_ip_blocks(adev);
2078                 if (r)
2079                         return r;
2080                 break;
2081         default:
2082                 r = amdgpu_discovery_set_ip_blocks(adev);
2083                 if (r)
2084                         return r;
2085                 break;
2086         }
2087
2088         if (amdgpu_has_atpx() &&
2089             (amdgpu_is_atpx_hybrid() ||
2090              amdgpu_has_atpx_dgpu_power_cntl()) &&
2091             ((adev->flags & AMD_IS_APU) == 0) &&
2092             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2093                 adev->flags |= AMD_IS_PX;
2094
2095         if (!(adev->flags & AMD_IS_APU)) {
2096                 parent = pci_upstream_bridge(adev->pdev);
2097                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2098         }
2099
2100
2101         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2102         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2103                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2104         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2105                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2106
2107         total = true;
2108         for (i = 0; i < adev->num_ip_blocks; i++) {
2109                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2110                         DRM_WARN("disabled ip block: %d <%s>\n",
2111                                   i, adev->ip_blocks[i].version->funcs->name);
2112                         adev->ip_blocks[i].status.valid = false;
2113                 } else {
2114                         if (adev->ip_blocks[i].version->funcs->early_init) {
2115                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2116                                 if (r == -ENOENT) {
2117                                         adev->ip_blocks[i].status.valid = false;
2118                                 } else if (r) {
2119                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2120                                                   adev->ip_blocks[i].version->funcs->name, r);
2121                                         total = false;
2122                                 } else {
2123                                         adev->ip_blocks[i].status.valid = true;
2124                                 }
2125                         } else {
2126                                 adev->ip_blocks[i].status.valid = true;
2127                         }
2128                 }
2129                 /* get the vbios after the asic_funcs are set up */
2130                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2131                         r = amdgpu_device_parse_gpu_info_fw(adev);
2132                         if (r)
2133                                 return r;
2134
2135                         /* Read BIOS */
2136                         if (amdgpu_device_read_bios(adev)) {
2137                                 if (!amdgpu_get_bios(adev))
2138                                         return -EINVAL;
2139
2140                                 r = amdgpu_atombios_init(adev);
2141                                 if (r) {
2142                                         dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2143                                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2144                                         return r;
2145                                 }
2146                         }
2147
2148                         /*get pf2vf msg info at it's earliest time*/
2149                         if (amdgpu_sriov_vf(adev))
2150                                 amdgpu_virt_init_data_exchange(adev);
2151
2152                 }
2153         }
2154         if (!total)
2155                 return -ENODEV;
2156
2157         amdgpu_amdkfd_device_probe(adev);
2158         adev->cg_flags &= amdgpu_cg_mask;
2159         adev->pg_flags &= amdgpu_pg_mask;
2160
2161         return 0;
2162 }
2163
2164 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2165 {
2166         int i, r;
2167
2168         for (i = 0; i < adev->num_ip_blocks; i++) {
2169                 if (!adev->ip_blocks[i].status.sw)
2170                         continue;
2171                 if (adev->ip_blocks[i].status.hw)
2172                         continue;
2173                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2174                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2175                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2176                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2177                         if (r) {
2178                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2179                                           adev->ip_blocks[i].version->funcs->name, r);
2180                                 return r;
2181                         }
2182                         adev->ip_blocks[i].status.hw = true;
2183                 }
2184         }
2185
2186         return 0;
2187 }
2188
2189 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2190 {
2191         int i, r;
2192
2193         for (i = 0; i < adev->num_ip_blocks; i++) {
2194                 if (!adev->ip_blocks[i].status.sw)
2195                         continue;
2196                 if (adev->ip_blocks[i].status.hw)
2197                         continue;
2198                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2199                 if (r) {
2200                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2201                                   adev->ip_blocks[i].version->funcs->name, r);
2202                         return r;
2203                 }
2204                 adev->ip_blocks[i].status.hw = true;
2205         }
2206
2207         return 0;
2208 }
2209
2210 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2211 {
2212         int r = 0;
2213         int i;
2214         uint32_t smu_version;
2215
2216         if (adev->asic_type >= CHIP_VEGA10) {
2217                 for (i = 0; i < adev->num_ip_blocks; i++) {
2218                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2219                                 continue;
2220
2221                         if (!adev->ip_blocks[i].status.sw)
2222                                 continue;
2223
2224                         /* no need to do the fw loading again if already done*/
2225                         if (adev->ip_blocks[i].status.hw == true)
2226                                 break;
2227
2228                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2229                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2230                                 if (r) {
2231                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2232                                                           adev->ip_blocks[i].version->funcs->name, r);
2233                                         return r;
2234                                 }
2235                         } else {
2236                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2237                                 if (r) {
2238                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2239                                                           adev->ip_blocks[i].version->funcs->name, r);
2240                                         return r;
2241                                 }
2242                         }
2243
2244                         adev->ip_blocks[i].status.hw = true;
2245                         break;
2246                 }
2247         }
2248
2249         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2250                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2251
2252         return r;
2253 }
2254
2255 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2256 {
2257         long timeout;
2258         int r, i;
2259
2260         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2261                 struct amdgpu_ring *ring = adev->rings[i];
2262
2263                 /* No need to setup the GPU scheduler for rings that don't need it */
2264                 if (!ring || ring->no_scheduler)
2265                         continue;
2266
2267                 switch (ring->funcs->type) {
2268                 case AMDGPU_RING_TYPE_GFX:
2269                         timeout = adev->gfx_timeout;
2270                         break;
2271                 case AMDGPU_RING_TYPE_COMPUTE:
2272                         timeout = adev->compute_timeout;
2273                         break;
2274                 case AMDGPU_RING_TYPE_SDMA:
2275                         timeout = adev->sdma_timeout;
2276                         break;
2277                 default:
2278                         timeout = adev->video_timeout;
2279                         break;
2280                 }
2281
2282                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2283                                    ring->num_hw_submission, 0,
2284                                    timeout, adev->reset_domain->wq,
2285                                    ring->sched_score, ring->name,
2286                                    adev->dev);
2287                 if (r) {
2288                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2289                                   ring->name);
2290                         return r;
2291                 }
2292         }
2293
2294         amdgpu_xcp_update_partition_sched_list(adev);
2295
2296         return 0;
2297 }
2298
2299
2300 /**
2301  * amdgpu_device_ip_init - run init for hardware IPs
2302  *
2303  * @adev: amdgpu_device pointer
2304  *
2305  * Main initialization pass for hardware IPs.  The list of all the hardware
2306  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2307  * are run.  sw_init initializes the software state associated with each IP
2308  * and hw_init initializes the hardware associated with each IP.
2309  * Returns 0 on success, negative error code on failure.
2310  */
2311 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2312 {
2313         int i, r;
2314
2315         r = amdgpu_ras_init(adev);
2316         if (r)
2317                 return r;
2318
2319         for (i = 0; i < adev->num_ip_blocks; i++) {
2320                 if (!adev->ip_blocks[i].status.valid)
2321                         continue;
2322                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2323                 if (r) {
2324                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2325                                   adev->ip_blocks[i].version->funcs->name, r);
2326                         goto init_failed;
2327                 }
2328                 adev->ip_blocks[i].status.sw = true;
2329
2330                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2331                         /* need to do common hw init early so everything is set up for gmc */
2332                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2333                         if (r) {
2334                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2335                                 goto init_failed;
2336                         }
2337                         adev->ip_blocks[i].status.hw = true;
2338                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2339                         /* need to do gmc hw init early so we can allocate gpu mem */
2340                         /* Try to reserve bad pages early */
2341                         if (amdgpu_sriov_vf(adev))
2342                                 amdgpu_virt_exchange_data(adev);
2343
2344                         r = amdgpu_device_mem_scratch_init(adev);
2345                         if (r) {
2346                                 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2347                                 goto init_failed;
2348                         }
2349                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2350                         if (r) {
2351                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2352                                 goto init_failed;
2353                         }
2354                         r = amdgpu_device_wb_init(adev);
2355                         if (r) {
2356                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2357                                 goto init_failed;
2358                         }
2359                         adev->ip_blocks[i].status.hw = true;
2360
2361                         /* right after GMC hw init, we create CSA */
2362                         if (adev->gfx.mcbp) {
2363                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2364                                                                AMDGPU_GEM_DOMAIN_VRAM |
2365                                                                AMDGPU_GEM_DOMAIN_GTT,
2366                                                                AMDGPU_CSA_SIZE);
2367                                 if (r) {
2368                                         DRM_ERROR("allocate CSA failed %d\n", r);
2369                                         goto init_failed;
2370                                 }
2371                         }
2372                 }
2373         }
2374
2375         if (amdgpu_sriov_vf(adev))
2376                 amdgpu_virt_init_data_exchange(adev);
2377
2378         r = amdgpu_ib_pool_init(adev);
2379         if (r) {
2380                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2381                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2382                 goto init_failed;
2383         }
2384
2385         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2386         if (r)
2387                 goto init_failed;
2388
2389         r = amdgpu_device_ip_hw_init_phase1(adev);
2390         if (r)
2391                 goto init_failed;
2392
2393         r = amdgpu_device_fw_loading(adev);
2394         if (r)
2395                 goto init_failed;
2396
2397         r = amdgpu_device_ip_hw_init_phase2(adev);
2398         if (r)
2399                 goto init_failed;
2400
2401         /*
2402          * retired pages will be loaded from eeprom and reserved here,
2403          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2404          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2405          * for I2C communication which only true at this point.
2406          *
2407          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2408          * failure from bad gpu situation and stop amdgpu init process
2409          * accordingly. For other failed cases, it will still release all
2410          * the resource and print error message, rather than returning one
2411          * negative value to upper level.
2412          *
2413          * Note: theoretically, this should be called before all vram allocations
2414          * to protect retired page from abusing
2415          */
2416         r = amdgpu_ras_recovery_init(adev);
2417         if (r)
2418                 goto init_failed;
2419
2420         /**
2421          * In case of XGMI grab extra reference for reset domain for this device
2422          */
2423         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2424                 if (amdgpu_xgmi_add_device(adev) == 0) {
2425                         if (!amdgpu_sriov_vf(adev)) {
2426                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2427
2428                                 if (WARN_ON(!hive)) {
2429                                         r = -ENOENT;
2430                                         goto init_failed;
2431                                 }
2432
2433                                 if (!hive->reset_domain ||
2434                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2435                                         r = -ENOENT;
2436                                         amdgpu_put_xgmi_hive(hive);
2437                                         goto init_failed;
2438                                 }
2439
2440                                 /* Drop the early temporary reset domain we created for device */
2441                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2442                                 adev->reset_domain = hive->reset_domain;
2443                                 amdgpu_put_xgmi_hive(hive);
2444                         }
2445                 }
2446         }
2447
2448         r = amdgpu_device_init_schedulers(adev);
2449         if (r)
2450                 goto init_failed;
2451
2452         /* Don't init kfd if whole hive need to be reset during init */
2453         if (!adev->gmc.xgmi.pending_reset) {
2454                 kgd2kfd_init_zone_device(adev);
2455                 amdgpu_amdkfd_device_init(adev);
2456         }
2457
2458         amdgpu_fru_get_product_info(adev);
2459
2460 init_failed:
2461
2462         return r;
2463 }
2464
2465 /**
2466  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2467  *
2468  * @adev: amdgpu_device pointer
2469  *
2470  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2471  * this function before a GPU reset.  If the value is retained after a
2472  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2473  */
2474 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2475 {
2476         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2477 }
2478
2479 /**
2480  * amdgpu_device_check_vram_lost - check if vram is valid
2481  *
2482  * @adev: amdgpu_device pointer
2483  *
2484  * Checks the reset magic value written to the gart pointer in VRAM.
2485  * The driver calls this after a GPU reset to see if the contents of
2486  * VRAM is lost or now.
2487  * returns true if vram is lost, false if not.
2488  */
2489 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2490 {
2491         if (memcmp(adev->gart.ptr, adev->reset_magic,
2492                         AMDGPU_RESET_MAGIC_NUM))
2493                 return true;
2494
2495         if (!amdgpu_in_reset(adev))
2496                 return false;
2497
2498         /*
2499          * For all ASICs with baco/mode1 reset, the VRAM is
2500          * always assumed to be lost.
2501          */
2502         switch (amdgpu_asic_reset_method(adev)) {
2503         case AMD_RESET_METHOD_BACO:
2504         case AMD_RESET_METHOD_MODE1:
2505                 return true;
2506         default:
2507                 return false;
2508         }
2509 }
2510
2511 /**
2512  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2513  *
2514  * @adev: amdgpu_device pointer
2515  * @state: clockgating state (gate or ungate)
2516  *
2517  * The list of all the hardware IPs that make up the asic is walked and the
2518  * set_clockgating_state callbacks are run.
2519  * Late initialization pass enabling clockgating for hardware IPs.
2520  * Fini or suspend, pass disabling clockgating for hardware IPs.
2521  * Returns 0 on success, negative error code on failure.
2522  */
2523
2524 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2525                                enum amd_clockgating_state state)
2526 {
2527         int i, j, r;
2528
2529         if (amdgpu_emu_mode == 1)
2530                 return 0;
2531
2532         for (j = 0; j < adev->num_ip_blocks; j++) {
2533                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2534                 if (!adev->ip_blocks[i].status.late_initialized)
2535                         continue;
2536                 /* skip CG for GFX, SDMA on S0ix */
2537                 if (adev->in_s0ix &&
2538                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2539                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2540                         continue;
2541                 /* skip CG for VCE/UVD, it's handled specially */
2542                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2543                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2544                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2545                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2546                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2547                         /* enable clockgating to save power */
2548                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2549                                                                                      state);
2550                         if (r) {
2551                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2552                                           adev->ip_blocks[i].version->funcs->name, r);
2553                                 return r;
2554                         }
2555                 }
2556         }
2557
2558         return 0;
2559 }
2560
2561 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2562                                enum amd_powergating_state state)
2563 {
2564         int i, j, r;
2565
2566         if (amdgpu_emu_mode == 1)
2567                 return 0;
2568
2569         for (j = 0; j < adev->num_ip_blocks; j++) {
2570                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2571                 if (!adev->ip_blocks[i].status.late_initialized)
2572                         continue;
2573                 /* skip PG for GFX, SDMA on S0ix */
2574                 if (adev->in_s0ix &&
2575                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2576                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2577                         continue;
2578                 /* skip CG for VCE/UVD, it's handled specially */
2579                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2580                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2581                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2582                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2583                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2584                         /* enable powergating to save power */
2585                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2586                                                                                         state);
2587                         if (r) {
2588                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2589                                           adev->ip_blocks[i].version->funcs->name, r);
2590                                 return r;
2591                         }
2592                 }
2593         }
2594         return 0;
2595 }
2596
2597 static int amdgpu_device_enable_mgpu_fan_boost(void)
2598 {
2599         struct amdgpu_gpu_instance *gpu_ins;
2600         struct amdgpu_device *adev;
2601         int i, ret = 0;
2602
2603         mutex_lock(&mgpu_info.mutex);
2604
2605         /*
2606          * MGPU fan boost feature should be enabled
2607          * only when there are two or more dGPUs in
2608          * the system
2609          */
2610         if (mgpu_info.num_dgpu < 2)
2611                 goto out;
2612
2613         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2614                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2615                 adev = gpu_ins->adev;
2616                 if (!(adev->flags & AMD_IS_APU) &&
2617                     !gpu_ins->mgpu_fan_enabled) {
2618                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2619                         if (ret)
2620                                 break;
2621
2622                         gpu_ins->mgpu_fan_enabled = 1;
2623                 }
2624         }
2625
2626 out:
2627         mutex_unlock(&mgpu_info.mutex);
2628
2629         return ret;
2630 }
2631
2632 /**
2633  * amdgpu_device_ip_late_init - run late init for hardware IPs
2634  *
2635  * @adev: amdgpu_device pointer
2636  *
2637  * Late initialization pass for hardware IPs.  The list of all the hardware
2638  * IPs that make up the asic is walked and the late_init callbacks are run.
2639  * late_init covers any special initialization that an IP requires
2640  * after all of the have been initialized or something that needs to happen
2641  * late in the init process.
2642  * Returns 0 on success, negative error code on failure.
2643  */
2644 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2645 {
2646         struct amdgpu_gpu_instance *gpu_instance;
2647         int i = 0, r;
2648
2649         for (i = 0; i < adev->num_ip_blocks; i++) {
2650                 if (!adev->ip_blocks[i].status.hw)
2651                         continue;
2652                 if (adev->ip_blocks[i].version->funcs->late_init) {
2653                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2654                         if (r) {
2655                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2656                                           adev->ip_blocks[i].version->funcs->name, r);
2657                                 return r;
2658                         }
2659                 }
2660                 adev->ip_blocks[i].status.late_initialized = true;
2661         }
2662
2663         r = amdgpu_ras_late_init(adev);
2664         if (r) {
2665                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2666                 return r;
2667         }
2668
2669         amdgpu_ras_set_error_query_ready(adev, true);
2670
2671         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2672         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2673
2674         amdgpu_device_fill_reset_magic(adev);
2675
2676         r = amdgpu_device_enable_mgpu_fan_boost();
2677         if (r)
2678                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2679
2680         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2681         if (amdgpu_passthrough(adev) &&
2682             ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2683              adev->asic_type == CHIP_ALDEBARAN))
2684                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2685
2686         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2687                 mutex_lock(&mgpu_info.mutex);
2688
2689                 /*
2690                  * Reset device p-state to low as this was booted with high.
2691                  *
2692                  * This should be performed only after all devices from the same
2693                  * hive get initialized.
2694                  *
2695                  * However, it's unknown how many device in the hive in advance.
2696                  * As this is counted one by one during devices initializations.
2697                  *
2698                  * So, we wait for all XGMI interlinked devices initialized.
2699                  * This may bring some delays as those devices may come from
2700                  * different hives. But that should be OK.
2701                  */
2702                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2703                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2704                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2705                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2706                                         continue;
2707
2708                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2709                                                 AMDGPU_XGMI_PSTATE_MIN);
2710                                 if (r) {
2711                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2712                                         break;
2713                                 }
2714                         }
2715                 }
2716
2717                 mutex_unlock(&mgpu_info.mutex);
2718         }
2719
2720         return 0;
2721 }
2722
2723 /**
2724  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2725  *
2726  * @adev: amdgpu_device pointer
2727  *
2728  * For ASICs need to disable SMC first
2729  */
2730 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2731 {
2732         int i, r;
2733
2734         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2735                 return;
2736
2737         for (i = 0; i < adev->num_ip_blocks; i++) {
2738                 if (!adev->ip_blocks[i].status.hw)
2739                         continue;
2740                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2741                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2742                         /* XXX handle errors */
2743                         if (r) {
2744                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2745                                           adev->ip_blocks[i].version->funcs->name, r);
2746                         }
2747                         adev->ip_blocks[i].status.hw = false;
2748                         break;
2749                 }
2750         }
2751 }
2752
2753 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2754 {
2755         int i, r;
2756
2757         for (i = 0; i < adev->num_ip_blocks; i++) {
2758                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2759                         continue;
2760
2761                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2762                 if (r) {
2763                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2764                                   adev->ip_blocks[i].version->funcs->name, r);
2765                 }
2766         }
2767
2768         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2769         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2770
2771         amdgpu_amdkfd_suspend(adev, false);
2772
2773         /* Workaroud for ASICs need to disable SMC first */
2774         amdgpu_device_smu_fini_early(adev);
2775
2776         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2777                 if (!adev->ip_blocks[i].status.hw)
2778                         continue;
2779
2780                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2781                 /* XXX handle errors */
2782                 if (r) {
2783                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2784                                   adev->ip_blocks[i].version->funcs->name, r);
2785                 }
2786
2787                 adev->ip_blocks[i].status.hw = false;
2788         }
2789
2790         if (amdgpu_sriov_vf(adev)) {
2791                 if (amdgpu_virt_release_full_gpu(adev, false))
2792                         DRM_ERROR("failed to release exclusive mode on fini\n");
2793         }
2794
2795         return 0;
2796 }
2797
2798 /**
2799  * amdgpu_device_ip_fini - run fini for hardware IPs
2800  *
2801  * @adev: amdgpu_device pointer
2802  *
2803  * Main teardown pass for hardware IPs.  The list of all the hardware
2804  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2805  * are run.  hw_fini tears down the hardware associated with each IP
2806  * and sw_fini tears down any software state associated with each IP.
2807  * Returns 0 on success, negative error code on failure.
2808  */
2809 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2810 {
2811         int i, r;
2812
2813         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2814                 amdgpu_virt_release_ras_err_handler_data(adev);
2815
2816         if (adev->gmc.xgmi.num_physical_nodes > 1)
2817                 amdgpu_xgmi_remove_device(adev);
2818
2819         amdgpu_amdkfd_device_fini_sw(adev);
2820
2821         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2822                 if (!adev->ip_blocks[i].status.sw)
2823                         continue;
2824
2825                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2826                         amdgpu_ucode_free_bo(adev);
2827                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2828                         amdgpu_device_wb_fini(adev);
2829                         amdgpu_device_mem_scratch_fini(adev);
2830                         amdgpu_ib_pool_fini(adev);
2831                 }
2832
2833                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2834                 /* XXX handle errors */
2835                 if (r) {
2836                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2837                                   adev->ip_blocks[i].version->funcs->name, r);
2838                 }
2839                 adev->ip_blocks[i].status.sw = false;
2840                 adev->ip_blocks[i].status.valid = false;
2841         }
2842
2843         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2844                 if (!adev->ip_blocks[i].status.late_initialized)
2845                         continue;
2846                 if (adev->ip_blocks[i].version->funcs->late_fini)
2847                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2848                 adev->ip_blocks[i].status.late_initialized = false;
2849         }
2850
2851         amdgpu_ras_fini(adev);
2852
2853         return 0;
2854 }
2855
2856 /**
2857  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2858  *
2859  * @work: work_struct.
2860  */
2861 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2862 {
2863         struct amdgpu_device *adev =
2864                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2865         int r;
2866
2867         r = amdgpu_ib_ring_tests(adev);
2868         if (r)
2869                 DRM_ERROR("ib ring test failed (%d).\n", r);
2870 }
2871
2872 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2873 {
2874         struct amdgpu_device *adev =
2875                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2876
2877         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2878         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2879
2880         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2881                 adev->gfx.gfx_off_state = true;
2882 }
2883
2884 /**
2885  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2886  *
2887  * @adev: amdgpu_device pointer
2888  *
2889  * Main suspend function for hardware IPs.  The list of all the hardware
2890  * IPs that make up the asic is walked, clockgating is disabled and the
2891  * suspend callbacks are run.  suspend puts the hardware and software state
2892  * in each IP into a state suitable for suspend.
2893  * Returns 0 on success, negative error code on failure.
2894  */
2895 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2896 {
2897         int i, r;
2898
2899         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2900         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2901
2902         /*
2903          * Per PMFW team's suggestion, driver needs to handle gfxoff
2904          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2905          * scenario. Add the missing df cstate disablement here.
2906          */
2907         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2908                 dev_warn(adev->dev, "Failed to disallow df cstate");
2909
2910         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2911                 if (!adev->ip_blocks[i].status.valid)
2912                         continue;
2913
2914                 /* displays are handled separately */
2915                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2916                         continue;
2917
2918                 /* XXX handle errors */
2919                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2920                 /* XXX handle errors */
2921                 if (r) {
2922                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2923                                   adev->ip_blocks[i].version->funcs->name, r);
2924                         return r;
2925                 }
2926
2927                 adev->ip_blocks[i].status.hw = false;
2928         }
2929
2930         return 0;
2931 }
2932
2933 /**
2934  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2935  *
2936  * @adev: amdgpu_device pointer
2937  *
2938  * Main suspend function for hardware IPs.  The list of all the hardware
2939  * IPs that make up the asic is walked, clockgating is disabled and the
2940  * suspend callbacks are run.  suspend puts the hardware and software state
2941  * in each IP into a state suitable for suspend.
2942  * Returns 0 on success, negative error code on failure.
2943  */
2944 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2945 {
2946         int i, r;
2947
2948         if (adev->in_s0ix)
2949                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2950
2951         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2952                 if (!adev->ip_blocks[i].status.valid)
2953                         continue;
2954                 /* displays are handled in phase1 */
2955                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2956                         continue;
2957                 /* PSP lost connection when err_event_athub occurs */
2958                 if (amdgpu_ras_intr_triggered() &&
2959                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2960                         adev->ip_blocks[i].status.hw = false;
2961                         continue;
2962                 }
2963
2964                 /* skip unnecessary suspend if we do not initialize them yet */
2965                 if (adev->gmc.xgmi.pending_reset &&
2966                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2967                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2968                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2969                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2970                         adev->ip_blocks[i].status.hw = false;
2971                         continue;
2972                 }
2973
2974                 /* skip suspend of gfx/mes and psp for S0ix
2975                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
2976                  * like at runtime. PSP is also part of the always on hardware
2977                  * so no need to suspend it.
2978                  */
2979                 if (adev->in_s0ix &&
2980                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2981                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2982                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
2983                         continue;
2984
2985                 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
2986                 if (adev->in_s0ix &&
2987                     (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
2988                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2989                         continue;
2990
2991                 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
2992                  * These are in TMR, hence are expected to be reused by PSP-TOS to reload
2993                  * from this location and RLC Autoload automatically also gets loaded
2994                  * from here based on PMFW -> PSP message during re-init sequence.
2995                  * Therefore, the psp suspend & resume should be skipped to avoid destroy
2996                  * the TMR and reload FWs again for IMU enabled APU ASICs.
2997                  */
2998                 if (amdgpu_in_reset(adev) &&
2999                     (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3000                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3001                         continue;
3002
3003                 /* XXX handle errors */
3004                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3005                 /* XXX handle errors */
3006                 if (r) {
3007                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3008                                   adev->ip_blocks[i].version->funcs->name, r);
3009                 }
3010                 adev->ip_blocks[i].status.hw = false;
3011                 /* handle putting the SMC in the appropriate state */
3012                 if (!amdgpu_sriov_vf(adev)) {
3013                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3014                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3015                                 if (r) {
3016                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3017                                                         adev->mp1_state, r);
3018                                         return r;
3019                                 }
3020                         }
3021                 }
3022         }
3023
3024         return 0;
3025 }
3026
3027 /**
3028  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3029  *
3030  * @adev: amdgpu_device pointer
3031  *
3032  * Main suspend function for hardware IPs.  The list of all the hardware
3033  * IPs that make up the asic is walked, clockgating is disabled and the
3034  * suspend callbacks are run.  suspend puts the hardware and software state
3035  * in each IP into a state suitable for suspend.
3036  * Returns 0 on success, negative error code on failure.
3037  */
3038 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3039 {
3040         int r;
3041
3042         if (amdgpu_sriov_vf(adev)) {
3043                 amdgpu_virt_fini_data_exchange(adev);
3044                 amdgpu_virt_request_full_gpu(adev, false);
3045         }
3046
3047         r = amdgpu_device_ip_suspend_phase1(adev);
3048         if (r)
3049                 return r;
3050         r = amdgpu_device_ip_suspend_phase2(adev);
3051
3052         if (amdgpu_sriov_vf(adev))
3053                 amdgpu_virt_release_full_gpu(adev, false);
3054
3055         return r;
3056 }
3057
3058 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3059 {
3060         int i, r;
3061
3062         static enum amd_ip_block_type ip_order[] = {
3063                 AMD_IP_BLOCK_TYPE_COMMON,
3064                 AMD_IP_BLOCK_TYPE_GMC,
3065                 AMD_IP_BLOCK_TYPE_PSP,
3066                 AMD_IP_BLOCK_TYPE_IH,
3067         };
3068
3069         for (i = 0; i < adev->num_ip_blocks; i++) {
3070                 int j;
3071                 struct amdgpu_ip_block *block;
3072
3073                 block = &adev->ip_blocks[i];
3074                 block->status.hw = false;
3075
3076                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3077
3078                         if (block->version->type != ip_order[j] ||
3079                                 !block->status.valid)
3080                                 continue;
3081
3082                         r = block->version->funcs->hw_init(adev);
3083                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3084                         if (r)
3085                                 return r;
3086                         block->status.hw = true;
3087                 }
3088         }
3089
3090         return 0;
3091 }
3092
3093 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3094 {
3095         int i, r;
3096
3097         static enum amd_ip_block_type ip_order[] = {
3098                 AMD_IP_BLOCK_TYPE_SMC,
3099                 AMD_IP_BLOCK_TYPE_DCE,
3100                 AMD_IP_BLOCK_TYPE_GFX,
3101                 AMD_IP_BLOCK_TYPE_SDMA,
3102                 AMD_IP_BLOCK_TYPE_MES,
3103                 AMD_IP_BLOCK_TYPE_UVD,
3104                 AMD_IP_BLOCK_TYPE_VCE,
3105                 AMD_IP_BLOCK_TYPE_VCN,
3106                 AMD_IP_BLOCK_TYPE_JPEG
3107         };
3108
3109         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3110                 int j;
3111                 struct amdgpu_ip_block *block;
3112
3113                 for (j = 0; j < adev->num_ip_blocks; j++) {
3114                         block = &adev->ip_blocks[j];
3115
3116                         if (block->version->type != ip_order[i] ||
3117                                 !block->status.valid ||
3118                                 block->status.hw)
3119                                 continue;
3120
3121                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3122                                 r = block->version->funcs->resume(adev);
3123                         else
3124                                 r = block->version->funcs->hw_init(adev);
3125
3126                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3127                         if (r)
3128                                 return r;
3129                         block->status.hw = true;
3130                 }
3131         }
3132
3133         return 0;
3134 }
3135
3136 /**
3137  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3138  *
3139  * @adev: amdgpu_device pointer
3140  *
3141  * First resume function for hardware IPs.  The list of all the hardware
3142  * IPs that make up the asic is walked and the resume callbacks are run for
3143  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3144  * after a suspend and updates the software state as necessary.  This
3145  * function is also used for restoring the GPU after a GPU reset.
3146  * Returns 0 on success, negative error code on failure.
3147  */
3148 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3149 {
3150         int i, r;
3151
3152         for (i = 0; i < adev->num_ip_blocks; i++) {
3153                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3154                         continue;
3155                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3156                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3157                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3158                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3159
3160                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3161                         if (r) {
3162                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3163                                           adev->ip_blocks[i].version->funcs->name, r);
3164                                 return r;
3165                         }
3166                         adev->ip_blocks[i].status.hw = true;
3167                 }
3168         }
3169
3170         return 0;
3171 }
3172
3173 /**
3174  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3175  *
3176  * @adev: amdgpu_device pointer
3177  *
3178  * First resume function for hardware IPs.  The list of all the hardware
3179  * IPs that make up the asic is walked and the resume callbacks are run for
3180  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3181  * functional state after a suspend and updates the software state as
3182  * necessary.  This function is also used for restoring the GPU after a GPU
3183  * reset.
3184  * Returns 0 on success, negative error code on failure.
3185  */
3186 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3187 {
3188         int i, r;
3189
3190         for (i = 0; i < adev->num_ip_blocks; i++) {
3191                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3192                         continue;
3193                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3194                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3195                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3196                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3197                         continue;
3198                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3199                 if (r) {
3200                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3201                                   adev->ip_blocks[i].version->funcs->name, r);
3202                         return r;
3203                 }
3204                 adev->ip_blocks[i].status.hw = true;
3205         }
3206
3207         return 0;
3208 }
3209
3210 /**
3211  * amdgpu_device_ip_resume - run resume for hardware IPs
3212  *
3213  * @adev: amdgpu_device pointer
3214  *
3215  * Main resume function for hardware IPs.  The hardware IPs
3216  * are split into two resume functions because they are
3217  * also used in recovering from a GPU reset and some additional
3218  * steps need to be take between them.  In this case (S3/S4) they are
3219  * run sequentially.
3220  * Returns 0 on success, negative error code on failure.
3221  */
3222 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3223 {
3224         int r;
3225
3226         r = amdgpu_device_ip_resume_phase1(adev);
3227         if (r)
3228                 return r;
3229
3230         r = amdgpu_device_fw_loading(adev);
3231         if (r)
3232                 return r;
3233
3234         r = amdgpu_device_ip_resume_phase2(adev);
3235
3236         return r;
3237 }
3238
3239 /**
3240  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3241  *
3242  * @adev: amdgpu_device pointer
3243  *
3244  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3245  */
3246 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3247 {
3248         if (amdgpu_sriov_vf(adev)) {
3249                 if (adev->is_atom_fw) {
3250                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3251                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3252                 } else {
3253                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3254                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3255                 }
3256
3257                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3258                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3259         }
3260 }
3261
3262 /**
3263  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3264  *
3265  * @asic_type: AMD asic type
3266  *
3267  * Check if there is DC (new modesetting infrastructre) support for an asic.
3268  * returns true if DC has support, false if not.
3269  */
3270 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3271 {
3272         switch (asic_type) {
3273 #ifdef CONFIG_DRM_AMDGPU_SI
3274         case CHIP_HAINAN:
3275 #endif
3276         case CHIP_TOPAZ:
3277                 /* chips with no display hardware */
3278                 return false;
3279 #if defined(CONFIG_DRM_AMD_DC)
3280         case CHIP_TAHITI:
3281         case CHIP_PITCAIRN:
3282         case CHIP_VERDE:
3283         case CHIP_OLAND:
3284                 /*
3285                  * We have systems in the wild with these ASICs that require
3286                  * LVDS and VGA support which is not supported with DC.
3287                  *
3288                  * Fallback to the non-DC driver here by default so as not to
3289                  * cause regressions.
3290                  */
3291 #if defined(CONFIG_DRM_AMD_DC_SI)
3292                 return amdgpu_dc > 0;
3293 #else
3294                 return false;
3295 #endif
3296         case CHIP_BONAIRE:
3297         case CHIP_KAVERI:
3298         case CHIP_KABINI:
3299         case CHIP_MULLINS:
3300                 /*
3301                  * We have systems in the wild with these ASICs that require
3302                  * VGA support which is not supported with DC.
3303                  *
3304                  * Fallback to the non-DC driver here by default so as not to
3305                  * cause regressions.
3306                  */
3307                 return amdgpu_dc > 0;
3308         default:
3309                 return amdgpu_dc != 0;
3310 #else
3311         default:
3312                 if (amdgpu_dc > 0)
3313                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3314                 return false;
3315 #endif
3316         }
3317 }
3318
3319 /**
3320  * amdgpu_device_has_dc_support - check if dc is supported
3321  *
3322  * @adev: amdgpu_device pointer
3323  *
3324  * Returns true for supported, false for not supported
3325  */
3326 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3327 {
3328         if (adev->enable_virtual_display ||
3329             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3330                 return false;
3331
3332         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3333 }
3334
3335 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3336 {
3337         struct amdgpu_device *adev =
3338                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3339         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3340
3341         /* It's a bug to not have a hive within this function */
3342         if (WARN_ON(!hive))
3343                 return;
3344
3345         /*
3346          * Use task barrier to synchronize all xgmi reset works across the
3347          * hive. task_barrier_enter and task_barrier_exit will block
3348          * until all the threads running the xgmi reset works reach
3349          * those points. task_barrier_full will do both blocks.
3350          */
3351         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3352
3353                 task_barrier_enter(&hive->tb);
3354                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3355
3356                 if (adev->asic_reset_res)
3357                         goto fail;
3358
3359                 task_barrier_exit(&hive->tb);
3360                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3361
3362                 if (adev->asic_reset_res)
3363                         goto fail;
3364
3365                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3366                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3367                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3368         } else {
3369
3370                 task_barrier_full(&hive->tb);
3371                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3372         }
3373
3374 fail:
3375         if (adev->asic_reset_res)
3376                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3377                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3378         amdgpu_put_xgmi_hive(hive);
3379 }
3380
3381 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3382 {
3383         char *input = amdgpu_lockup_timeout;
3384         char *timeout_setting = NULL;
3385         int index = 0;
3386         long timeout;
3387         int ret = 0;
3388
3389         /*
3390          * By default timeout for non compute jobs is 10000
3391          * and 60000 for compute jobs.
3392          * In SR-IOV or passthrough mode, timeout for compute
3393          * jobs are 60000 by default.
3394          */
3395         adev->gfx_timeout = msecs_to_jiffies(10000);
3396         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3397         if (amdgpu_sriov_vf(adev))
3398                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3399                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3400         else
3401                 adev->compute_timeout =  msecs_to_jiffies(60000);
3402
3403         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3404                 while ((timeout_setting = strsep(&input, ",")) &&
3405                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3406                         ret = kstrtol(timeout_setting, 0, &timeout);
3407                         if (ret)
3408                                 return ret;
3409
3410                         if (timeout == 0) {
3411                                 index++;
3412                                 continue;
3413                         } else if (timeout < 0) {
3414                                 timeout = MAX_SCHEDULE_TIMEOUT;
3415                                 dev_warn(adev->dev, "lockup timeout disabled");
3416                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3417                         } else {
3418                                 timeout = msecs_to_jiffies(timeout);
3419                         }
3420
3421                         switch (index++) {
3422                         case 0:
3423                                 adev->gfx_timeout = timeout;
3424                                 break;
3425                         case 1:
3426                                 adev->compute_timeout = timeout;
3427                                 break;
3428                         case 2:
3429                                 adev->sdma_timeout = timeout;
3430                                 break;
3431                         case 3:
3432                                 adev->video_timeout = timeout;
3433                                 break;
3434                         default:
3435                                 break;
3436                         }
3437                 }
3438                 /*
3439                  * There is only one value specified and
3440                  * it should apply to all non-compute jobs.
3441                  */
3442                 if (index == 1) {
3443                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3444                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3445                                 adev->compute_timeout = adev->gfx_timeout;
3446                 }
3447         }
3448
3449         return ret;
3450 }
3451
3452 /**
3453  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3454  *
3455  * @adev: amdgpu_device pointer
3456  *
3457  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3458  */
3459 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3460 {
3461         struct iommu_domain *domain;
3462
3463         domain = iommu_get_domain_for_dev(adev->dev);
3464         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3465                 adev->ram_is_direct_mapped = true;
3466 }
3467
3468 static const struct attribute *amdgpu_dev_attributes[] = {
3469         &dev_attr_pcie_replay_count.attr,
3470         NULL
3471 };
3472
3473 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3474 {
3475         if (amdgpu_mcbp == 1)
3476                 adev->gfx.mcbp = true;
3477         else if (amdgpu_mcbp == 0)
3478                 adev->gfx.mcbp = false;
3479         else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3480                  (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3481                  adev->gfx.num_gfx_rings)
3482                 adev->gfx.mcbp = true;
3483
3484         if (amdgpu_sriov_vf(adev))
3485                 adev->gfx.mcbp = true;
3486
3487         if (adev->gfx.mcbp)
3488                 DRM_INFO("MCBP is enabled\n");
3489 }
3490
3491 /**
3492  * amdgpu_device_init - initialize the driver
3493  *
3494  * @adev: amdgpu_device pointer
3495  * @flags: driver flags
3496  *
3497  * Initializes the driver info and hw (all asics).
3498  * Returns 0 for success or an error on failure.
3499  * Called at driver startup.
3500  */
3501 int amdgpu_device_init(struct amdgpu_device *adev,
3502                        uint32_t flags)
3503 {
3504         struct drm_device *ddev = adev_to_drm(adev);
3505         struct pci_dev *pdev = adev->pdev;
3506         int r, i;
3507         bool px = false;
3508         u32 max_MBps;
3509         int tmp;
3510
3511         adev->shutdown = false;
3512         adev->flags = flags;
3513
3514         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3515                 adev->asic_type = amdgpu_force_asic_type;
3516         else
3517                 adev->asic_type = flags & AMD_ASIC_MASK;
3518
3519         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3520         if (amdgpu_emu_mode == 1)
3521                 adev->usec_timeout *= 10;
3522         adev->gmc.gart_size = 512 * 1024 * 1024;
3523         adev->accel_working = false;
3524         adev->num_rings = 0;
3525         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3526         adev->mman.buffer_funcs = NULL;
3527         adev->mman.buffer_funcs_ring = NULL;
3528         adev->vm_manager.vm_pte_funcs = NULL;
3529         adev->vm_manager.vm_pte_num_scheds = 0;
3530         adev->gmc.gmc_funcs = NULL;
3531         adev->harvest_ip_mask = 0x0;
3532         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3533         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3534
3535         adev->smc_rreg = &amdgpu_invalid_rreg;
3536         adev->smc_wreg = &amdgpu_invalid_wreg;
3537         adev->pcie_rreg = &amdgpu_invalid_rreg;
3538         adev->pcie_wreg = &amdgpu_invalid_wreg;
3539         adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3540         adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3541         adev->pciep_rreg = &amdgpu_invalid_rreg;
3542         adev->pciep_wreg = &amdgpu_invalid_wreg;
3543         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3544         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3545         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3546         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3547         adev->didt_rreg = &amdgpu_invalid_rreg;
3548         adev->didt_wreg = &amdgpu_invalid_wreg;
3549         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3550         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3551         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3552         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3553
3554         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3555                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3556                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3557
3558         /* mutex initialization are all done here so we
3559          * can recall function without having locking issues
3560          */
3561         mutex_init(&adev->firmware.mutex);
3562         mutex_init(&adev->pm.mutex);
3563         mutex_init(&adev->gfx.gpu_clock_mutex);
3564         mutex_init(&adev->srbm_mutex);
3565         mutex_init(&adev->gfx.pipe_reserve_mutex);
3566         mutex_init(&adev->gfx.gfx_off_mutex);
3567         mutex_init(&adev->gfx.partition_mutex);
3568         mutex_init(&adev->grbm_idx_mutex);
3569         mutex_init(&adev->mn_lock);
3570         mutex_init(&adev->virt.vf_errors.lock);
3571         hash_init(adev->mn_hash);
3572         mutex_init(&adev->psp.mutex);
3573         mutex_init(&adev->notifier_lock);
3574         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3575         mutex_init(&adev->benchmark_mutex);
3576
3577         amdgpu_device_init_apu_flags(adev);
3578
3579         r = amdgpu_device_check_arguments(adev);
3580         if (r)
3581                 return r;
3582
3583         spin_lock_init(&adev->mmio_idx_lock);
3584         spin_lock_init(&adev->smc_idx_lock);
3585         spin_lock_init(&adev->pcie_idx_lock);
3586         spin_lock_init(&adev->uvd_ctx_idx_lock);
3587         spin_lock_init(&adev->didt_idx_lock);
3588         spin_lock_init(&adev->gc_cac_idx_lock);
3589         spin_lock_init(&adev->se_cac_idx_lock);
3590         spin_lock_init(&adev->audio_endpt_idx_lock);
3591         spin_lock_init(&adev->mm_stats.lock);
3592
3593         INIT_LIST_HEAD(&adev->shadow_list);
3594         mutex_init(&adev->shadow_list_lock);
3595
3596         INIT_LIST_HEAD(&adev->reset_list);
3597
3598         INIT_LIST_HEAD(&adev->ras_list);
3599
3600         INIT_DELAYED_WORK(&adev->delayed_init_work,
3601                           amdgpu_device_delayed_init_work_handler);
3602         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3603                           amdgpu_device_delay_enable_gfx_off);
3604
3605         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3606
3607         adev->gfx.gfx_off_req_count = 1;
3608         adev->gfx.gfx_off_residency = 0;
3609         adev->gfx.gfx_off_entrycount = 0;
3610         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3611
3612         atomic_set(&adev->throttling_logging_enabled, 1);
3613         /*
3614          * If throttling continues, logging will be performed every minute
3615          * to avoid log flooding. "-1" is subtracted since the thermal
3616          * throttling interrupt comes every second. Thus, the total logging
3617          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3618          * for throttling interrupt) = 60 seconds.
3619          */
3620         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3621         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3622
3623         /* Registers mapping */
3624         /* TODO: block userspace mapping of io register */
3625         if (adev->asic_type >= CHIP_BONAIRE) {
3626                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3627                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3628         } else {
3629                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3630                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3631         }
3632
3633         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3634                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3635
3636         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3637         if (!adev->rmmio)
3638                 return -ENOMEM;
3639
3640         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3641         DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3642
3643         /*
3644          * Reset domain needs to be present early, before XGMI hive discovered
3645          * (if any) and intitialized to use reset sem and in_gpu reset flag
3646          * early on during init and before calling to RREG32.
3647          */
3648         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3649         if (!adev->reset_domain)
3650                 return -ENOMEM;
3651
3652         /* detect hw virtualization here */
3653         amdgpu_detect_virtualization(adev);
3654
3655         amdgpu_device_get_pcie_info(adev);
3656
3657         r = amdgpu_device_get_job_timeout_settings(adev);
3658         if (r) {
3659                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3660                 return r;
3661         }
3662
3663         /* early init functions */
3664         r = amdgpu_device_ip_early_init(adev);
3665         if (r)
3666                 return r;
3667
3668         amdgpu_device_set_mcbp(adev);
3669
3670         /* Get rid of things like offb */
3671         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3672         if (r)
3673                 return r;
3674
3675         /* Enable TMZ based on IP_VERSION */
3676         amdgpu_gmc_tmz_set(adev);
3677
3678         amdgpu_gmc_noretry_set(adev);
3679         /* Need to get xgmi info early to decide the reset behavior*/
3680         if (adev->gmc.xgmi.supported) {
3681                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3682                 if (r)
3683                         return r;
3684         }
3685
3686         /* enable PCIE atomic ops */
3687         if (amdgpu_sriov_vf(adev)) {
3688                 if (adev->virt.fw_reserve.p_pf2vf)
3689                         adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3690                                                       adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3691                                 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3692         /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3693          * internal path natively support atomics, set have_atomics_support to true.
3694          */
3695         } else if ((adev->flags & AMD_IS_APU) &&
3696                    (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3697                 adev->have_atomics_support = true;
3698         } else {
3699                 adev->have_atomics_support =
3700                         !pci_enable_atomic_ops_to_root(adev->pdev,
3701                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3702                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3703         }
3704
3705         if (!adev->have_atomics_support)
3706                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3707
3708         /* doorbell bar mapping and doorbell index init*/
3709         amdgpu_doorbell_init(adev);
3710
3711         if (amdgpu_emu_mode == 1) {
3712                 /* post the asic on emulation mode */
3713                 emu_soc_asic_init(adev);
3714                 goto fence_driver_init;
3715         }
3716
3717         amdgpu_reset_init(adev);
3718
3719         /* detect if we are with an SRIOV vbios */
3720         if (adev->bios)
3721                 amdgpu_device_detect_sriov_bios(adev);
3722
3723         /* check if we need to reset the asic
3724          *  E.g., driver was not cleanly unloaded previously, etc.
3725          */
3726         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3727                 if (adev->gmc.xgmi.num_physical_nodes) {
3728                         dev_info(adev->dev, "Pending hive reset.\n");
3729                         adev->gmc.xgmi.pending_reset = true;
3730                         /* Only need to init necessary block for SMU to handle the reset */
3731                         for (i = 0; i < adev->num_ip_blocks; i++) {
3732                                 if (!adev->ip_blocks[i].status.valid)
3733                                         continue;
3734                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3735                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3736                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3737                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3738                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3739                                                 adev->ip_blocks[i].version->funcs->name);
3740                                         adev->ip_blocks[i].status.hw = true;
3741                                 }
3742                         }
3743                 } else {
3744                         tmp = amdgpu_reset_method;
3745                         /* It should do a default reset when loading or reloading the driver,
3746                          * regardless of the module parameter reset_method.
3747                          */
3748                         amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3749                         r = amdgpu_asic_reset(adev);
3750                         amdgpu_reset_method = tmp;
3751                         if (r) {
3752                                 dev_err(adev->dev, "asic reset on init failed\n");
3753                                 goto failed;
3754                         }
3755                 }
3756         }
3757
3758         /* Post card if necessary */
3759         if (amdgpu_device_need_post(adev)) {
3760                 if (!adev->bios) {
3761                         dev_err(adev->dev, "no vBIOS found\n");
3762                         r = -EINVAL;
3763                         goto failed;
3764                 }
3765                 DRM_INFO("GPU posting now...\n");
3766                 r = amdgpu_device_asic_init(adev);
3767                 if (r) {
3768                         dev_err(adev->dev, "gpu post error!\n");
3769                         goto failed;
3770                 }
3771         }
3772
3773         if (adev->bios) {
3774                 if (adev->is_atom_fw) {
3775                         /* Initialize clocks */
3776                         r = amdgpu_atomfirmware_get_clock_info(adev);
3777                         if (r) {
3778                                 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3779                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3780                                 goto failed;
3781                         }
3782                 } else {
3783                         /* Initialize clocks */
3784                         r = amdgpu_atombios_get_clock_info(adev);
3785                         if (r) {
3786                                 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3787                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3788                                 goto failed;
3789                         }
3790                         /* init i2c buses */
3791                         if (!amdgpu_device_has_dc_support(adev))
3792                                 amdgpu_atombios_i2c_init(adev);
3793                 }
3794         }
3795
3796 fence_driver_init:
3797         /* Fence driver */
3798         r = amdgpu_fence_driver_sw_init(adev);
3799         if (r) {
3800                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3801                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3802                 goto failed;
3803         }
3804
3805         /* init the mode config */
3806         drm_mode_config_init(adev_to_drm(adev));
3807
3808         r = amdgpu_device_ip_init(adev);
3809         if (r) {
3810                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3811                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3812                 goto release_ras_con;
3813         }
3814
3815         amdgpu_fence_driver_hw_init(adev);
3816
3817         dev_info(adev->dev,
3818                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3819                         adev->gfx.config.max_shader_engines,
3820                         adev->gfx.config.max_sh_per_se,
3821                         adev->gfx.config.max_cu_per_sh,
3822                         adev->gfx.cu_info.number);
3823
3824         adev->accel_working = true;
3825
3826         amdgpu_vm_check_compute_bug(adev);
3827
3828         /* Initialize the buffer migration limit. */
3829         if (amdgpu_moverate >= 0)
3830                 max_MBps = amdgpu_moverate;
3831         else
3832                 max_MBps = 8; /* Allow 8 MB/s. */
3833         /* Get a log2 for easy divisions. */
3834         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3835
3836         r = amdgpu_atombios_sysfs_init(adev);
3837         if (r)
3838                 drm_err(&adev->ddev,
3839                         "registering atombios sysfs failed (%d).\n", r);
3840
3841         r = amdgpu_pm_sysfs_init(adev);
3842         if (r)
3843                 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3844
3845         r = amdgpu_ucode_sysfs_init(adev);
3846         if (r) {
3847                 adev->ucode_sysfs_en = false;
3848                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3849         } else
3850                 adev->ucode_sysfs_en = true;
3851
3852         /*
3853          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3854          * Otherwise the mgpu fan boost feature will be skipped due to the
3855          * gpu instance is counted less.
3856          */
3857         amdgpu_register_gpu_instance(adev);
3858
3859         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3860          * explicit gating rather than handling it automatically.
3861          */
3862         if (!adev->gmc.xgmi.pending_reset) {
3863                 r = amdgpu_device_ip_late_init(adev);
3864                 if (r) {
3865                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3866                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3867                         goto release_ras_con;
3868                 }
3869                 /* must succeed. */
3870                 amdgpu_ras_resume(adev);
3871                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3872                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3873         }
3874
3875         if (amdgpu_sriov_vf(adev)) {
3876                 amdgpu_virt_release_full_gpu(adev, true);
3877                 flush_delayed_work(&adev->delayed_init_work);
3878         }
3879
3880         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3881         if (r)
3882                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3883
3884         amdgpu_fru_sysfs_init(adev);
3885
3886         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3887                 r = amdgpu_pmu_init(adev);
3888         if (r)
3889                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3890
3891         /* Have stored pci confspace at hand for restore in sudden PCI error */
3892         if (amdgpu_device_cache_pci_state(adev->pdev))
3893                 pci_restore_state(pdev);
3894
3895         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3896         /* this will fail for cards that aren't VGA class devices, just
3897          * ignore it
3898          */
3899         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3900                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3901
3902         px = amdgpu_device_supports_px(ddev);
3903
3904         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3905                                 apple_gmux_detect(NULL, NULL)))
3906                 vga_switcheroo_register_client(adev->pdev,
3907                                                &amdgpu_switcheroo_ops, px);
3908
3909         if (px)
3910                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3911
3912         if (adev->gmc.xgmi.pending_reset)
3913                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3914                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3915
3916         amdgpu_device_check_iommu_direct_map(adev);
3917
3918         return 0;
3919
3920 release_ras_con:
3921         if (amdgpu_sriov_vf(adev))
3922                 amdgpu_virt_release_full_gpu(adev, true);
3923
3924         /* failed in exclusive mode due to timeout */
3925         if (amdgpu_sriov_vf(adev) &&
3926                 !amdgpu_sriov_runtime(adev) &&
3927                 amdgpu_virt_mmio_blocked(adev) &&
3928                 !amdgpu_virt_wait_reset(adev)) {
3929                 dev_err(adev->dev, "VF exclusive mode timeout\n");
3930                 /* Don't send request since VF is inactive. */
3931                 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3932                 adev->virt.ops = NULL;
3933                 r = -EAGAIN;
3934         }
3935         amdgpu_release_ras_context(adev);
3936
3937 failed:
3938         amdgpu_vf_error_trans_all(adev);
3939
3940         return r;
3941 }
3942
3943 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3944 {
3945
3946         /* Clear all CPU mappings pointing to this device */
3947         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3948
3949         /* Unmap all mapped bars - Doorbell, registers and VRAM */
3950         amdgpu_doorbell_fini(adev);
3951
3952         iounmap(adev->rmmio);
3953         adev->rmmio = NULL;
3954         if (adev->mman.aper_base_kaddr)
3955                 iounmap(adev->mman.aper_base_kaddr);
3956         adev->mman.aper_base_kaddr = NULL;
3957
3958         /* Memory manager related */
3959         if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
3960                 arch_phys_wc_del(adev->gmc.vram_mtrr);
3961                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3962         }
3963 }
3964
3965 /**
3966  * amdgpu_device_fini_hw - tear down the driver
3967  *
3968  * @adev: amdgpu_device pointer
3969  *
3970  * Tear down the driver info (all asics).
3971  * Called at driver shutdown.
3972  */
3973 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3974 {
3975         dev_info(adev->dev, "amdgpu: finishing device.\n");
3976         flush_delayed_work(&adev->delayed_init_work);
3977         adev->shutdown = true;
3978
3979         /* make sure IB test finished before entering exclusive mode
3980          * to avoid preemption on IB test
3981          */
3982         if (amdgpu_sriov_vf(adev)) {
3983                 amdgpu_virt_request_full_gpu(adev, false);
3984                 amdgpu_virt_fini_data_exchange(adev);
3985         }
3986
3987         /* disable all interrupts */
3988         amdgpu_irq_disable_all(adev);
3989         if (adev->mode_info.mode_config_initialized) {
3990                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3991                         drm_helper_force_disable_all(adev_to_drm(adev));
3992                 else
3993                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3994         }
3995         amdgpu_fence_driver_hw_fini(adev);
3996
3997         if (adev->mman.initialized)
3998                 drain_workqueue(adev->mman.bdev.wq);
3999
4000         if (adev->pm.sysfs_initialized)
4001                 amdgpu_pm_sysfs_fini(adev);
4002         if (adev->ucode_sysfs_en)
4003                 amdgpu_ucode_sysfs_fini(adev);
4004         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4005         amdgpu_fru_sysfs_fini(adev);
4006
4007         /* disable ras feature must before hw fini */
4008         amdgpu_ras_pre_fini(adev);
4009
4010         amdgpu_device_ip_fini_early(adev);
4011
4012         amdgpu_irq_fini_hw(adev);
4013
4014         if (adev->mman.initialized)
4015                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4016
4017         amdgpu_gart_dummy_page_fini(adev);
4018
4019         if (drm_dev_is_unplugged(adev_to_drm(adev)))
4020                 amdgpu_device_unmap_mmio(adev);
4021
4022 }
4023
4024 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4025 {
4026         int idx;
4027         bool px;
4028
4029         amdgpu_fence_driver_sw_fini(adev);
4030         amdgpu_device_ip_fini(adev);
4031         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4032         adev->accel_working = false;
4033         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4034
4035         amdgpu_reset_fini(adev);
4036
4037         /* free i2c buses */
4038         if (!amdgpu_device_has_dc_support(adev))
4039                 amdgpu_i2c_fini(adev);
4040
4041         if (amdgpu_emu_mode != 1)
4042                 amdgpu_atombios_fini(adev);
4043
4044         kfree(adev->bios);
4045         adev->bios = NULL;
4046
4047         px = amdgpu_device_supports_px(adev_to_drm(adev));
4048
4049         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4050                                 apple_gmux_detect(NULL, NULL)))
4051                 vga_switcheroo_unregister_client(adev->pdev);
4052
4053         if (px)
4054                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4055
4056         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4057                 vga_client_unregister(adev->pdev);
4058
4059         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4060
4061                 iounmap(adev->rmmio);
4062                 adev->rmmio = NULL;
4063                 amdgpu_doorbell_fini(adev);
4064                 drm_dev_exit(idx);
4065         }
4066
4067         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4068                 amdgpu_pmu_fini(adev);
4069         if (adev->mman.discovery_bin)
4070                 amdgpu_discovery_fini(adev);
4071
4072         amdgpu_reset_put_reset_domain(adev->reset_domain);
4073         adev->reset_domain = NULL;
4074
4075         kfree(adev->pci_state);
4076
4077 }
4078
4079 /**
4080  * amdgpu_device_evict_resources - evict device resources
4081  * @adev: amdgpu device object
4082  *
4083  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4084  * of the vram memory type. Mainly used for evicting device resources
4085  * at suspend time.
4086  *
4087  */
4088 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4089 {
4090         int ret;
4091
4092         /* No need to evict vram on APUs for suspend to ram or s2idle */
4093         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4094                 return 0;
4095
4096         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4097         if (ret)
4098                 DRM_WARN("evicting device resources failed\n");
4099         return ret;
4100 }
4101
4102 /*
4103  * Suspend & resume.
4104  */
4105 /**
4106  * amdgpu_device_suspend - initiate device suspend
4107  *
4108  * @dev: drm dev pointer
4109  * @fbcon : notify the fbdev of suspend
4110  *
4111  * Puts the hw in the suspend state (all asics).
4112  * Returns 0 for success or an error on failure.
4113  * Called at driver suspend.
4114  */
4115 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4116 {
4117         struct amdgpu_device *adev = drm_to_adev(dev);
4118         int r = 0;
4119
4120         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4121                 return 0;
4122
4123         adev->in_suspend = true;
4124
4125         /* Evict the majority of BOs before grabbing the full access */
4126         r = amdgpu_device_evict_resources(adev);
4127         if (r)
4128                 return r;
4129
4130         if (amdgpu_sriov_vf(adev)) {
4131                 amdgpu_virt_fini_data_exchange(adev);
4132                 r = amdgpu_virt_request_full_gpu(adev, false);
4133                 if (r)
4134                         return r;
4135         }
4136
4137         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4138                 DRM_WARN("smart shift update failed\n");
4139
4140         if (fbcon)
4141                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4142
4143         cancel_delayed_work_sync(&adev->delayed_init_work);
4144         flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4145
4146         amdgpu_ras_suspend(adev);
4147
4148         amdgpu_device_ip_suspend_phase1(adev);
4149
4150         if (!adev->in_s0ix)
4151                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4152
4153         r = amdgpu_device_evict_resources(adev);
4154         if (r)
4155                 return r;
4156
4157         amdgpu_fence_driver_hw_fini(adev);
4158
4159         amdgpu_device_ip_suspend_phase2(adev);
4160
4161         if (amdgpu_sriov_vf(adev))
4162                 amdgpu_virt_release_full_gpu(adev, false);
4163
4164         return 0;
4165 }
4166
4167 /**
4168  * amdgpu_device_resume - initiate device resume
4169  *
4170  * @dev: drm dev pointer
4171  * @fbcon : notify the fbdev of resume
4172  *
4173  * Bring the hw back to operating state (all asics).
4174  * Returns 0 for success or an error on failure.
4175  * Called at driver resume.
4176  */
4177 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4178 {
4179         struct amdgpu_device *adev = drm_to_adev(dev);
4180         int r = 0;
4181
4182         if (amdgpu_sriov_vf(adev)) {
4183                 r = amdgpu_virt_request_full_gpu(adev, true);
4184                 if (r)
4185                         return r;
4186         }
4187
4188         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4189                 return 0;
4190
4191         if (adev->in_s0ix)
4192                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4193
4194         /* post card */
4195         if (amdgpu_device_need_post(adev)) {
4196                 r = amdgpu_device_asic_init(adev);
4197                 if (r)
4198                         dev_err(adev->dev, "amdgpu asic init failed\n");
4199         }
4200
4201         r = amdgpu_device_ip_resume(adev);
4202
4203         if (r) {
4204                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4205                 goto exit;
4206         }
4207         amdgpu_fence_driver_hw_init(adev);
4208
4209         r = amdgpu_device_ip_late_init(adev);
4210         if (r)
4211                 goto exit;
4212
4213         queue_delayed_work(system_wq, &adev->delayed_init_work,
4214                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4215
4216         if (!adev->in_s0ix) {
4217                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4218                 if (r)
4219                         goto exit;
4220         }
4221
4222 exit:
4223         if (amdgpu_sriov_vf(adev)) {
4224                 amdgpu_virt_init_data_exchange(adev);
4225                 amdgpu_virt_release_full_gpu(adev, true);
4226         }
4227
4228         if (r)
4229                 return r;
4230
4231         /* Make sure IB tests flushed */
4232         flush_delayed_work(&adev->delayed_init_work);
4233
4234         if (fbcon)
4235                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4236
4237         amdgpu_ras_resume(adev);
4238
4239         if (adev->mode_info.num_crtc) {
4240                 /*
4241                  * Most of the connector probing functions try to acquire runtime pm
4242                  * refs to ensure that the GPU is powered on when connector polling is
4243                  * performed. Since we're calling this from a runtime PM callback,
4244                  * trying to acquire rpm refs will cause us to deadlock.
4245                  *
4246                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4247                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4248                  */
4249 #ifdef CONFIG_PM
4250                 dev->dev->power.disable_depth++;
4251 #endif
4252                 if (!adev->dc_enabled)
4253                         drm_helper_hpd_irq_event(dev);
4254                 else
4255                         drm_kms_helper_hotplug_event(dev);
4256 #ifdef CONFIG_PM
4257                 dev->dev->power.disable_depth--;
4258 #endif
4259         }
4260         adev->in_suspend = false;
4261
4262         if (adev->enable_mes)
4263                 amdgpu_mes_self_test(adev);
4264
4265         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4266                 DRM_WARN("smart shift update failed\n");
4267
4268         return 0;
4269 }
4270
4271 /**
4272  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4273  *
4274  * @adev: amdgpu_device pointer
4275  *
4276  * The list of all the hardware IPs that make up the asic is walked and
4277  * the check_soft_reset callbacks are run.  check_soft_reset determines
4278  * if the asic is still hung or not.
4279  * Returns true if any of the IPs are still in a hung state, false if not.
4280  */
4281 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4282 {
4283         int i;
4284         bool asic_hang = false;
4285
4286         if (amdgpu_sriov_vf(adev))
4287                 return true;
4288
4289         if (amdgpu_asic_need_full_reset(adev))
4290                 return true;
4291
4292         for (i = 0; i < adev->num_ip_blocks; i++) {
4293                 if (!adev->ip_blocks[i].status.valid)
4294                         continue;
4295                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4296                         adev->ip_blocks[i].status.hang =
4297                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4298                 if (adev->ip_blocks[i].status.hang) {
4299                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4300                         asic_hang = true;
4301                 }
4302         }
4303         return asic_hang;
4304 }
4305
4306 /**
4307  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4308  *
4309  * @adev: amdgpu_device pointer
4310  *
4311  * The list of all the hardware IPs that make up the asic is walked and the
4312  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4313  * handles any IP specific hardware or software state changes that are
4314  * necessary for a soft reset to succeed.
4315  * Returns 0 on success, negative error code on failure.
4316  */
4317 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4318 {
4319         int i, r = 0;
4320
4321         for (i = 0; i < adev->num_ip_blocks; i++) {
4322                 if (!adev->ip_blocks[i].status.valid)
4323                         continue;
4324                 if (adev->ip_blocks[i].status.hang &&
4325                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4326                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4327                         if (r)
4328                                 return r;
4329                 }
4330         }
4331
4332         return 0;
4333 }
4334
4335 /**
4336  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4337  *
4338  * @adev: amdgpu_device pointer
4339  *
4340  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4341  * reset is necessary to recover.
4342  * Returns true if a full asic reset is required, false if not.
4343  */
4344 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4345 {
4346         int i;
4347
4348         if (amdgpu_asic_need_full_reset(adev))
4349                 return true;
4350
4351         for (i = 0; i < adev->num_ip_blocks; i++) {
4352                 if (!adev->ip_blocks[i].status.valid)
4353                         continue;
4354                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4355                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4356                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4357                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4358                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4359                         if (adev->ip_blocks[i].status.hang) {
4360                                 dev_info(adev->dev, "Some block need full reset!\n");
4361                                 return true;
4362                         }
4363                 }
4364         }
4365         return false;
4366 }
4367
4368 /**
4369  * amdgpu_device_ip_soft_reset - do a soft reset
4370  *
4371  * @adev: amdgpu_device pointer
4372  *
4373  * The list of all the hardware IPs that make up the asic is walked and the
4374  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4375  * IP specific hardware or software state changes that are necessary to soft
4376  * reset the IP.
4377  * Returns 0 on success, negative error code on failure.
4378  */
4379 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4380 {
4381         int i, r = 0;
4382
4383         for (i = 0; i < adev->num_ip_blocks; i++) {
4384                 if (!adev->ip_blocks[i].status.valid)
4385                         continue;
4386                 if (adev->ip_blocks[i].status.hang &&
4387                     adev->ip_blocks[i].version->funcs->soft_reset) {
4388                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4389                         if (r)
4390                                 return r;
4391                 }
4392         }
4393
4394         return 0;
4395 }
4396
4397 /**
4398  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4399  *
4400  * @adev: amdgpu_device pointer
4401  *
4402  * The list of all the hardware IPs that make up the asic is walked and the
4403  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4404  * handles any IP specific hardware or software state changes that are
4405  * necessary after the IP has been soft reset.
4406  * Returns 0 on success, negative error code on failure.
4407  */
4408 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4409 {
4410         int i, r = 0;
4411
4412         for (i = 0; i < adev->num_ip_blocks; i++) {
4413                 if (!adev->ip_blocks[i].status.valid)
4414                         continue;
4415                 if (adev->ip_blocks[i].status.hang &&
4416                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4417                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4418                 if (r)
4419                         return r;
4420         }
4421
4422         return 0;
4423 }
4424
4425 /**
4426  * amdgpu_device_recover_vram - Recover some VRAM contents
4427  *
4428  * @adev: amdgpu_device pointer
4429  *
4430  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4431  * restore things like GPUVM page tables after a GPU reset where
4432  * the contents of VRAM might be lost.
4433  *
4434  * Returns:
4435  * 0 on success, negative error code on failure.
4436  */
4437 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4438 {
4439         struct dma_fence *fence = NULL, *next = NULL;
4440         struct amdgpu_bo *shadow;
4441         struct amdgpu_bo_vm *vmbo;
4442         long r = 1, tmo;
4443
4444         if (amdgpu_sriov_runtime(adev))
4445                 tmo = msecs_to_jiffies(8000);
4446         else
4447                 tmo = msecs_to_jiffies(100);
4448
4449         dev_info(adev->dev, "recover vram bo from shadow start\n");
4450         mutex_lock(&adev->shadow_list_lock);
4451         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4452                 /* If vm is compute context or adev is APU, shadow will be NULL */
4453                 if (!vmbo->shadow)
4454                         continue;
4455                 shadow = vmbo->shadow;
4456
4457                 /* No need to recover an evicted BO */
4458                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4459                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4460                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4461                         continue;
4462
4463                 r = amdgpu_bo_restore_shadow(shadow, &next);
4464                 if (r)
4465                         break;
4466
4467                 if (fence) {
4468                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4469                         dma_fence_put(fence);
4470                         fence = next;
4471                         if (tmo == 0) {
4472                                 r = -ETIMEDOUT;
4473                                 break;
4474                         } else if (tmo < 0) {
4475                                 r = tmo;
4476                                 break;
4477                         }
4478                 } else {
4479                         fence = next;
4480                 }
4481         }
4482         mutex_unlock(&adev->shadow_list_lock);
4483
4484         if (fence)
4485                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4486         dma_fence_put(fence);
4487
4488         if (r < 0 || tmo <= 0) {
4489                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4490                 return -EIO;
4491         }
4492
4493         dev_info(adev->dev, "recover vram bo from shadow done\n");
4494         return 0;
4495 }
4496
4497
4498 /**
4499  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4500  *
4501  * @adev: amdgpu_device pointer
4502  * @from_hypervisor: request from hypervisor
4503  *
4504  * do VF FLR and reinitialize Asic
4505  * return 0 means succeeded otherwise failed
4506  */
4507 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4508                                      bool from_hypervisor)
4509 {
4510         int r;
4511         struct amdgpu_hive_info *hive = NULL;
4512         int retry_limit = 0;
4513
4514 retry:
4515         amdgpu_amdkfd_pre_reset(adev);
4516
4517         if (from_hypervisor)
4518                 r = amdgpu_virt_request_full_gpu(adev, true);
4519         else
4520                 r = amdgpu_virt_reset_gpu(adev);
4521         if (r)
4522                 return r;
4523         amdgpu_irq_gpu_reset_resume_helper(adev);
4524
4525         /* some sw clean up VF needs to do before recover */
4526         amdgpu_virt_post_reset(adev);
4527
4528         /* Resume IP prior to SMC */
4529         r = amdgpu_device_ip_reinit_early_sriov(adev);
4530         if (r)
4531                 goto error;
4532
4533         amdgpu_virt_init_data_exchange(adev);
4534
4535         r = amdgpu_device_fw_loading(adev);
4536         if (r)
4537                 return r;
4538
4539         /* now we are okay to resume SMC/CP/SDMA */
4540         r = amdgpu_device_ip_reinit_late_sriov(adev);
4541         if (r)
4542                 goto error;
4543
4544         hive = amdgpu_get_xgmi_hive(adev);
4545         /* Update PSP FW topology after reset */
4546         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4547                 r = amdgpu_xgmi_update_topology(hive, adev);
4548
4549         if (hive)
4550                 amdgpu_put_xgmi_hive(hive);
4551
4552         if (!r) {
4553                 r = amdgpu_ib_ring_tests(adev);
4554
4555                 amdgpu_amdkfd_post_reset(adev);
4556         }
4557
4558 error:
4559         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4560                 amdgpu_inc_vram_lost(adev);
4561                 r = amdgpu_device_recover_vram(adev);
4562         }
4563         amdgpu_virt_release_full_gpu(adev, true);
4564
4565         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4566                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4567                         retry_limit++;
4568                         goto retry;
4569                 } else
4570                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4571         }
4572
4573         return r;
4574 }
4575
4576 /**
4577  * amdgpu_device_has_job_running - check if there is any job in mirror list
4578  *
4579  * @adev: amdgpu_device pointer
4580  *
4581  * check if there is any job in mirror list
4582  */
4583 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4584 {
4585         int i;
4586         struct drm_sched_job *job;
4587
4588         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4589                 struct amdgpu_ring *ring = adev->rings[i];
4590
4591                 if (!ring || !ring->sched.thread)
4592                         continue;
4593
4594                 spin_lock(&ring->sched.job_list_lock);
4595                 job = list_first_entry_or_null(&ring->sched.pending_list,
4596                                                struct drm_sched_job, list);
4597                 spin_unlock(&ring->sched.job_list_lock);
4598                 if (job)
4599                         return true;
4600         }
4601         return false;
4602 }
4603
4604 /**
4605  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4606  *
4607  * @adev: amdgpu_device pointer
4608  *
4609  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4610  * a hung GPU.
4611  */
4612 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4613 {
4614
4615         if (amdgpu_gpu_recovery == 0)
4616                 goto disabled;
4617
4618         /* Skip soft reset check in fatal error mode */
4619         if (!amdgpu_ras_is_poison_mode_supported(adev))
4620                 return true;
4621
4622         if (amdgpu_sriov_vf(adev))
4623                 return true;
4624
4625         if (amdgpu_gpu_recovery == -1) {
4626                 switch (adev->asic_type) {
4627 #ifdef CONFIG_DRM_AMDGPU_SI
4628                 case CHIP_VERDE:
4629                 case CHIP_TAHITI:
4630                 case CHIP_PITCAIRN:
4631                 case CHIP_OLAND:
4632                 case CHIP_HAINAN:
4633 #endif
4634 #ifdef CONFIG_DRM_AMDGPU_CIK
4635                 case CHIP_KAVERI:
4636                 case CHIP_KABINI:
4637                 case CHIP_MULLINS:
4638 #endif
4639                 case CHIP_CARRIZO:
4640                 case CHIP_STONEY:
4641                 case CHIP_CYAN_SKILLFISH:
4642                         goto disabled;
4643                 default:
4644                         break;
4645                 }
4646         }
4647
4648         return true;
4649
4650 disabled:
4651                 dev_info(adev->dev, "GPU recovery disabled.\n");
4652                 return false;
4653 }
4654
4655 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4656 {
4657         u32 i;
4658         int ret = 0;
4659
4660         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4661
4662         dev_info(adev->dev, "GPU mode1 reset\n");
4663
4664         /* disable BM */
4665         pci_clear_master(adev->pdev);
4666
4667         amdgpu_device_cache_pci_state(adev->pdev);
4668
4669         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4670                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4671                 ret = amdgpu_dpm_mode1_reset(adev);
4672         } else {
4673                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4674                 ret = psp_gpu_reset(adev);
4675         }
4676
4677         if (ret)
4678                 goto mode1_reset_failed;
4679
4680         amdgpu_device_load_pci_state(adev->pdev);
4681         ret = amdgpu_psp_wait_for_bootloader(adev);
4682         if (ret)
4683                 goto mode1_reset_failed;
4684
4685         /* wait for asic to come out of reset */
4686         for (i = 0; i < adev->usec_timeout; i++) {
4687                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4688
4689                 if (memsize != 0xffffffff)
4690                         break;
4691                 udelay(1);
4692         }
4693
4694         if (i >= adev->usec_timeout) {
4695                 ret = -ETIMEDOUT;
4696                 goto mode1_reset_failed;
4697         }
4698
4699         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4700
4701         return 0;
4702
4703 mode1_reset_failed:
4704         dev_err(adev->dev, "GPU mode1 reset failed\n");
4705         return ret;
4706 }
4707
4708 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4709                                  struct amdgpu_reset_context *reset_context)
4710 {
4711         int i, r = 0;
4712         struct amdgpu_job *job = NULL;
4713         bool need_full_reset =
4714                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4715
4716         if (reset_context->reset_req_dev == adev)
4717                 job = reset_context->job;
4718
4719         if (amdgpu_sriov_vf(adev)) {
4720                 /* stop the data exchange thread */
4721                 amdgpu_virt_fini_data_exchange(adev);
4722         }
4723
4724         amdgpu_fence_driver_isr_toggle(adev, true);
4725
4726         /* block all schedulers and reset given job's ring */
4727         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4728                 struct amdgpu_ring *ring = adev->rings[i];
4729
4730                 if (!ring || !ring->sched.thread)
4731                         continue;
4732
4733                 /* Clear job fence from fence drv to avoid force_completion
4734                  * leave NULL and vm flush fence in fence drv
4735                  */
4736                 amdgpu_fence_driver_clear_job_fences(ring);
4737
4738                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4739                 amdgpu_fence_driver_force_completion(ring);
4740         }
4741
4742         amdgpu_fence_driver_isr_toggle(adev, false);
4743
4744         if (job && job->vm)
4745                 drm_sched_increase_karma(&job->base);
4746
4747         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4748         /* If reset handler not implemented, continue; otherwise return */
4749         if (r == -EOPNOTSUPP)
4750                 r = 0;
4751         else
4752                 return r;
4753
4754         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4755         if (!amdgpu_sriov_vf(adev)) {
4756
4757                 if (!need_full_reset)
4758                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4759
4760                 if (!need_full_reset && amdgpu_gpu_recovery &&
4761                     amdgpu_device_ip_check_soft_reset(adev)) {
4762                         amdgpu_device_ip_pre_soft_reset(adev);
4763                         r = amdgpu_device_ip_soft_reset(adev);
4764                         amdgpu_device_ip_post_soft_reset(adev);
4765                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4766                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4767                                 need_full_reset = true;
4768                         }
4769                 }
4770
4771                 if (need_full_reset)
4772                         r = amdgpu_device_ip_suspend(adev);
4773                 if (need_full_reset)
4774                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4775                 else
4776                         clear_bit(AMDGPU_NEED_FULL_RESET,
4777                                   &reset_context->flags);
4778         }
4779
4780         return r;
4781 }
4782
4783 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4784 {
4785         int i;
4786
4787         lockdep_assert_held(&adev->reset_domain->sem);
4788
4789         for (i = 0; i < adev->num_regs; i++) {
4790                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4791                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4792                                              adev->reset_dump_reg_value[i]);
4793         }
4794
4795         return 0;
4796 }
4797
4798 #ifdef CONFIG_DEV_COREDUMP
4799 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4800                 size_t count, void *data, size_t datalen)
4801 {
4802         struct drm_printer p;
4803         struct amdgpu_device *adev = data;
4804         struct drm_print_iterator iter;
4805         int i;
4806
4807         iter.data = buffer;
4808         iter.offset = 0;
4809         iter.start = offset;
4810         iter.remain = count;
4811
4812         p = drm_coredump_printer(&iter);
4813
4814         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4815         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4816         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4817         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4818         if (adev->reset_task_info.pid)
4819                 drm_printf(&p, "process_name: %s PID: %d\n",
4820                            adev->reset_task_info.process_name,
4821                            adev->reset_task_info.pid);
4822
4823         if (adev->reset_vram_lost)
4824                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4825         if (adev->num_regs) {
4826                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4827
4828                 for (i = 0; i < adev->num_regs; i++)
4829                         drm_printf(&p, "0x%08x: 0x%08x\n",
4830                                    adev->reset_dump_reg_list[i],
4831                                    adev->reset_dump_reg_value[i]);
4832         }
4833
4834         return count - iter.remain;
4835 }
4836
4837 static void amdgpu_devcoredump_free(void *data)
4838 {
4839 }
4840
4841 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4842 {
4843         struct drm_device *dev = adev_to_drm(adev);
4844
4845         ktime_get_ts64(&adev->reset_time);
4846         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
4847                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4848 }
4849 #endif
4850
4851 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4852                          struct amdgpu_reset_context *reset_context)
4853 {
4854         struct amdgpu_device *tmp_adev = NULL;
4855         bool need_full_reset, skip_hw_reset, vram_lost = false;
4856         int r = 0;
4857         bool gpu_reset_for_dev_remove = 0;
4858
4859         /* Try reset handler method first */
4860         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4861                                     reset_list);
4862         amdgpu_reset_reg_dumps(tmp_adev);
4863
4864         reset_context->reset_device_list = device_list_handle;
4865         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4866         /* If reset handler not implemented, continue; otherwise return */
4867         if (r == -EOPNOTSUPP)
4868                 r = 0;
4869         else
4870                 return r;
4871
4872         /* Reset handler not implemented, use the default method */
4873         need_full_reset =
4874                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4875         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4876
4877         gpu_reset_for_dev_remove =
4878                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4879                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4880
4881         /*
4882          * ASIC reset has to be done on all XGMI hive nodes ASAP
4883          * to allow proper links negotiation in FW (within 1 sec)
4884          */
4885         if (!skip_hw_reset && need_full_reset) {
4886                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4887                         /* For XGMI run all resets in parallel to speed up the process */
4888                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4889                                 tmp_adev->gmc.xgmi.pending_reset = false;
4890                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4891                                         r = -EALREADY;
4892                         } else
4893                                 r = amdgpu_asic_reset(tmp_adev);
4894
4895                         if (r) {
4896                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4897                                          r, adev_to_drm(tmp_adev)->unique);
4898                                 break;
4899                         }
4900                 }
4901
4902                 /* For XGMI wait for all resets to complete before proceed */
4903                 if (!r) {
4904                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4905                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4906                                         flush_work(&tmp_adev->xgmi_reset_work);
4907                                         r = tmp_adev->asic_reset_res;
4908                                         if (r)
4909                                                 break;
4910                                 }
4911                         }
4912                 }
4913         }
4914
4915         if (!r && amdgpu_ras_intr_triggered()) {
4916                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4917                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4918                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4919                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4920                 }
4921
4922                 amdgpu_ras_intr_cleared();
4923         }
4924
4925         /* Since the mode1 reset affects base ip blocks, the
4926          * phase1 ip blocks need to be resumed. Otherwise there
4927          * will be a BIOS signature error and the psp bootloader
4928          * can't load kdb on the next amdgpu install.
4929          */
4930         if (gpu_reset_for_dev_remove) {
4931                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4932                         amdgpu_device_ip_resume_phase1(tmp_adev);
4933
4934                 goto end;
4935         }
4936
4937         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4938                 if (need_full_reset) {
4939                         /* post card */
4940                         r = amdgpu_device_asic_init(tmp_adev);
4941                         if (r) {
4942                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4943                         } else {
4944                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4945
4946                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4947                                 if (r)
4948                                         goto out;
4949
4950                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4951 #ifdef CONFIG_DEV_COREDUMP
4952                                 tmp_adev->reset_vram_lost = vram_lost;
4953                                 memset(&tmp_adev->reset_task_info, 0,
4954                                                 sizeof(tmp_adev->reset_task_info));
4955                                 if (reset_context->job && reset_context->job->vm)
4956                                         tmp_adev->reset_task_info =
4957                                                 reset_context->job->vm->task_info;
4958                                 amdgpu_reset_capture_coredumpm(tmp_adev);
4959 #endif
4960                                 if (vram_lost) {
4961                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4962                                         amdgpu_inc_vram_lost(tmp_adev);
4963                                 }
4964
4965                                 r = amdgpu_device_fw_loading(tmp_adev);
4966                                 if (r)
4967                                         return r;
4968
4969                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4970                                 if (r)
4971                                         goto out;
4972
4973                                 if (vram_lost)
4974                                         amdgpu_device_fill_reset_magic(tmp_adev);
4975
4976                                 /*
4977                                  * Add this ASIC as tracked as reset was already
4978                                  * complete successfully.
4979                                  */
4980                                 amdgpu_register_gpu_instance(tmp_adev);
4981
4982                                 if (!reset_context->hive &&
4983                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4984                                         amdgpu_xgmi_add_device(tmp_adev);
4985
4986                                 r = amdgpu_device_ip_late_init(tmp_adev);
4987                                 if (r)
4988                                         goto out;
4989
4990                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4991
4992                                 /*
4993                                  * The GPU enters bad state once faulty pages
4994                                  * by ECC has reached the threshold, and ras
4995                                  * recovery is scheduled next. So add one check
4996                                  * here to break recovery if it indeed exceeds
4997                                  * bad page threshold, and remind user to
4998                                  * retire this GPU or setting one bigger
4999                                  * bad_page_threshold value to fix this once
5000                                  * probing driver again.
5001                                  */
5002                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5003                                         /* must succeed. */
5004                                         amdgpu_ras_resume(tmp_adev);
5005                                 } else {
5006                                         r = -EINVAL;
5007                                         goto out;
5008                                 }
5009
5010                                 /* Update PSP FW topology after reset */
5011                                 if (reset_context->hive &&
5012                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5013                                         r = amdgpu_xgmi_update_topology(
5014                                                 reset_context->hive, tmp_adev);
5015                         }
5016                 }
5017
5018 out:
5019                 if (!r) {
5020                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5021                         r = amdgpu_ib_ring_tests(tmp_adev);
5022                         if (r) {
5023                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5024                                 need_full_reset = true;
5025                                 r = -EAGAIN;
5026                                 goto end;
5027                         }
5028                 }
5029
5030                 if (!r)
5031                         r = amdgpu_device_recover_vram(tmp_adev);
5032                 else
5033                         tmp_adev->asic_reset_res = r;
5034         }
5035
5036 end:
5037         if (need_full_reset)
5038                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5039         else
5040                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5041         return r;
5042 }
5043
5044 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5045 {
5046
5047         switch (amdgpu_asic_reset_method(adev)) {
5048         case AMD_RESET_METHOD_MODE1:
5049                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5050                 break;
5051         case AMD_RESET_METHOD_MODE2:
5052                 adev->mp1_state = PP_MP1_STATE_RESET;
5053                 break;
5054         default:
5055                 adev->mp1_state = PP_MP1_STATE_NONE;
5056                 break;
5057         }
5058 }
5059
5060 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5061 {
5062         amdgpu_vf_error_trans_all(adev);
5063         adev->mp1_state = PP_MP1_STATE_NONE;
5064 }
5065
5066 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5067 {
5068         struct pci_dev *p = NULL;
5069
5070         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5071                         adev->pdev->bus->number, 1);
5072         if (p) {
5073                 pm_runtime_enable(&(p->dev));
5074                 pm_runtime_resume(&(p->dev));
5075         }
5076
5077         pci_dev_put(p);
5078 }
5079
5080 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5081 {
5082         enum amd_reset_method reset_method;
5083         struct pci_dev *p = NULL;
5084         u64 expires;
5085
5086         /*
5087          * For now, only BACO and mode1 reset are confirmed
5088          * to suffer the audio issue without proper suspended.
5089          */
5090         reset_method = amdgpu_asic_reset_method(adev);
5091         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5092              (reset_method != AMD_RESET_METHOD_MODE1))
5093                 return -EINVAL;
5094
5095         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5096                         adev->pdev->bus->number, 1);
5097         if (!p)
5098                 return -ENODEV;
5099
5100         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5101         if (!expires)
5102                 /*
5103                  * If we cannot get the audio device autosuspend delay,
5104                  * a fixed 4S interval will be used. Considering 3S is
5105                  * the audio controller default autosuspend delay setting.
5106                  * 4S used here is guaranteed to cover that.
5107                  */
5108                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5109
5110         while (!pm_runtime_status_suspended(&(p->dev))) {
5111                 if (!pm_runtime_suspend(&(p->dev)))
5112                         break;
5113
5114                 if (expires < ktime_get_mono_fast_ns()) {
5115                         dev_warn(adev->dev, "failed to suspend display audio\n");
5116                         pci_dev_put(p);
5117                         /* TODO: abort the succeeding gpu reset? */
5118                         return -ETIMEDOUT;
5119                 }
5120         }
5121
5122         pm_runtime_disable(&(p->dev));
5123
5124         pci_dev_put(p);
5125         return 0;
5126 }
5127
5128 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5129 {
5130         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5131
5132 #if defined(CONFIG_DEBUG_FS)
5133         if (!amdgpu_sriov_vf(adev))
5134                 cancel_work(&adev->reset_work);
5135 #endif
5136
5137         if (adev->kfd.dev)
5138                 cancel_work(&adev->kfd.reset_work);
5139
5140         if (amdgpu_sriov_vf(adev))
5141                 cancel_work(&adev->virt.flr_work);
5142
5143         if (con && adev->ras_enabled)
5144                 cancel_work(&con->recovery_work);
5145
5146 }
5147
5148 /**
5149  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5150  *
5151  * @adev: amdgpu_device pointer
5152  * @job: which job trigger hang
5153  * @reset_context: amdgpu reset context pointer
5154  *
5155  * Attempt to reset the GPU if it has hung (all asics).
5156  * Attempt to do soft-reset or full-reset and reinitialize Asic
5157  * Returns 0 for success or an error on failure.
5158  */
5159
5160 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5161                               struct amdgpu_job *job,
5162                               struct amdgpu_reset_context *reset_context)
5163 {
5164         struct list_head device_list, *device_list_handle =  NULL;
5165         bool job_signaled = false;
5166         struct amdgpu_hive_info *hive = NULL;
5167         struct amdgpu_device *tmp_adev = NULL;
5168         int i, r = 0;
5169         bool need_emergency_restart = false;
5170         bool audio_suspended = false;
5171         bool gpu_reset_for_dev_remove = false;
5172
5173         gpu_reset_for_dev_remove =
5174                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5175                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5176
5177         /*
5178          * Special case: RAS triggered and full reset isn't supported
5179          */
5180         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5181
5182         /*
5183          * Flush RAM to disk so that after reboot
5184          * the user can read log and see why the system rebooted.
5185          */
5186         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5187                 DRM_WARN("Emergency reboot.");
5188
5189                 ksys_sync_helper();
5190                 emergency_restart();
5191         }
5192
5193         dev_info(adev->dev, "GPU %s begin!\n",
5194                 need_emergency_restart ? "jobs stop":"reset");
5195
5196         if (!amdgpu_sriov_vf(adev))
5197                 hive = amdgpu_get_xgmi_hive(adev);
5198         if (hive)
5199                 mutex_lock(&hive->hive_lock);
5200
5201         reset_context->job = job;
5202         reset_context->hive = hive;
5203         /*
5204          * Build list of devices to reset.
5205          * In case we are in XGMI hive mode, resort the device list
5206          * to put adev in the 1st position.
5207          */
5208         INIT_LIST_HEAD(&device_list);
5209         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5210                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5211                         list_add_tail(&tmp_adev->reset_list, &device_list);
5212                         if (gpu_reset_for_dev_remove && adev->shutdown)
5213                                 tmp_adev->shutdown = true;
5214                 }
5215                 if (!list_is_first(&adev->reset_list, &device_list))
5216                         list_rotate_to_front(&adev->reset_list, &device_list);
5217                 device_list_handle = &device_list;
5218         } else {
5219                 list_add_tail(&adev->reset_list, &device_list);
5220                 device_list_handle = &device_list;
5221         }
5222
5223         /* We need to lock reset domain only once both for XGMI and single device */
5224         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5225                                     reset_list);
5226         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5227
5228         /* block all schedulers and reset given job's ring */
5229         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5230
5231                 amdgpu_device_set_mp1_state(tmp_adev);
5232
5233                 /*
5234                  * Try to put the audio codec into suspend state
5235                  * before gpu reset started.
5236                  *
5237                  * Due to the power domain of the graphics device
5238                  * is shared with AZ power domain. Without this,
5239                  * we may change the audio hardware from behind
5240                  * the audio driver's back. That will trigger
5241                  * some audio codec errors.
5242                  */
5243                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5244                         audio_suspended = true;
5245
5246                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5247
5248                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5249
5250                 if (!amdgpu_sriov_vf(tmp_adev))
5251                         amdgpu_amdkfd_pre_reset(tmp_adev);
5252
5253                 /*
5254                  * Mark these ASICs to be reseted as untracked first
5255                  * And add them back after reset completed
5256                  */
5257                 amdgpu_unregister_gpu_instance(tmp_adev);
5258
5259                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5260
5261                 /* disable ras on ALL IPs */
5262                 if (!need_emergency_restart &&
5263                       amdgpu_device_ip_need_full_reset(tmp_adev))
5264                         amdgpu_ras_suspend(tmp_adev);
5265
5266                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5267                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5268
5269                         if (!ring || !ring->sched.thread)
5270                                 continue;
5271
5272                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5273
5274                         if (need_emergency_restart)
5275                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5276                 }
5277                 atomic_inc(&tmp_adev->gpu_reset_counter);
5278         }
5279
5280         if (need_emergency_restart)
5281                 goto skip_sched_resume;
5282
5283         /*
5284          * Must check guilty signal here since after this point all old
5285          * HW fences are force signaled.
5286          *
5287          * job->base holds a reference to parent fence
5288          */
5289         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5290                 job_signaled = true;
5291                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5292                 goto skip_hw_reset;
5293         }
5294
5295 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5296         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5297                 if (gpu_reset_for_dev_remove) {
5298                         /* Workaroud for ASICs need to disable SMC first */
5299                         amdgpu_device_smu_fini_early(tmp_adev);
5300                 }
5301                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5302                 /*TODO Should we stop ?*/
5303                 if (r) {
5304                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5305                                   r, adev_to_drm(tmp_adev)->unique);
5306                         tmp_adev->asic_reset_res = r;
5307                 }
5308
5309                 /*
5310                  * Drop all pending non scheduler resets. Scheduler resets
5311                  * were already dropped during drm_sched_stop
5312                  */
5313                 amdgpu_device_stop_pending_resets(tmp_adev);
5314         }
5315
5316         /* Actual ASIC resets if needed.*/
5317         /* Host driver will handle XGMI hive reset for SRIOV */
5318         if (amdgpu_sriov_vf(adev)) {
5319                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5320                 if (r)
5321                         adev->asic_reset_res = r;
5322
5323                 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5324                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5325                     adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5326                         amdgpu_ras_resume(adev);
5327         } else {
5328                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5329                 if (r && r == -EAGAIN)
5330                         goto retry;
5331
5332                 if (!r && gpu_reset_for_dev_remove)
5333                         goto recover_end;
5334         }
5335
5336 skip_hw_reset:
5337
5338         /* Post ASIC reset for all devs .*/
5339         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5340
5341                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5342                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5343
5344                         if (!ring || !ring->sched.thread)
5345                                 continue;
5346
5347                         drm_sched_start(&ring->sched, true);
5348                 }
5349
5350                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5351                         amdgpu_mes_self_test(tmp_adev);
5352
5353                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5354                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5355
5356                 if (tmp_adev->asic_reset_res)
5357                         r = tmp_adev->asic_reset_res;
5358
5359                 tmp_adev->asic_reset_res = 0;
5360
5361                 if (r) {
5362                         /* bad news, how to tell it to userspace ? */
5363                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5364                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5365                 } else {
5366                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5367                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5368                                 DRM_WARN("smart shift update failed\n");
5369                 }
5370         }
5371
5372 skip_sched_resume:
5373         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5374                 /* unlock kfd: SRIOV would do it separately */
5375                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5376                         amdgpu_amdkfd_post_reset(tmp_adev);
5377
5378                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5379                  * need to bring up kfd here if it's not be initialized before
5380                  */
5381                 if (!adev->kfd.init_complete)
5382                         amdgpu_amdkfd_device_init(adev);
5383
5384                 if (audio_suspended)
5385                         amdgpu_device_resume_display_audio(tmp_adev);
5386
5387                 amdgpu_device_unset_mp1_state(tmp_adev);
5388
5389                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5390         }
5391
5392 recover_end:
5393         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5394                                             reset_list);
5395         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5396
5397         if (hive) {
5398                 mutex_unlock(&hive->hive_lock);
5399                 amdgpu_put_xgmi_hive(hive);
5400         }
5401
5402         if (r)
5403                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5404
5405         atomic_set(&adev->reset_domain->reset_res, r);
5406         return r;
5407 }
5408
5409 /**
5410  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5411  *
5412  * @adev: amdgpu_device pointer
5413  *
5414  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5415  * and lanes) of the slot the device is in. Handles APUs and
5416  * virtualized environments where PCIE config space may not be available.
5417  */
5418 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5419 {
5420         struct pci_dev *pdev;
5421         enum pci_bus_speed speed_cap, platform_speed_cap;
5422         enum pcie_link_width platform_link_width;
5423
5424         if (amdgpu_pcie_gen_cap)
5425                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5426
5427         if (amdgpu_pcie_lane_cap)
5428                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5429
5430         /* covers APUs as well */
5431         if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5432                 if (adev->pm.pcie_gen_mask == 0)
5433                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5434                 if (adev->pm.pcie_mlw_mask == 0)
5435                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5436                 return;
5437         }
5438
5439         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5440                 return;
5441
5442         pcie_bandwidth_available(adev->pdev, NULL,
5443                                  &platform_speed_cap, &platform_link_width);
5444
5445         if (adev->pm.pcie_gen_mask == 0) {
5446                 /* asic caps */
5447                 pdev = adev->pdev;
5448                 speed_cap = pcie_get_speed_cap(pdev);
5449                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5450                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5451                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5452                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5453                 } else {
5454                         if (speed_cap == PCIE_SPEED_32_0GT)
5455                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5456                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5457                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5458                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5459                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5460                         else if (speed_cap == PCIE_SPEED_16_0GT)
5461                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5462                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5463                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5464                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5465                         else if (speed_cap == PCIE_SPEED_8_0GT)
5466                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5467                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5468                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5469                         else if (speed_cap == PCIE_SPEED_5_0GT)
5470                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5471                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5472                         else
5473                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5474                 }
5475                 /* platform caps */
5476                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5477                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5478                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5479                 } else {
5480                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5481                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5482                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5483                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5484                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5485                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5486                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5487                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5488                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5489                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5490                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5491                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5492                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5493                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5494                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5495                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5496                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5497                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5498                         else
5499                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5500
5501                 }
5502         }
5503         if (adev->pm.pcie_mlw_mask == 0) {
5504                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5505                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5506                 } else {
5507                         switch (platform_link_width) {
5508                         case PCIE_LNK_X32:
5509                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5510                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5511                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5512                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5513                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5514                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5515                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5516                                 break;
5517                         case PCIE_LNK_X16:
5518                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5519                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5520                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5521                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5522                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5523                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5524                                 break;
5525                         case PCIE_LNK_X12:
5526                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5527                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5528                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5529                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5530                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5531                                 break;
5532                         case PCIE_LNK_X8:
5533                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5534                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5535                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5536                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5537                                 break;
5538                         case PCIE_LNK_X4:
5539                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5540                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5541                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5542                                 break;
5543                         case PCIE_LNK_X2:
5544                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5545                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5546                                 break;
5547                         case PCIE_LNK_X1:
5548                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5549                                 break;
5550                         default:
5551                                 break;
5552                         }
5553                 }
5554         }
5555 }
5556
5557 /**
5558  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5559  *
5560  * @adev: amdgpu_device pointer
5561  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5562  *
5563  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5564  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5565  * @peer_adev.
5566  */
5567 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5568                                       struct amdgpu_device *peer_adev)
5569 {
5570 #ifdef CONFIG_HSA_AMD_P2P
5571         uint64_t address_mask = peer_adev->dev->dma_mask ?
5572                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5573         resource_size_t aper_limit =
5574                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5575         bool p2p_access =
5576                 !adev->gmc.xgmi.connected_to_cpu &&
5577                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5578
5579         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5580                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5581                 !(adev->gmc.aper_base & address_mask ||
5582                   aper_limit & address_mask));
5583 #else
5584         return false;
5585 #endif
5586 }
5587
5588 int amdgpu_device_baco_enter(struct drm_device *dev)
5589 {
5590         struct amdgpu_device *adev = drm_to_adev(dev);
5591         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5592
5593         if (!amdgpu_device_supports_baco(dev))
5594                 return -ENOTSUPP;
5595
5596         if (ras && adev->ras_enabled &&
5597             adev->nbio.funcs->enable_doorbell_interrupt)
5598                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5599
5600         return amdgpu_dpm_baco_enter(adev);
5601 }
5602
5603 int amdgpu_device_baco_exit(struct drm_device *dev)
5604 {
5605         struct amdgpu_device *adev = drm_to_adev(dev);
5606         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5607         int ret = 0;
5608
5609         if (!amdgpu_device_supports_baco(dev))
5610                 return -ENOTSUPP;
5611
5612         ret = amdgpu_dpm_baco_exit(adev);
5613         if (ret)
5614                 return ret;
5615
5616         if (ras && adev->ras_enabled &&
5617             adev->nbio.funcs->enable_doorbell_interrupt)
5618                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5619
5620         if (amdgpu_passthrough(adev) &&
5621             adev->nbio.funcs->clear_doorbell_interrupt)
5622                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5623
5624         return 0;
5625 }
5626
5627 /**
5628  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5629  * @pdev: PCI device struct
5630  * @state: PCI channel state
5631  *
5632  * Description: Called when a PCI error is detected.
5633  *
5634  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5635  */
5636 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5637 {
5638         struct drm_device *dev = pci_get_drvdata(pdev);
5639         struct amdgpu_device *adev = drm_to_adev(dev);
5640         int i;
5641
5642         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5643
5644         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5645                 DRM_WARN("No support for XGMI hive yet...");
5646                 return PCI_ERS_RESULT_DISCONNECT;
5647         }
5648
5649         adev->pci_channel_state = state;
5650
5651         switch (state) {
5652         case pci_channel_io_normal:
5653                 return PCI_ERS_RESULT_CAN_RECOVER;
5654         /* Fatal error, prepare for slot reset */
5655         case pci_channel_io_frozen:
5656                 /*
5657                  * Locking adev->reset_domain->sem will prevent any external access
5658                  * to GPU during PCI error recovery
5659                  */
5660                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5661                 amdgpu_device_set_mp1_state(adev);
5662
5663                 /*
5664                  * Block any work scheduling as we do for regular GPU reset
5665                  * for the duration of the recovery
5666                  */
5667                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5668                         struct amdgpu_ring *ring = adev->rings[i];
5669
5670                         if (!ring || !ring->sched.thread)
5671                                 continue;
5672
5673                         drm_sched_stop(&ring->sched, NULL);
5674                 }
5675                 atomic_inc(&adev->gpu_reset_counter);
5676                 return PCI_ERS_RESULT_NEED_RESET;
5677         case pci_channel_io_perm_failure:
5678                 /* Permanent error, prepare for device removal */
5679                 return PCI_ERS_RESULT_DISCONNECT;
5680         }
5681
5682         return PCI_ERS_RESULT_NEED_RESET;
5683 }
5684
5685 /**
5686  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5687  * @pdev: pointer to PCI device
5688  */
5689 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5690 {
5691
5692         DRM_INFO("PCI error: mmio enabled callback!!\n");
5693
5694         /* TODO - dump whatever for debugging purposes */
5695
5696         /* This called only if amdgpu_pci_error_detected returns
5697          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5698          * works, no need to reset slot.
5699          */
5700
5701         return PCI_ERS_RESULT_RECOVERED;
5702 }
5703
5704 /**
5705  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5706  * @pdev: PCI device struct
5707  *
5708  * Description: This routine is called by the pci error recovery
5709  * code after the PCI slot has been reset, just before we
5710  * should resume normal operations.
5711  */
5712 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5713 {
5714         struct drm_device *dev = pci_get_drvdata(pdev);
5715         struct amdgpu_device *adev = drm_to_adev(dev);
5716         int r, i;
5717         struct amdgpu_reset_context reset_context;
5718         u32 memsize;
5719         struct list_head device_list;
5720
5721         DRM_INFO("PCI error: slot reset callback!!\n");
5722
5723         memset(&reset_context, 0, sizeof(reset_context));
5724
5725         INIT_LIST_HEAD(&device_list);
5726         list_add_tail(&adev->reset_list, &device_list);
5727
5728         /* wait for asic to come out of reset */
5729         msleep(500);
5730
5731         /* Restore PCI confspace */
5732         amdgpu_device_load_pci_state(pdev);
5733
5734         /* confirm  ASIC came out of reset */
5735         for (i = 0; i < adev->usec_timeout; i++) {
5736                 memsize = amdgpu_asic_get_config_memsize(adev);
5737
5738                 if (memsize != 0xffffffff)
5739                         break;
5740                 udelay(1);
5741         }
5742         if (memsize == 0xffffffff) {
5743                 r = -ETIME;
5744                 goto out;
5745         }
5746
5747         reset_context.method = AMD_RESET_METHOD_NONE;
5748         reset_context.reset_req_dev = adev;
5749         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5750         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5751
5752         adev->no_hw_access = true;
5753         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5754         adev->no_hw_access = false;
5755         if (r)
5756                 goto out;
5757
5758         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5759
5760 out:
5761         if (!r) {
5762                 if (amdgpu_device_cache_pci_state(adev->pdev))
5763                         pci_restore_state(adev->pdev);
5764
5765                 DRM_INFO("PCIe error recovery succeeded\n");
5766         } else {
5767                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5768                 amdgpu_device_unset_mp1_state(adev);
5769                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5770         }
5771
5772         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5773 }
5774
5775 /**
5776  * amdgpu_pci_resume() - resume normal ops after PCI reset
5777  * @pdev: pointer to PCI device
5778  *
5779  * Called when the error recovery driver tells us that its
5780  * OK to resume normal operation.
5781  */
5782 void amdgpu_pci_resume(struct pci_dev *pdev)
5783 {
5784         struct drm_device *dev = pci_get_drvdata(pdev);
5785         struct amdgpu_device *adev = drm_to_adev(dev);
5786         int i;
5787
5788
5789         DRM_INFO("PCI error: resume callback!!\n");
5790
5791         /* Only continue execution for the case of pci_channel_io_frozen */
5792         if (adev->pci_channel_state != pci_channel_io_frozen)
5793                 return;
5794
5795         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5796                 struct amdgpu_ring *ring = adev->rings[i];
5797
5798                 if (!ring || !ring->sched.thread)
5799                         continue;
5800
5801                 drm_sched_start(&ring->sched, true);
5802         }
5803
5804         amdgpu_device_unset_mp1_state(adev);
5805         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5806 }
5807
5808 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5809 {
5810         struct drm_device *dev = pci_get_drvdata(pdev);
5811         struct amdgpu_device *adev = drm_to_adev(dev);
5812         int r;
5813
5814         r = pci_save_state(pdev);
5815         if (!r) {
5816                 kfree(adev->pci_state);
5817
5818                 adev->pci_state = pci_store_saved_state(pdev);
5819
5820                 if (!adev->pci_state) {
5821                         DRM_ERROR("Failed to store PCI saved state");
5822                         return false;
5823                 }
5824         } else {
5825                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5826                 return false;
5827         }
5828
5829         return true;
5830 }
5831
5832 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5833 {
5834         struct drm_device *dev = pci_get_drvdata(pdev);
5835         struct amdgpu_device *adev = drm_to_adev(dev);
5836         int r;
5837
5838         if (!adev->pci_state)
5839                 return false;
5840
5841         r = pci_load_saved_state(pdev, adev->pci_state);
5842
5843         if (!r) {
5844                 pci_restore_state(pdev);
5845         } else {
5846                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5847                 return false;
5848         }
5849
5850         return true;
5851 }
5852
5853 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5854                 struct amdgpu_ring *ring)
5855 {
5856 #ifdef CONFIG_X86_64
5857         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5858                 return;
5859 #endif
5860         if (adev->gmc.xgmi.connected_to_cpu)
5861                 return;
5862
5863         if (ring && ring->funcs->emit_hdp_flush)
5864                 amdgpu_ring_emit_hdp_flush(ring);
5865         else
5866                 amdgpu_asic_flush_hdp(adev, ring);
5867 }
5868
5869 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5870                 struct amdgpu_ring *ring)
5871 {
5872 #ifdef CONFIG_X86_64
5873         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5874                 return;
5875 #endif
5876         if (adev->gmc.xgmi.connected_to_cpu)
5877                 return;
5878
5879         amdgpu_asic_invalidate_hdp(adev, ring);
5880 }
5881
5882 int amdgpu_in_reset(struct amdgpu_device *adev)
5883 {
5884         return atomic_read(&adev->reset_domain->in_gpu_reset);
5885 }
5886
5887 /**
5888  * amdgpu_device_halt() - bring hardware to some kind of halt state
5889  *
5890  * @adev: amdgpu_device pointer
5891  *
5892  * Bring hardware to some kind of halt state so that no one can touch it
5893  * any more. It will help to maintain error context when error occurred.
5894  * Compare to a simple hang, the system will keep stable at least for SSH
5895  * access. Then it should be trivial to inspect the hardware state and
5896  * see what's going on. Implemented as following:
5897  *
5898  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5899  *    clears all CPU mappings to device, disallows remappings through page faults
5900  * 2. amdgpu_irq_disable_all() disables all interrupts
5901  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5902  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5903  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5904  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5905  *    flush any in flight DMA operations
5906  */
5907 void amdgpu_device_halt(struct amdgpu_device *adev)
5908 {
5909         struct pci_dev *pdev = adev->pdev;
5910         struct drm_device *ddev = adev_to_drm(adev);
5911
5912         amdgpu_xcp_dev_unplug(adev);
5913         drm_dev_unplug(ddev);
5914
5915         amdgpu_irq_disable_all(adev);
5916
5917         amdgpu_fence_driver_hw_fini(adev);
5918
5919         adev->no_hw_access = true;
5920
5921         amdgpu_device_unmap_mmio(adev);
5922
5923         pci_disable_device(pdev);
5924         pci_wait_for_pending_transaction(pdev);
5925 }
5926
5927 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5928                                 u32 reg)
5929 {
5930         unsigned long flags, address, data;
5931         u32 r;
5932
5933         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5934         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5935
5936         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5937         WREG32(address, reg * 4);
5938         (void)RREG32(address);
5939         r = RREG32(data);
5940         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5941         return r;
5942 }
5943
5944 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5945                                 u32 reg, u32 v)
5946 {
5947         unsigned long flags, address, data;
5948
5949         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5950         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5951
5952         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5953         WREG32(address, reg * 4);
5954         (void)RREG32(address);
5955         WREG32(data, v);
5956         (void)RREG32(data);
5957         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5958 }
5959
5960 /**
5961  * amdgpu_device_switch_gang - switch to a new gang
5962  * @adev: amdgpu_device pointer
5963  * @gang: the gang to switch to
5964  *
5965  * Try to switch to a new gang.
5966  * Returns: NULL if we switched to the new gang or a reference to the current
5967  * gang leader.
5968  */
5969 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5970                                             struct dma_fence *gang)
5971 {
5972         struct dma_fence *old = NULL;
5973
5974         do {
5975                 dma_fence_put(old);
5976                 rcu_read_lock();
5977                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
5978                 rcu_read_unlock();
5979
5980                 if (old == gang)
5981                         break;
5982
5983                 if (!dma_fence_is_signaled(old))
5984                         return old;
5985
5986         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
5987                          old, gang) != old);
5988
5989         dma_fence_put(old);
5990         return NULL;
5991 }
5992
5993 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
5994 {
5995         switch (adev->asic_type) {
5996 #ifdef CONFIG_DRM_AMDGPU_SI
5997         case CHIP_HAINAN:
5998 #endif
5999         case CHIP_TOPAZ:
6000                 /* chips with no display hardware */
6001                 return false;
6002 #ifdef CONFIG_DRM_AMDGPU_SI
6003         case CHIP_TAHITI:
6004         case CHIP_PITCAIRN:
6005         case CHIP_VERDE:
6006         case CHIP_OLAND:
6007 #endif
6008 #ifdef CONFIG_DRM_AMDGPU_CIK
6009         case CHIP_BONAIRE:
6010         case CHIP_HAWAII:
6011         case CHIP_KAVERI:
6012         case CHIP_KABINI:
6013         case CHIP_MULLINS:
6014 #endif
6015         case CHIP_TONGA:
6016         case CHIP_FIJI:
6017         case CHIP_POLARIS10:
6018         case CHIP_POLARIS11:
6019         case CHIP_POLARIS12:
6020         case CHIP_VEGAM:
6021         case CHIP_CARRIZO:
6022         case CHIP_STONEY:
6023                 /* chips with display hardware */
6024                 return true;
6025         default:
6026                 /* IP discovery */
6027                 if (!adev->ip_versions[DCE_HWIP][0] ||
6028                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6029                         return false;
6030                 return true;
6031         }
6032 }
6033
6034 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6035                 uint32_t inst, uint32_t reg_addr, char reg_name[],
6036                 uint32_t expected_value, uint32_t mask)
6037 {
6038         uint32_t ret = 0;
6039         uint32_t old_ = 0;
6040         uint32_t tmp_ = RREG32(reg_addr);
6041         uint32_t loop = adev->usec_timeout;
6042
6043         while ((tmp_ & (mask)) != (expected_value)) {
6044                 if (old_ != tmp_) {
6045                         loop = adev->usec_timeout;
6046                         old_ = tmp_;
6047                 } else
6048                         udelay(1);
6049                 tmp_ = RREG32(reg_addr);
6050                 loop--;
6051                 if (!loop) {
6052                         DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6053                                   inst, reg_name, (uint32_t)expected_value,
6054                                   (uint32_t)(tmp_ & (mask)));
6055                         ret = -ETIMEDOUT;
6056                         break;
6057                 }
6058         }
6059         return ret;
6060 }