drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "intel_workarounds.h"
  39 #include "i915_gemfs.h"
  40 #include <linux/dma-fence-array.h>
  41 #include <linux/kthread.h>
  42 #include <linux/reservation.h>
  43 #include <linux/shmem_fs.h>
  44 #include <linux/slab.h>
  45 #include <linux/stop_machine.h>
  46 #include <linux/swap.h>
  47 #include <linux/pci.h>
  48 #include <linux/dma-buf.h>
  49
  50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53 {
  54         if (obj->cache_dirty)
  55                 return false;
  56
  57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                 return true;
  59
  60         return obj->pin_global; /* currently in use by HW, keep flushed */
  61 }
  62
  63 static int
  64 insert_mappable_node(struct i915_ggtt *ggtt,
  65                      struct drm_mm_node *node, u32 size)
  66 {
  67         memset(node, 0, sizeof(*node));
  68         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  69                                            size, 0, I915_COLOR_UNEVICTABLE,
  70                                            0, ggtt->mappable_end,
  71                                            DRM_MM_INSERT_LOW);
  72 }
  73
  74 static void
  75 remove_mappable_node(struct drm_mm_node *node)
  76 {
  77         drm_mm_remove_node(node);
  78 }
  79
  80 /* some bookkeeping */
  81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                   u64 size)
  83 {
  84         spin_lock(&dev_priv->mm.object_stat_lock);
  85         dev_priv->mm.object_count++;
  86         dev_priv->mm.object_memory += size;
  87         spin_unlock(&dev_priv->mm.object_stat_lock);
  88 }
  89
  90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                      u64 size)
  92 {
  93         spin_lock(&dev_priv->mm.object_stat_lock);
  94         dev_priv->mm.object_count--;
  95         dev_priv->mm.object_memory -= size;
  96         spin_unlock(&dev_priv->mm.object_stat_lock);
  97 }
  98
  99 static int
 100 i915_gem_wait_for_error(struct i915_gpu_error *error)
 101 {
 102         int ret;
 103
 104         might_sleep();
 105
 106         /*
 107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108          * userspace. If it takes that long something really bad is going on and
 109          * we should simply try to bail out and fail as gracefully as possible.
 110          */
 111         ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                                !i915_reset_backoff(error),
 113                                                I915_RESET_TIMEOUT);
 114         if (ret == 0) {
 115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                 return -EIO;
 117         } else if (ret < 0) {
 118                 return ret;
 119         } else {
 120                 return 0;
 121         }
 122 }
 123
 124 int i915_mutex_lock_interruptible(struct drm_device *dev)
 125 {
 126         struct drm_i915_private *dev_priv = to_i915(dev);
 127         int ret;
 128
 129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130         if (ret)
 131                 return ret;
 132
 133         ret = mutex_lock_interruptible(&dev->struct_mutex);
 134         if (ret)
 135                 return ret;
 136
 137         return 0;
 138 }
 139
 140 static u32 __i915_gem_park(struct drm_i915_private *i915)
 141 {
 142         GEM_TRACE("\n");
 143
 144         lockdep_assert_held(&i915->drm.struct_mutex);
 145         GEM_BUG_ON(i915->gt.active_requests);
 146         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 147
 148         if (!i915->gt.awake)
 149                 return I915_EPOCH_INVALID;
 150
 151         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 152
 153         /*
 154          * Be paranoid and flush a concurrent interrupt to make sure
 155          * we don't reactivate any irq tasklets after parking.
 156          *
 157          * FIXME: Note that even though we have waited for execlists to be idle,
 158          * there may still be an in-flight interrupt even though the CSB
 159          * is now empty. synchronize_irq() makes sure that a residual interrupt
 160          * is completed before we continue, but it doesn't prevent the HW from
 161          * raising a spurious interrupt later. To complete the shield we should
 162          * coordinate disabling the CS irq with flushing the interrupts.
 163          */
 164         synchronize_irq(i915->drm.irq);
 165
 166         intel_engines_park(i915);
 167         i915_timelines_park(i915);
 168
 169         i915_pmu_gt_parked(i915);
 170         i915_vma_parked(i915);
 171
 172         i915->gt.awake = false;
 173
 174         if (INTEL_GEN(i915) >= 6)
 175                 gen6_rps_idle(i915);
 176
 177         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 178
 179         intel_runtime_pm_put(i915);
 180
 181         return i915->gt.epoch;
 182 }
 183
 184 void i915_gem_park(struct drm_i915_private *i915)
 185 {
 186         GEM_TRACE("\n");
 187
 188         lockdep_assert_held(&i915->drm.struct_mutex);
 189         GEM_BUG_ON(i915->gt.active_requests);
 190
 191         if (!i915->gt.awake)
 192                 return;
 193
 194         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 195         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 196 }
 197
 198 void i915_gem_unpark(struct drm_i915_private *i915)
 199 {
 200         GEM_TRACE("\n");
 201
 202         lockdep_assert_held(&i915->drm.struct_mutex);
 203         GEM_BUG_ON(!i915->gt.active_requests);
 204
 205         if (i915->gt.awake)
 206                 return;
 207
 208         intel_runtime_pm_get_noresume(i915);
 209
 210         /*
 211          * It seems that the DMC likes to transition between the DC states a lot
 212          * when there are no connected displays (no active power domains) during
 213          * command submission.
 214          *
 215          * This activity has negative impact on the performance of the chip with
 216          * huge latencies observed in the interrupt handler and elsewhere.
 217          *
 218          * Work around it by grabbing a GT IRQ power domain whilst there is any
 219          * GT activity, preventing any DC state transitions.
 220          */
 221         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 222
 223         i915->gt.awake = true;
 224         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 225                 i915->gt.epoch = 1;
 226
 227         intel_enable_gt_powersave(i915);
 228         i915_update_gfx_val(i915);
 229         if (INTEL_GEN(i915) >= 6)
 230                 gen6_rps_busy(i915);
 231         i915_pmu_gt_unparked(i915);
 232
 233         intel_engines_unpark(i915);
 234
 235         i915_queue_hangcheck(i915);
 236
 237         queue_delayed_work(i915->wq,
 238                            &i915->gt.retire_work,
 239                            round_jiffies_up_relative(HZ));
 240 }
 241
 242 int
 243 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 244                             struct drm_file *file)
 245 {
 246         struct drm_i915_private *dev_priv = to_i915(dev);
 247         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 248         struct drm_i915_gem_get_aperture *args = data;
 249         struct i915_vma *vma;
 250         u64 pinned;
 251
 252         pinned = ggtt->vm.reserved;
 253         mutex_lock(&dev->struct_mutex);
 254         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 255                 if (i915_vma_is_pinned(vma))
 256                         pinned += vma->node.size;
 257         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 258                 if (i915_vma_is_pinned(vma))
 259                         pinned += vma->node.size;
 260         mutex_unlock(&dev->struct_mutex);
 261
 262         args->aper_size = ggtt->vm.total;
 263         args->aper_available_size = args->aper_size - pinned;
 264
 265         return 0;
 266 }
 267
 268 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 269 {
 270         struct address_space *mapping = obj->base.filp->f_mapping;
 271         drm_dma_handle_t *phys;
 272         struct sg_table *st;
 273         struct scatterlist *sg;
 274         char *vaddr;
 275         int i;
 276         int err;
 277
 278         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 279                 return -EINVAL;
 280
 281         /* Always aligning to the object size, allows a single allocation
 282          * to handle all possible callers, and given typical object sizes,
 283          * the alignment of the buddy allocation will naturally match.
 284          */
 285         phys = drm_pci_alloc(obj->base.dev,
 286                              roundup_pow_of_two(obj->base.size),
 287                              roundup_pow_of_two(obj->base.size));
 288         if (!phys)
 289                 return -ENOMEM;
 290
 291         vaddr = phys->vaddr;
 292         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 293                 struct page *page;
 294                 char *src;
 295
 296                 page = shmem_read_mapping_page(mapping, i);
 297                 if (IS_ERR(page)) {
 298                         err = PTR_ERR(page);
 299                         goto err_phys;
 300                 }
 301
 302                 src = kmap_atomic(page);
 303                 memcpy(vaddr, src, PAGE_SIZE);
 304                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 305                 kunmap_atomic(src);
 306
 307                 put_page(page);
 308                 vaddr += PAGE_SIZE;
 309         }
 310
 311         i915_gem_chipset_flush(to_i915(obj->base.dev));
 312
 313         st = kmalloc(sizeof(*st), GFP_KERNEL);
 314         if (!st) {
 315                 err = -ENOMEM;
 316                 goto err_phys;
 317         }
 318
 319         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 320                 kfree(st);
 321                 err = -ENOMEM;
 322                 goto err_phys;
 323         }
 324
 325         sg = st->sgl;
 326         sg->offset = 0;
 327         sg->length = obj->base.size;
 328
 329         sg_dma_address(sg) = phys->busaddr;
 330         sg_dma_len(sg) = obj->base.size;
 331
 332         obj->phys_handle = phys;
 333
 334         __i915_gem_object_set_pages(obj, st, sg->length);
 335
 336         return 0;
 337
 338 err_phys:
 339         drm_pci_free(obj->base.dev, phys);
 340
 341         return err;
 342 }
 343
 344 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 345 {
 346         obj->read_domains = I915_GEM_DOMAIN_CPU;
 347         obj->write_domain = I915_GEM_DOMAIN_CPU;
 348         if (cpu_write_needs_clflush(obj))
 349                 obj->cache_dirty = true;
 350 }
 351
 352 static void
 353 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 354                                 struct sg_table *pages,
 355                                 bool needs_clflush)
 356 {
 357         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 358
 359         if (obj->mm.madv == I915_MADV_DONTNEED)
 360                 obj->mm.dirty = false;
 361
 362         if (needs_clflush &&
 363             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 364             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 365                 drm_clflush_sg(pages);
 366
 367         __start_cpu_write(obj);
 368 }
 369
 370 static void
 371 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 372                                struct sg_table *pages)
 373 {
 374         __i915_gem_object_release_shmem(obj, pages, false);
 375
 376         if (obj->mm.dirty) {
 377                 struct address_space *mapping = obj->base.filp->f_mapping;
 378                 char *vaddr = obj->phys_handle->vaddr;
 379                 int i;
 380
 381                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 382                         struct page *page;
 383                         char *dst;
 384
 385                         page = shmem_read_mapping_page(mapping, i);
 386                         if (IS_ERR(page))
 387                                 continue;
 388
 389                         dst = kmap_atomic(page);
 390                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 391                         memcpy(dst, vaddr, PAGE_SIZE);
 392                         kunmap_atomic(dst);
 393
 394                         set_page_dirty(page);
 395                         if (obj->mm.madv == I915_MADV_WILLNEED)
 396                                 mark_page_accessed(page);
 397                         put_page(page);
 398                         vaddr += PAGE_SIZE;
 399                 }
 400                 obj->mm.dirty = false;
 401         }
 402
 403         sg_free_table(pages);
 404         kfree(pages);
 405
 406         drm_pci_free(obj->base.dev, obj->phys_handle);
 407 }
 408
 409 static void
 410 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 411 {
 412         i915_gem_object_unpin_pages(obj);
 413 }
 414
 415 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 416         .get_pages = i915_gem_object_get_pages_phys,
 417         .put_pages = i915_gem_object_put_pages_phys,
 418         .release = i915_gem_object_release_phys,
 419 };
 420
 421 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 422
 423 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 424 {
 425         struct i915_vma *vma;
 426         LIST_HEAD(still_in_list);
 427         int ret;
 428
 429         lockdep_assert_held(&obj->base.dev->struct_mutex);
 430
 431         /* Closed vma are removed from the obj->vma_list - but they may
 432          * still have an active binding on the object. To remove those we
 433          * must wait for all rendering to complete to the object (as unbinding
 434          * must anyway), and retire the requests.
 435          */
 436         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 437         if (ret)
 438                 return ret;
 439
 440         while ((vma = list_first_entry_or_null(&obj->vma_list,
 441                                                struct i915_vma,
 442                                                obj_link))) {
 443                 list_move_tail(&vma->obj_link, &still_in_list);
 444                 ret = i915_vma_unbind(vma);
 445                 if (ret)
 446                         break;
 447         }
 448         list_splice(&still_in_list, &obj->vma_list);
 449
 450         return ret;
 451 }
 452
 453 static long
 454 i915_gem_object_wait_fence(struct dma_fence *fence,
 455                            unsigned int flags,
 456                            long timeout,
 457                            struct intel_rps_client *rps_client)
 458 {
 459         struct i915_request *rq;
 460
 461         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 462
 463         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 464                 return timeout;
 465
 466         if (!dma_fence_is_i915(fence))
 467                 return dma_fence_wait_timeout(fence,
 468                                               flags & I915_WAIT_INTERRUPTIBLE,
 469                                               timeout);
 470
 471         rq = to_request(fence);
 472         if (i915_request_completed(rq))
 473                 goto out;
 474
 475         /*
 476          * This client is about to stall waiting for the GPU. In many cases
 477          * this is undesirable and limits the throughput of the system, as
 478          * many clients cannot continue processing user input/output whilst
 479          * blocked. RPS autotuning may take tens of milliseconds to respond
 480          * to the GPU load and thus incurs additional latency for the client.
 481          * We can circumvent that by promoting the GPU frequency to maximum
 482          * before we wait. This makes the GPU throttle up much more quickly
 483          * (good for benchmarks and user experience, e.g. window animations),
 484          * but at a cost of spending more power processing the workload
 485          * (bad for battery). Not all clients even want their results
 486          * immediately and for them we should just let the GPU select its own
 487          * frequency to maximise efficiency. To prevent a single client from
 488          * forcing the clocks too high for the whole system, we only allow
 489          * each client to waitboost once in a busy period.
 490          */
 491         if (rps_client && !i915_request_started(rq)) {
 492                 if (INTEL_GEN(rq->i915) >= 6)
 493                         gen6_rps_boost(rq, rps_client);
 494         }
 495
 496         timeout = i915_request_wait(rq, flags, timeout);
 497
 498 out:
 499         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 500                 i915_request_retire_upto(rq);
 501
 502         return timeout;
 503 }
 504
 505 static long
 506 i915_gem_object_wait_reservation(struct reservation_object *resv,
 507                                  unsigned int flags,
 508                                  long timeout,
 509                                  struct intel_rps_client *rps_client)
 510 {
 511         unsigned int seq = __read_seqcount_begin(&resv->seq);
 512         struct dma_fence *excl;
 513         bool prune_fences = false;
 514
 515         if (flags & I915_WAIT_ALL) {
 516                 struct dma_fence **shared;
 517                 unsigned int count, i;
 518                 int ret;
 519
 520                 ret = reservation_object_get_fences_rcu(resv,
 521                                                         &excl, &count, &shared);
 522                 if (ret)
 523                         return ret;
 524
 525                 for (i = 0; i < count; i++) {
 526                         timeout = i915_gem_object_wait_fence(shared[i],
 527                                                              flags, timeout,
 528                                                              rps_client);
 529                         if (timeout < 0)
 530                                 break;
 531
 532                         dma_fence_put(shared[i]);
 533                 }
 534
 535                 for (; i < count; i++)
 536                         dma_fence_put(shared[i]);
 537                 kfree(shared);
 538
 539                 /*
 540                  * If both shared fences and an exclusive fence exist,
 541                  * then by construction the shared fences must be later
 542                  * than the exclusive fence. If we successfully wait for
 543                  * all the shared fences, we know that the exclusive fence
 544                  * must all be signaled. If all the shared fences are
 545                  * signaled, we can prune the array and recover the
 546                  * floating references on the fences/requests.
 547                  */
 548                 prune_fences = count && timeout >= 0;
 549         } else {
 550                 excl = reservation_object_get_excl_rcu(resv);
 551         }
 552
 553         if (excl && timeout >= 0)
 554                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 555                                                      rps_client);
 556
 557         dma_fence_put(excl);
 558
 559         /*
 560          * Opportunistically prune the fences iff we know they have *all* been
 561          * signaled and that the reservation object has not been changed (i.e.
 562          * no new fences have been added).
 563          */
 564         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 565                 if (reservation_object_trylock(resv)) {
 566                         if (!__read_seqcount_retry(&resv->seq, seq))
 567                                 reservation_object_add_excl_fence(resv, NULL);
 568                         reservation_object_unlock(resv);
 569                 }
 570         }
 571
 572         return timeout;
 573 }
 574
 575 static void __fence_set_priority(struct dma_fence *fence,
 576                                  const struct i915_sched_attr *attr)
 577 {
 578         struct i915_request *rq;
 579         struct intel_engine_cs *engine;
 580
 581         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 582                 return;
 583
 584         rq = to_request(fence);
 585         engine = rq->engine;
 586
 587         local_bh_disable();
 588         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 589         if (engine->schedule)
 590                 engine->schedule(rq, attr);
 591         rcu_read_unlock();
 592         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 593 }
 594
 595 static void fence_set_priority(struct dma_fence *fence,
 596                                const struct i915_sched_attr *attr)
 597 {
 598         /* Recurse once into a fence-array */
 599         if (dma_fence_is_array(fence)) {
 600                 struct dma_fence_array *array = to_dma_fence_array(fence);
 601                 int i;
 602
 603                 for (i = 0; i < array->num_fences; i++)
 604                         __fence_set_priority(array->fences[i], attr);
 605         } else {
 606                 __fence_set_priority(fence, attr);
 607         }
 608 }
 609
 610 int
 611 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 612                               unsigned int flags,
 613                               const struct i915_sched_attr *attr)
 614 {
 615         struct dma_fence *excl;
 616
 617         if (flags & I915_WAIT_ALL) {
 618                 struct dma_fence **shared;
 619                 unsigned int count, i;
 620                 int ret;
 621
 622                 ret = reservation_object_get_fences_rcu(obj->resv,
 623                                                         &excl, &count, &shared);
 624                 if (ret)
 625                         return ret;
 626
 627                 for (i = 0; i < count; i++) {
 628                         fence_set_priority(shared[i], attr);
 629                         dma_fence_put(shared[i]);
 630                 }
 631
 632                 kfree(shared);
 633         } else {
 634                 excl = reservation_object_get_excl_rcu(obj->resv);
 635         }
 636
 637         if (excl) {
 638                 fence_set_priority(excl, attr);
 639                 dma_fence_put(excl);
 640         }
 641         return 0;
 642 }
 643
 644 /**
 645  * Waits for rendering to the object to be completed
 646  * @obj: i915 gem object
 647  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 648  * @timeout: how long to wait
 649  * @rps_client: client (user process) to charge for any waitboosting
 650  */
 651 int
 652 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 653                      unsigned int flags,
 654                      long timeout,
 655                      struct intel_rps_client *rps_client)
 656 {
 657         might_sleep();
 658 #if IS_ENABLED(CONFIG_LOCKDEP)
 659         GEM_BUG_ON(debug_locks &&
 660                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 661                    !!(flags & I915_WAIT_LOCKED));
 662 #endif
 663         GEM_BUG_ON(timeout < 0);
 664
 665         timeout = i915_gem_object_wait_reservation(obj->resv,
 666                                                    flags, timeout,
 667                                                    rps_client);
 668         return timeout < 0 ? timeout : 0;
 669 }
 670
 671 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 672 {
 673         struct drm_i915_file_private *fpriv = file->driver_priv;
 674
 675         return &fpriv->rps_client;
 676 }
 677
 678 static int
 679 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 680                      struct drm_i915_gem_pwrite *args,
 681                      struct drm_file *file)
 682 {
 683         void *vaddr = obj->phys_handle->vaddr + args->offset;
 684         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 685
 686         /* We manually control the domain here and pretend that it
 687          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 688          */
 689         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 690         if (copy_from_user(vaddr, user_data, args->size))
 691                 return -EFAULT;
 692
 693         drm_clflush_virt_range(vaddr, args->size);
 694         i915_gem_chipset_flush(to_i915(obj->base.dev));
 695
 696         intel_fb_obj_flush(obj, ORIGIN_CPU);
 697         return 0;
 698 }
 699
 700 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 701 {
 702         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 703 }
 704
 705 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 706 {
 707         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 708         kmem_cache_free(dev_priv->objects, obj);
 709 }
 710
 711 static int
 712 i915_gem_create(struct drm_file *file,
 713                 struct drm_i915_private *dev_priv,
 714                 uint64_t size,
 715                 uint32_t *handle_p)
 716 {
 717         struct drm_i915_gem_object *obj;
 718         int ret;
 719         u32 handle;
 720
 721         size = roundup(size, PAGE_SIZE);
 722         if (size == 0)
 723                 return -EINVAL;
 724
 725         /* Allocate the new object */
 726         obj = i915_gem_object_create(dev_priv, size);
 727         if (IS_ERR(obj))
 728                 return PTR_ERR(obj);
 729
 730         ret = drm_gem_handle_create(file, &obj->base, &handle);
 731         /* drop reference from allocate - handle holds it now */
 732         i915_gem_object_put(obj);
 733         if (ret)
 734                 return ret;
 735
 736         *handle_p = handle;
 737         return 0;
 738 }
 739
 740 int
 741 i915_gem_dumb_create(struct drm_file *file,
 742                      struct drm_device *dev,
 743                      struct drm_mode_create_dumb *args)
 744 {
 745         /* have to work out size/pitch and return them */
 746         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 747         args->size = args->pitch * args->height;
 748         return i915_gem_create(file, to_i915(dev),
 749                                args->size, &args->handle);
 750 }
 751
 752 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 753 {
 754         return !(obj->cache_level == I915_CACHE_NONE ||
 755                  obj->cache_level == I915_CACHE_WT);
 756 }
 757
 758 /**
 759  * Creates a new mm object and returns a handle to it.
 760  * @dev: drm device pointer
 761  * @data: ioctl data blob
 762  * @file: drm file pointer
 763  */
 764 int
 765 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 766                       struct drm_file *file)
 767 {
 768         struct drm_i915_private *dev_priv = to_i915(dev);
 769         struct drm_i915_gem_create *args = data;
 770
 771         i915_gem_flush_free_objects(dev_priv);
 772
 773         return i915_gem_create(file, dev_priv,
 774                                args->size, &args->handle);
 775 }
 776
 777 static inline enum fb_op_origin
 778 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 779 {
 780         return (domain == I915_GEM_DOMAIN_GTT ?
 781                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 782 }
 783
 784 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 785 {
 786         /*
 787          * No actual flushing is required for the GTT write domain for reads
 788          * from the GTT domain. Writes to it "immediately" go to main memory
 789          * as far as we know, so there's no chipset flush. It also doesn't
 790          * land in the GPU render cache.
 791          *
 792          * However, we do have to enforce the order so that all writes through
 793          * the GTT land before any writes to the device, such as updates to
 794          * the GATT itself.
 795          *
 796          * We also have to wait a bit for the writes to land from the GTT.
 797          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 798          * timing. This issue has only been observed when switching quickly
 799          * between GTT writes and CPU reads from inside the kernel on recent hw,
 800          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 801          * system agents we cannot reproduce this behaviour, until Cannonlake
 802          * that was!).
 803          */
 804
 805         wmb();
 806
 807         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 808                 return;
 809
 810         i915_gem_chipset_flush(dev_priv);
 811
 812         intel_runtime_pm_get(dev_priv);
 813         spin_lock_irq(&dev_priv->uncore.lock);
 814
 815         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 816
 817         spin_unlock_irq(&dev_priv->uncore.lock);
 818         intel_runtime_pm_put(dev_priv);
 819 }
 820
 821 static void
 822 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 823 {
 824         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 825         struct i915_vma *vma;
 826
 827         if (!(obj->write_domain & flush_domains))
 828                 return;
 829
 830         switch (obj->write_domain) {
 831         case I915_GEM_DOMAIN_GTT:
 832                 i915_gem_flush_ggtt_writes(dev_priv);
 833
 834                 intel_fb_obj_flush(obj,
 835                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 836
 837                 for_each_ggtt_vma(vma, obj) {
 838                         if (vma->iomap)
 839                                 continue;
 840
 841                         i915_vma_unset_ggtt_write(vma);
 842                 }
 843                 break;
 844
 845         case I915_GEM_DOMAIN_WC:
 846                 wmb();
 847                 break;
 848
 849         case I915_GEM_DOMAIN_CPU:
 850                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 851                 break;
 852
 853         case I915_GEM_DOMAIN_RENDER:
 854                 if (gpu_write_needs_clflush(obj))
 855                         obj->cache_dirty = true;
 856                 break;
 857         }
 858
 859         obj->write_domain = 0;
 860 }
 861
 862 /*
 863  * Pins the specified object's pages and synchronizes the object with
 864  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 865  * flush the object from the CPU cache.
 866  */
 867 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 868                                     unsigned int *needs_clflush)
 869 {
 870         int ret;
 871
 872         lockdep_assert_held(&obj->base.dev->struct_mutex);
 873
 874         *needs_clflush = 0;
 875         if (!i915_gem_object_has_struct_page(obj))
 876                 return -ENODEV;
 877
 878         ret = i915_gem_object_wait(obj,
 879                                    I915_WAIT_INTERRUPTIBLE |
 880                                    I915_WAIT_LOCKED,
 881                                    MAX_SCHEDULE_TIMEOUT,
 882                                    NULL);
 883         if (ret)
 884                 return ret;
 885
 886         ret = i915_gem_object_pin_pages(obj);
 887         if (ret)
 888                 return ret;
 889
 890         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 891             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 892                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 893                 if (ret)
 894                         goto err_unpin;
 895                 else
 896                         goto out;
 897         }
 898
 899         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 900
 901         /* If we're not in the cpu read domain, set ourself into the gtt
 902          * read domain and manually flush cachelines (if required). This
 903          * optimizes for the case when the gpu will dirty the data
 904          * anyway again before the next pread happens.
 905          */
 906         if (!obj->cache_dirty &&
 907             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 908                 *needs_clflush = CLFLUSH_BEFORE;
 909
 910 out:
 911         /* return with the pages pinned */
 912         return 0;
 913
 914 err_unpin:
 915         i915_gem_object_unpin_pages(obj);
 916         return ret;
 917 }
 918
 919 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 920                                      unsigned int *needs_clflush)
 921 {
 922         int ret;
 923
 924         lockdep_assert_held(&obj->base.dev->struct_mutex);
 925
 926         *needs_clflush = 0;
 927         if (!i915_gem_object_has_struct_page(obj))
 928                 return -ENODEV;
 929
 930         ret = i915_gem_object_wait(obj,
 931                                    I915_WAIT_INTERRUPTIBLE |
 932                                    I915_WAIT_LOCKED |
 933                                    I915_WAIT_ALL,
 934                                    MAX_SCHEDULE_TIMEOUT,
 935                                    NULL);
 936         if (ret)
 937                 return ret;
 938
 939         ret = i915_gem_object_pin_pages(obj);
 940         if (ret)
 941                 return ret;
 942
 943         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 944             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 945                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 946                 if (ret)
 947                         goto err_unpin;
 948                 else
 949                         goto out;
 950         }
 951
 952         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 953
 954         /* If we're not in the cpu write domain, set ourself into the
 955          * gtt write domain and manually flush cachelines (as required).
 956          * This optimizes for the case when the gpu will use the data
 957          * right away and we therefore have to clflush anyway.
 958          */
 959         if (!obj->cache_dirty) {
 960                 *needs_clflush |= CLFLUSH_AFTER;
 961
 962                 /*
 963                  * Same trick applies to invalidate partially written
 964                  * cachelines read before writing.
 965                  */
 966                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
 967                         *needs_clflush |= CLFLUSH_BEFORE;
 968         }
 969
 970 out:
 971         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 972         obj->mm.dirty = true;
 973         /* return with the pages pinned */
 974         return 0;
 975
 976 err_unpin:
 977         i915_gem_object_unpin_pages(obj);
 978         return ret;
 979 }
 980
 981 static int
 982 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
 983             bool needs_clflush)
 984 {
 985         char *vaddr;
 986         int ret;
 987
 988         vaddr = kmap(page);
 989
 990         if (needs_clflush)
 991                 drm_clflush_virt_range(vaddr + offset, len);
 992
 993         ret = __copy_to_user(user_data, vaddr + offset, len);
 994
 995         kunmap(page);
 996
 997         return ret ? -EFAULT : 0;
 998 }
 999
1000 static int
1001 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1002                      struct drm_i915_gem_pread *args)
1003 {
1004         char __user *user_data;
1005         u64 remain;
1006         unsigned int needs_clflush;
1007         unsigned int idx, offset;
1008         int ret;
1009
1010         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1011         if (ret)
1012                 return ret;
1013
1014         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1015         mutex_unlock(&obj->base.dev->struct_mutex);
1016         if (ret)
1017                 return ret;
1018
1019         remain = args->size;
1020         user_data = u64_to_user_ptr(args->data_ptr);
1021         offset = offset_in_page(args->offset);
1022         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1023                 struct page *page = i915_gem_object_get_page(obj, idx);
1024                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1025
1026                 ret = shmem_pread(page, offset, length, user_data,
1027                                   needs_clflush);
1028                 if (ret)
1029                         break;
1030
1031                 remain -= length;
1032                 user_data += length;
1033                 offset = 0;
1034         }
1035
1036         i915_gem_obj_finish_shmem_access(obj);
1037         return ret;
1038 }
1039
1040 static inline bool
1041 gtt_user_read(struct io_mapping *mapping,
1042               loff_t base, int offset,
1043               char __user *user_data, int length)
1044 {
1045         void __iomem *vaddr;
1046         unsigned long unwritten;
1047
1048         /* We can use the cpu mem copy function because this is X86. */
1049         vaddr = io_mapping_map_atomic_wc(mapping, base);
1050         unwritten = __copy_to_user_inatomic(user_data,
1051                                             (void __force *)vaddr + offset,
1052                                             length);
1053         io_mapping_unmap_atomic(vaddr);
1054         if (unwritten) {
1055                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1056                 unwritten = copy_to_user(user_data,
1057                                          (void __force *)vaddr + offset,
1058                                          length);
1059                 io_mapping_unmap(vaddr);
1060         }
1061         return unwritten;
1062 }
1063
1064 static int
1065 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1066                    const struct drm_i915_gem_pread *args)
1067 {
1068         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1069         struct i915_ggtt *ggtt = &i915->ggtt;
1070         struct drm_mm_node node;
1071         struct i915_vma *vma;
1072         void __user *user_data;
1073         u64 remain, offset;
1074         int ret;
1075
1076         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1077         if (ret)
1078                 return ret;
1079
1080         intel_runtime_pm_get(i915);
1081         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1082                                        PIN_MAPPABLE |
1083                                        PIN_NONFAULT |
1084                                        PIN_NONBLOCK);
1085         if (!IS_ERR(vma)) {
1086                 node.start = i915_ggtt_offset(vma);
1087                 node.allocated = false;
1088                 ret = i915_vma_put_fence(vma);
1089                 if (ret) {
1090                         i915_vma_unpin(vma);
1091                         vma = ERR_PTR(ret);
1092                 }
1093         }
1094         if (IS_ERR(vma)) {
1095                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1096                 if (ret)
1097                         goto out_unlock;
1098                 GEM_BUG_ON(!node.allocated);
1099         }
1100
1101         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1102         if (ret)
1103                 goto out_unpin;
1104
1105         mutex_unlock(&i915->drm.struct_mutex);
1106
1107         user_data = u64_to_user_ptr(args->data_ptr);
1108         remain = args->size;
1109         offset = args->offset;
1110
1111         while (remain > 0) {
1112                 /* Operation in this page
1113                  *
1114                  * page_base = page offset within aperture
1115                  * page_offset = offset within page
1116                  * page_length = bytes to copy for this page
1117                  */
1118                 u32 page_base = node.start;
1119                 unsigned page_offset = offset_in_page(offset);
1120                 unsigned page_length = PAGE_SIZE - page_offset;
1121                 page_length = remain < page_length ? remain : page_length;
1122                 if (node.allocated) {
1123                         wmb();
1124                         ggtt->vm.insert_page(&ggtt->vm,
1125                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1126                                              node.start, I915_CACHE_NONE, 0);
1127                         wmb();
1128                 } else {
1129                         page_base += offset & PAGE_MASK;
1130                 }
1131
1132                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1133                                   user_data, page_length)) {
1134                         ret = -EFAULT;
1135                         break;
1136                 }
1137
1138                 remain -= page_length;
1139                 user_data += page_length;
1140                 offset += page_length;
1141         }
1142
1143         mutex_lock(&i915->drm.struct_mutex);
1144 out_unpin:
1145         if (node.allocated) {
1146                 wmb();
1147                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1148                 remove_mappable_node(&node);
1149         } else {
1150                 i915_vma_unpin(vma);
1151         }
1152 out_unlock:
1153         intel_runtime_pm_put(i915);
1154         mutex_unlock(&i915->drm.struct_mutex);
1155
1156         return ret;
1157 }
1158
1159 /**
1160  * Reads data from the object referenced by handle.
1161  * @dev: drm device pointer
1162  * @data: ioctl data blob
1163  * @file: drm file pointer
1164  *
1165  * On error, the contents of *data are undefined.
1166  */
1167 int
1168 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1169                      struct drm_file *file)
1170 {
1171         struct drm_i915_gem_pread *args = data;
1172         struct drm_i915_gem_object *obj;
1173         int ret;
1174
1175         if (args->size == 0)
1176                 return 0;
1177
1178         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1179                        args->size))
1180                 return -EFAULT;
1181
1182         obj = i915_gem_object_lookup(file, args->handle);
1183         if (!obj)
1184                 return -ENOENT;
1185
1186         /* Bounds check source.  */
1187         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1188                 ret = -EINVAL;
1189                 goto out;
1190         }
1191
1192         trace_i915_gem_object_pread(obj, args->offset, args->size);
1193
1194         ret = i915_gem_object_wait(obj,
1195                                    I915_WAIT_INTERRUPTIBLE,
1196                                    MAX_SCHEDULE_TIMEOUT,
1197                                    to_rps_client(file));
1198         if (ret)
1199                 goto out;
1200
1201         ret = i915_gem_object_pin_pages(obj);
1202         if (ret)
1203                 goto out;
1204
1205         ret = i915_gem_shmem_pread(obj, args);
1206         if (ret == -EFAULT || ret == -ENODEV)
1207                 ret = i915_gem_gtt_pread(obj, args);
1208
1209         i915_gem_object_unpin_pages(obj);
1210 out:
1211         i915_gem_object_put(obj);
1212         return ret;
1213 }
1214
1215 /* This is the fast write path which cannot handle
1216  * page faults in the source data
1217  */
1218
1219 static inline bool
1220 ggtt_write(struct io_mapping *mapping,
1221            loff_t base, int offset,
1222            char __user *user_data, int length)
1223 {
1224         void __iomem *vaddr;
1225         unsigned long unwritten;
1226
1227         /* We can use the cpu mem copy function because this is X86. */
1228         vaddr = io_mapping_map_atomic_wc(mapping, base);
1229         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1230                                                       user_data, length);
1231         io_mapping_unmap_atomic(vaddr);
1232         if (unwritten) {
1233                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1234                 unwritten = copy_from_user((void __force *)vaddr + offset,
1235                                            user_data, length);
1236                 io_mapping_unmap(vaddr);
1237         }
1238
1239         return unwritten;
1240 }
1241
1242 /**
1243  * This is the fast pwrite path, where we copy the data directly from the
1244  * user into the GTT, uncached.
1245  * @obj: i915 GEM object
1246  * @args: pwrite arguments structure
1247  */
1248 static int
1249 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1250                          const struct drm_i915_gem_pwrite *args)
1251 {
1252         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1253         struct i915_ggtt *ggtt = &i915->ggtt;
1254         struct drm_mm_node node;
1255         struct i915_vma *vma;
1256         u64 remain, offset;
1257         void __user *user_data;
1258         int ret;
1259
1260         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1261         if (ret)
1262                 return ret;
1263
1264         if (i915_gem_object_has_struct_page(obj)) {
1265                 /*
1266                  * Avoid waking the device up if we can fallback, as
1267                  * waking/resuming is very slow (worst-case 10-100 ms
1268                  * depending on PCI sleeps and our own resume time).
1269                  * This easily dwarfs any performance advantage from
1270                  * using the cache bypass of indirect GGTT access.
1271                  */
1272                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1273                         ret = -EFAULT;
1274                         goto out_unlock;
1275                 }
1276         } else {
1277                 /* No backing pages, no fallback, we must force GGTT access */
1278                 intel_runtime_pm_get(i915);
1279         }
1280
1281         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1282                                        PIN_MAPPABLE |
1283                                        PIN_NONFAULT |
1284                                        PIN_NONBLOCK);
1285         if (!IS_ERR(vma)) {
1286                 node.start = i915_ggtt_offset(vma);
1287                 node.allocated = false;
1288                 ret = i915_vma_put_fence(vma);
1289                 if (ret) {
1290                         i915_vma_unpin(vma);
1291                         vma = ERR_PTR(ret);
1292                 }
1293         }
1294         if (IS_ERR(vma)) {
1295                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1296                 if (ret)
1297                         goto out_rpm;
1298                 GEM_BUG_ON(!node.allocated);
1299         }
1300
1301         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1302         if (ret)
1303                 goto out_unpin;
1304
1305         mutex_unlock(&i915->drm.struct_mutex);
1306
1307         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1308
1309         user_data = u64_to_user_ptr(args->data_ptr);
1310         offset = args->offset;
1311         remain = args->size;
1312         while (remain) {
1313                 /* Operation in this page
1314                  *
1315                  * page_base = page offset within aperture
1316                  * page_offset = offset within page
1317                  * page_length = bytes to copy for this page
1318                  */
1319                 u32 page_base = node.start;
1320                 unsigned int page_offset = offset_in_page(offset);
1321                 unsigned int page_length = PAGE_SIZE - page_offset;
1322                 page_length = remain < page_length ? remain : page_length;
1323                 if (node.allocated) {
1324                         wmb(); /* flush the write before we modify the GGTT */
1325                         ggtt->vm.insert_page(&ggtt->vm,
1326                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1327                                              node.start, I915_CACHE_NONE, 0);
1328                         wmb(); /* flush modifications to the GGTT (insert_page) */
1329                 } else {
1330                         page_base += offset & PAGE_MASK;
1331                 }
1332                 /* If we get a fault while copying data, then (presumably) our
1333                  * source page isn't available.  Return the error and we'll
1334                  * retry in the slow path.
1335                  * If the object is non-shmem backed, we retry again with the
1336                  * path that handles page fault.
1337                  */
1338                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1339                                user_data, page_length)) {
1340                         ret = -EFAULT;
1341                         break;
1342                 }
1343
1344                 remain -= page_length;
1345                 user_data += page_length;
1346                 offset += page_length;
1347         }
1348         intel_fb_obj_flush(obj, ORIGIN_CPU);
1349
1350         mutex_lock(&i915->drm.struct_mutex);
1351 out_unpin:
1352         if (node.allocated) {
1353                 wmb();
1354                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1355                 remove_mappable_node(&node);
1356         } else {
1357                 i915_vma_unpin(vma);
1358         }
1359 out_rpm:
1360         intel_runtime_pm_put(i915);
1361 out_unlock:
1362         mutex_unlock(&i915->drm.struct_mutex);
1363         return ret;
1364 }
1365
1366 /* Per-page copy function for the shmem pwrite fastpath.
1367  * Flushes invalid cachelines before writing to the target if
1368  * needs_clflush_before is set and flushes out any written cachelines after
1369  * writing if needs_clflush is set.
1370  */
1371 static int
1372 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1373              bool needs_clflush_before,
1374              bool needs_clflush_after)
1375 {
1376         char *vaddr;
1377         int ret;
1378
1379         vaddr = kmap(page);
1380
1381         if (needs_clflush_before)
1382                 drm_clflush_virt_range(vaddr + offset, len);
1383
1384         ret = __copy_from_user(vaddr + offset, user_data, len);
1385         if (!ret && needs_clflush_after)
1386                 drm_clflush_virt_range(vaddr + offset, len);
1387
1388         kunmap(page);
1389
1390         return ret ? -EFAULT : 0;
1391 }
1392
1393 static int
1394 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1395                       const struct drm_i915_gem_pwrite *args)
1396 {
1397         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1398         void __user *user_data;
1399         u64 remain;
1400         unsigned int partial_cacheline_write;
1401         unsigned int needs_clflush;
1402         unsigned int offset, idx;
1403         int ret;
1404
1405         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1406         if (ret)
1407                 return ret;
1408
1409         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1410         mutex_unlock(&i915->drm.struct_mutex);
1411         if (ret)
1412                 return ret;
1413
1414         /* If we don't overwrite a cacheline completely we need to be
1415          * careful to have up-to-date data by first clflushing. Don't
1416          * overcomplicate things and flush the entire patch.
1417          */
1418         partial_cacheline_write = 0;
1419         if (needs_clflush & CLFLUSH_BEFORE)
1420                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1421
1422         user_data = u64_to_user_ptr(args->data_ptr);
1423         remain = args->size;
1424         offset = offset_in_page(args->offset);
1425         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1426                 struct page *page = i915_gem_object_get_page(obj, idx);
1427                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1428
1429                 ret = shmem_pwrite(page, offset, length, user_data,
1430                                    (offset | length) & partial_cacheline_write,
1431                                    needs_clflush & CLFLUSH_AFTER);
1432                 if (ret)
1433                         break;
1434
1435                 remain -= length;
1436                 user_data += length;
1437                 offset = 0;
1438         }
1439
1440         intel_fb_obj_flush(obj, ORIGIN_CPU);
1441         i915_gem_obj_finish_shmem_access(obj);
1442         return ret;
1443 }
1444
1445 /**
1446  * Writes data to the object referenced by handle.
1447  * @dev: drm device
1448  * @data: ioctl data blob
1449  * @file: drm file
1450  *
1451  * On error, the contents of the buffer that were to be modified are undefined.
1452  */
1453 int
1454 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1455                       struct drm_file *file)
1456 {
1457         struct drm_i915_gem_pwrite *args = data;
1458         struct drm_i915_gem_object *obj;
1459         int ret;
1460
1461         if (args->size == 0)
1462                 return 0;
1463
1464         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1465                 return -EFAULT;
1466
1467         obj = i915_gem_object_lookup(file, args->handle);
1468         if (!obj)
1469                 return -ENOENT;
1470
1471         /* Bounds check destination. */
1472         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1473                 ret = -EINVAL;
1474                 goto err;
1475         }
1476
1477         /* Writes not allowed into this read-only object */
1478         if (i915_gem_object_is_readonly(obj)) {
1479                 ret = -EINVAL;
1480                 goto err;
1481         }
1482
1483         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1484
1485         ret = -ENODEV;
1486         if (obj->ops->pwrite)
1487                 ret = obj->ops->pwrite(obj, args);
1488         if (ret != -ENODEV)
1489                 goto err;
1490
1491         ret = i915_gem_object_wait(obj,
1492                                    I915_WAIT_INTERRUPTIBLE |
1493                                    I915_WAIT_ALL,
1494                                    MAX_SCHEDULE_TIMEOUT,
1495                                    to_rps_client(file));
1496         if (ret)
1497                 goto err;
1498
1499         ret = i915_gem_object_pin_pages(obj);
1500         if (ret)
1501                 goto err;
1502
1503         ret = -EFAULT;
1504         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1505          * it would end up going through the fenced access, and we'll get
1506          * different detiling behavior between reading and writing.
1507          * pread/pwrite currently are reading and writing from the CPU
1508          * perspective, requiring manual detiling by the client.
1509          */
1510         if (!i915_gem_object_has_struct_page(obj) ||
1511             cpu_write_needs_clflush(obj))
1512                 /* Note that the gtt paths might fail with non-page-backed user
1513                  * pointers (e.g. gtt mappings when moving data between
1514                  * textures). Fallback to the shmem path in that case.
1515                  */
1516                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1517
1518         if (ret == -EFAULT || ret == -ENOSPC) {
1519                 if (obj->phys_handle)
1520                         ret = i915_gem_phys_pwrite(obj, args, file);
1521                 else
1522                         ret = i915_gem_shmem_pwrite(obj, args);
1523         }
1524
1525         i915_gem_object_unpin_pages(obj);
1526 err:
1527         i915_gem_object_put(obj);
1528         return ret;
1529 }
1530
1531 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1532 {
1533         struct drm_i915_private *i915;
1534         struct list_head *list;
1535         struct i915_vma *vma;
1536
1537         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1538
1539         for_each_ggtt_vma(vma, obj) {
1540                 if (i915_vma_is_active(vma))
1541                         continue;
1542
1543                 if (!drm_mm_node_allocated(&vma->node))
1544                         continue;
1545
1546                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1547         }
1548
1549         i915 = to_i915(obj->base.dev);
1550         spin_lock(&i915->mm.obj_lock);
1551         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1552         list_move_tail(&obj->mm.link, list);
1553         spin_unlock(&i915->mm.obj_lock);
1554 }
1555
1556 /**
1557  * Called when user space prepares to use an object with the CPU, either
1558  * through the mmap ioctl's mapping or a GTT mapping.
1559  * @dev: drm device
1560  * @data: ioctl data blob
1561  * @file: drm file
1562  */
1563 int
1564 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1565                           struct drm_file *file)
1566 {
1567         struct drm_i915_gem_set_domain *args = data;
1568         struct drm_i915_gem_object *obj;
1569         uint32_t read_domains = args->read_domains;
1570         uint32_t write_domain = args->write_domain;
1571         int err;
1572
1573         /* Only handle setting domains to types used by the CPU. */
1574         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1575                 return -EINVAL;
1576
1577         /* Having something in the write domain implies it's in the read
1578          * domain, and only that read domain.  Enforce that in the request.
1579          */
1580         if (write_domain != 0 && read_domains != write_domain)
1581                 return -EINVAL;
1582
1583         obj = i915_gem_object_lookup(file, args->handle);
1584         if (!obj)
1585                 return -ENOENT;
1586
1587         /* Try to flush the object off the GPU without holding the lock.
1588          * We will repeat the flush holding the lock in the normal manner
1589          * to catch cases where we are gazumped.
1590          */
1591         err = i915_gem_object_wait(obj,
1592                                    I915_WAIT_INTERRUPTIBLE |
1593                                    I915_WAIT_PRIORITY |
1594                                    (write_domain ? I915_WAIT_ALL : 0),
1595                                    MAX_SCHEDULE_TIMEOUT,
1596                                    to_rps_client(file));
1597         if (err)
1598                 goto out;
1599
1600         /*
1601          * Proxy objects do not control access to the backing storage, ergo
1602          * they cannot be used as a means to manipulate the cache domain
1603          * tracking for that backing storage. The proxy object is always
1604          * considered to be outside of any cache domain.
1605          */
1606         if (i915_gem_object_is_proxy(obj)) {
1607                 err = -ENXIO;
1608                 goto out;
1609         }
1610
1611         /*
1612          * Flush and acquire obj->pages so that we are coherent through
1613          * direct access in memory with previous cached writes through
1614          * shmemfs and that our cache domain tracking remains valid.
1615          * For example, if the obj->filp was moved to swap without us
1616          * being notified and releasing the pages, we would mistakenly
1617          * continue to assume that the obj remained out of the CPU cached
1618          * domain.
1619          */
1620         err = i915_gem_object_pin_pages(obj);
1621         if (err)
1622                 goto out;
1623
1624         err = i915_mutex_lock_interruptible(dev);
1625         if (err)
1626                 goto out_unpin;
1627
1628         if (read_domains & I915_GEM_DOMAIN_WC)
1629                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1630         else if (read_domains & I915_GEM_DOMAIN_GTT)
1631                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1632         else
1633                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1634
1635         /* And bump the LRU for this access */
1636         i915_gem_object_bump_inactive_ggtt(obj);
1637
1638         mutex_unlock(&dev->struct_mutex);
1639
1640         if (write_domain != 0)
1641                 intel_fb_obj_invalidate(obj,
1642                                         fb_write_origin(obj, write_domain));
1643
1644 out_unpin:
1645         i915_gem_object_unpin_pages(obj);
1646 out:
1647         i915_gem_object_put(obj);
1648         return err;
1649 }
1650
1651 /**
1652  * Called when user space has done writes to this buffer
1653  * @dev: drm device
1654  * @data: ioctl data blob
1655  * @file: drm file
1656  */
1657 int
1658 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1659                          struct drm_file *file)
1660 {
1661         struct drm_i915_gem_sw_finish *args = data;
1662         struct drm_i915_gem_object *obj;
1663
1664         obj = i915_gem_object_lookup(file, args->handle);
1665         if (!obj)
1666                 return -ENOENT;
1667
1668         /*
1669          * Proxy objects are barred from CPU access, so there is no
1670          * need to ban sw_finish as it is a nop.
1671          */
1672
1673         /* Pinned buffers may be scanout, so flush the cache */
1674         i915_gem_object_flush_if_display(obj);
1675         i915_gem_object_put(obj);
1676
1677         return 0;
1678 }
1679
1680 /**
1681  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1682  *                       it is mapped to.
1683  * @dev: drm device
1684  * @data: ioctl data blob
1685  * @file: drm file
1686  *
1687  * While the mapping holds a reference on the contents of the object, it doesn't
1688  * imply a ref on the object itself.
1689  *
1690  * IMPORTANT:
1691  *
1692  * DRM driver writers who look a this function as an example for how to do GEM
1693  * mmap support, please don't implement mmap support like here. The modern way
1694  * to implement DRM mmap support is with an mmap offset ioctl (like
1695  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1696  * That way debug tooling like valgrind will understand what's going on, hiding
1697  * the mmap call in a driver private ioctl will break that. The i915 driver only
1698  * does cpu mmaps this way because we didn't know better.
1699  */
1700 int
1701 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1702                     struct drm_file *file)
1703 {
1704         struct drm_i915_gem_mmap *args = data;
1705         struct drm_i915_gem_object *obj;
1706         unsigned long addr;
1707
1708         if (args->flags & ~(I915_MMAP_WC))
1709                 return -EINVAL;
1710
1711         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1712                 return -ENODEV;
1713
1714         obj = i915_gem_object_lookup(file, args->handle);
1715         if (!obj)
1716                 return -ENOENT;
1717
1718         /* prime objects have no backing filp to GEM mmap
1719          * pages from.
1720          */
1721         if (!obj->base.filp) {
1722                 i915_gem_object_put(obj);
1723                 return -ENXIO;
1724         }
1725
1726         addr = vm_mmap(obj->base.filp, 0, args->size,
1727                        PROT_READ | PROT_WRITE, MAP_SHARED,
1728                        args->offset);
1729         if (args->flags & I915_MMAP_WC) {
1730                 struct mm_struct *mm = current->mm;
1731                 struct vm_area_struct *vma;
1732
1733                 if (down_write_killable(&mm->mmap_sem)) {
1734                         i915_gem_object_put(obj);
1735                         return -EINTR;
1736                 }
1737                 vma = find_vma(mm, addr);
1738                 if (vma)
1739                         vma->vm_page_prot =
1740                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1741                 else
1742                         addr = -ENOMEM;
1743                 up_write(&mm->mmap_sem);
1744
1745                 /* This may race, but that's ok, it only gets set */
1746                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1747         }
1748         i915_gem_object_put(obj);
1749         if (IS_ERR((void *)addr))
1750                 return addr;
1751
1752         args->addr_ptr = (uint64_t) addr;
1753
1754         return 0;
1755 }
1756
1757 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1758 {
1759         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1760 }
1761
1762 /**
1763  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1764  *
1765  * A history of the GTT mmap interface:
1766  *
1767  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1768  *     aligned and suitable for fencing, and still fit into the available
1769  *     mappable space left by the pinned display objects. A classic problem
1770  *     we called the page-fault-of-doom where we would ping-pong between
1771  *     two objects that could not fit inside the GTT and so the memcpy
1772  *     would page one object in at the expense of the other between every
1773  *     single byte.
1774  *
1775  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1776  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1777  *     object is too large for the available space (or simply too large
1778  *     for the mappable aperture!), a view is created instead and faulted
1779  *     into userspace. (This view is aligned and sized appropriately for
1780  *     fenced access.)
1781  *
1782  * 2 - Recognise WC as a separate cache domain so that we can flush the
1783  *     delayed writes via GTT before performing direct access via WC.
1784  *
1785  * Restrictions:
1786  *
1787  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1788  *    hangs on some architectures, corruption on others. An attempt to service
1789  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1790  *
1791  *  * the object must be able to fit into RAM (physical memory, though no
1792  *    limited to the mappable aperture).
1793  *
1794  *
1795  * Caveats:
1796  *
1797  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1798  *    all data to system memory. Subsequent access will not be synchronized.
1799  *
1800  *  * all mappings are revoked on runtime device suspend.
1801  *
1802  *  * there are only 8, 16 or 32 fence registers to share between all users
1803  *    (older machines require fence register for display and blitter access
1804  *    as well). Contention of the fence registers will cause the previous users
1805  *    to be unmapped and any new access will generate new page faults.
1806  *
1807  *  * running out of memory while servicing a fault may generate a SIGBUS,
1808  *    rather than the expected SIGSEGV.
1809  */
1810 int i915_gem_mmap_gtt_version(void)
1811 {
1812         return 2;
1813 }
1814
1815 static inline struct i915_ggtt_view
1816 compute_partial_view(const struct drm_i915_gem_object *obj,
1817                      pgoff_t page_offset,
1818                      unsigned int chunk)
1819 {
1820         struct i915_ggtt_view view;
1821
1822         if (i915_gem_object_is_tiled(obj))
1823                 chunk = roundup(chunk, tile_row_pages(obj));
1824
1825         view.type = I915_GGTT_VIEW_PARTIAL;
1826         view.partial.offset = rounddown(page_offset, chunk);
1827         view.partial.size =
1828                 min_t(unsigned int, chunk,
1829                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1830
1831         /* If the partial covers the entire object, just create a normal VMA. */
1832         if (chunk >= obj->base.size >> PAGE_SHIFT)
1833                 view.type = I915_GGTT_VIEW_NORMAL;
1834
1835         return view;
1836 }
1837
1838 /**
1839  * i915_gem_fault - fault a page into the GTT
1840  * @vmf: fault info
1841  *
1842  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1843  * from userspace.  The fault handler takes care of binding the object to
1844  * the GTT (if needed), allocating and programming a fence register (again,
1845  * only if needed based on whether the old reg is still valid or the object
1846  * is tiled) and inserting a new PTE into the faulting process.
1847  *
1848  * Note that the faulting process may involve evicting existing objects
1849  * from the GTT and/or fence registers to make room.  So performance may
1850  * suffer if the GTT working set is large or there are few fence registers
1851  * left.
1852  *
1853  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1854  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1855  */
1856 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1857 {
1858 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1859         struct vm_area_struct *area = vmf->vma;
1860         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1861         struct drm_device *dev = obj->base.dev;
1862         struct drm_i915_private *dev_priv = to_i915(dev);
1863         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1864         bool write = area->vm_flags & VM_WRITE;
1865         struct i915_vma *vma;
1866         pgoff_t page_offset;
1867         int ret;
1868
1869         /* Sanity check that we allow writing into this object */
1870         if (i915_gem_object_is_readonly(obj) && write)
1871                 return VM_FAULT_SIGBUS;
1872
1873         /* We don't use vmf->pgoff since that has the fake offset */
1874         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1875
1876         trace_i915_gem_object_fault(obj, page_offset, true, write);
1877
1878         /* Try to flush the object off the GPU first without holding the lock.
1879          * Upon acquiring the lock, we will perform our sanity checks and then
1880          * repeat the flush holding the lock in the normal manner to catch cases
1881          * where we are gazumped.
1882          */
1883         ret = i915_gem_object_wait(obj,
1884                                    I915_WAIT_INTERRUPTIBLE,
1885                                    MAX_SCHEDULE_TIMEOUT,
1886                                    NULL);
1887         if (ret)
1888                 goto err;
1889
1890         ret = i915_gem_object_pin_pages(obj);
1891         if (ret)
1892                 goto err;
1893
1894         intel_runtime_pm_get(dev_priv);
1895
1896         ret = i915_mutex_lock_interruptible(dev);
1897         if (ret)
1898                 goto err_rpm;
1899
1900         /* Access to snoopable pages through the GTT is incoherent. */
1901         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1902                 ret = -EFAULT;
1903                 goto err_unlock;
1904         }
1905
1906
1907         /* Now pin it into the GTT as needed */
1908         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1909                                        PIN_MAPPABLE |
1910                                        PIN_NONBLOCK |
1911                                        PIN_NONFAULT);
1912         if (IS_ERR(vma)) {
1913                 /* Use a partial view if it is bigger than available space */
1914                 struct i915_ggtt_view view =
1915                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1916                 unsigned int flags;
1917
1918                 flags = PIN_MAPPABLE;
1919                 if (view.type == I915_GGTT_VIEW_NORMAL)
1920                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1921
1922                 /*
1923                  * Userspace is now writing through an untracked VMA, abandon
1924                  * all hope that the hardware is able to track future writes.
1925                  */
1926                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1927
1928                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1929                 if (IS_ERR(vma) && !view.type) {
1930                         flags = PIN_MAPPABLE;
1931                         view.type = I915_GGTT_VIEW_PARTIAL;
1932                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1933                 }
1934         }
1935         if (IS_ERR(vma)) {
1936                 ret = PTR_ERR(vma);
1937                 goto err_unlock;
1938         }
1939
1940         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1941         if (ret)
1942                 goto err_unpin;
1943
1944         ret = i915_vma_pin_fence(vma);
1945         if (ret)
1946                 goto err_unpin;
1947
1948         /* Finally, remap it using the new GTT offset */
1949         ret = remap_io_mapping(area,
1950                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1951                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1952                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1953                                &ggtt->iomap);
1954         if (ret)
1955                 goto err_fence;
1956
1957         /* Mark as being mmapped into userspace for later revocation */
1958         assert_rpm_wakelock_held(dev_priv);
1959         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1960                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1961         GEM_BUG_ON(!obj->userfault_count);
1962
1963         i915_vma_set_ggtt_write(vma);
1964
1965 err_fence:
1966         i915_vma_unpin_fence(vma);
1967 err_unpin:
1968         __i915_vma_unpin(vma);
1969 err_unlock:
1970         mutex_unlock(&dev->struct_mutex);
1971 err_rpm:
1972         intel_runtime_pm_put(dev_priv);
1973         i915_gem_object_unpin_pages(obj);
1974 err:
1975         switch (ret) {
1976         case -EIO:
1977                 /*
1978                  * We eat errors when the gpu is terminally wedged to avoid
1979                  * userspace unduly crashing (gl has no provisions for mmaps to
1980                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1981                  * and so needs to be reported.
1982                  */
1983                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
1984                         return VM_FAULT_SIGBUS;
1985                 /* else: fall through */
1986         case -EAGAIN:
1987                 /*
1988                  * EAGAIN means the gpu is hung and we'll wait for the error
1989                  * handler to reset everything when re-faulting in
1990                  * i915_mutex_lock_interruptible.
1991                  */
1992         case 0:
1993         case -ERESTARTSYS:
1994         case -EINTR:
1995         case -EBUSY:
1996                 /*
1997                  * EBUSY is ok: this just means that another thread
1998                  * already did the job.
1999                  */
2000                 return VM_FAULT_NOPAGE;
2001         case -ENOMEM:
2002                 return VM_FAULT_OOM;
2003         case -ENOSPC:
2004         case -EFAULT:
2005                 return VM_FAULT_SIGBUS;
2006         default:
2007                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2008                 return VM_FAULT_SIGBUS;
2009         }
2010 }
2011
2012 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2013 {
2014         struct i915_vma *vma;
2015
2016         GEM_BUG_ON(!obj->userfault_count);
2017
2018         obj->userfault_count = 0;
2019         list_del(&obj->userfault_link);
2020         drm_vma_node_unmap(&obj->base.vma_node,
2021                            obj->base.dev->anon_inode->i_mapping);
2022
2023         for_each_ggtt_vma(vma, obj)
2024                 i915_vma_unset_userfault(vma);
2025 }
2026
2027 /**
2028  * i915_gem_release_mmap - remove physical page mappings
2029  * @obj: obj in question
2030  *
2031  * Preserve the reservation of the mmapping with the DRM core code, but
2032  * relinquish ownership of the pages back to the system.
2033  *
2034  * It is vital that we remove the page mapping if we have mapped a tiled
2035  * object through the GTT and then lose the fence register due to
2036  * resource pressure. Similarly if the object has been moved out of the
2037  * aperture, than pages mapped into userspace must be revoked. Removing the
2038  * mapping will then trigger a page fault on the next user access, allowing
2039  * fixup by i915_gem_fault().
2040  */
2041 void
2042 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2043 {
2044         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2045
2046         /* Serialisation between user GTT access and our code depends upon
2047          * revoking the CPU's PTE whilst the mutex is held. The next user
2048          * pagefault then has to wait until we release the mutex.
2049          *
2050          * Note that RPM complicates somewhat by adding an additional
2051          * requirement that operations to the GGTT be made holding the RPM
2052          * wakeref.
2053          */
2054         lockdep_assert_held(&i915->drm.struct_mutex);
2055         intel_runtime_pm_get(i915);
2056
2057         if (!obj->userfault_count)
2058                 goto out;
2059
2060         __i915_gem_object_release_mmap(obj);
2061
2062         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2063          * memory transactions from userspace before we return. The TLB
2064          * flushing implied above by changing the PTE above *should* be
2065          * sufficient, an extra barrier here just provides us with a bit
2066          * of paranoid documentation about our requirement to serialise
2067          * memory writes before touching registers / GSM.
2068          */
2069         wmb();
2070
2071 out:
2072         intel_runtime_pm_put(i915);
2073 }
2074
2075 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2076 {
2077         struct drm_i915_gem_object *obj, *on;
2078         int i;
2079
2080         /*
2081          * Only called during RPM suspend. All users of the userfault_list
2082          * must be holding an RPM wakeref to ensure that this can not
2083          * run concurrently with themselves (and use the struct_mutex for
2084          * protection between themselves).
2085          */
2086
2087         list_for_each_entry_safe(obj, on,
2088                                  &dev_priv->mm.userfault_list, userfault_link)
2089                 __i915_gem_object_release_mmap(obj);
2090
2091         /* The fence will be lost when the device powers down. If any were
2092          * in use by hardware (i.e. they are pinned), we should not be powering
2093          * down! All other fences will be reacquired by the user upon waking.
2094          */
2095         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2096                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2097
2098                 /* Ideally we want to assert that the fence register is not
2099                  * live at this point (i.e. that no piece of code will be
2100                  * trying to write through fence + GTT, as that both violates
2101                  * our tracking of activity and associated locking/barriers,
2102                  * but also is illegal given that the hw is powered down).
2103                  *
2104                  * Previously we used reg->pin_count as a "liveness" indicator.
2105                  * That is not sufficient, and we need a more fine-grained
2106                  * tool if we want to have a sanity check here.
2107                  */
2108
2109                 if (!reg->vma)
2110                         continue;
2111
2112                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2113                 reg->dirty = true;
2114         }
2115 }
2116
2117 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2118 {
2119         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2120         int err;
2121
2122         err = drm_gem_create_mmap_offset(&obj->base);
2123         if (likely(!err))
2124                 return 0;
2125
2126         /* Attempt to reap some mmap space from dead objects */
2127         do {
2128                 err = i915_gem_wait_for_idle(dev_priv,
2129                                              I915_WAIT_INTERRUPTIBLE,
2130                                              MAX_SCHEDULE_TIMEOUT);
2131                 if (err)
2132                         break;
2133
2134                 i915_gem_drain_freed_objects(dev_priv);
2135                 err = drm_gem_create_mmap_offset(&obj->base);
2136                 if (!err)
2137                         break;
2138
2139         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2140
2141         return err;
2142 }
2143
2144 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2145 {
2146         drm_gem_free_mmap_offset(&obj->base);
2147 }
2148
2149 int
2150 i915_gem_mmap_gtt(struct drm_file *file,
2151                   struct drm_device *dev,
2152                   uint32_t handle,
2153                   uint64_t *offset)
2154 {
2155         struct drm_i915_gem_object *obj;
2156         int ret;
2157
2158         obj = i915_gem_object_lookup(file, handle);
2159         if (!obj)
2160                 return -ENOENT;
2161
2162         ret = i915_gem_object_create_mmap_offset(obj);
2163         if (ret == 0)
2164                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2165
2166         i915_gem_object_put(obj);
2167         return ret;
2168 }
2169
2170 /**
2171  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2172  * @dev: DRM device
2173  * @data: GTT mapping ioctl data
2174  * @file: GEM object info
2175  *
2176  * Simply returns the fake offset to userspace so it can mmap it.
2177  * The mmap call will end up in drm_gem_mmap(), which will set things
2178  * up so we can get faults in the handler above.
2179  *
2180  * The fault handler will take care of binding the object into the GTT
2181  * (since it may have been evicted to make room for something), allocating
2182  * a fence register, and mapping the appropriate aperture address into
2183  * userspace.
2184  */
2185 int
2186 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2187                         struct drm_file *file)
2188 {
2189         struct drm_i915_gem_mmap_gtt *args = data;
2190
2191         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2192 }
2193
2194 /* Immediately discard the backing storage */
2195 static void
2196 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2197 {
2198         i915_gem_object_free_mmap_offset(obj);
2199
2200         if (obj->base.filp == NULL)
2201                 return;
2202
2203         /* Our goal here is to return as much of the memory as
2204          * is possible back to the system as we are called from OOM.
2205          * To do this we must instruct the shmfs to drop all of its
2206          * backing pages, *now*.
2207          */
2208         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2209         obj->mm.madv = __I915_MADV_PURGED;
2210         obj->mm.pages = ERR_PTR(-EFAULT);
2211 }
2212
2213 /* Try to discard unwanted pages */
2214 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2215 {
2216         struct address_space *mapping;
2217
2218         lockdep_assert_held(&obj->mm.lock);
2219         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2220
2221         switch (obj->mm.madv) {
2222         case I915_MADV_DONTNEED:
2223                 i915_gem_object_truncate(obj);
2224         case __I915_MADV_PURGED:
2225                 return;
2226         }
2227
2228         if (obj->base.filp == NULL)
2229                 return;
2230
2231         mapping = obj->base.filp->f_mapping,
2232         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2233 }
2234
2235 /*
2236  * Move pages to appropriate lru and release the pagevec, decrementing the
2237  * ref count of those pages.
2238  */
2239 static void check_release_pagevec(struct pagevec *pvec)
2240 {
2241         check_move_unevictable_pages(pvec);
2242         __pagevec_release(pvec);
2243         cond_resched();
2244 }
2245
2246 static void
2247 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2248                               struct sg_table *pages)
2249 {
2250         struct sgt_iter sgt_iter;
2251         struct pagevec pvec;
2252         struct page *page;
2253
2254         __i915_gem_object_release_shmem(obj, pages, true);
2255
2256         i915_gem_gtt_finish_pages(obj, pages);
2257
2258         if (i915_gem_object_needs_bit17_swizzle(obj))
2259                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2260
2261         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2262
2263         pagevec_init(&pvec);
2264         for_each_sgt_page(page, sgt_iter, pages) {
2265                 if (obj->mm.dirty)
2266                         set_page_dirty(page);
2267
2268                 if (obj->mm.madv == I915_MADV_WILLNEED)
2269                         mark_page_accessed(page);
2270
2271                 if (!pagevec_add(&pvec, page))
2272                         check_release_pagevec(&pvec);
2273         }
2274         if (pagevec_count(&pvec))
2275                 check_release_pagevec(&pvec);
2276         obj->mm.dirty = false;
2277
2278         sg_free_table(pages);
2279         kfree(pages);
2280 }
2281
2282 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2283 {
2284         struct radix_tree_iter iter;
2285         void __rcu **slot;
2286
2287         rcu_read_lock();
2288         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2289                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2290         rcu_read_unlock();
2291 }
2292
2293 static struct sg_table *
2294 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2295 {
2296         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2297         struct sg_table *pages;
2298
2299         pages = fetch_and_zero(&obj->mm.pages);
2300         if (!pages)
2301                 return NULL;
2302
2303         spin_lock(&i915->mm.obj_lock);
2304         list_del(&obj->mm.link);
2305         spin_unlock(&i915->mm.obj_lock);
2306
2307         if (obj->mm.mapping) {
2308                 void *ptr;
2309
2310                 ptr = page_mask_bits(obj->mm.mapping);
2311                 if (is_vmalloc_addr(ptr))
2312                         vunmap(ptr);
2313                 else
2314                         kunmap(kmap_to_page(ptr));
2315
2316                 obj->mm.mapping = NULL;
2317         }
2318
2319         __i915_gem_object_reset_page_iter(obj);
2320         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2321
2322         return pages;
2323 }
2324
2325 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2326                                  enum i915_mm_subclass subclass)
2327 {
2328         struct sg_table *pages;
2329
2330         if (i915_gem_object_has_pinned_pages(obj))
2331                 return;
2332
2333         GEM_BUG_ON(obj->bind_count);
2334         if (!i915_gem_object_has_pages(obj))
2335                 return;
2336
2337         /* May be called by shrinker from within get_pages() (on another bo) */
2338         mutex_lock_nested(&obj->mm.lock, subclass);
2339         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2340                 goto unlock;
2341
2342         /*
2343          * ->put_pages might need to allocate memory for the bit17 swizzle
2344          * array, hence protect them from being reaped by removing them from gtt
2345          * lists early.
2346          */
2347         pages = __i915_gem_object_unset_pages(obj);
2348         if (!IS_ERR(pages))
2349                 obj->ops->put_pages(obj, pages);
2350
2351 unlock:
2352         mutex_unlock(&obj->mm.lock);
2353 }
2354
2355 bool i915_sg_trim(struct sg_table *orig_st)
2356 {
2357         struct sg_table new_st;
2358         struct scatterlist *sg, *new_sg;
2359         unsigned int i;
2360
2361         if (orig_st->nents == orig_st->orig_nents)
2362                 return false;
2363
2364         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2365                 return false;
2366
2367         new_sg = new_st.sgl;
2368         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2369                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2370                 sg_dma_address(new_sg) = sg_dma_address(sg);
2371                 sg_dma_len(new_sg) = sg_dma_len(sg);
2372
2373                 new_sg = sg_next(new_sg);
2374         }
2375         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2376
2377         sg_free_table(orig_st);
2378
2379         *orig_st = new_st;
2380         return true;
2381 }
2382
2383 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2384 {
2385         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2386         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2387         unsigned long i;
2388         struct address_space *mapping;
2389         struct sg_table *st;
2390         struct scatterlist *sg;
2391         struct sgt_iter sgt_iter;
2392         struct page *page;
2393         unsigned long last_pfn = 0;     /* suppress gcc warning */
2394         unsigned int max_segment = i915_sg_segment_size();
2395         unsigned int sg_page_sizes;
2396         struct pagevec pvec;
2397         gfp_t noreclaim;
2398         int ret;
2399
2400         /*
2401          * Assert that the object is not currently in any GPU domain. As it
2402          * wasn't in the GTT, there shouldn't be any way it could have been in
2403          * a GPU cache
2404          */
2405         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2406         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2407
2408         /*
2409          * If there's no chance of allocating enough pages for the whole
2410          * object, bail early.
2411          */
2412         if (page_count > totalram_pages())
2413                 return -ENOMEM;
2414
2415         st = kmalloc(sizeof(*st), GFP_KERNEL);
2416         if (st == NULL)
2417                 return -ENOMEM;
2418
2419 rebuild_st:
2420         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2421                 kfree(st);
2422                 return -ENOMEM;
2423         }
2424
2425         /*
2426          * Get the list of pages out of our struct file.  They'll be pinned
2427          * at this point until we release them.
2428          *
2429          * Fail silently without starting the shrinker
2430          */
2431         mapping = obj->base.filp->f_mapping;
2432         mapping_set_unevictable(mapping);
2433         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2434         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2435
2436         sg = st->sgl;
2437         st->nents = 0;
2438         sg_page_sizes = 0;
2439         for (i = 0; i < page_count; i++) {
2440                 const unsigned int shrink[] = {
2441                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2442                         0,
2443                 }, *s = shrink;
2444                 gfp_t gfp = noreclaim;
2445
2446                 do {
2447                         cond_resched();
2448                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2449                         if (likely(!IS_ERR(page)))
2450                                 break;
2451
2452                         if (!*s) {
2453                                 ret = PTR_ERR(page);
2454                                 goto err_sg;
2455                         }
2456
2457                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2458
2459                         /*
2460                          * We've tried hard to allocate the memory by reaping
2461                          * our own buffer, now let the real VM do its job and
2462                          * go down in flames if truly OOM.
2463                          *
2464                          * However, since graphics tend to be disposable,
2465                          * defer the oom here by reporting the ENOMEM back
2466                          * to userspace.
2467                          */
2468                         if (!*s) {
2469                                 /* reclaim and warn, but no oom */
2470                                 gfp = mapping_gfp_mask(mapping);
2471
2472                                 /*
2473                                  * Our bo are always dirty and so we require
2474                                  * kswapd to reclaim our pages (direct reclaim
2475                                  * does not effectively begin pageout of our
2476                                  * buffers on its own). However, direct reclaim
2477                                  * only waits for kswapd when under allocation
2478                                  * congestion. So as a result __GFP_RECLAIM is
2479                                  * unreliable and fails to actually reclaim our
2480                                  * dirty pages -- unless you try over and over
2481                                  * again with !__GFP_NORETRY. However, we still
2482                                  * want to fail this allocation rather than
2483                                  * trigger the out-of-memory killer and for
2484                                  * this we want __GFP_RETRY_MAYFAIL.
2485                                  */
2486                                 gfp |= __GFP_RETRY_MAYFAIL;
2487                         }
2488                 } while (1);
2489
2490                 if (!i ||
2491                     sg->length >= max_segment ||
2492                     page_to_pfn(page) != last_pfn + 1) {
2493                         if (i) {
2494                                 sg_page_sizes |= sg->length;
2495                                 sg = sg_next(sg);
2496                         }
2497                         st->nents++;
2498                         sg_set_page(sg, page, PAGE_SIZE, 0);
2499                 } else {
2500                         sg->length += PAGE_SIZE;
2501                 }
2502                 last_pfn = page_to_pfn(page);
2503
2504                 /* Check that the i965g/gm workaround works. */
2505                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2506         }
2507         if (sg) { /* loop terminated early; short sg table */
2508                 sg_page_sizes |= sg->length;
2509                 sg_mark_end(sg);
2510         }
2511
2512         /* Trim unused sg entries to avoid wasting memory. */
2513         i915_sg_trim(st);
2514
2515         ret = i915_gem_gtt_prepare_pages(obj, st);
2516         if (ret) {
2517                 /*
2518                  * DMA remapping failed? One possible cause is that
2519                  * it could not reserve enough large entries, asking
2520                  * for PAGE_SIZE chunks instead may be helpful.
2521                  */
2522                 if (max_segment > PAGE_SIZE) {
2523                         for_each_sgt_page(page, sgt_iter, st)
2524                                 put_page(page);
2525                         sg_free_table(st);
2526
2527                         max_segment = PAGE_SIZE;
2528                         goto rebuild_st;
2529                 } else {
2530                         dev_warn(&dev_priv->drm.pdev->dev,
2531                                  "Failed to DMA remap %lu pages\n",
2532                                  page_count);
2533                         goto err_pages;
2534                 }
2535         }
2536
2537         if (i915_gem_object_needs_bit17_swizzle(obj))
2538                 i915_gem_object_do_bit_17_swizzle(obj, st);
2539
2540         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2541
2542         return 0;
2543
2544 err_sg:
2545         sg_mark_end(sg);
2546 err_pages:
2547         mapping_clear_unevictable(mapping);
2548         pagevec_init(&pvec);
2549         for_each_sgt_page(page, sgt_iter, st) {
2550                 if (!pagevec_add(&pvec, page))
2551                         check_release_pagevec(&pvec);
2552         }
2553         if (pagevec_count(&pvec))
2554                 check_release_pagevec(&pvec);
2555         sg_free_table(st);
2556         kfree(st);
2557
2558         /*
2559          * shmemfs first checks if there is enough memory to allocate the page
2560          * and reports ENOSPC should there be insufficient, along with the usual
2561          * ENOMEM for a genuine allocation failure.
2562          *
2563          * We use ENOSPC in our driver to mean that we have run out of aperture
2564          * space and so want to translate the error from shmemfs back to our
2565          * usual understanding of ENOMEM.
2566          */
2567         if (ret == -ENOSPC)
2568                 ret = -ENOMEM;
2569
2570         return ret;
2571 }
2572
2573 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2574                                  struct sg_table *pages,
2575                                  unsigned int sg_page_sizes)
2576 {
2577         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2578         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2579         int i;
2580
2581         lockdep_assert_held(&obj->mm.lock);
2582
2583         obj->mm.get_page.sg_pos = pages->sgl;
2584         obj->mm.get_page.sg_idx = 0;
2585
2586         obj->mm.pages = pages;
2587
2588         if (i915_gem_object_is_tiled(obj) &&
2589             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2590                 GEM_BUG_ON(obj->mm.quirked);
2591                 __i915_gem_object_pin_pages(obj);
2592                 obj->mm.quirked = true;
2593         }
2594
2595         GEM_BUG_ON(!sg_page_sizes);
2596         obj->mm.page_sizes.phys = sg_page_sizes;
2597
2598         /*
2599          * Calculate the supported page-sizes which fit into the given
2600          * sg_page_sizes. This will give us the page-sizes which we may be able
2601          * to use opportunistically when later inserting into the GTT. For
2602          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2603          * 64K or 4K pages, although in practice this will depend on a number of
2604          * other factors.
2605          */
2606         obj->mm.page_sizes.sg = 0;
2607         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2608                 if (obj->mm.page_sizes.phys & ~0u << i)
2609                         obj->mm.page_sizes.sg |= BIT(i);
2610         }
2611         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2612
2613         spin_lock(&i915->mm.obj_lock);
2614         list_add(&obj->mm.link, &i915->mm.unbound_list);
2615         spin_unlock(&i915->mm.obj_lock);
2616 }
2617
2618 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2619 {
2620         int err;
2621
2622         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2623                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2624                 return -EFAULT;
2625         }
2626
2627         err = obj->ops->get_pages(obj);
2628         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2629
2630         return err;
2631 }
2632
2633 /* Ensure that the associated pages are gathered from the backing storage
2634  * and pinned into our object. i915_gem_object_pin_pages() may be called
2635  * multiple times before they are released by a single call to
2636  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2637  * either as a result of memory pressure (reaping pages under the shrinker)
2638  * or as the object is itself released.
2639  */
2640 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2641 {
2642         int err;
2643
2644         err = mutex_lock_interruptible(&obj->mm.lock);
2645         if (err)
2646                 return err;
2647
2648         if (unlikely(!i915_gem_object_has_pages(obj))) {
2649                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2650
2651                 err = ____i915_gem_object_get_pages(obj);
2652                 if (err)
2653                         goto unlock;
2654
2655                 smp_mb__before_atomic();
2656         }
2657         atomic_inc(&obj->mm.pages_pin_count);
2658
2659 unlock:
2660         mutex_unlock(&obj->mm.lock);
2661         return err;
2662 }
2663
2664 /* The 'mapping' part of i915_gem_object_pin_map() below */
2665 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2666                                  enum i915_map_type type)
2667 {
2668         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2669         struct sg_table *sgt = obj->mm.pages;
2670         struct sgt_iter sgt_iter;
2671         struct page *page;
2672         struct page *stack_pages[32];
2673         struct page **pages = stack_pages;
2674         unsigned long i = 0;
2675         pgprot_t pgprot;
2676         void *addr;
2677
2678         /* A single page can always be kmapped */
2679         if (n_pages == 1 && type == I915_MAP_WB)
2680                 return kmap(sg_page(sgt->sgl));
2681
2682         if (n_pages > ARRAY_SIZE(stack_pages)) {
2683                 /* Too big for stack -- allocate temporary array instead */
2684                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2685                 if (!pages)
2686                         return NULL;
2687         }
2688
2689         for_each_sgt_page(page, sgt_iter, sgt)
2690                 pages[i++] = page;
2691
2692         /* Check that we have the expected number of pages */
2693         GEM_BUG_ON(i != n_pages);
2694
2695         switch (type) {
2696         default:
2697                 MISSING_CASE(type);
2698                 /* fallthrough to use PAGE_KERNEL anyway */
2699         case I915_MAP_WB:
2700                 pgprot = PAGE_KERNEL;
2701                 break;
2702         case I915_MAP_WC:
2703                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2704                 break;
2705         }
2706         addr = vmap(pages, n_pages, 0, pgprot);
2707
2708         if (pages != stack_pages)
2709                 kvfree(pages);
2710
2711         return addr;
2712 }
2713
2714 /* get, pin, and map the pages of the object into kernel space */
2715 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2716                               enum i915_map_type type)
2717 {
2718         enum i915_map_type has_type;
2719         bool pinned;
2720         void *ptr;
2721         int ret;
2722
2723         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2724                 return ERR_PTR(-ENXIO);
2725
2726         ret = mutex_lock_interruptible(&obj->mm.lock);
2727         if (ret)
2728                 return ERR_PTR(ret);
2729
2730         pinned = !(type & I915_MAP_OVERRIDE);
2731         type &= ~I915_MAP_OVERRIDE;
2732
2733         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2734                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2735                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2736
2737                         ret = ____i915_gem_object_get_pages(obj);
2738                         if (ret)
2739                                 goto err_unlock;
2740
2741                         smp_mb__before_atomic();
2742                 }
2743                 atomic_inc(&obj->mm.pages_pin_count);
2744                 pinned = false;
2745         }
2746         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2747
2748         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2749         if (ptr && has_type != type) {
2750                 if (pinned) {
2751                         ret = -EBUSY;
2752                         goto err_unpin;
2753                 }
2754
2755                 if (is_vmalloc_addr(ptr))
2756                         vunmap(ptr);
2757                 else
2758                         kunmap(kmap_to_page(ptr));
2759
2760                 ptr = obj->mm.mapping = NULL;
2761         }
2762
2763         if (!ptr) {
2764                 ptr = i915_gem_object_map(obj, type);
2765                 if (!ptr) {
2766                         ret = -ENOMEM;
2767                         goto err_unpin;
2768                 }
2769
2770                 obj->mm.mapping = page_pack_bits(ptr, type);
2771         }
2772
2773 out_unlock:
2774         mutex_unlock(&obj->mm.lock);
2775         return ptr;
2776
2777 err_unpin:
2778         atomic_dec(&obj->mm.pages_pin_count);
2779 err_unlock:
2780         ptr = ERR_PTR(ret);
2781         goto out_unlock;
2782 }
2783
2784 static int
2785 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2786                            const struct drm_i915_gem_pwrite *arg)
2787 {
2788         struct address_space *mapping = obj->base.filp->f_mapping;
2789         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2790         u64 remain, offset;
2791         unsigned int pg;
2792
2793         /* Before we instantiate/pin the backing store for our use, we
2794          * can prepopulate the shmemfs filp efficiently using a write into
2795          * the pagecache. We avoid the penalty of instantiating all the
2796          * pages, important if the user is just writing to a few and never
2797          * uses the object on the GPU, and using a direct write into shmemfs
2798          * allows it to avoid the cost of retrieving a page (either swapin
2799          * or clearing-before-use) before it is overwritten.
2800          */
2801         if (i915_gem_object_has_pages(obj))
2802                 return -ENODEV;
2803
2804         if (obj->mm.madv != I915_MADV_WILLNEED)
2805                 return -EFAULT;
2806
2807         /* Before the pages are instantiated the object is treated as being
2808          * in the CPU domain. The pages will be clflushed as required before
2809          * use, and we can freely write into the pages directly. If userspace
2810          * races pwrite with any other operation; corruption will ensue -
2811          * that is userspace's prerogative!
2812          */
2813
2814         remain = arg->size;
2815         offset = arg->offset;
2816         pg = offset_in_page(offset);
2817
2818         do {
2819                 unsigned int len, unwritten;
2820                 struct page *page;
2821                 void *data, *vaddr;
2822                 int err;
2823
2824                 len = PAGE_SIZE - pg;
2825                 if (len > remain)
2826                         len = remain;
2827
2828                 err = pagecache_write_begin(obj->base.filp, mapping,
2829                                             offset, len, 0,
2830                                             &page, &data);
2831                 if (err < 0)
2832                         return err;
2833
2834                 vaddr = kmap(page);
2835                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2836                 kunmap(page);
2837
2838                 err = pagecache_write_end(obj->base.filp, mapping,
2839                                           offset, len, len - unwritten,
2840                                           page, data);
2841                 if (err < 0)
2842                         return err;
2843
2844                 if (unwritten)
2845                         return -EFAULT;
2846
2847                 remain -= len;
2848                 user_data += len;
2849                 offset += len;
2850                 pg = 0;
2851         } while (remain);
2852
2853         return 0;
2854 }
2855
2856 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2857                                         const struct i915_gem_context *ctx)
2858 {
2859         unsigned int score;
2860         unsigned long prev_hang;
2861
2862         if (i915_gem_context_is_banned(ctx))
2863                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
2864         else
2865                 score = 0;
2866
2867         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2868         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2869                 score += I915_CLIENT_SCORE_HANG_FAST;
2870
2871         if (score) {
2872                 atomic_add(score, &file_priv->ban_score);
2873
2874                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2875                                  ctx->name, score,
2876                                  atomic_read(&file_priv->ban_score));
2877         }
2878 }
2879
2880 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2881 {
2882         unsigned int score;
2883         bool banned, bannable;
2884
2885         atomic_inc(&ctx->guilty_count);
2886
2887         bannable = i915_gem_context_is_bannable(ctx);
2888         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
2889         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2890
2891         /* Cool contexts don't accumulate client ban score */
2892         if (!bannable)
2893                 return;
2894
2895         if (banned) {
2896                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
2897                                  ctx->name, atomic_read(&ctx->guilty_count),
2898                                  score);
2899                 i915_gem_context_set_banned(ctx);
2900         }
2901
2902         if (!IS_ERR_OR_NULL(ctx->file_priv))
2903                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
2904 }
2905
2906 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2907 {
2908         atomic_inc(&ctx->active_count);
2909 }
2910
2911 struct i915_request *
2912 i915_gem_find_active_request(struct intel_engine_cs *engine)
2913 {
2914         struct i915_request *request, *active = NULL;
2915         unsigned long flags;
2916
2917         /*
2918          * We are called by the error capture, reset and to dump engine
2919          * state at random points in time. In particular, note that neither is
2920          * crucially ordered with an interrupt. After a hang, the GPU is dead
2921          * and we assume that no more writes can happen (we waited long enough
2922          * for all writes that were in transaction to be flushed) - adding an
2923          * extra delay for a recent interrupt is pointless. Hence, we do
2924          * not need an engine->irq_seqno_barrier() before the seqno reads.
2925          * At all other times, we must assume the GPU is still running, but
2926          * we only care about the snapshot of this moment.
2927          */
2928         spin_lock_irqsave(&engine->timeline.lock, flags);
2929         list_for_each_entry(request, &engine->timeline.requests, link) {
2930                 if (__i915_request_completed(request, request->global_seqno))
2931                         continue;
2932
2933                 active = request;
2934                 break;
2935         }
2936         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2937
2938         return active;
2939 }
2940
2941 /*
2942  * Ensure irq handler finishes, and not run again.
2943  * Also return the active request so that we only search for it once.
2944  */
2945 struct i915_request *
2946 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
2947 {
2948         struct i915_request *request;
2949
2950         /*
2951          * During the reset sequence, we must prevent the engine from
2952          * entering RC6. As the context state is undefined until we restart
2953          * the engine, if it does enter RC6 during the reset, the state
2954          * written to the powercontext is undefined and so we may lose
2955          * GPU state upon resume, i.e. fail to restart after a reset.
2956          */
2957         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
2958
2959         request = engine->reset.prepare(engine);
2960         if (request && request->fence.error == -EIO)
2961                 request = ERR_PTR(-EIO); /* Previous reset failed! */
2962
2963         return request;
2964 }
2965
2966 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
2967 {
2968         struct intel_engine_cs *engine;
2969         struct i915_request *request;
2970         enum intel_engine_id id;
2971         int err = 0;
2972
2973         for_each_engine(engine, dev_priv, id) {
2974                 request = i915_gem_reset_prepare_engine(engine);
2975                 if (IS_ERR(request)) {
2976                         err = PTR_ERR(request);
2977                         continue;
2978                 }
2979
2980                 engine->hangcheck.active_request = request;
2981         }
2982
2983         i915_gem_revoke_fences(dev_priv);
2984         intel_uc_sanitize(dev_priv);
2985
2986         return err;
2987 }
2988
2989 static void engine_skip_context(struct i915_request *request)
2990 {
2991         struct intel_engine_cs *engine = request->engine;
2992         struct i915_gem_context *hung_ctx = request->gem_context;
2993         struct i915_timeline *timeline = request->timeline;
2994         unsigned long flags;
2995
2996         GEM_BUG_ON(timeline == &engine->timeline);
2997
2998         spin_lock_irqsave(&engine->timeline.lock, flags);
2999         spin_lock(&timeline->lock);
3000
3001         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3002                 if (request->gem_context == hung_ctx)
3003                         i915_request_skip(request, -EIO);
3004
3005         list_for_each_entry(request, &timeline->requests, link)
3006                 i915_request_skip(request, -EIO);
3007
3008         spin_unlock(&timeline->lock);
3009         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3010 }
3011
3012 /* Returns the request if it was guilty of the hang */
3013 static struct i915_request *
3014 i915_gem_reset_request(struct intel_engine_cs *engine,
3015                        struct i915_request *request,
3016                        bool stalled)
3017 {
3018         /* The guilty request will get skipped on a hung engine.
3019          *
3020          * Users of client default contexts do not rely on logical
3021          * state preserved between batches so it is safe to execute
3022          * queued requests following the hang. Non default contexts
3023          * rely on preserved state, so skipping a batch loses the
3024          * evolution of the state and it needs to be considered corrupted.
3025          * Executing more queued batches on top of corrupted state is
3026          * risky. But we take the risk by trying to advance through
3027          * the queued requests in order to make the client behaviour
3028          * more predictable around resets, by not throwing away random
3029          * amount of batches it has prepared for execution. Sophisticated
3030          * clients can use gem_reset_stats_ioctl and dma fence status
3031          * (exported via sync_file info ioctl on explicit fences) to observe
3032          * when it loses the context state and should rebuild accordingly.
3033          *
3034          * The context ban, and ultimately the client ban, mechanism are safety
3035          * valves if client submission ends up resulting in nothing more than
3036          * subsequent hangs.
3037          */
3038
3039         if (i915_request_completed(request)) {
3040                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3041                           engine->name, request->global_seqno,
3042                           request->fence.context, request->fence.seqno,
3043                           intel_engine_get_seqno(engine));
3044                 stalled = false;
3045         }
3046
3047         if (stalled) {
3048                 i915_gem_context_mark_guilty(request->gem_context);
3049                 i915_request_skip(request, -EIO);
3050
3051                 /* If this context is now banned, skip all pending requests. */
3052                 if (i915_gem_context_is_banned(request->gem_context))
3053                         engine_skip_context(request);
3054         } else {
3055                 /*
3056                  * Since this is not the hung engine, it may have advanced
3057                  * since the hang declaration. Double check by refinding
3058                  * the active request at the time of the reset.
3059                  */
3060                 request = i915_gem_find_active_request(engine);
3061                 if (request) {
3062                         unsigned long flags;
3063
3064                         i915_gem_context_mark_innocent(request->gem_context);
3065                         dma_fence_set_error(&request->fence, -EAGAIN);
3066
3067                         /* Rewind the engine to replay the incomplete rq */
3068                         spin_lock_irqsave(&engine->timeline.lock, flags);
3069                         request = list_prev_entry(request, link);
3070                         if (&request->link == &engine->timeline.requests)
3071                                 request = NULL;
3072                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3073                 }
3074         }
3075
3076         return request;
3077 }
3078
3079 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3080                            struct i915_request *request,
3081                            bool stalled)
3082 {
3083         if (request)
3084                 request = i915_gem_reset_request(engine, request, stalled);
3085
3086         /* Setup the CS to resume from the breadcrumb of the hung request */
3087         engine->reset.reset(engine, request);
3088 }
3089
3090 void i915_gem_reset(struct drm_i915_private *dev_priv,
3091                     unsigned int stalled_mask)
3092 {
3093         struct intel_engine_cs *engine;
3094         enum intel_engine_id id;
3095
3096         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3097
3098         i915_retire_requests(dev_priv);
3099
3100         for_each_engine(engine, dev_priv, id) {
3101                 struct intel_context *ce;
3102
3103                 i915_gem_reset_engine(engine,
3104                                       engine->hangcheck.active_request,
3105                                       stalled_mask & ENGINE_MASK(id));
3106                 ce = fetch_and_zero(&engine->last_retired_context);
3107                 if (ce)
3108                         intel_context_unpin(ce);
3109
3110                 /*
3111                  * Ostensibily, we always want a context loaded for powersaving,
3112                  * so if the engine is idle after the reset, send a request
3113                  * to load our scratch kernel_context.
3114                  *
3115                  * More mysteriously, if we leave the engine idle after a reset,
3116                  * the next userspace batch may hang, with what appears to be
3117                  * an incoherent read by the CS (presumably stale TLB). An
3118                  * empty request appears sufficient to paper over the glitch.
3119                  */
3120                 if (intel_engine_is_idle(engine)) {
3121                         struct i915_request *rq;
3122
3123                         rq = i915_request_alloc(engine,
3124                                                 dev_priv->kernel_context);
3125                         if (!IS_ERR(rq))
3126                                 i915_request_add(rq);
3127                 }
3128         }
3129
3130         i915_gem_restore_fences(dev_priv);
3131 }
3132
3133 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3134 {
3135         engine->reset.finish(engine);
3136
3137         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3138 }
3139
3140 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3141 {
3142         struct intel_engine_cs *engine;
3143         enum intel_engine_id id;
3144
3145         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3146
3147         for_each_engine(engine, dev_priv, id) {
3148                 engine->hangcheck.active_request = NULL;
3149                 i915_gem_reset_finish_engine(engine);
3150         }
3151 }
3152
3153 static void nop_submit_request(struct i915_request *request)
3154 {
3155         unsigned long flags;
3156
3157         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3158                   request->engine->name,
3159                   request->fence.context, request->fence.seqno);
3160         dma_fence_set_error(&request->fence, -EIO);
3161
3162         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3163         __i915_request_submit(request);
3164         intel_engine_write_global_seqno(request->engine, request->global_seqno);
3165         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3166 }
3167
3168 void i915_gem_set_wedged(struct drm_i915_private *i915)
3169 {
3170         struct intel_engine_cs *engine;
3171         enum intel_engine_id id;
3172
3173         GEM_TRACE("start\n");
3174
3175         if (GEM_SHOW_DEBUG()) {
3176                 struct drm_printer p = drm_debug_printer(__func__);
3177
3178                 for_each_engine(engine, i915, id)
3179                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3180         }
3181
3182         if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
3183                 goto out;
3184
3185         /*
3186          * First, stop submission to hw, but do not yet complete requests by
3187          * rolling the global seqno forward (since this would complete requests
3188          * for which we haven't set the fence error to EIO yet).
3189          */
3190         for_each_engine(engine, i915, id)
3191                 i915_gem_reset_prepare_engine(engine);
3192
3193         /* Even if the GPU reset fails, it should still stop the engines */
3194         if (INTEL_GEN(i915) >= 5)
3195                 intel_gpu_reset(i915, ALL_ENGINES);
3196
3197         for_each_engine(engine, i915, id) {
3198                 engine->submit_request = nop_submit_request;
3199                 engine->schedule = NULL;
3200         }
3201         i915->caps.scheduler = 0;
3202
3203         /*
3204          * Make sure no request can slip through without getting completed by
3205          * either this call here to intel_engine_write_global_seqno, or the one
3206          * in nop_submit_request.
3207          */
3208         synchronize_rcu();
3209
3210         /* Mark all executing requests as skipped */
3211         for_each_engine(engine, i915, id)
3212                 engine->cancel_requests(engine);
3213
3214         for_each_engine(engine, i915, id) {
3215                 i915_gem_reset_finish_engine(engine);
3216                 intel_engine_wakeup(engine);
3217         }
3218
3219 out:
3220         GEM_TRACE("end\n");
3221
3222         wake_up_all(&i915->gpu_error.reset_queue);
3223 }
3224
3225 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3226 {
3227         struct i915_timeline *tl;
3228
3229         lockdep_assert_held(&i915->drm.struct_mutex);
3230         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3231                 return true;
3232
3233         if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
3234                 return false;
3235
3236         GEM_TRACE("start\n");
3237
3238         /*
3239          * Before unwedging, make sure that all pending operations
3240          * are flushed and errored out - we may have requests waiting upon
3241          * third party fences. We marked all inflight requests as EIO, and
3242          * every execbuf since returned EIO, for consistency we want all
3243          * the currently pending requests to also be marked as EIO, which
3244          * is done inside our nop_submit_request - and so we must wait.
3245          *
3246          * No more can be submitted until we reset the wedged bit.
3247          */
3248         list_for_each_entry(tl, &i915->gt.timelines, link) {
3249                 struct i915_request *rq;
3250
3251                 rq = i915_gem_active_peek(&tl->last_request,
3252                                           &i915->drm.struct_mutex);
3253                 if (!rq)
3254                         continue;
3255
3256                 /*
3257                  * We can't use our normal waiter as we want to
3258                  * avoid recursively trying to handle the current
3259                  * reset. The basic dma_fence_default_wait() installs
3260                  * a callback for dma_fence_signal(), which is
3261                  * triggered by our nop handler (indirectly, the
3262                  * callback enables the signaler thread which is
3263                  * woken by the nop_submit_request() advancing the seqno
3264                  * and when the seqno passes the fence, the signaler
3265                  * then signals the fence waking us up).
3266                  */
3267                 if (dma_fence_default_wait(&rq->fence, true,
3268                                            MAX_SCHEDULE_TIMEOUT) < 0)
3269                         return false;
3270         }
3271         i915_retire_requests(i915);
3272         GEM_BUG_ON(i915->gt.active_requests);
3273
3274         intel_engines_sanitize(i915, false);
3275
3276         /*
3277          * Undo nop_submit_request. We prevent all new i915 requests from
3278          * being queued (by disallowing execbuf whilst wedged) so having
3279          * waited for all active requests above, we know the system is idle
3280          * and do not have to worry about a thread being inside
3281          * engine->submit_request() as we swap over. So unlike installing
3282          * the nop_submit_request on reset, we can do this from normal
3283          * context and do not require stop_machine().
3284          */
3285         intel_engines_reset_default_submission(i915);
3286         i915_gem_contexts_lost(i915);
3287
3288         GEM_TRACE("end\n");
3289
3290         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3291         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3292
3293         return true;
3294 }
3295
3296 static void
3297 i915_gem_retire_work_handler(struct work_struct *work)
3298 {
3299         struct drm_i915_private *dev_priv =
3300                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3301         struct drm_device *dev = &dev_priv->drm;
3302
3303         /* Come back later if the device is busy... */
3304         if (mutex_trylock(&dev->struct_mutex)) {
3305                 i915_retire_requests(dev_priv);
3306                 mutex_unlock(&dev->struct_mutex);
3307         }
3308
3309         /*
3310          * Keep the retire handler running until we are finally idle.
3311          * We do not need to do this test under locking as in the worst-case
3312          * we queue the retire worker once too often.
3313          */
3314         if (READ_ONCE(dev_priv->gt.awake))
3315                 queue_delayed_work(dev_priv->wq,
3316                                    &dev_priv->gt.retire_work,
3317                                    round_jiffies_up_relative(HZ));
3318 }
3319
3320 static void shrink_caches(struct drm_i915_private *i915)
3321 {
3322         /*
3323          * kmem_cache_shrink() discards empty slabs and reorders partially
3324          * filled slabs to prioritise allocating from the mostly full slabs,
3325          * with the aim of reducing fragmentation.
3326          */
3327         kmem_cache_shrink(i915->priorities);
3328         kmem_cache_shrink(i915->dependencies);
3329         kmem_cache_shrink(i915->requests);
3330         kmem_cache_shrink(i915->luts);
3331         kmem_cache_shrink(i915->vmas);
3332         kmem_cache_shrink(i915->objects);
3333 }
3334
3335 struct sleep_rcu_work {
3336         union {
3337                 struct rcu_head rcu;
3338                 struct work_struct work;
3339         };
3340         struct drm_i915_private *i915;
3341         unsigned int epoch;
3342 };
3343
3344 static inline bool
3345 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3346 {
3347         /*
3348          * There is a small chance that the epoch wrapped since we started
3349          * sleeping. If we assume that epoch is at least a u32, then it will
3350          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3351          */
3352         return epoch == READ_ONCE(i915->gt.epoch);
3353 }
3354
3355 static void __sleep_work(struct work_struct *work)
3356 {
3357         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3358         struct drm_i915_private *i915 = s->i915;
3359         unsigned int epoch = s->epoch;
3360
3361         kfree(s);
3362         if (same_epoch(i915, epoch))
3363                 shrink_caches(i915);
3364 }
3365
3366 static void __sleep_rcu(struct rcu_head *rcu)
3367 {
3368         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3369         struct drm_i915_private *i915 = s->i915;
3370
3371         destroy_rcu_head(&s->rcu);
3372
3373         if (same_epoch(i915, s->epoch)) {
3374                 INIT_WORK(&s->work, __sleep_work);
3375                 queue_work(i915->wq, &s->work);
3376         } else {
3377                 kfree(s);
3378         }
3379 }
3380
3381 static inline bool
3382 new_requests_since_last_retire(const struct drm_i915_private *i915)
3383 {
3384         return (READ_ONCE(i915->gt.active_requests) ||
3385                 work_pending(&i915->gt.idle_work.work));
3386 }
3387
3388 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3389 {
3390         struct intel_engine_cs *engine;
3391         enum intel_engine_id id;
3392
3393         if (i915_terminally_wedged(&i915->gpu_error))
3394                 return;
3395
3396         GEM_BUG_ON(i915->gt.active_requests);
3397         for_each_engine(engine, i915, id) {
3398                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3399                 GEM_BUG_ON(engine->last_retired_context !=
3400                            to_intel_context(i915->kernel_context, engine));
3401         }
3402 }
3403
3404 static void
3405 i915_gem_idle_work_handler(struct work_struct *work)
3406 {
3407         struct drm_i915_private *dev_priv =
3408                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3409         unsigned int epoch = I915_EPOCH_INVALID;
3410         bool rearm_hangcheck;
3411
3412         if (!READ_ONCE(dev_priv->gt.awake))
3413                 return;
3414
3415         if (READ_ONCE(dev_priv->gt.active_requests))
3416                 return;
3417
3418         /*
3419          * Flush out the last user context, leaving only the pinned
3420          * kernel context resident. When we are idling on the kernel_context,
3421          * no more new requests (with a context switch) are emitted and we
3422          * can finally rest. A consequence is that the idle work handler is
3423          * always called at least twice before idling (and if the system is
3424          * idle that implies a round trip through the retire worker).
3425          */
3426         mutex_lock(&dev_priv->drm.struct_mutex);
3427         i915_gem_switch_to_kernel_context(dev_priv);
3428         mutex_unlock(&dev_priv->drm.struct_mutex);
3429
3430         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3431                   READ_ONCE(dev_priv->gt.active_requests));
3432
3433         /*
3434          * Wait for last execlists context complete, but bail out in case a
3435          * new request is submitted. As we don't trust the hardware, we
3436          * continue on if the wait times out. This is necessary to allow
3437          * the machine to suspend even if the hardware dies, and we will
3438          * try to recover in resume (after depriving the hardware of power,
3439          * it may be in a better mmod).
3440          */
3441         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3442                    intel_engines_are_idle(dev_priv),
3443                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3444                    10, 500);
3445
3446         rearm_hangcheck =
3447                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3448
3449         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3450                 /* Currently busy, come back later */
3451                 mod_delayed_work(dev_priv->wq,
3452                                  &dev_priv->gt.idle_work,
3453                                  msecs_to_jiffies(50));
3454                 goto out_rearm;
3455         }
3456
3457         /*
3458          * New request retired after this work handler started, extend active
3459          * period until next instance of the work.
3460          */
3461         if (new_requests_since_last_retire(dev_priv))
3462                 goto out_unlock;
3463
3464         epoch = __i915_gem_park(dev_priv);
3465
3466         assert_kernel_context_is_current(dev_priv);
3467
3468         rearm_hangcheck = false;
3469 out_unlock:
3470         mutex_unlock(&dev_priv->drm.struct_mutex);
3471
3472 out_rearm:
3473         if (rearm_hangcheck) {
3474                 GEM_BUG_ON(!dev_priv->gt.awake);
3475                 i915_queue_hangcheck(dev_priv);
3476         }
3477
3478         /*
3479          * When we are idle, it is an opportune time to reap our caches.
3480          * However, we have many objects that utilise RCU and the ordered
3481          * i915->wq that this work is executing on. To try and flush any
3482          * pending frees now we are idle, we first wait for an RCU grace
3483          * period, and then queue a task (that will run last on the wq) to
3484          * shrink and re-optimize the caches.
3485          */
3486         if (same_epoch(dev_priv, epoch)) {
3487                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3488                 if (s) {
3489                         init_rcu_head(&s->rcu);
3490                         s->i915 = dev_priv;
3491                         s->epoch = epoch;
3492                         call_rcu(&s->rcu, __sleep_rcu);
3493                 }
3494         }
3495 }
3496
3497 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3498 {
3499         struct drm_i915_private *i915 = to_i915(gem->dev);
3500         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3501         struct drm_i915_file_private *fpriv = file->driver_priv;
3502         struct i915_lut_handle *lut, *ln;
3503
3504         mutex_lock(&i915->drm.struct_mutex);
3505
3506         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3507                 struct i915_gem_context *ctx = lut->ctx;
3508                 struct i915_vma *vma;
3509
3510                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3511                 if (ctx->file_priv != fpriv)
3512                         continue;
3513
3514                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3515                 GEM_BUG_ON(vma->obj != obj);
3516
3517                 /* We allow the process to have multiple handles to the same
3518                  * vma, in the same fd namespace, by virtue of flink/open.
3519                  */
3520                 GEM_BUG_ON(!vma->open_count);
3521                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3522                         i915_vma_close(vma);
3523
3524                 list_del(&lut->obj_link);
3525                 list_del(&lut->ctx_link);
3526
3527                 kmem_cache_free(i915->luts, lut);
3528                 __i915_gem_object_release_unless_active(obj);
3529         }
3530
3531         mutex_unlock(&i915->drm.struct_mutex);
3532 }
3533
3534 static unsigned long to_wait_timeout(s64 timeout_ns)
3535 {
3536         if (timeout_ns < 0)
3537                 return MAX_SCHEDULE_TIMEOUT;
3538
3539         if (timeout_ns == 0)
3540                 return 0;
3541
3542         return nsecs_to_jiffies_timeout(timeout_ns);
3543 }
3544
3545 /**
3546  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3547  * @dev: drm device pointer
3548  * @data: ioctl data blob
3549  * @file: drm file pointer
3550  *
3551  * Returns 0 if successful, else an error is returned with the remaining time in
3552  * the timeout parameter.
3553  *  -ETIME: object is still busy after timeout
3554  *  -ERESTARTSYS: signal interrupted the wait
3555  *  -ENONENT: object doesn't exist
3556  * Also possible, but rare:
3557  *  -EAGAIN: incomplete, restart syscall
3558  *  -ENOMEM: damn
3559  *  -ENODEV: Internal IRQ fail
3560  *  -E?: The add request failed
3561  *
3562  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3563  * non-zero timeout parameter the wait ioctl will wait for the given number of
3564  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3565  * without holding struct_mutex the object may become re-busied before this
3566  * function completes. A similar but shorter * race condition exists in the busy
3567  * ioctl
3568  */
3569 int
3570 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3571 {
3572         struct drm_i915_gem_wait *args = data;
3573         struct drm_i915_gem_object *obj;
3574         ktime_t start;
3575         long ret;
3576
3577         if (args->flags != 0)
3578                 return -EINVAL;
3579
3580         obj = i915_gem_object_lookup(file, args->bo_handle);
3581         if (!obj)
3582                 return -ENOENT;
3583
3584         start = ktime_get();
3585
3586         ret = i915_gem_object_wait(obj,
3587                                    I915_WAIT_INTERRUPTIBLE |
3588                                    I915_WAIT_PRIORITY |
3589                                    I915_WAIT_ALL,
3590                                    to_wait_timeout(args->timeout_ns),
3591                                    to_rps_client(file));
3592
3593         if (args->timeout_ns > 0) {
3594                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3595                 if (args->timeout_ns < 0)
3596                         args->timeout_ns = 0;
3597
3598                 /*
3599                  * Apparently ktime isn't accurate enough and occasionally has a
3600                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3601                  * things up to make the test happy. We allow up to 1 jiffy.
3602                  *
3603                  * This is a regression from the timespec->ktime conversion.
3604                  */
3605                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3606                         args->timeout_ns = 0;
3607
3608                 /* Asked to wait beyond the jiffie/scheduler precision? */
3609                 if (ret == -ETIME && args->timeout_ns)
3610                         ret = -EAGAIN;
3611         }
3612
3613         i915_gem_object_put(obj);
3614         return ret;
3615 }
3616
3617 static long wait_for_timeline(struct i915_timeline *tl,
3618                               unsigned int flags, long timeout)
3619 {
3620         struct i915_request *rq;
3621
3622         rq = i915_gem_active_get_unlocked(&tl->last_request);
3623         if (!rq)
3624                 return timeout;
3625
3626         /*
3627          * "Race-to-idle".
3628          *
3629          * Switching to the kernel context is often used a synchronous
3630          * step prior to idling, e.g. in suspend for flushing all
3631          * current operations to memory before sleeping. These we
3632          * want to complete as quickly as possible to avoid prolonged
3633          * stalls, so allow the gpu to boost to maximum clocks.
3634          */
3635         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3636                 gen6_rps_boost(rq, NULL);
3637
3638         timeout = i915_request_wait(rq, flags, timeout);
3639         i915_request_put(rq);
3640
3641         return timeout;
3642 }
3643
3644 static int wait_for_engines(struct drm_i915_private *i915)
3645 {
3646         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3647                 dev_err(i915->drm.dev,
3648                         "Failed to idle engines, declaring wedged!\n");
3649                 GEM_TRACE_DUMP();
3650                 i915_gem_set_wedged(i915);
3651                 return -EIO;
3652         }
3653
3654         return 0;
3655 }
3656
3657 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3658                            unsigned int flags, long timeout)
3659 {
3660         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3661                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3662                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3663
3664         /* If the device is asleep, we have no requests outstanding */
3665         if (!READ_ONCE(i915->gt.awake))
3666                 return 0;
3667
3668         if (flags & I915_WAIT_LOCKED) {
3669                 struct i915_timeline *tl;
3670                 int err;
3671
3672                 lockdep_assert_held(&i915->drm.struct_mutex);
3673
3674                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3675                         timeout = wait_for_timeline(tl, flags, timeout);
3676                         if (timeout < 0)
3677                                 return timeout;
3678                 }
3679                 if (GEM_SHOW_DEBUG() && !timeout) {
3680                         /* Presume that timeout was non-zero to begin with! */
3681                         dev_warn(&i915->drm.pdev->dev,
3682                                  "Missed idle-completion interrupt!\n");
3683                         GEM_TRACE_DUMP();
3684                 }
3685
3686                 err = wait_for_engines(i915);
3687                 if (err)
3688                         return err;
3689
3690                 i915_retire_requests(i915);
3691                 GEM_BUG_ON(i915->gt.active_requests);
3692         } else {
3693                 struct intel_engine_cs *engine;
3694                 enum intel_engine_id id;
3695
3696                 for_each_engine(engine, i915, id) {
3697                         struct i915_timeline *tl = &engine->timeline;
3698
3699                         timeout = wait_for_timeline(tl, flags, timeout);
3700                         if (timeout < 0)
3701                                 return timeout;
3702                 }
3703         }
3704
3705         return 0;
3706 }
3707
3708 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3709 {
3710         /*
3711          * We manually flush the CPU domain so that we can override and
3712          * force the flush for the display, and perform it asyncrhonously.
3713          */
3714         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3715         if (obj->cache_dirty)
3716                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3717         obj->write_domain = 0;
3718 }
3719
3720 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3721 {
3722         if (!READ_ONCE(obj->pin_global))
3723                 return;
3724
3725         mutex_lock(&obj->base.dev->struct_mutex);
3726         __i915_gem_object_flush_for_display(obj);
3727         mutex_unlock(&obj->base.dev->struct_mutex);
3728 }
3729
3730 /**
3731  * Moves a single object to the WC read, and possibly write domain.
3732  * @obj: object to act on
3733  * @write: ask for write access or read only
3734  *
3735  * This function returns when the move is complete, including waiting on
3736  * flushes to occur.
3737  */
3738 int
3739 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3740 {
3741         int ret;
3742
3743         lockdep_assert_held(&obj->base.dev->struct_mutex);
3744
3745         ret = i915_gem_object_wait(obj,
3746                                    I915_WAIT_INTERRUPTIBLE |
3747                                    I915_WAIT_LOCKED |
3748                                    (write ? I915_WAIT_ALL : 0),
3749                                    MAX_SCHEDULE_TIMEOUT,
3750                                    NULL);
3751         if (ret)
3752                 return ret;
3753
3754         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3755                 return 0;
3756
3757         /* Flush and acquire obj->pages so that we are coherent through
3758          * direct access in memory with previous cached writes through
3759          * shmemfs and that our cache domain tracking remains valid.
3760          * For example, if the obj->filp was moved to swap without us
3761          * being notified and releasing the pages, we would mistakenly
3762          * continue to assume that the obj remained out of the CPU cached
3763          * domain.
3764          */
3765         ret = i915_gem_object_pin_pages(obj);
3766         if (ret)
3767                 return ret;
3768
3769         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3770
3771         /* Serialise direct access to this object with the barriers for
3772          * coherent writes from the GPU, by effectively invalidating the
3773          * WC domain upon first access.
3774          */
3775         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3776                 mb();
3777
3778         /* It should now be out of any other write domains, and we can update
3779          * the domain values for our changes.
3780          */
3781         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3782         obj->read_domains |= I915_GEM_DOMAIN_WC;
3783         if (write) {
3784                 obj->read_domains = I915_GEM_DOMAIN_WC;
3785                 obj->write_domain = I915_GEM_DOMAIN_WC;
3786                 obj->mm.dirty = true;
3787         }
3788
3789         i915_gem_object_unpin_pages(obj);
3790         return 0;
3791 }
3792
3793 /**
3794  * Moves a single object to the GTT read, and possibly write domain.
3795  * @obj: object to act on
3796  * @write: ask for write access or read only
3797  *
3798  * This function returns when the move is complete, including waiting on
3799  * flushes to occur.
3800  */
3801 int
3802 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3803 {
3804         int ret;
3805
3806         lockdep_assert_held(&obj->base.dev->struct_mutex);
3807
3808         ret = i915_gem_object_wait(obj,
3809                                    I915_WAIT_INTERRUPTIBLE |
3810                                    I915_WAIT_LOCKED |
3811                                    (write ? I915_WAIT_ALL : 0),
3812                                    MAX_SCHEDULE_TIMEOUT,
3813                                    NULL);
3814         if (ret)
3815                 return ret;
3816
3817         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3818                 return 0;
3819
3820         /* Flush and acquire obj->pages so that we are coherent through
3821          * direct access in memory with previous cached writes through
3822          * shmemfs and that our cache domain tracking remains valid.
3823          * For example, if the obj->filp was moved to swap without us
3824          * being notified and releasing the pages, we would mistakenly
3825          * continue to assume that the obj remained out of the CPU cached
3826          * domain.
3827          */
3828         ret = i915_gem_object_pin_pages(obj);
3829         if (ret)
3830                 return ret;
3831
3832         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3833
3834         /* Serialise direct access to this object with the barriers for
3835          * coherent writes from the GPU, by effectively invalidating the
3836          * GTT domain upon first access.
3837          */
3838         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3839                 mb();
3840
3841         /* It should now be out of any other write domains, and we can update
3842          * the domain values for our changes.
3843          */
3844         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3845         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3846         if (write) {
3847                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3848                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3849                 obj->mm.dirty = true;
3850         }
3851
3852         i915_gem_object_unpin_pages(obj);
3853         return 0;
3854 }
3855
3856 /**
3857  * Changes the cache-level of an object across all VMA.
3858  * @obj: object to act on
3859  * @cache_level: new cache level to set for the object
3860  *
3861  * After this function returns, the object will be in the new cache-level
3862  * across all GTT and the contents of the backing storage will be coherent,
3863  * with respect to the new cache-level. In order to keep the backing storage
3864  * coherent for all users, we only allow a single cache level to be set
3865  * globally on the object and prevent it from being changed whilst the
3866  * hardware is reading from the object. That is if the object is currently
3867  * on the scanout it will be set to uncached (or equivalent display
3868  * cache coherency) and all non-MOCS GPU access will also be uncached so
3869  * that all direct access to the scanout remains coherent.
3870  */
3871 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3872                                     enum i915_cache_level cache_level)
3873 {
3874         struct i915_vma *vma;
3875         int ret;
3876
3877         lockdep_assert_held(&obj->base.dev->struct_mutex);
3878
3879         if (obj->cache_level == cache_level)
3880                 return 0;
3881
3882         /* Inspect the list of currently bound VMA and unbind any that would
3883          * be invalid given the new cache-level. This is principally to
3884          * catch the issue of the CS prefetch crossing page boundaries and
3885          * reading an invalid PTE on older architectures.
3886          */
3887 restart:
3888         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3889                 if (!drm_mm_node_allocated(&vma->node))
3890                         continue;
3891
3892                 if (i915_vma_is_pinned(vma)) {
3893                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3894                         return -EBUSY;
3895                 }
3896
3897                 if (!i915_vma_is_closed(vma) &&
3898                     i915_gem_valid_gtt_space(vma, cache_level))
3899                         continue;
3900
3901                 ret = i915_vma_unbind(vma);
3902                 if (ret)
3903                         return ret;
3904
3905                 /* As unbinding may affect other elements in the
3906                  * obj->vma_list (due to side-effects from retiring
3907                  * an active vma), play safe and restart the iterator.
3908                  */
3909                 goto restart;
3910         }
3911
3912         /* We can reuse the existing drm_mm nodes but need to change the
3913          * cache-level on the PTE. We could simply unbind them all and
3914          * rebind with the correct cache-level on next use. However since
3915          * we already have a valid slot, dma mapping, pages etc, we may as
3916          * rewrite the PTE in the belief that doing so tramples upon less
3917          * state and so involves less work.
3918          */
3919         if (obj->bind_count) {
3920                 /* Before we change the PTE, the GPU must not be accessing it.
3921                  * If we wait upon the object, we know that all the bound
3922                  * VMA are no longer active.
3923                  */
3924                 ret = i915_gem_object_wait(obj,
3925                                            I915_WAIT_INTERRUPTIBLE |
3926                                            I915_WAIT_LOCKED |
3927                                            I915_WAIT_ALL,
3928                                            MAX_SCHEDULE_TIMEOUT,
3929                                            NULL);
3930                 if (ret)
3931                         return ret;
3932
3933                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3934                     cache_level != I915_CACHE_NONE) {
3935                         /* Access to snoopable pages through the GTT is
3936                          * incoherent and on some machines causes a hard
3937                          * lockup. Relinquish the CPU mmaping to force
3938                          * userspace to refault in the pages and we can
3939                          * then double check if the GTT mapping is still
3940                          * valid for that pointer access.
3941                          */
3942                         i915_gem_release_mmap(obj);
3943
3944                         /* As we no longer need a fence for GTT access,
3945                          * we can relinquish it now (and so prevent having
3946                          * to steal a fence from someone else on the next
3947                          * fence request). Note GPU activity would have
3948                          * dropped the fence as all snoopable access is
3949                          * supposed to be linear.
3950                          */
3951                         for_each_ggtt_vma(vma, obj) {
3952                                 ret = i915_vma_put_fence(vma);
3953                                 if (ret)
3954                                         return ret;
3955                         }
3956                 } else {
3957                         /* We either have incoherent backing store and
3958                          * so no GTT access or the architecture is fully
3959                          * coherent. In such cases, existing GTT mmaps
3960                          * ignore the cache bit in the PTE and we can
3961                          * rewrite it without confusing the GPU or having
3962                          * to force userspace to fault back in its mmaps.
3963                          */
3964                 }
3965
3966                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
3967                         if (!drm_mm_node_allocated(&vma->node))
3968                                 continue;
3969
3970                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3971                         if (ret)
3972                                 return ret;
3973                 }
3974         }
3975
3976         list_for_each_entry(vma, &obj->vma_list, obj_link)
3977                 vma->node.color = cache_level;
3978         i915_gem_object_set_cache_coherency(obj, cache_level);
3979         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3980
3981         return 0;
3982 }
3983
3984 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3985                                struct drm_file *file)
3986 {
3987         struct drm_i915_gem_caching *args = data;
3988         struct drm_i915_gem_object *obj;
3989         int err = 0;
3990
3991         rcu_read_lock();
3992         obj = i915_gem_object_lookup_rcu(file, args->handle);
3993         if (!obj) {
3994                 err = -ENOENT;
3995                 goto out;
3996         }
3997
3998         switch (obj->cache_level) {
3999         case I915_CACHE_LLC:
4000         case I915_CACHE_L3_LLC:
4001                 args->caching = I915_CACHING_CACHED;
4002                 break;
4003
4004         case I915_CACHE_WT:
4005                 args->caching = I915_CACHING_DISPLAY;
4006                 break;
4007
4008         default:
4009                 args->caching = I915_CACHING_NONE;
4010                 break;
4011         }
4012 out:
4013         rcu_read_unlock();
4014         return err;
4015 }
4016
4017 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4018                                struct drm_file *file)
4019 {
4020         struct drm_i915_private *i915 = to_i915(dev);
4021         struct drm_i915_gem_caching *args = data;
4022         struct drm_i915_gem_object *obj;
4023         enum i915_cache_level level;
4024         int ret = 0;
4025
4026         switch (args->caching) {
4027         case I915_CACHING_NONE:
4028                 level = I915_CACHE_NONE;
4029                 break;
4030         case I915_CACHING_CACHED:
4031                 /*
4032                  * Due to a HW issue on BXT A stepping, GPU stores via a
4033                  * snooped mapping may leave stale data in a corresponding CPU
4034                  * cacheline, whereas normally such cachelines would get
4035                  * invalidated.
4036                  */
4037                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4038                         return -ENODEV;
4039
4040                 level = I915_CACHE_LLC;
4041                 break;
4042         case I915_CACHING_DISPLAY:
4043                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4044                 break;
4045         default:
4046                 return -EINVAL;
4047         }
4048
4049         obj = i915_gem_object_lookup(file, args->handle);
4050         if (!obj)
4051                 return -ENOENT;
4052
4053         /*
4054          * The caching mode of proxy object is handled by its generator, and
4055          * not allowed to be changed by userspace.
4056          */
4057         if (i915_gem_object_is_proxy(obj)) {
4058                 ret = -ENXIO;
4059                 goto out;
4060         }
4061
4062         if (obj->cache_level == level)
4063                 goto out;
4064
4065         ret = i915_gem_object_wait(obj,
4066                                    I915_WAIT_INTERRUPTIBLE,
4067                                    MAX_SCHEDULE_TIMEOUT,
4068                                    to_rps_client(file));
4069         if (ret)
4070                 goto out;
4071
4072         ret = i915_mutex_lock_interruptible(dev);
4073         if (ret)
4074                 goto out;
4075
4076         ret = i915_gem_object_set_cache_level(obj, level);
4077         mutex_unlock(&dev->struct_mutex);
4078
4079 out:
4080         i915_gem_object_put(obj);
4081         return ret;
4082 }
4083
4084 /*
4085  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4086  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4087  * (for pageflips). We only flush the caches while preparing the buffer for
4088  * display, the callers are responsible for frontbuffer flush.
4089  */
4090 struct i915_vma *
4091 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4092                                      u32 alignment,
4093                                      const struct i915_ggtt_view *view,
4094                                      unsigned int flags)
4095 {
4096         struct i915_vma *vma;
4097         int ret;
4098
4099         lockdep_assert_held(&obj->base.dev->struct_mutex);
4100
4101         /* Mark the global pin early so that we account for the
4102          * display coherency whilst setting up the cache domains.
4103          */
4104         obj->pin_global++;
4105
4106         /* The display engine is not coherent with the LLC cache on gen6.  As
4107          * a result, we make sure that the pinning that is about to occur is
4108          * done with uncached PTEs. This is lowest common denominator for all
4109          * chipsets.
4110          *
4111          * However for gen6+, we could do better by using the GFDT bit instead
4112          * of uncaching, which would allow us to flush all the LLC-cached data
4113          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4114          */
4115         ret = i915_gem_object_set_cache_level(obj,
4116                                               HAS_WT(to_i915(obj->base.dev)) ?
4117                                               I915_CACHE_WT : I915_CACHE_NONE);
4118         if (ret) {
4119                 vma = ERR_PTR(ret);
4120                 goto err_unpin_global;
4121         }
4122
4123         /* As the user may map the buffer once pinned in the display plane
4124          * (e.g. libkms for the bootup splash), we have to ensure that we
4125          * always use map_and_fenceable for all scanout buffers. However,
4126          * it may simply be too big to fit into mappable, in which case
4127          * put it anyway and hope that userspace can cope (but always first
4128          * try to preserve the existing ABI).
4129          */
4130         vma = ERR_PTR(-ENOSPC);
4131         if ((flags & PIN_MAPPABLE) == 0 &&
4132             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4133                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4134                                                flags |
4135                                                PIN_MAPPABLE |
4136                                                PIN_NONBLOCK);
4137         if (IS_ERR(vma))
4138                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4139         if (IS_ERR(vma))
4140                 goto err_unpin_global;
4141
4142         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4143
4144         __i915_gem_object_flush_for_display(obj);
4145
4146         /* It should now be out of any other write domains, and we can update
4147          * the domain values for our changes.
4148          */
4149         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4150
4151         return vma;
4152
4153 err_unpin_global:
4154         obj->pin_global--;
4155         return vma;
4156 }
4157
4158 void
4159 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4160 {
4161         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4162
4163         if (WARN_ON(vma->obj->pin_global == 0))
4164                 return;
4165
4166         if (--vma->obj->pin_global == 0)
4167                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4168
4169         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4170         i915_gem_object_bump_inactive_ggtt(vma->obj);
4171
4172         i915_vma_unpin(vma);
4173 }
4174
4175 /**
4176  * Moves a single object to the CPU read, and possibly write domain.
4177  * @obj: object to act on
4178  * @write: requesting write or read-only access
4179  *
4180  * This function returns when the move is complete, including waiting on
4181  * flushes to occur.
4182  */
4183 int
4184 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4185 {
4186         int ret;
4187
4188         lockdep_assert_held(&obj->base.dev->struct_mutex);
4189
4190         ret = i915_gem_object_wait(obj,
4191                                    I915_WAIT_INTERRUPTIBLE |
4192                                    I915_WAIT_LOCKED |
4193                                    (write ? I915_WAIT_ALL : 0),
4194                                    MAX_SCHEDULE_TIMEOUT,
4195                                    NULL);
4196         if (ret)
4197                 return ret;
4198
4199         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4200
4201         /* Flush the CPU cache if it's still invalid. */
4202         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4203                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4204                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4205         }
4206
4207         /* It should now be out of any other write domains, and we can update
4208          * the domain values for our changes.
4209          */
4210         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4211
4212         /* If we're writing through the CPU, then the GPU read domains will
4213          * need to be invalidated at next use.
4214          */
4215         if (write)
4216                 __start_cpu_write(obj);
4217
4218         return 0;
4219 }
4220
4221 /* Throttle our rendering by waiting until the ring has completed our requests
4222  * emitted over 20 msec ago.
4223  *
4224  * Note that if we were to use the current jiffies each time around the loop,
4225  * we wouldn't escape the function with any frames outstanding if the time to
4226  * render a frame was over 20ms.
4227  *
4228  * This should get us reasonable parallelism between CPU and GPU but also
4229  * relatively low latency when blocking on a particular request to finish.
4230  */
4231 static int
4232 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4233 {
4234         struct drm_i915_private *dev_priv = to_i915(dev);
4235         struct drm_i915_file_private *file_priv = file->driver_priv;
4236         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4237         struct i915_request *request, *target = NULL;
4238         long ret;
4239
4240         /* ABI: return -EIO if already wedged */
4241         if (i915_terminally_wedged(&dev_priv->gpu_error))
4242                 return -EIO;
4243
4244         spin_lock(&file_priv->mm.lock);
4245         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4246                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4247                         break;
4248
4249                 if (target) {
4250                         list_del(&target->client_link);
4251                         target->file_priv = NULL;
4252                 }
4253
4254                 target = request;
4255         }
4256         if (target)
4257                 i915_request_get(target);
4258         spin_unlock(&file_priv->mm.lock);
4259
4260         if (target == NULL)
4261                 return 0;
4262
4263         ret = i915_request_wait(target,
4264                                 I915_WAIT_INTERRUPTIBLE,
4265                                 MAX_SCHEDULE_TIMEOUT);
4266         i915_request_put(target);
4267
4268         return ret < 0 ? ret : 0;
4269 }
4270
4271 struct i915_vma *
4272 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4273                          const struct i915_ggtt_view *view,
4274                          u64 size,
4275                          u64 alignment,
4276                          u64 flags)
4277 {
4278         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4279         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4280         struct i915_vma *vma;
4281         int ret;
4282
4283         lockdep_assert_held(&obj->base.dev->struct_mutex);
4284
4285         if (flags & PIN_MAPPABLE &&
4286             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4287                 /* If the required space is larger than the available
4288                  * aperture, we will not able to find a slot for the
4289                  * object and unbinding the object now will be in
4290                  * vain. Worse, doing so may cause us to ping-pong
4291                  * the object in and out of the Global GTT and
4292                  * waste a lot of cycles under the mutex.
4293                  */
4294                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4295                         return ERR_PTR(-E2BIG);
4296
4297                 /* If NONBLOCK is set the caller is optimistically
4298                  * trying to cache the full object within the mappable
4299                  * aperture, and *must* have a fallback in place for
4300                  * situations where we cannot bind the object. We
4301                  * can be a little more lax here and use the fallback
4302                  * more often to avoid costly migrations of ourselves
4303                  * and other objects within the aperture.
4304                  *
4305                  * Half-the-aperture is used as a simple heuristic.
4306                  * More interesting would to do search for a free
4307                  * block prior to making the commitment to unbind.
4308                  * That caters for the self-harm case, and with a
4309                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4310                  * we could try to minimise harm to others.
4311                  */
4312                 if (flags & PIN_NONBLOCK &&
4313                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4314                         return ERR_PTR(-ENOSPC);
4315         }
4316
4317         vma = i915_vma_instance(obj, vm, view);
4318         if (unlikely(IS_ERR(vma)))
4319                 return vma;
4320
4321         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4322                 if (flags & PIN_NONBLOCK) {
4323                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4324                                 return ERR_PTR(-ENOSPC);
4325
4326                         if (flags & PIN_MAPPABLE &&
4327                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4328                                 return ERR_PTR(-ENOSPC);
4329                 }
4330
4331                 WARN(i915_vma_is_pinned(vma),
4332                      "bo is already pinned in ggtt with incorrect alignment:"
4333                      " offset=%08x, req.alignment=%llx,"
4334                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4335                      i915_ggtt_offset(vma), alignment,
4336                      !!(flags & PIN_MAPPABLE),
4337                      i915_vma_is_map_and_fenceable(vma));
4338                 ret = i915_vma_unbind(vma);
4339                 if (ret)
4340                         return ERR_PTR(ret);
4341         }
4342
4343         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4344         if (ret)
4345                 return ERR_PTR(ret);
4346
4347         return vma;
4348 }
4349
4350 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4351 {
4352         /* Note that we could alias engines in the execbuf API, but
4353          * that would be very unwise as it prevents userspace from
4354          * fine control over engine selection. Ahem.
4355          *
4356          * This should be something like EXEC_MAX_ENGINE instead of
4357          * I915_NUM_ENGINES.
4358          */
4359         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4360         return 0x10000 << id;
4361 }
4362
4363 static __always_inline unsigned int __busy_write_id(unsigned int id)
4364 {
4365         /* The uABI guarantees an active writer is also amongst the read
4366          * engines. This would be true if we accessed the activity tracking
4367          * under the lock, but as we perform the lookup of the object and
4368          * its activity locklessly we can not guarantee that the last_write
4369          * being active implies that we have set the same engine flag from
4370          * last_read - hence we always set both read and write busy for
4371          * last_write.
4372          */
4373         return id | __busy_read_flag(id);
4374 }
4375
4376 static __always_inline unsigned int
4377 __busy_set_if_active(const struct dma_fence *fence,
4378                      unsigned int (*flag)(unsigned int id))
4379 {
4380         struct i915_request *rq;
4381
4382         /* We have to check the current hw status of the fence as the uABI
4383          * guarantees forward progress. We could rely on the idle worker
4384          * to eventually flush us, but to minimise latency just ask the
4385          * hardware.
4386          *
4387          * Note we only report on the status of native fences.
4388          */
4389         if (!dma_fence_is_i915(fence))
4390                 return 0;
4391
4392         /* opencode to_request() in order to avoid const warnings */
4393         rq = container_of(fence, struct i915_request, fence);
4394         if (i915_request_completed(rq))
4395                 return 0;
4396
4397         return flag(rq->engine->uabi_id);
4398 }
4399
4400 static __always_inline unsigned int
4401 busy_check_reader(const struct dma_fence *fence)
4402 {
4403         return __busy_set_if_active(fence, __busy_read_flag);
4404 }
4405
4406 static __always_inline unsigned int
4407 busy_check_writer(const struct dma_fence *fence)
4408 {
4409         if (!fence)
4410                 return 0;
4411
4412         return __busy_set_if_active(fence, __busy_write_id);
4413 }
4414
4415 int
4416 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4417                     struct drm_file *file)
4418 {
4419         struct drm_i915_gem_busy *args = data;
4420         struct drm_i915_gem_object *obj;
4421         struct reservation_object_list *list;
4422         unsigned int seq;
4423         int err;
4424
4425         err = -ENOENT;
4426         rcu_read_lock();
4427         obj = i915_gem_object_lookup_rcu(file, args->handle);
4428         if (!obj)
4429                 goto out;
4430
4431         /* A discrepancy here is that we do not report the status of
4432          * non-i915 fences, i.e. even though we may report the object as idle,
4433          * a call to set-domain may still stall waiting for foreign rendering.
4434          * This also means that wait-ioctl may report an object as busy,
4435          * where busy-ioctl considers it idle.
4436          *
4437          * We trade the ability to warn of foreign fences to report on which
4438          * i915 engines are active for the object.
4439          *
4440          * Alternatively, we can trade that extra information on read/write
4441          * activity with
4442          *      args->busy =
4443          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4444          * to report the overall busyness. This is what the wait-ioctl does.
4445          *
4446          */
4447 retry:
4448         seq = raw_read_seqcount(&obj->resv->seq);
4449
4450         /* Translate the exclusive fence to the READ *and* WRITE engine */
4451         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4452
4453         /* Translate shared fences to READ set of engines */
4454         list = rcu_dereference(obj->resv->fence);
4455         if (list) {
4456                 unsigned int shared_count = list->shared_count, i;
4457
4458                 for (i = 0; i < shared_count; ++i) {
4459                         struct dma_fence *fence =
4460                                 rcu_dereference(list->shared[i]);
4461
4462                         args->busy |= busy_check_reader(fence);
4463                 }
4464         }
4465
4466         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4467                 goto retry;
4468
4469         err = 0;
4470 out:
4471         rcu_read_unlock();
4472         return err;
4473 }
4474
4475 int
4476 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4477                         struct drm_file *file_priv)
4478 {
4479         return i915_gem_ring_throttle(dev, file_priv);
4480 }
4481
4482 int
4483 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4484                        struct drm_file *file_priv)
4485 {
4486         struct drm_i915_private *dev_priv = to_i915(dev);
4487         struct drm_i915_gem_madvise *args = data;
4488         struct drm_i915_gem_object *obj;
4489         int err;
4490
4491         switch (args->madv) {
4492         case I915_MADV_DONTNEED:
4493         case I915_MADV_WILLNEED:
4494             break;
4495         default:
4496             return -EINVAL;
4497         }
4498
4499         obj = i915_gem_object_lookup(file_priv, args->handle);
4500         if (!obj)
4501                 return -ENOENT;
4502
4503         err = mutex_lock_interruptible(&obj->mm.lock);
4504         if (err)
4505                 goto out;
4506
4507         if (i915_gem_object_has_pages(obj) &&
4508             i915_gem_object_is_tiled(obj) &&
4509             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4510                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4511                         GEM_BUG_ON(!obj->mm.quirked);
4512                         __i915_gem_object_unpin_pages(obj);
4513                         obj->mm.quirked = false;
4514                 }
4515                 if (args->madv == I915_MADV_WILLNEED) {
4516                         GEM_BUG_ON(obj->mm.quirked);
4517                         __i915_gem_object_pin_pages(obj);
4518                         obj->mm.quirked = true;
4519                 }
4520         }
4521
4522         if (obj->mm.madv != __I915_MADV_PURGED)
4523                 obj->mm.madv = args->madv;
4524
4525         /* if the object is no longer attached, discard its backing storage */
4526         if (obj->mm.madv == I915_MADV_DONTNEED &&
4527             !i915_gem_object_has_pages(obj))
4528                 i915_gem_object_truncate(obj);
4529
4530         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4531         mutex_unlock(&obj->mm.lock);
4532
4533 out:
4534         i915_gem_object_put(obj);
4535         return err;
4536 }
4537
4538 static void
4539 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4540 {
4541         struct drm_i915_gem_object *obj =
4542                 container_of(active, typeof(*obj), frontbuffer_write);
4543
4544         intel_fb_obj_flush(obj, ORIGIN_CS);
4545 }
4546
4547 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4548                           const struct drm_i915_gem_object_ops *ops)
4549 {
4550         mutex_init(&obj->mm.lock);
4551
4552         INIT_LIST_HEAD(&obj->vma_list);
4553         INIT_LIST_HEAD(&obj->lut_list);
4554         INIT_LIST_HEAD(&obj->batch_pool_link);
4555
4556         init_rcu_head(&obj->rcu);
4557
4558         obj->ops = ops;
4559
4560         reservation_object_init(&obj->__builtin_resv);
4561         obj->resv = &obj->__builtin_resv;
4562
4563         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4564         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4565
4566         obj->mm.madv = I915_MADV_WILLNEED;
4567         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4568         mutex_init(&obj->mm.get_page.lock);
4569
4570         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4571 }
4572
4573 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4574         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4575                  I915_GEM_OBJECT_IS_SHRINKABLE,
4576
4577         .get_pages = i915_gem_object_get_pages_gtt,
4578         .put_pages = i915_gem_object_put_pages_gtt,
4579
4580         .pwrite = i915_gem_object_pwrite_gtt,
4581 };
4582
4583 static int i915_gem_object_create_shmem(struct drm_device *dev,
4584                                         struct drm_gem_object *obj,
4585                                         size_t size)
4586 {
4587         struct drm_i915_private *i915 = to_i915(dev);
4588         unsigned long flags = VM_NORESERVE;
4589         struct file *filp;
4590
4591         drm_gem_private_object_init(dev, obj, size);
4592
4593         if (i915->mm.gemfs)
4594                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4595                                                  flags);
4596         else
4597                 filp = shmem_file_setup("i915", size, flags);
4598
4599         if (IS_ERR(filp))
4600                 return PTR_ERR(filp);
4601
4602         obj->filp = filp;
4603
4604         return 0;
4605 }
4606
4607 struct drm_i915_gem_object *
4608 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4609 {
4610         struct drm_i915_gem_object *obj;
4611         struct address_space *mapping;
4612         unsigned int cache_level;
4613         gfp_t mask;
4614         int ret;
4615
4616         /* There is a prevalence of the assumption that we fit the object's
4617          * page count inside a 32bit _signed_ variable. Let's document this and
4618          * catch if we ever need to fix it. In the meantime, if you do spot
4619          * such a local variable, please consider fixing!
4620          */
4621         if (size >> PAGE_SHIFT > INT_MAX)
4622                 return ERR_PTR(-E2BIG);
4623
4624         if (overflows_type(size, obj->base.size))
4625                 return ERR_PTR(-E2BIG);
4626
4627         obj = i915_gem_object_alloc(dev_priv);
4628         if (obj == NULL)
4629                 return ERR_PTR(-ENOMEM);
4630
4631         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4632         if (ret)
4633                 goto fail;
4634
4635         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4636         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4637                 /* 965gm cannot relocate objects above 4GiB. */
4638                 mask &= ~__GFP_HIGHMEM;
4639                 mask |= __GFP_DMA32;
4640         }
4641
4642         mapping = obj->base.filp->f_mapping;
4643         mapping_set_gfp_mask(mapping, mask);
4644         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4645
4646         i915_gem_object_init(obj, &i915_gem_object_ops);
4647
4648         obj->write_domain = I915_GEM_DOMAIN_CPU;
4649         obj->read_domains = I915_GEM_DOMAIN_CPU;
4650
4651         if (HAS_LLC(dev_priv))
4652                 /* On some devices, we can have the GPU use the LLC (the CPU
4653                  * cache) for about a 10% performance improvement
4654                  * compared to uncached.  Graphics requests other than
4655                  * display scanout are coherent with the CPU in
4656                  * accessing this cache.  This means in this mode we
4657                  * don't need to clflush on the CPU side, and on the
4658                  * GPU side we only need to flush internal caches to
4659                  * get data visible to the CPU.
4660                  *
4661                  * However, we maintain the display planes as UC, and so
4662                  * need to rebind when first used as such.
4663                  */
4664                 cache_level = I915_CACHE_LLC;
4665         else
4666                 cache_level = I915_CACHE_NONE;
4667
4668         i915_gem_object_set_cache_coherency(obj, cache_level);
4669
4670         trace_i915_gem_object_create(obj);
4671
4672         return obj;
4673
4674 fail:
4675         i915_gem_object_free(obj);
4676         return ERR_PTR(ret);
4677 }
4678
4679 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4680 {
4681         /* If we are the last user of the backing storage (be it shmemfs
4682          * pages or stolen etc), we know that the pages are going to be
4683          * immediately released. In this case, we can then skip copying
4684          * back the contents from the GPU.
4685          */
4686
4687         if (obj->mm.madv != I915_MADV_WILLNEED)
4688                 return false;
4689
4690         if (obj->base.filp == NULL)
4691                 return true;
4692
4693         /* At first glance, this looks racy, but then again so would be
4694          * userspace racing mmap against close. However, the first external
4695          * reference to the filp can only be obtained through the
4696          * i915_gem_mmap_ioctl() which safeguards us against the user
4697          * acquiring such a reference whilst we are in the middle of
4698          * freeing the object.
4699          */
4700         return atomic_long_read(&obj->base.filp->f_count) == 1;
4701 }
4702
4703 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4704                                     struct llist_node *freed)
4705 {
4706         struct drm_i915_gem_object *obj, *on;
4707
4708         intel_runtime_pm_get(i915);
4709         llist_for_each_entry_safe(obj, on, freed, freed) {
4710                 struct i915_vma *vma, *vn;
4711
4712                 trace_i915_gem_object_destroy(obj);
4713
4714                 mutex_lock(&i915->drm.struct_mutex);
4715
4716                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4717                 list_for_each_entry_safe(vma, vn,
4718                                          &obj->vma_list, obj_link) {
4719                         GEM_BUG_ON(i915_vma_is_active(vma));
4720                         vma->flags &= ~I915_VMA_PIN_MASK;
4721                         i915_vma_destroy(vma);
4722                 }
4723                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4724                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4725
4726                 /* This serializes freeing with the shrinker. Since the free
4727                  * is delayed, first by RCU then by the workqueue, we want the
4728                  * shrinker to be able to free pages of unreferenced objects,
4729                  * or else we may oom whilst there are plenty of deferred
4730                  * freed objects.
4731                  */
4732                 if (i915_gem_object_has_pages(obj)) {
4733                         spin_lock(&i915->mm.obj_lock);
4734                         list_del_init(&obj->mm.link);
4735                         spin_unlock(&i915->mm.obj_lock);
4736                 }
4737
4738                 mutex_unlock(&i915->drm.struct_mutex);
4739
4740                 GEM_BUG_ON(obj->bind_count);
4741                 GEM_BUG_ON(obj->userfault_count);
4742                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4743                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4744
4745                 if (obj->ops->release)
4746                         obj->ops->release(obj);
4747
4748                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4749                         atomic_set(&obj->mm.pages_pin_count, 0);
4750                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4751                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4752
4753                 if (obj->base.import_attach)
4754                         drm_prime_gem_destroy(&obj->base, NULL);
4755
4756                 reservation_object_fini(&obj->__builtin_resv);
4757                 drm_gem_object_release(&obj->base);
4758                 i915_gem_info_remove_obj(i915, obj->base.size);
4759
4760                 kfree(obj->bit_17);
4761                 i915_gem_object_free(obj);
4762
4763                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4764                 atomic_dec(&i915->mm.free_count);
4765
4766                 if (on)
4767                         cond_resched();
4768         }
4769         intel_runtime_pm_put(i915);
4770 }
4771
4772 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4773 {
4774         struct llist_node *freed;
4775
4776         /* Free the oldest, most stale object to keep the free_list short */
4777         freed = NULL;
4778         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4779                 /* Only one consumer of llist_del_first() allowed */
4780                 spin_lock(&i915->mm.free_lock);
4781                 freed = llist_del_first(&i915->mm.free_list);
4782                 spin_unlock(&i915->mm.free_lock);
4783         }
4784         if (unlikely(freed)) {
4785                 freed->next = NULL;
4786                 __i915_gem_free_objects(i915, freed);
4787         }
4788 }
4789
4790 static void __i915_gem_free_work(struct work_struct *work)
4791 {
4792         struct drm_i915_private *i915 =
4793                 container_of(work, struct drm_i915_private, mm.free_work);
4794         struct llist_node *freed;
4795
4796         /*
4797          * All file-owned VMA should have been released by this point through
4798          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4799          * However, the object may also be bound into the global GTT (e.g.
4800          * older GPUs without per-process support, or for direct access through
4801          * the GTT either for the user or for scanout). Those VMA still need to
4802          * unbound now.
4803          */
4804
4805         spin_lock(&i915->mm.free_lock);
4806         while ((freed = llist_del_all(&i915->mm.free_list))) {
4807                 spin_unlock(&i915->mm.free_lock);
4808
4809                 __i915_gem_free_objects(i915, freed);
4810                 if (need_resched())
4811                         return;
4812
4813                 spin_lock(&i915->mm.free_lock);
4814         }
4815         spin_unlock(&i915->mm.free_lock);
4816 }
4817
4818 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4819 {
4820         struct drm_i915_gem_object *obj =
4821                 container_of(head, typeof(*obj), rcu);
4822         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4823
4824         /*
4825          * We reuse obj->rcu for the freed list, so we had better not treat
4826          * it like a rcu_head from this point forwards. And we expect all
4827          * objects to be freed via this path.
4828          */
4829         destroy_rcu_head(&obj->rcu);
4830
4831         /*
4832          * Since we require blocking on struct_mutex to unbind the freed
4833          * object from the GPU before releasing resources back to the
4834          * system, we can not do that directly from the RCU callback (which may
4835          * be a softirq context), but must instead then defer that work onto a
4836          * kthread. We use the RCU callback rather than move the freed object
4837          * directly onto the work queue so that we can mix between using the
4838          * worker and performing frees directly from subsequent allocations for
4839          * crude but effective memory throttling.
4840          */
4841         if (llist_add(&obj->freed, &i915->mm.free_list))
4842                 queue_work(i915->wq, &i915->mm.free_work);
4843 }
4844
4845 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4846 {
4847         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4848
4849         if (obj->mm.quirked)
4850                 __i915_gem_object_unpin_pages(obj);
4851
4852         if (discard_backing_storage(obj))
4853                 obj->mm.madv = I915_MADV_DONTNEED;
4854
4855         /*
4856          * Before we free the object, make sure any pure RCU-only
4857          * read-side critical sections are complete, e.g.
4858          * i915_gem_busy_ioctl(). For the corresponding synchronized
4859          * lookup see i915_gem_object_lookup_rcu().
4860          */
4861         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4862         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4863 }
4864
4865 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4866 {
4867         lockdep_assert_held(&obj->base.dev->struct_mutex);
4868
4869         if (!i915_gem_object_has_active_reference(obj) &&
4870             i915_gem_object_is_active(obj))
4871                 i915_gem_object_set_active_reference(obj);
4872         else
4873                 i915_gem_object_put(obj);
4874 }
4875
4876 void i915_gem_sanitize(struct drm_i915_private *i915)
4877 {
4878         GEM_TRACE("\n");
4879
4880         mutex_lock(&i915->drm.struct_mutex);
4881
4882         intel_runtime_pm_get(i915);
4883         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4884
4885         /*
4886          * As we have just resumed the machine and woken the device up from
4887          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4888          * back to defaults, recovering from whatever wedged state we left it
4889          * in and so worth trying to use the device once more.
4890          */
4891         if (i915_terminally_wedged(&i915->gpu_error))
4892                 i915_gem_unset_wedged(i915);
4893
4894         /*
4895          * If we inherit context state from the BIOS or earlier occupants
4896          * of the GPU, the GPU may be in an inconsistent state when we
4897          * try to take over. The only way to remove the earlier state
4898          * is by resetting. However, resetting on earlier gen is tricky as
4899          * it may impact the display and we are uncertain about the stability
4900          * of the reset, so this could be applied to even earlier gen.
4901          */
4902         intel_engines_sanitize(i915, false);
4903
4904         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4905         intel_runtime_pm_put(i915);
4906
4907         i915_gem_contexts_lost(i915);
4908         mutex_unlock(&i915->drm.struct_mutex);
4909 }
4910
4911 int i915_gem_suspend(struct drm_i915_private *i915)
4912 {
4913         int ret;
4914
4915         GEM_TRACE("\n");
4916
4917         intel_runtime_pm_get(i915);
4918         intel_suspend_gt_powersave(i915);
4919
4920         mutex_lock(&i915->drm.struct_mutex);
4921
4922         /*
4923          * We have to flush all the executing contexts to main memory so
4924          * that they can saved in the hibernation image. To ensure the last
4925          * context image is coherent, we have to switch away from it. That
4926          * leaves the i915->kernel_context still active when
4927          * we actually suspend, and its image in memory may not match the GPU
4928          * state. Fortunately, the kernel_context is disposable and we do
4929          * not rely on its state.
4930          */
4931         if (!i915_terminally_wedged(&i915->gpu_error)) {
4932                 ret = i915_gem_switch_to_kernel_context(i915);
4933                 if (ret)
4934                         goto err_unlock;
4935
4936                 ret = i915_gem_wait_for_idle(i915,
4937                                              I915_WAIT_INTERRUPTIBLE |
4938                                              I915_WAIT_LOCKED |
4939                                              I915_WAIT_FOR_IDLE_BOOST,
4940                                              MAX_SCHEDULE_TIMEOUT);
4941                 if (ret && ret != -EIO)
4942                         goto err_unlock;
4943
4944                 assert_kernel_context_is_current(i915);
4945         }
4946         i915_retire_requests(i915); /* ensure we flush after wedging */
4947
4948         mutex_unlock(&i915->drm.struct_mutex);
4949
4950         intel_uc_suspend(i915);
4951
4952         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
4953         cancel_delayed_work_sync(&i915->gt.retire_work);
4954
4955         /*
4956          * As the idle_work is rearming if it detects a race, play safe and
4957          * repeat the flush until it is definitely idle.
4958          */
4959         drain_delayed_work(&i915->gt.idle_work);
4960
4961         /*
4962          * Assert that we successfully flushed all the work and
4963          * reset the GPU back to its idle, low power state.
4964          */
4965         WARN_ON(i915->gt.awake);
4966         if (WARN_ON(!intel_engines_are_idle(i915)))
4967                 i915_gem_set_wedged(i915); /* no hope, discard everything */
4968
4969         intel_runtime_pm_put(i915);
4970         return 0;
4971
4972 err_unlock:
4973         mutex_unlock(&i915->drm.struct_mutex);
4974         intel_runtime_pm_put(i915);
4975         return ret;
4976 }
4977
4978 void i915_gem_suspend_late(struct drm_i915_private *i915)
4979 {
4980         struct drm_i915_gem_object *obj;
4981         struct list_head *phases[] = {
4982                 &i915->mm.unbound_list,
4983                 &i915->mm.bound_list,
4984                 NULL
4985         }, **phase;
4986
4987         /*
4988          * Neither the BIOS, ourselves or any other kernel
4989          * expects the system to be in execlists mode on startup,
4990          * so we need to reset the GPU back to legacy mode. And the only
4991          * known way to disable logical contexts is through a GPU reset.
4992          *
4993          * So in order to leave the system in a known default configuration,
4994          * always reset the GPU upon unload and suspend. Afterwards we then
4995          * clean up the GEM state tracking, flushing off the requests and
4996          * leaving the system in a known idle state.
4997          *
4998          * Note that is of the upmost importance that the GPU is idle and
4999          * all stray writes are flushed *before* we dismantle the backing
5000          * storage for the pinned objects.
5001          *
5002          * However, since we are uncertain that resetting the GPU on older
5003          * machines is a good idea, we don't - just in case it leaves the
5004          * machine in an unusable condition.
5005          */
5006
5007         mutex_lock(&i915->drm.struct_mutex);
5008         for (phase = phases; *phase; phase++) {
5009                 list_for_each_entry(obj, *phase, mm.link)
5010                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5011         }
5012         mutex_unlock(&i915->drm.struct_mutex);
5013
5014         intel_uc_sanitize(i915);
5015         i915_gem_sanitize(i915);
5016 }
5017
5018 void i915_gem_resume(struct drm_i915_private *i915)
5019 {
5020         GEM_TRACE("\n");
5021
5022         WARN_ON(i915->gt.awake);
5023
5024         mutex_lock(&i915->drm.struct_mutex);
5025         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5026
5027         i915_gem_restore_gtt_mappings(i915);
5028         i915_gem_restore_fences(i915);
5029
5030         /*
5031          * As we didn't flush the kernel context before suspend, we cannot
5032          * guarantee that the context image is complete. So let's just reset
5033          * it and start again.
5034          */
5035         i915->gt.resume(i915);
5036
5037         if (i915_gem_init_hw(i915))
5038                 goto err_wedged;
5039
5040         intel_uc_resume(i915);
5041
5042         /* Always reload a context for powersaving. */
5043         if (i915_gem_switch_to_kernel_context(i915))
5044                 goto err_wedged;
5045
5046 out_unlock:
5047         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5048         mutex_unlock(&i915->drm.struct_mutex);
5049         return;
5050
5051 err_wedged:
5052         if (!i915_terminally_wedged(&i915->gpu_error)) {
5053                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5054                 i915_gem_set_wedged(i915);
5055         }
5056         goto out_unlock;
5057 }
5058
5059 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5060 {
5061         if (INTEL_GEN(dev_priv) < 5 ||
5062             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5063                 return;
5064
5065         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5066                                  DISP_TILE_SURFACE_SWIZZLING);
5067
5068         if (IS_GEN(dev_priv, 5))
5069                 return;
5070
5071         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5072         if (IS_GEN(dev_priv, 6))
5073                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5074         else if (IS_GEN(dev_priv, 7))
5075                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5076         else if (IS_GEN(dev_priv, 8))
5077                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5078         else
5079                 BUG();
5080 }
5081
5082 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5083 {
5084         I915_WRITE(RING_CTL(base), 0);
5085         I915_WRITE(RING_HEAD(base), 0);
5086         I915_WRITE(RING_TAIL(base), 0);
5087         I915_WRITE(RING_START(base), 0);
5088 }
5089
5090 static void init_unused_rings(struct drm_i915_private *dev_priv)
5091 {
5092         if (IS_I830(dev_priv)) {
5093                 init_unused_ring(dev_priv, PRB1_BASE);
5094                 init_unused_ring(dev_priv, SRB0_BASE);
5095                 init_unused_ring(dev_priv, SRB1_BASE);
5096                 init_unused_ring(dev_priv, SRB2_BASE);
5097                 init_unused_ring(dev_priv, SRB3_BASE);
5098         } else if (IS_GEN(dev_priv, 2)) {
5099                 init_unused_ring(dev_priv, SRB0_BASE);
5100                 init_unused_ring(dev_priv, SRB1_BASE);
5101         } else if (IS_GEN(dev_priv, 3)) {
5102                 init_unused_ring(dev_priv, PRB1_BASE);
5103                 init_unused_ring(dev_priv, PRB2_BASE);
5104         }
5105 }
5106
5107 static int __i915_gem_restart_engines(void *data)
5108 {
5109         struct drm_i915_private *i915 = data;
5110         struct intel_engine_cs *engine;
5111         enum intel_engine_id id;
5112         int err;
5113
5114         for_each_engine(engine, i915, id) {
5115                 err = engine->init_hw(engine);
5116                 if (err) {
5117                         DRM_ERROR("Failed to restart %s (%d)\n",
5118                                   engine->name, err);
5119                         return err;
5120                 }
5121         }
5122
5123         return 0;
5124 }
5125
5126 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5127 {
5128         int ret;
5129
5130         dev_priv->gt.last_init_time = ktime_get();
5131
5132         /* Double layer security blanket, see i915_gem_init() */
5133         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5134
5135         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5136                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5137
5138         if (IS_HASWELL(dev_priv))
5139                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5140                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5141
5142         /* Apply the GT workarounds... */
5143         intel_gt_apply_workarounds(dev_priv);
5144         /* ...and determine whether they are sticking. */
5145         intel_gt_verify_workarounds(dev_priv, "init");
5146
5147         i915_gem_init_swizzling(dev_priv);
5148
5149         /*
5150          * At least 830 can leave some of the unused rings
5151          * "active" (ie. head != tail) after resume which
5152          * will prevent c3 entry. Makes sure all unused rings
5153          * are totally idle.
5154          */
5155         init_unused_rings(dev_priv);
5156
5157         BUG_ON(!dev_priv->kernel_context);
5158         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5159                 ret = -EIO;
5160                 goto out;
5161         }
5162
5163         ret = i915_ppgtt_init_hw(dev_priv);
5164         if (ret) {
5165                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5166                 goto out;
5167         }
5168
5169         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5170         if (ret) {
5171                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5172                 goto out;
5173         }
5174
5175         /* We can't enable contexts until all firmware is loaded */
5176         ret = intel_uc_init_hw(dev_priv);
5177         if (ret) {
5178                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5179                 goto out;
5180         }
5181
5182         intel_mocs_init_l3cc_table(dev_priv);
5183
5184         /* Only when the HW is re-initialised, can we replay the requests */
5185         ret = __i915_gem_restart_engines(dev_priv);
5186         if (ret)
5187                 goto cleanup_uc;
5188
5189         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5190
5191         return 0;
5192
5193 cleanup_uc:
5194         intel_uc_fini_hw(dev_priv);
5195 out:
5196         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5197
5198         return ret;
5199 }
5200
5201 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5202 {
5203         struct i915_gem_context *ctx;
5204         struct intel_engine_cs *engine;
5205         enum intel_engine_id id;
5206         int err;
5207
5208         /*
5209          * As we reset the gpu during very early sanitisation, the current
5210          * register state on the GPU should reflect its defaults values.
5211          * We load a context onto the hw (with restore-inhibit), then switch
5212          * over to a second context to save that default register state. We
5213          * can then prime every new context with that state so they all start
5214          * from the same default HW values.
5215          */
5216
5217         ctx = i915_gem_context_create_kernel(i915, 0);
5218         if (IS_ERR(ctx))
5219                 return PTR_ERR(ctx);
5220
5221         for_each_engine(engine, i915, id) {
5222                 struct i915_request *rq;
5223
5224                 rq = i915_request_alloc(engine, ctx);
5225                 if (IS_ERR(rq)) {
5226                         err = PTR_ERR(rq);
5227                         goto out_ctx;
5228                 }
5229
5230                 err = 0;
5231                 if (engine->init_context)
5232                         err = engine->init_context(rq);
5233
5234                 i915_request_add(rq);
5235                 if (err)
5236                         goto err_active;
5237         }
5238
5239         err = i915_gem_switch_to_kernel_context(i915);
5240         if (err)
5241                 goto err_active;
5242
5243         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5244                 i915_gem_set_wedged(i915);
5245                 err = -EIO; /* Caller will declare us wedged */
5246                 goto err_active;
5247         }
5248
5249         assert_kernel_context_is_current(i915);
5250
5251         /*
5252          * Immediately park the GPU so that we enable powersaving and
5253          * treat it as idle. The next time we issue a request, we will
5254          * unpark and start using the engine->pinned_default_state, otherwise
5255          * it is in limbo and an early reset may fail.
5256          */
5257         __i915_gem_park(i915);
5258
5259         for_each_engine(engine, i915, id) {
5260                 struct i915_vma *state;
5261                 void *vaddr;
5262
5263                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
5264
5265                 state = to_intel_context(ctx, engine)->state;
5266                 if (!state)
5267                         continue;
5268
5269                 /*
5270                  * As we will hold a reference to the logical state, it will
5271                  * not be torn down with the context, and importantly the
5272                  * object will hold onto its vma (making it possible for a
5273                  * stray GTT write to corrupt our defaults). Unmap the vma
5274                  * from the GTT to prevent such accidents and reclaim the
5275                  * space.
5276                  */
5277                 err = i915_vma_unbind(state);
5278                 if (err)
5279                         goto err_active;
5280
5281                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5282                 if (err)
5283                         goto err_active;
5284
5285                 engine->default_state = i915_gem_object_get(state->obj);
5286
5287                 /* Check we can acquire the image of the context state */
5288                 vaddr = i915_gem_object_pin_map(engine->default_state,
5289                                                 I915_MAP_FORCE_WB);
5290                 if (IS_ERR(vaddr)) {
5291                         err = PTR_ERR(vaddr);
5292                         goto err_active;
5293                 }
5294
5295                 i915_gem_object_unpin_map(engine->default_state);
5296         }
5297
5298         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5299                 unsigned int found = intel_engines_has_context_isolation(i915);
5300
5301                 /*
5302                  * Make sure that classes with multiple engine instances all
5303                  * share the same basic configuration.
5304                  */
5305                 for_each_engine(engine, i915, id) {
5306                         unsigned int bit = BIT(engine->uabi_class);
5307                         unsigned int expected = engine->default_state ? bit : 0;
5308
5309                         if ((found & bit) != expected) {
5310                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5311                                           engine->uabi_class, engine->name);
5312                         }
5313                 }
5314         }
5315
5316 out_ctx:
5317         i915_gem_context_set_closed(ctx);
5318         i915_gem_context_put(ctx);
5319         return err;
5320
5321 err_active:
5322         /*
5323          * If we have to abandon now, we expect the engines to be idle
5324          * and ready to be torn-down. First try to flush any remaining
5325          * request, ensure we are pointing at the kernel context and
5326          * then remove it.
5327          */
5328         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5329                 goto out_ctx;
5330
5331         if (WARN_ON(i915_gem_wait_for_idle(i915,
5332                                            I915_WAIT_LOCKED,
5333                                            MAX_SCHEDULE_TIMEOUT)))
5334                 goto out_ctx;
5335
5336         i915_gem_contexts_lost(i915);
5337         goto out_ctx;
5338 }
5339
5340 static int
5341 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
5342 {
5343         struct drm_i915_gem_object *obj;
5344         struct i915_vma *vma;
5345         int ret;
5346
5347         obj = i915_gem_object_create_stolen(i915, size);
5348         if (!obj)
5349                 obj = i915_gem_object_create_internal(i915, size);
5350         if (IS_ERR(obj)) {
5351                 DRM_ERROR("Failed to allocate scratch page\n");
5352                 return PTR_ERR(obj);
5353         }
5354
5355         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5356         if (IS_ERR(vma)) {
5357                 ret = PTR_ERR(vma);
5358                 goto err_unref;
5359         }
5360
5361         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5362         if (ret)
5363                 goto err_unref;
5364
5365         i915->gt.scratch = vma;
5366         return 0;
5367
5368 err_unref:
5369         i915_gem_object_put(obj);
5370         return ret;
5371 }
5372
5373 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5374 {
5375         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5376 }
5377
5378 int i915_gem_init(struct drm_i915_private *dev_priv)
5379 {
5380         int ret;
5381
5382         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5383         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5384                 mkwrite_device_info(dev_priv)->page_sizes =
5385                         I915_GTT_PAGE_SIZE_4K;
5386
5387         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5388
5389         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5390                 dev_priv->gt.resume = intel_lr_context_resume;
5391                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5392         } else {
5393                 dev_priv->gt.resume = intel_legacy_submission_resume;
5394                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5395         }
5396
5397         ret = i915_gem_init_userptr(dev_priv);
5398         if (ret)
5399                 return ret;
5400
5401         ret = intel_uc_init_misc(dev_priv);
5402         if (ret)
5403                 return ret;
5404
5405         ret = intel_wopcm_init(&dev_priv->wopcm);
5406         if (ret)
5407                 goto err_uc_misc;
5408
5409         /* This is just a security blanket to placate dragons.
5410          * On some systems, we very sporadically observe that the first TLBs
5411          * used by the CS may be stale, despite us poking the TLB reset. If
5412          * we hold the forcewake during initialisation these problems
5413          * just magically go away.
5414          */
5415         mutex_lock(&dev_priv->drm.struct_mutex);
5416         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5417
5418         ret = i915_gem_init_ggtt(dev_priv);
5419         if (ret) {
5420                 GEM_BUG_ON(ret == -EIO);
5421                 goto err_unlock;
5422         }
5423
5424         ret = i915_gem_init_scratch(dev_priv,
5425                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
5426         if (ret) {
5427                 GEM_BUG_ON(ret == -EIO);
5428                 goto err_ggtt;
5429         }
5430
5431         ret = i915_gem_contexts_init(dev_priv);
5432         if (ret) {
5433                 GEM_BUG_ON(ret == -EIO);
5434                 goto err_scratch;
5435         }
5436
5437         ret = intel_engines_init(dev_priv);
5438         if (ret) {
5439                 GEM_BUG_ON(ret == -EIO);
5440                 goto err_context;
5441         }
5442
5443         intel_init_gt_powersave(dev_priv);
5444
5445         ret = intel_uc_init(dev_priv);
5446         if (ret)
5447                 goto err_pm;
5448
5449         ret = i915_gem_init_hw(dev_priv);
5450         if (ret)
5451                 goto err_uc_init;
5452
5453         /*
5454          * Despite its name intel_init_clock_gating applies both display
5455          * clock gating workarounds; GT mmio workarounds and the occasional
5456          * GT power context workaround. Worse, sometimes it includes a context
5457          * register workaround which we need to apply before we record the
5458          * default HW state for all contexts.
5459          *
5460          * FIXME: break up the workarounds and apply them at the right time!
5461          */
5462         intel_init_clock_gating(dev_priv);
5463
5464         ret = __intel_engines_record_defaults(dev_priv);
5465         if (ret)
5466                 goto err_init_hw;
5467
5468         if (i915_inject_load_failure()) {
5469                 ret = -ENODEV;
5470                 goto err_init_hw;
5471         }
5472
5473         if (i915_inject_load_failure()) {
5474                 ret = -EIO;
5475                 goto err_init_hw;
5476         }
5477
5478         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5479         mutex_unlock(&dev_priv->drm.struct_mutex);
5480
5481         return 0;
5482
5483         /*
5484          * Unwinding is complicated by that we want to handle -EIO to mean
5485          * disable GPU submission but keep KMS alive. We want to mark the
5486          * HW as irrevisibly wedged, but keep enough state around that the
5487          * driver doesn't explode during runtime.
5488          */
5489 err_init_hw:
5490         mutex_unlock(&dev_priv->drm.struct_mutex);
5491
5492         WARN_ON(i915_gem_suspend(dev_priv));
5493         i915_gem_suspend_late(dev_priv);
5494
5495         i915_gem_drain_workqueue(dev_priv);
5496
5497         mutex_lock(&dev_priv->drm.struct_mutex);
5498         intel_uc_fini_hw(dev_priv);
5499 err_uc_init:
5500         intel_uc_fini(dev_priv);
5501 err_pm:
5502         if (ret != -EIO) {
5503                 intel_cleanup_gt_powersave(dev_priv);
5504                 i915_gem_cleanup_engines(dev_priv);
5505         }
5506 err_context:
5507         if (ret != -EIO)
5508                 i915_gem_contexts_fini(dev_priv);
5509 err_scratch:
5510         i915_gem_fini_scratch(dev_priv);
5511 err_ggtt:
5512 err_unlock:
5513         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5514         mutex_unlock(&dev_priv->drm.struct_mutex);
5515
5516 err_uc_misc:
5517         intel_uc_fini_misc(dev_priv);
5518
5519         if (ret != -EIO)
5520                 i915_gem_cleanup_userptr(dev_priv);
5521
5522         if (ret == -EIO) {
5523                 mutex_lock(&dev_priv->drm.struct_mutex);
5524
5525                 /*
5526                  * Allow engine initialisation to fail by marking the GPU as
5527                  * wedged. But we only want to do this where the GPU is angry,
5528                  * for all other failure, such as an allocation failure, bail.
5529                  */
5530                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5531                         i915_load_error(dev_priv,
5532                                         "Failed to initialize GPU, declaring it wedged!\n");
5533                         i915_gem_set_wedged(dev_priv);
5534                 }
5535
5536                 /* Minimal basic recovery for KMS */
5537                 ret = i915_ggtt_enable_hw(dev_priv);
5538                 i915_gem_restore_gtt_mappings(dev_priv);
5539                 i915_gem_restore_fences(dev_priv);
5540                 intel_init_clock_gating(dev_priv);
5541
5542                 mutex_unlock(&dev_priv->drm.struct_mutex);
5543         }
5544
5545         i915_gem_drain_freed_objects(dev_priv);
5546         return ret;
5547 }
5548
5549 void i915_gem_fini(struct drm_i915_private *dev_priv)
5550 {
5551         i915_gem_suspend_late(dev_priv);
5552         intel_disable_gt_powersave(dev_priv);
5553
5554         /* Flush any outstanding unpin_work. */
5555         i915_gem_drain_workqueue(dev_priv);
5556
5557         mutex_lock(&dev_priv->drm.struct_mutex);
5558         intel_uc_fini_hw(dev_priv);
5559         intel_uc_fini(dev_priv);
5560         i915_gem_cleanup_engines(dev_priv);
5561         i915_gem_contexts_fini(dev_priv);
5562         i915_gem_fini_scratch(dev_priv);
5563         mutex_unlock(&dev_priv->drm.struct_mutex);
5564
5565         intel_wa_list_free(&dev_priv->gt_wa_list);
5566
5567         intel_cleanup_gt_powersave(dev_priv);
5568
5569         intel_uc_fini_misc(dev_priv);
5570         i915_gem_cleanup_userptr(dev_priv);
5571
5572         i915_gem_drain_freed_objects(dev_priv);
5573
5574         WARN_ON(!list_empty(&dev_priv->contexts.list));
5575 }
5576
5577 void i915_gem_init_mmio(struct drm_i915_private *i915)
5578 {
5579         i915_gem_sanitize(i915);
5580 }
5581
5582 void
5583 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5584 {
5585         struct intel_engine_cs *engine;
5586         enum intel_engine_id id;
5587
5588         for_each_engine(engine, dev_priv, id)
5589                 dev_priv->gt.cleanup_engine(engine);
5590 }
5591
5592 void
5593 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5594 {
5595         int i;
5596
5597         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5598             !IS_CHERRYVIEW(dev_priv))
5599                 dev_priv->num_fence_regs = 32;
5600         else if (INTEL_GEN(dev_priv) >= 4 ||
5601                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5602                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5603                 dev_priv->num_fence_regs = 16;
5604         else
5605                 dev_priv->num_fence_regs = 8;
5606
5607         if (intel_vgpu_active(dev_priv))
5608                 dev_priv->num_fence_regs =
5609                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5610
5611         /* Initialize fence registers to zero */
5612         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5613                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5614
5615                 fence->i915 = dev_priv;
5616                 fence->id = i;
5617                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5618         }
5619         i915_gem_restore_fences(dev_priv);
5620
5621         i915_gem_detect_bit_6_swizzle(dev_priv);
5622 }
5623
5624 static void i915_gem_init__mm(struct drm_i915_private *i915)
5625 {
5626         spin_lock_init(&i915->mm.object_stat_lock);
5627         spin_lock_init(&i915->mm.obj_lock);
5628         spin_lock_init(&i915->mm.free_lock);
5629
5630         init_llist_head(&i915->mm.free_list);
5631
5632         INIT_LIST_HEAD(&i915->mm.unbound_list);
5633         INIT_LIST_HEAD(&i915->mm.bound_list);
5634         INIT_LIST_HEAD(&i915->mm.fence_list);
5635         INIT_LIST_HEAD(&i915->mm.userfault_list);
5636
5637         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5638 }
5639
5640 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5641 {
5642         int err = -ENOMEM;
5643
5644         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5645         if (!dev_priv->objects)
5646                 goto err_out;
5647
5648         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5649         if (!dev_priv->vmas)
5650                 goto err_objects;
5651
5652         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5653         if (!dev_priv->luts)
5654                 goto err_vmas;
5655
5656         dev_priv->requests = KMEM_CACHE(i915_request,
5657                                         SLAB_HWCACHE_ALIGN |
5658                                         SLAB_RECLAIM_ACCOUNT |
5659                                         SLAB_TYPESAFE_BY_RCU);
5660         if (!dev_priv->requests)
5661                 goto err_luts;
5662
5663         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5664                                             SLAB_HWCACHE_ALIGN |
5665                                             SLAB_RECLAIM_ACCOUNT);
5666         if (!dev_priv->dependencies)
5667                 goto err_requests;
5668
5669         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5670         if (!dev_priv->priorities)
5671                 goto err_dependencies;
5672
5673         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5674         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5675         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5676
5677         i915_gem_init__mm(dev_priv);
5678
5679         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5680                           i915_gem_retire_work_handler);
5681         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5682                           i915_gem_idle_work_handler);
5683         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5684         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5685
5686         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5687
5688         spin_lock_init(&dev_priv->fb_tracking.lock);
5689
5690         err = i915_gemfs_init(dev_priv);
5691         if (err)
5692                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5693
5694         return 0;
5695
5696 err_dependencies:
5697         kmem_cache_destroy(dev_priv->dependencies);
5698 err_requests:
5699         kmem_cache_destroy(dev_priv->requests);
5700 err_luts:
5701         kmem_cache_destroy(dev_priv->luts);
5702 err_vmas:
5703         kmem_cache_destroy(dev_priv->vmas);
5704 err_objects:
5705         kmem_cache_destroy(dev_priv->objects);
5706 err_out:
5707         return err;
5708 }
5709
5710 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5711 {
5712         i915_gem_drain_freed_objects(dev_priv);
5713         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5714         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5715         WARN_ON(dev_priv->mm.object_count);
5716         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5717
5718         kmem_cache_destroy(dev_priv->priorities);
5719         kmem_cache_destroy(dev_priv->dependencies);
5720         kmem_cache_destroy(dev_priv->requests);
5721         kmem_cache_destroy(dev_priv->luts);
5722         kmem_cache_destroy(dev_priv->vmas);
5723         kmem_cache_destroy(dev_priv->objects);
5724
5725         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5726         rcu_barrier();
5727
5728         i915_gemfs_fini(dev_priv);
5729 }
5730
5731 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5732 {
5733         /* Discard all purgeable objects, let userspace recover those as
5734          * required after resuming.
5735          */
5736         i915_gem_shrink_all(dev_priv);
5737
5738         return 0;
5739 }
5740
5741 int i915_gem_freeze_late(struct drm_i915_private *i915)
5742 {
5743         struct drm_i915_gem_object *obj;
5744         struct list_head *phases[] = {
5745                 &i915->mm.unbound_list,
5746                 &i915->mm.bound_list,
5747                 NULL
5748         }, **phase;
5749
5750         /*
5751          * Called just before we write the hibernation image.
5752          *
5753          * We need to update the domain tracking to reflect that the CPU
5754          * will be accessing all the pages to create and restore from the
5755          * hibernation, and so upon restoration those pages will be in the
5756          * CPU domain.
5757          *
5758          * To make sure the hibernation image contains the latest state,
5759          * we update that state just before writing out the image.
5760          *
5761          * To try and reduce the hibernation image, we manually shrink
5762          * the objects as well, see i915_gem_freeze()
5763          */
5764
5765         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5766         i915_gem_drain_freed_objects(i915);
5767
5768         mutex_lock(&i915->drm.struct_mutex);
5769         for (phase = phases; *phase; phase++) {
5770                 list_for_each_entry(obj, *phase, mm.link)
5771                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5772         }
5773         mutex_unlock(&i915->drm.struct_mutex);
5774
5775         return 0;
5776 }
5777
5778 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5779 {
5780         struct drm_i915_file_private *file_priv = file->driver_priv;
5781         struct i915_request *request;
5782
5783         /* Clean up our request list when the client is going away, so that
5784          * later retire_requests won't dereference our soon-to-be-gone
5785          * file_priv.
5786          */
5787         spin_lock(&file_priv->mm.lock);
5788         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5789                 request->file_priv = NULL;
5790         spin_unlock(&file_priv->mm.lock);
5791 }
5792
5793 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5794 {
5795         struct drm_i915_file_private *file_priv;
5796         int ret;
5797
5798         DRM_DEBUG("\n");
5799
5800         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5801         if (!file_priv)
5802                 return -ENOMEM;
5803
5804         file->driver_priv = file_priv;
5805         file_priv->dev_priv = i915;
5806         file_priv->file = file;
5807
5808         spin_lock_init(&file_priv->mm.lock);
5809         INIT_LIST_HEAD(&file_priv->mm.request_list);
5810
5811         file_priv->bsd_engine = -1;
5812         file_priv->hang_timestamp = jiffies;
5813
5814         ret = i915_gem_context_open(i915, file);
5815         if (ret)
5816                 kfree(file_priv);
5817
5818         return ret;
5819 }
5820
5821 /**
5822  * i915_gem_track_fb - update frontbuffer tracking
5823  * @old: current GEM buffer for the frontbuffer slots
5824  * @new: new GEM buffer for the frontbuffer slots
5825  * @frontbuffer_bits: bitmask of frontbuffer slots
5826  *
5827  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5828  * from @old and setting them in @new. Both @old and @new can be NULL.
5829  */
5830 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5831                        struct drm_i915_gem_object *new,
5832                        unsigned frontbuffer_bits)
5833 {
5834         /* Control of individual bits within the mask are guarded by
5835          * the owning plane->mutex, i.e. we can never see concurrent
5836          * manipulation of individual bits. But since the bitfield as a whole
5837          * is updated using RMW, we need to use atomics in order to update
5838          * the bits.
5839          */
5840         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5841                      BITS_PER_TYPE(atomic_t));
5842
5843         if (old) {
5844                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5845                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5846         }
5847
5848         if (new) {
5849                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5850                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5851         }
5852 }
5853
5854 /* Allocate a new GEM object and fill it with the supplied data */
5855 struct drm_i915_gem_object *
5856 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5857                                  const void *data, size_t size)
5858 {
5859         struct drm_i915_gem_object *obj;
5860         struct file *file;
5861         size_t offset;
5862         int err;
5863
5864         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5865         if (IS_ERR(obj))
5866                 return obj;
5867
5868         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5869
5870         file = obj->base.filp;
5871         offset = 0;
5872         do {
5873                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5874                 struct page *page;
5875                 void *pgdata, *vaddr;
5876
5877                 err = pagecache_write_begin(file, file->f_mapping,
5878                                             offset, len, 0,
5879                                             &page, &pgdata);
5880                 if (err < 0)
5881                         goto fail;
5882
5883                 vaddr = kmap(page);
5884                 memcpy(vaddr, data, len);
5885                 kunmap(page);
5886
5887                 err = pagecache_write_end(file, file->f_mapping,
5888                                           offset, len, len,
5889                                           page, pgdata);
5890                 if (err < 0)
5891                         goto fail;
5892
5893                 size -= len;
5894                 data += len;
5895                 offset += len;
5896         } while (size);
5897
5898         return obj;
5899
5900 fail:
5901         i915_gem_object_put(obj);
5902         return ERR_PTR(err);
5903 }
5904
5905 struct scatterlist *
5906 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5907                        unsigned int n,
5908                        unsigned int *offset)
5909 {
5910         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5911         struct scatterlist *sg;
5912         unsigned int idx, count;
5913
5914         might_sleep();
5915         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5916         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5917
5918         /* As we iterate forward through the sg, we record each entry in a
5919          * radixtree for quick repeated (backwards) lookups. If we have seen
5920          * this index previously, we will have an entry for it.
5921          *
5922          * Initial lookup is O(N), but this is amortized to O(1) for
5923          * sequential page access (where each new request is consecutive
5924          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5925          * i.e. O(1) with a large constant!
5926          */
5927         if (n < READ_ONCE(iter->sg_idx))
5928                 goto lookup;
5929
5930         mutex_lock(&iter->lock);
5931
5932         /* We prefer to reuse the last sg so that repeated lookup of this
5933          * (or the subsequent) sg are fast - comparing against the last
5934          * sg is faster than going through the radixtree.
5935          */
5936
5937         sg = iter->sg_pos;
5938         idx = iter->sg_idx;
5939         count = __sg_page_count(sg);
5940
5941         while (idx + count <= n) {
5942                 void *entry;
5943                 unsigned long i;
5944                 int ret;
5945
5946                 /* If we cannot allocate and insert this entry, or the
5947                  * individual pages from this range, cancel updating the
5948                  * sg_idx so that on this lookup we are forced to linearly
5949                  * scan onwards, but on future lookups we will try the
5950                  * insertion again (in which case we need to be careful of
5951                  * the error return reporting that we have already inserted
5952                  * this index).
5953                  */
5954                 ret = radix_tree_insert(&iter->radix, idx, sg);
5955                 if (ret && ret != -EEXIST)
5956                         goto scan;
5957
5958                 entry = xa_mk_value(idx);
5959                 for (i = 1; i < count; i++) {
5960                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5961                         if (ret && ret != -EEXIST)
5962                                 goto scan;
5963                 }
5964
5965                 idx += count;
5966                 sg = ____sg_next(sg);
5967                 count = __sg_page_count(sg);
5968         }
5969
5970 scan:
5971         iter->sg_pos = sg;
5972         iter->sg_idx = idx;
5973
5974         mutex_unlock(&iter->lock);
5975
5976         if (unlikely(n < idx)) /* insertion completed by another thread */
5977                 goto lookup;
5978
5979         /* In case we failed to insert the entry into the radixtree, we need
5980          * to look beyond the current sg.
5981          */
5982         while (idx + count <= n) {
5983                 idx += count;
5984                 sg = ____sg_next(sg);
5985                 count = __sg_page_count(sg);
5986         }
5987
5988         *offset = n - idx;
5989         return sg;
5990
5991 lookup:
5992         rcu_read_lock();
5993
5994         sg = radix_tree_lookup(&iter->radix, n);
5995         GEM_BUG_ON(!sg);
5996
5997         /* If this index is in the middle of multi-page sg entry,
5998          * the radix tree will contain a value entry that points
5999          * to the start of that range. We will return the pointer to
6000          * the base page and the offset of this page within the
6001          * sg entry's range.
6002          */
6003         *offset = 0;
6004         if (unlikely(xa_is_value(sg))) {
6005                 unsigned long base = xa_to_value(sg);
6006
6007                 sg = radix_tree_lookup(&iter->radix, base);
6008                 GEM_BUG_ON(!sg);
6009
6010                 *offset = n - base;
6011         }
6012
6013         rcu_read_unlock();
6014
6015         return sg;
6016 }
6017
6018 struct page *
6019 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6020 {
6021         struct scatterlist *sg;
6022         unsigned int offset;
6023
6024         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6025
6026         sg = i915_gem_object_get_sg(obj, n, &offset);
6027         return nth_page(sg_page(sg), offset);
6028 }
6029
6030 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6031 struct page *
6032 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6033                                unsigned int n)
6034 {
6035         struct page *page;
6036
6037         page = i915_gem_object_get_page(obj, n);
6038         if (!obj->mm.dirty)
6039                 set_page_dirty(page);
6040
6041         return page;
6042 }
6043
6044 dma_addr_t
6045 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6046                                 unsigned long n)
6047 {
6048         struct scatterlist *sg;
6049         unsigned int offset;
6050
6051         sg = i915_gem_object_get_sg(obj, n, &offset);
6052         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6053 }
6054
6055 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6056 {
6057         struct sg_table *pages;
6058         int err;
6059
6060         if (align > obj->base.size)
6061                 return -EINVAL;
6062
6063         if (obj->ops == &i915_gem_phys_ops)
6064                 return 0;
6065
6066         if (obj->ops != &i915_gem_object_ops)
6067                 return -EINVAL;
6068
6069         err = i915_gem_object_unbind(obj);
6070         if (err)
6071                 return err;
6072
6073         mutex_lock(&obj->mm.lock);
6074
6075         if (obj->mm.madv != I915_MADV_WILLNEED) {
6076                 err = -EFAULT;
6077                 goto err_unlock;
6078         }
6079
6080         if (obj->mm.quirked) {
6081                 err = -EFAULT;
6082                 goto err_unlock;
6083         }
6084
6085         if (obj->mm.mapping) {
6086                 err = -EBUSY;
6087                 goto err_unlock;
6088         }
6089
6090         pages = __i915_gem_object_unset_pages(obj);
6091
6092         obj->ops = &i915_gem_phys_ops;
6093
6094         err = ____i915_gem_object_get_pages(obj);
6095         if (err)
6096                 goto err_xfer;
6097
6098         /* Perma-pin (until release) the physical set of pages */
6099         __i915_gem_object_pin_pages(obj);
6100
6101         if (!IS_ERR_OR_NULL(pages))
6102                 i915_gem_object_ops.put_pages(obj, pages);
6103         mutex_unlock(&obj->mm.lock);
6104         return 0;
6105
6106 err_xfer:
6107         obj->ops = &i915_gem_object_ops;
6108         if (!IS_ERR_OR_NULL(pages)) {
6109                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6110
6111                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6112         }
6113 err_unlock:
6114         mutex_unlock(&obj->mm.lock);
6115         return err;
6116 }
6117
6118 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6119 #include "selftests/scatterlist.c"
6120 #include "selftests/mock_gem_device.c"
6121 #include "selftests/huge_gem_object.c"
6122 #include "selftests/huge_pages.c"
6123 #include "selftests/i915_gem_object.c"
6124 #include "selftests/i915_gem_coherency.c"
6125 #include "selftests/i915_gem.c"
6126 #endif