drivers/gpu/drm/scheduler/sched_main.c

   1 /*
   2  * Copyright 2015 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23
  24 /**
  25  * DOC: Overview
  26  *
  27  * The GPU scheduler provides entities which allow userspace to push jobs
  28  * into software queues which are then scheduled on a hardware run queue.
  29  * The software queues have a priority among them. The scheduler selects the entities
  30  * from the run queue using a FIFO. The scheduler provides dependency handling
  31  * features among jobs. The driver is supposed to provide callback functions for
  32  * backend operations to the scheduler like submitting a job to hardware run queue,
  33  * returning the dependencies of a job etc.
  34  *
  35  * The organisation of the scheduler is the following:
  36  *
  37  * 1. Each hw run queue has one scheduler
  38  * 2. Each scheduler has multiple run queues with different priorities
  39  *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
  40  * 3. Each scheduler run queue has a queue of entities to schedule
  41  * 4. Entities themselves maintain a queue of jobs that will be scheduled on
  42  *    the hardware.
  43  *
  44  * The jobs in a entity are always scheduled in the order that they were pushed.
  45  */
  46
  47 #include <linux/kthread.h>
  48 #include <linux/wait.h>
  49 #include <linux/sched.h>
  50 #include <linux/completion.h>
  51 #include <linux/dma-resv.h>
  52 #include <uapi/linux/sched/types.h>
  53
  54 #include <drm/drm_print.h>
  55 #include <drm/drm_gem.h>
  56 #include <drm/drm_syncobj.h>
  57 #include <drm/gpu_scheduler.h>
  58 #include <drm/spsc_queue.h>
  59
  60 #define CREATE_TRACE_POINTS
  61 #include "gpu_scheduler_trace.h"
  62
  63 #define to_drm_sched_job(sched_job)             \
  64                 container_of((sched_job), struct drm_sched_job, queue_node)
  65
  66 int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
  67
  68 /**
  69  * DOC: sched_policy (int)
  70  * Used to override default entities scheduling policy in a run queue.
  71  */
  72 MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
  73 module_param_named(sched_policy, drm_sched_policy, int, 0444);
  74
  75 static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
  76                                                             const struct rb_node *b)
  77 {
  78         struct drm_sched_entity *ent_a =  rb_entry((a), struct drm_sched_entity, rb_tree_node);
  79         struct drm_sched_entity *ent_b =  rb_entry((b), struct drm_sched_entity, rb_tree_node);
  80
  81         return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting);
  82 }
  83
  84 static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity)
  85 {
  86         struct drm_sched_rq *rq = entity->rq;
  87
  88         if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
  89                 rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
  90                 RB_CLEAR_NODE(&entity->rb_tree_node);
  91         }
  92 }
  93
  94 void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
  95 {
  96         /*
  97          * Both locks need to be grabbed, one to protect from entity->rq change
  98          * for entity from within concurrent drm_sched_entity_select_rq and the
  99          * other to update the rb tree structure.
 100          */
 101         spin_lock(&entity->rq_lock);
 102         spin_lock(&entity->rq->lock);
 103
 104         drm_sched_rq_remove_fifo_locked(entity);
 105
 106         entity->oldest_job_waiting = ts;
 107
 108         rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,
 109                       drm_sched_entity_compare_before);
 110
 111         spin_unlock(&entity->rq->lock);
 112         spin_unlock(&entity->rq_lock);
 113 }
 114
 115 /**
 116  * drm_sched_rq_init - initialize a given run queue struct
 117  *
 118  * @sched: scheduler instance to associate with this run queue
 119  * @rq: scheduler run queue
 120  *
 121  * Initializes a scheduler runqueue.
 122  */
 123 static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
 124                               struct drm_sched_rq *rq)
 125 {
 126         spin_lock_init(&rq->lock);
 127         INIT_LIST_HEAD(&rq->entities);
 128         rq->rb_tree_root = RB_ROOT_CACHED;
 129         rq->current_entity = NULL;
 130         rq->sched = sched;
 131 }
 132
 133 /**
 134  * drm_sched_rq_add_entity - add an entity
 135  *
 136  * @rq: scheduler run queue
 137  * @entity: scheduler entity
 138  *
 139  * Adds a scheduler entity to the run queue.
 140  */
 141 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
 142                              struct drm_sched_entity *entity)
 143 {
 144         if (!list_empty(&entity->list))
 145                 return;
 146
 147         spin_lock(&rq->lock);
 148
 149         atomic_inc(rq->sched->score);
 150         list_add_tail(&entity->list, &rq->entities);
 151
 152         spin_unlock(&rq->lock);
 153 }
 154
 155 /**
 156  * drm_sched_rq_remove_entity - remove an entity
 157  *
 158  * @rq: scheduler run queue
 159  * @entity: scheduler entity
 160  *
 161  * Removes a scheduler entity from the run queue.
 162  */
 163 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
 164                                 struct drm_sched_entity *entity)
 165 {
 166         if (list_empty(&entity->list))
 167                 return;
 168
 169         spin_lock(&rq->lock);
 170
 171         atomic_dec(rq->sched->score);
 172         list_del_init(&entity->list);
 173
 174         if (rq->current_entity == entity)
 175                 rq->current_entity = NULL;
 176
 177         if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
 178                 drm_sched_rq_remove_fifo_locked(entity);
 179
 180         spin_unlock(&rq->lock);
 181 }
 182
 183 /**
 184  * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
 185  *
 186  * @rq: scheduler run queue to check.
 187  *
 188  * Try to find a ready entity, returns NULL if none found.
 189  */
 190 static struct drm_sched_entity *
 191 drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
 192 {
 193         struct drm_sched_entity *entity;
 194
 195         spin_lock(&rq->lock);
 196
 197         entity = rq->current_entity;
 198         if (entity) {
 199                 list_for_each_entry_continue(entity, &rq->entities, list) {
 200                         if (drm_sched_entity_is_ready(entity)) {
 201                                 rq->current_entity = entity;
 202                                 reinit_completion(&entity->entity_idle);
 203                                 spin_unlock(&rq->lock);
 204                                 return entity;
 205                         }
 206                 }
 207         }
 208
 209         list_for_each_entry(entity, &rq->entities, list) {
 210
 211                 if (drm_sched_entity_is_ready(entity)) {
 212                         rq->current_entity = entity;
 213                         reinit_completion(&entity->entity_idle);
 214                         spin_unlock(&rq->lock);
 215                         return entity;
 216                 }
 217
 218                 if (entity == rq->current_entity)
 219                         break;
 220         }
 221
 222         spin_unlock(&rq->lock);
 223
 224         return NULL;
 225 }
 226
 227 /**
 228  * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
 229  *
 230  * @rq: scheduler run queue to check.
 231  *
 232  * Find oldest waiting ready entity, returns NULL if none found.
 233  */
 234 static struct drm_sched_entity *
 235 drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 236 {
 237         struct rb_node *rb;
 238
 239         spin_lock(&rq->lock);
 240         for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
 241                 struct drm_sched_entity *entity;
 242
 243                 entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
 244                 if (drm_sched_entity_is_ready(entity)) {
 245                         rq->current_entity = entity;
 246                         reinit_completion(&entity->entity_idle);
 247                         break;
 248                 }
 249         }
 250         spin_unlock(&rq->lock);
 251
 252         return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
 253 }
 254
 255 /**
 256  * drm_sched_job_done - complete a job
 257  * @s_job: pointer to the job which is done
 258  *
 259  * Finish the job's fence and wake up the worker thread.
 260  */
 261 static void drm_sched_job_done(struct drm_sched_job *s_job)
 262 {
 263         struct drm_sched_fence *s_fence = s_job->s_fence;
 264         struct drm_gpu_scheduler *sched = s_fence->sched;
 265
 266         atomic_dec(&sched->hw_rq_count);
 267         atomic_dec(sched->score);
 268
 269         trace_drm_sched_process_job(s_fence);
 270
 271         dma_fence_get(&s_fence->finished);
 272         drm_sched_fence_finished(s_fence);
 273         dma_fence_put(&s_fence->finished);
 274         wake_up_interruptible(&sched->wake_up_worker);
 275 }
 276
 277 /**
 278  * drm_sched_job_done_cb - the callback for a done job
 279  * @f: fence
 280  * @cb: fence callbacks
 281  */
 282 static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
 283 {
 284         struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
 285
 286         drm_sched_job_done(s_job);
 287 }
 288
 289 /**
 290  * drm_sched_start_timeout - start timeout for reset worker
 291  *
 292  * @sched: scheduler instance to start the worker for
 293  *
 294  * Start the timeout for the given scheduler.
 295  */
 296 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 297 {
 298         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
 299             !list_empty(&sched->pending_list))
 300                 queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
 301 }
 302
 303 /**
 304  * drm_sched_fault - immediately start timeout handler
 305  *
 306  * @sched: scheduler where the timeout handling should be started.
 307  *
 308  * Start timeout handling immediately when the driver detects a hardware fault.
 309  */
 310 void drm_sched_fault(struct drm_gpu_scheduler *sched)
 311 {
 312         if (sched->timeout_wq)
 313                 mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
 314 }
 315 EXPORT_SYMBOL(drm_sched_fault);
 316
 317 /**
 318  * drm_sched_suspend_timeout - Suspend scheduler job timeout
 319  *
 320  * @sched: scheduler instance for which to suspend the timeout
 321  *
 322  * Suspend the delayed work timeout for the scheduler. This is done by
 323  * modifying the delayed work timeout to an arbitrary large value,
 324  * MAX_SCHEDULE_TIMEOUT in this case.
 325  *
 326  * Returns the timeout remaining
 327  *
 328  */
 329 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
 330 {
 331         unsigned long sched_timeout, now = jiffies;
 332
 333         sched_timeout = sched->work_tdr.timer.expires;
 334
 335         /*
 336          * Modify the timeout to an arbitrarily large value. This also prevents
 337          * the timeout to be restarted when new submissions arrive
 338          */
 339         if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
 340                         && time_after(sched_timeout, now))
 341                 return sched_timeout - now;
 342         else
 343                 return sched->timeout;
 344 }
 345 EXPORT_SYMBOL(drm_sched_suspend_timeout);
 346
 347 /**
 348  * drm_sched_resume_timeout - Resume scheduler job timeout
 349  *
 350  * @sched: scheduler instance for which to resume the timeout
 351  * @remaining: remaining timeout
 352  *
 353  * Resume the delayed work timeout for the scheduler.
 354  */
 355 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 356                 unsigned long remaining)
 357 {
 358         spin_lock(&sched->job_list_lock);
 359
 360         if (list_empty(&sched->pending_list))
 361                 cancel_delayed_work(&sched->work_tdr);
 362         else
 363                 mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
 364
 365         spin_unlock(&sched->job_list_lock);
 366 }
 367 EXPORT_SYMBOL(drm_sched_resume_timeout);
 368
 369 static void drm_sched_job_begin(struct drm_sched_job *s_job)
 370 {
 371         struct drm_gpu_scheduler *sched = s_job->sched;
 372
 373         spin_lock(&sched->job_list_lock);
 374         list_add_tail(&s_job->list, &sched->pending_list);
 375         drm_sched_start_timeout(sched);
 376         spin_unlock(&sched->job_list_lock);
 377 }
 378
 379 static void drm_sched_job_timedout(struct work_struct *work)
 380 {
 381         struct drm_gpu_scheduler *sched;
 382         struct drm_sched_job *job;
 383         enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;
 384
 385         sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
 386
 387         /* Protects against concurrent deletion in drm_sched_get_cleanup_job */
 388         spin_lock(&sched->job_list_lock);
 389         job = list_first_entry_or_null(&sched->pending_list,
 390                                        struct drm_sched_job, list);
 391
 392         if (job) {
 393                 /*
 394                  * Remove the bad job so it cannot be freed by concurrent
 395                  * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
 396                  * is parked at which point it's safe.
 397                  */
 398                 list_del_init(&job->list);
 399                 spin_unlock(&sched->job_list_lock);
 400
 401                 status = job->sched->ops->timedout_job(job);
 402
 403                 /*
 404                  * Guilty job did complete and hence needs to be manually removed
 405                  * See drm_sched_stop doc.
 406                  */
 407                 if (sched->free_guilty) {
 408                         job->sched->ops->free_job(job);
 409                         sched->free_guilty = false;
 410                 }
 411         } else {
 412                 spin_unlock(&sched->job_list_lock);
 413         }
 414
 415         if (status != DRM_GPU_SCHED_STAT_ENODEV) {
 416                 spin_lock(&sched->job_list_lock);
 417                 drm_sched_start_timeout(sched);
 418                 spin_unlock(&sched->job_list_lock);
 419         }
 420 }
 421
 422 /**
 423  * drm_sched_stop - stop the scheduler
 424  *
 425  * @sched: scheduler instance
 426  * @bad: job which caused the time out
 427  *
 428  * Stop the scheduler and also removes and frees all completed jobs.
 429  * Note: bad job will not be freed as it might be used later and so it's
 430  * callers responsibility to release it manually if it's not part of the
 431  * pending list any more.
 432  *
 433  */
 434 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 435 {
 436         struct drm_sched_job *s_job, *tmp;
 437
 438         kthread_park(sched->thread);
 439
 440         /*
 441          * Reinsert back the bad job here - now it's safe as
 442          * drm_sched_get_cleanup_job cannot race against us and release the
 443          * bad job at this point - we parked (waited for) any in progress
 444          * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
 445          * now until the scheduler thread is unparked.
 446          */
 447         if (bad && bad->sched == sched)
 448                 /*
 449                  * Add at the head of the queue to reflect it was the earliest
 450                  * job extracted.
 451                  */
 452                 list_add(&bad->list, &sched->pending_list);
 453
 454         /*
 455          * Iterate the job list from later to  earlier one and either deactive
 456          * their HW callbacks or remove them from pending list if they already
 457          * signaled.
 458          * This iteration is thread safe as sched thread is stopped.
 459          */
 460         list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
 461                                          list) {
 462                 if (s_job->s_fence->parent &&
 463                     dma_fence_remove_callback(s_job->s_fence->parent,
 464                                               &s_job->cb)) {
 465                         dma_fence_put(s_job->s_fence->parent);
 466                         s_job->s_fence->parent = NULL;
 467                         atomic_dec(&sched->hw_rq_count);
 468                 } else {
 469                         /*
 470                          * remove job from pending_list.
 471                          * Locking here is for concurrent resume timeout
 472                          */
 473                         spin_lock(&sched->job_list_lock);
 474                         list_del_init(&s_job->list);
 475                         spin_unlock(&sched->job_list_lock);
 476
 477                         /*
 478                          * Wait for job's HW fence callback to finish using s_job
 479                          * before releasing it.
 480                          *
 481                          * Job is still alive so fence refcount at least 1
 482                          */
 483                         dma_fence_wait(&s_job->s_fence->finished, false);
 484
 485                         /*
 486                          * We must keep bad job alive for later use during
 487                          * recovery by some of the drivers but leave a hint
 488                          * that the guilty job must be released.
 489                          */
 490                         if (bad != s_job)
 491                                 sched->ops->free_job(s_job);
 492                         else
 493                                 sched->free_guilty = true;
 494                 }
 495         }
 496
 497         /*
 498          * Stop pending timer in flight as we rearm it in  drm_sched_start. This
 499          * avoids the pending timeout work in progress to fire right away after
 500          * this TDR finished and before the newly restarted jobs had a
 501          * chance to complete.
 502          */
 503         cancel_delayed_work(&sched->work_tdr);
 504 }
 505
 506 EXPORT_SYMBOL(drm_sched_stop);
 507
 508 /**
 509  * drm_sched_start - recover jobs after a reset
 510  *
 511  * @sched: scheduler instance
 512  * @full_recovery: proceed with complete sched restart
 513  *
 514  */
 515 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 516 {
 517         struct drm_sched_job *s_job, *tmp;
 518         int r;
 519
 520         /*
 521          * Locking the list is not required here as the sched thread is parked
 522          * so no new jobs are being inserted or removed. Also concurrent
 523          * GPU recovers can't run in parallel.
 524          */
 525         list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 526                 struct dma_fence *fence = s_job->s_fence->parent;
 527
 528                 atomic_inc(&sched->hw_rq_count);
 529
 530                 if (!full_recovery)
 531                         continue;
 532
 533                 if (fence) {
 534                         r = dma_fence_add_callback(fence, &s_job->cb,
 535                                                    drm_sched_job_done_cb);
 536                         if (r == -ENOENT)
 537                                 drm_sched_job_done(s_job);
 538                         else if (r)
 539                                 DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 540                                           r);
 541                 } else
 542                         drm_sched_job_done(s_job);
 543         }
 544
 545         if (full_recovery) {
 546                 spin_lock(&sched->job_list_lock);
 547                 drm_sched_start_timeout(sched);
 548                 spin_unlock(&sched->job_list_lock);
 549         }
 550
 551         kthread_unpark(sched->thread);
 552 }
 553 EXPORT_SYMBOL(drm_sched_start);
 554
 555 /**
 556  * drm_sched_resubmit_jobs - Deprecated, don't use in new code!
 557  *
 558  * @sched: scheduler instance
 559  *
 560  * Re-submitting jobs was a concept AMD came up as cheap way to implement
 561  * recovery after a job timeout.
 562  *
 563  * This turned out to be not working very well. First of all there are many
 564  * problem with the dma_fence implementation and requirements. Either the
 565  * implementation is risking deadlocks with core memory management or violating
 566  * documented implementation details of the dma_fence object.
 567  *
 568  * Drivers can still save and restore their state for recovery operations, but
 569  * we shouldn't make this a general scheduler feature around the dma_fence
 570  * interface.
 571  */
 572 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 573 {
 574         struct drm_sched_job *s_job, *tmp;
 575         uint64_t guilty_context;
 576         bool found_guilty = false;
 577         struct dma_fence *fence;
 578
 579         list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 580                 struct drm_sched_fence *s_fence = s_job->s_fence;
 581
 582                 if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
 583                         found_guilty = true;
 584                         guilty_context = s_job->s_fence->scheduled.context;
 585                 }
 586
 587                 if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
 588                         dma_fence_set_error(&s_fence->finished, -ECANCELED);
 589
 590                 fence = sched->ops->run_job(s_job);
 591
 592                 if (IS_ERR_OR_NULL(fence)) {
 593                         if (IS_ERR(fence))
 594                                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
 595
 596                         s_job->s_fence->parent = NULL;
 597                 } else {
 598
 599                         s_job->s_fence->parent = dma_fence_get(fence);
 600
 601                         /* Drop for orignal kref_init */
 602                         dma_fence_put(fence);
 603                 }
 604         }
 605 }
 606 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 607
 608 /**
 609  * drm_sched_job_init - init a scheduler job
 610  * @job: scheduler job to init
 611  * @entity: scheduler entity to use
 612  * @owner: job owner for debugging
 613  *
 614  * Refer to drm_sched_entity_push_job() documentation
 615  * for locking considerations.
 616  *
 617  * Drivers must make sure drm_sched_job_cleanup() if this function returns
 618  * successfully, even when @job is aborted before drm_sched_job_arm() is called.
 619  *
 620  * WARNING: amdgpu abuses &drm_sched.ready to signal when the hardware
 621  * has died, which can mean that there's no valid runqueue for a @entity.
 622  * This function returns -ENOENT in this case (which probably should be -EIO as
 623  * a more meanigful return value).
 624  *
 625  * Returns 0 for success, negative error code otherwise.
 626  */
 627 int drm_sched_job_init(struct drm_sched_job *job,
 628                        struct drm_sched_entity *entity,
 629                        void *owner)
 630 {
 631         if (!entity->rq)
 632                 return -ENOENT;
 633
 634         job->entity = entity;
 635         job->s_fence = drm_sched_fence_alloc(entity, owner);
 636         if (!job->s_fence)
 637                 return -ENOMEM;
 638
 639         INIT_LIST_HEAD(&job->list);
 640
 641         xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
 642
 643         return 0;
 644 }
 645 EXPORT_SYMBOL(drm_sched_job_init);
 646
 647 /**
 648  * drm_sched_job_arm - arm a scheduler job for execution
 649  * @job: scheduler job to arm
 650  *
 651  * This arms a scheduler job for execution. Specifically it initializes the
 652  * &drm_sched_job.s_fence of @job, so that it can be attached to struct dma_resv
 653  * or other places that need to track the completion of this job.
 654  *
 655  * Refer to drm_sched_entity_push_job() documentation for locking
 656  * considerations.
 657  *
 658  * This can only be called if drm_sched_job_init() succeeded.
 659  */
 660 void drm_sched_job_arm(struct drm_sched_job *job)
 661 {
 662         struct drm_gpu_scheduler *sched;
 663         struct drm_sched_entity *entity = job->entity;
 664
 665         BUG_ON(!entity);
 666         drm_sched_entity_select_rq(entity);
 667         sched = entity->rq->sched;
 668
 669         job->sched = sched;
 670         job->s_priority = entity->rq - sched->sched_rq;
 671         job->id = atomic64_inc_return(&sched->job_id_count);
 672
 673         drm_sched_fence_init(job->s_fence, job->entity);
 674 }
 675 EXPORT_SYMBOL(drm_sched_job_arm);
 676
 677 /**
 678  * drm_sched_job_add_dependency - adds the fence as a job dependency
 679  * @job: scheduler job to add the dependencies to
 680  * @fence: the dma_fence to add to the list of dependencies.
 681  *
 682  * Note that @fence is consumed in both the success and error cases.
 683  *
 684  * Returns:
 685  * 0 on success, or an error on failing to expand the array.
 686  */
 687 int drm_sched_job_add_dependency(struct drm_sched_job *job,
 688                                  struct dma_fence *fence)
 689 {
 690         struct dma_fence *entry;
 691         unsigned long index;
 692         u32 id = 0;
 693         int ret;
 694
 695         if (!fence)
 696                 return 0;
 697
 698         /* Deduplicate if we already depend on a fence from the same context.
 699          * This lets the size of the array of deps scale with the number of
 700          * engines involved, rather than the number of BOs.
 701          */
 702         xa_for_each(&job->dependencies, index, entry) {
 703                 if (entry->context != fence->context)
 704                         continue;
 705
 706                 if (dma_fence_is_later(fence, entry)) {
 707                         dma_fence_put(entry);
 708                         xa_store(&job->dependencies, index, fence, GFP_KERNEL);
 709                 } else {
 710                         dma_fence_put(fence);
 711                 }
 712                 return 0;
 713         }
 714
 715         ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, GFP_KERNEL);
 716         if (ret != 0)
 717                 dma_fence_put(fence);
 718
 719         return ret;
 720 }
 721 EXPORT_SYMBOL(drm_sched_job_add_dependency);
 722
 723 /**
 724  * drm_sched_job_add_syncobj_dependency - adds a syncobj's fence as a job dependency
 725  * @job: scheduler job to add the dependencies to
 726  * @file: drm file private pointer
 727  * @handle: syncobj handle to lookup
 728  * @point: timeline point
 729  *
 730  * This adds the fence matching the given syncobj to @job.
 731  *
 732  * Returns:
 733  * 0 on success, or an error on failing to expand the array.
 734  */
 735 int drm_sched_job_add_syncobj_dependency(struct drm_sched_job *job,
 736                                          struct drm_file *file,
 737                                          u32 handle,
 738                                          u32 point)
 739 {
 740         struct dma_fence *fence;
 741         int ret;
 742
 743         ret = drm_syncobj_find_fence(file, handle, point, 0, &fence);
 744         if (ret)
 745                 return ret;
 746
 747         return drm_sched_job_add_dependency(job, fence);
 748 }
 749 EXPORT_SYMBOL(drm_sched_job_add_syncobj_dependency);
 750
 751 /**
 752  * drm_sched_job_add_resv_dependencies - add all fences from the resv to the job
 753  * @job: scheduler job to add the dependencies to
 754  * @resv: the dma_resv object to get the fences from
 755  * @usage: the dma_resv_usage to use to filter the fences
 756  *
 757  * This adds all fences matching the given usage from @resv to @job.
 758  * Must be called with the @resv lock held.
 759  *
 760  * Returns:
 761  * 0 on success, or an error on failing to expand the array.
 762  */
 763 int drm_sched_job_add_resv_dependencies(struct drm_sched_job *job,
 764                                         struct dma_resv *resv,
 765                                         enum dma_resv_usage usage)
 766 {
 767         struct dma_resv_iter cursor;
 768         struct dma_fence *fence;
 769         int ret;
 770
 771         dma_resv_assert_held(resv);
 772
 773         dma_resv_for_each_fence(&cursor, resv, usage, fence) {
 774                 /* Make sure to grab an additional ref on the added fence */
 775                 dma_fence_get(fence);
 776                 ret = drm_sched_job_add_dependency(job, fence);
 777                 if (ret) {
 778                         dma_fence_put(fence);
 779                         return ret;
 780                 }
 781         }
 782         return 0;
 783 }
 784 EXPORT_SYMBOL(drm_sched_job_add_resv_dependencies);
 785
 786 /**
 787  * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
 788  *   dependencies
 789  * @job: scheduler job to add the dependencies to
 790  * @obj: the gem object to add new dependencies from.
 791  * @write: whether the job might write the object (so we need to depend on
 792  * shared fences in the reservation object).
 793  *
 794  * This should be called after drm_gem_lock_reservations() on your array of
 795  * GEM objects used in the job but before updating the reservations with your
 796  * own fences.
 797  *
 798  * Returns:
 799  * 0 on success, or an error on failing to expand the array.
 800  */
 801 int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
 802                                             struct drm_gem_object *obj,
 803                                             bool write)
 804 {
 805         return drm_sched_job_add_resv_dependencies(job, obj->resv,
 806                                                    dma_resv_usage_rw(write));
 807 }
 808 EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
 809
 810 /**
 811  * drm_sched_job_cleanup - clean up scheduler job resources
 812  * @job: scheduler job to clean up
 813  *
 814  * Cleans up the resources allocated with drm_sched_job_init().
 815  *
 816  * Drivers should call this from their error unwind code if @job is aborted
 817  * before drm_sched_job_arm() is called.
 818  *
 819  * After that point of no return @job is committed to be executed by the
 820  * scheduler, and this function should be called from the
 821  * &drm_sched_backend_ops.free_job callback.
 822  */
 823 void drm_sched_job_cleanup(struct drm_sched_job *job)
 824 {
 825         struct dma_fence *fence;
 826         unsigned long index;
 827
 828         if (kref_read(&job->s_fence->finished.refcount)) {
 829                 /* drm_sched_job_arm() has been called */
 830                 dma_fence_put(&job->s_fence->finished);
 831         } else {
 832                 /* aborted job before committing to run it */
 833                 drm_sched_fence_free(job->s_fence);
 834         }
 835
 836         job->s_fence = NULL;
 837
 838         xa_for_each(&job->dependencies, index, fence) {
 839                 dma_fence_put(fence);
 840         }
 841         xa_destroy(&job->dependencies);
 842
 843 }
 844 EXPORT_SYMBOL(drm_sched_job_cleanup);
 845
 846 /**
 847  * drm_sched_ready - is the scheduler ready
 848  *
 849  * @sched: scheduler instance
 850  *
 851  * Return true if we can push more jobs to the hw, otherwise false.
 852  */
 853 static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
 854 {
 855         return atomic_read(&sched->hw_rq_count) <
 856                 sched->hw_submission_limit;
 857 }
 858
 859 /**
 860  * drm_sched_wakeup - Wake up the scheduler when it is ready
 861  *
 862  * @sched: scheduler instance
 863  *
 864  */
 865 void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
 866 {
 867         if (drm_sched_ready(sched))
 868                 wake_up_interruptible(&sched->wake_up_worker);
 869 }
 870
 871 /**
 872  * drm_sched_select_entity - Select next entity to process
 873  *
 874  * @sched: scheduler instance
 875  *
 876  * Returns the entity to process or NULL if none are found.
 877  */
 878 static struct drm_sched_entity *
 879 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
 880 {
 881         struct drm_sched_entity *entity;
 882         int i;
 883
 884         if (!drm_sched_ready(sched))
 885                 return NULL;
 886
 887         /* Kernel run queue has higher priority than normal run queue*/
 888         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
 889                 entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
 890                         drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
 891                         drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
 892                 if (entity)
 893                         break;
 894         }
 895
 896         return entity;
 897 }
 898
 899 /**
 900  * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
 901  *
 902  * @sched: scheduler instance
 903  *
 904  * Returns the next finished job from the pending list (if there is one)
 905  * ready for it to be destroyed.
 906  */
 907 static struct drm_sched_job *
 908 drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 909 {
 910         struct drm_sched_job *job, *next;
 911
 912         spin_lock(&sched->job_list_lock);
 913
 914         job = list_first_entry_or_null(&sched->pending_list,
 915                                        struct drm_sched_job, list);
 916
 917         if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
 918                 /* remove job from pending_list */
 919                 list_del_init(&job->list);
 920
 921                 /* cancel this job's TO timer */
 922                 cancel_delayed_work(&sched->work_tdr);
 923                 /* make the scheduled timestamp more accurate */
 924                 next = list_first_entry_or_null(&sched->pending_list,
 925                                                 typeof(*next), list);
 926
 927                 if (next) {
 928                         next->s_fence->scheduled.timestamp =
 929                                 job->s_fence->finished.timestamp;
 930                         /* start TO timer for next job */
 931                         drm_sched_start_timeout(sched);
 932                 }
 933         } else {
 934                 job = NULL;
 935         }
 936
 937         spin_unlock(&sched->job_list_lock);
 938
 939         return job;
 940 }
 941
 942 /**
 943  * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
 944  * @sched_list: list of drm_gpu_schedulers
 945  * @num_sched_list: number of drm_gpu_schedulers in the sched_list
 946  *
 947  * Returns pointer of the sched with the least load or NULL if none of the
 948  * drm_gpu_schedulers are ready
 949  */
 950 struct drm_gpu_scheduler *
 951 drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 952                      unsigned int num_sched_list)
 953 {
 954         struct drm_gpu_scheduler *sched, *picked_sched = NULL;
 955         int i;
 956         unsigned int min_score = UINT_MAX, num_score;
 957
 958         for (i = 0; i < num_sched_list; ++i) {
 959                 sched = sched_list[i];
 960
 961                 if (!sched->ready) {
 962                         DRM_WARN("scheduler %s is not ready, skipping",
 963                                  sched->name);
 964                         continue;
 965                 }
 966
 967                 num_score = atomic_read(sched->score);
 968                 if (num_score < min_score) {
 969                         min_score = num_score;
 970                         picked_sched = sched;
 971                 }
 972         }
 973
 974         return picked_sched;
 975 }
 976 EXPORT_SYMBOL(drm_sched_pick_best);
 977
 978 /**
 979  * drm_sched_blocked - check if the scheduler is blocked
 980  *
 981  * @sched: scheduler instance
 982  *
 983  * Returns true if blocked, otherwise false.
 984  */
 985 static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
 986 {
 987         if (kthread_should_park()) {
 988                 kthread_parkme();
 989                 return true;
 990         }
 991
 992         return false;
 993 }
 994
 995 /**
 996  * drm_sched_main - main scheduler thread
 997  *
 998  * @param: scheduler instance
 999  *
1000  * Returns 0.
1001  */
1002 static int drm_sched_main(void *param)
1003 {
1004         struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
1005         int r;
1006
1007         sched_set_fifo_low(current);
1008
1009         while (!kthread_should_stop()) {
1010                 struct drm_sched_entity *entity = NULL;
1011                 struct drm_sched_fence *s_fence;
1012                 struct drm_sched_job *sched_job;
1013                 struct dma_fence *fence;
1014                 struct drm_sched_job *cleanup_job = NULL;
1015
1016                 wait_event_interruptible(sched->wake_up_worker,
1017                                          (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
1018                                          (!drm_sched_blocked(sched) &&
1019                                           (entity = drm_sched_select_entity(sched))) ||
1020                                          kthread_should_stop());
1021
1022                 if (cleanup_job)
1023                         sched->ops->free_job(cleanup_job);
1024
1025                 if (!entity)
1026                         continue;
1027
1028                 sched_job = drm_sched_entity_pop_job(entity);
1029
1030                 if (!sched_job) {
1031                         complete_all(&entity->entity_idle);
1032                         continue;
1033                 }
1034
1035                 s_fence = sched_job->s_fence;
1036
1037                 atomic_inc(&sched->hw_rq_count);
1038                 drm_sched_job_begin(sched_job);
1039
1040                 trace_drm_run_job(sched_job, entity);
1041                 fence = sched->ops->run_job(sched_job);
1042                 complete_all(&entity->entity_idle);
1043                 drm_sched_fence_scheduled(s_fence);
1044
1045                 if (!IS_ERR_OR_NULL(fence)) {
1046                         drm_sched_fence_set_parent(s_fence, fence);
1047                         /* Drop for original kref_init of the fence */
1048                         dma_fence_put(fence);
1049
1050                         r = dma_fence_add_callback(fence, &sched_job->cb,
1051                                                    drm_sched_job_done_cb);
1052                         if (r == -ENOENT)
1053                                 drm_sched_job_done(sched_job);
1054                         else if (r)
1055                                 DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
1056                                           r);
1057                 } else {
1058                         if (IS_ERR(fence))
1059                                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
1060
1061                         drm_sched_job_done(sched_job);
1062                 }
1063
1064                 wake_up(&sched->job_scheduled);
1065         }
1066         return 0;
1067 }
1068
1069 /**
1070  * drm_sched_init - Init a gpu scheduler instance
1071  *
1072  * @sched: scheduler instance
1073  * @ops: backend operations for this scheduler
1074  * @hw_submission: number of hw submissions that can be in flight
1075  * @hang_limit: number of times to allow a job to hang before dropping it
1076  * @timeout: timeout value in jiffies for the scheduler
1077  * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
1078  *              used
1079  * @score: optional score atomic shared with other schedulers
1080  * @name: name used for debugging
1081  * @dev: target &struct device
1082  *
1083  * Return 0 on success, otherwise error code.
1084  */
1085 int drm_sched_init(struct drm_gpu_scheduler *sched,
1086                    const struct drm_sched_backend_ops *ops,
1087                    unsigned hw_submission, unsigned hang_limit,
1088                    long timeout, struct workqueue_struct *timeout_wq,
1089                    atomic_t *score, const char *name, struct device *dev)
1090 {
1091         int i, ret;
1092         sched->ops = ops;
1093         sched->hw_submission_limit = hw_submission;
1094         sched->name = name;
1095         sched->timeout = timeout;
1096         sched->timeout_wq = timeout_wq ? : system_wq;
1097         sched->hang_limit = hang_limit;
1098         sched->score = score ? score : &sched->_score;
1099         sched->dev = dev;
1100         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
1101                 drm_sched_rq_init(sched, &sched->sched_rq[i]);
1102
1103         init_waitqueue_head(&sched->wake_up_worker);
1104         init_waitqueue_head(&sched->job_scheduled);
1105         INIT_LIST_HEAD(&sched->pending_list);
1106         spin_lock_init(&sched->job_list_lock);
1107         atomic_set(&sched->hw_rq_count, 0);
1108         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
1109         atomic_set(&sched->_score, 0);
1110         atomic64_set(&sched->job_id_count, 0);
1111
1112         /* Each scheduler will run on a seperate kernel thread */
1113         sched->thread = kthread_run(drm_sched_main, sched, sched->name);
1114         if (IS_ERR(sched->thread)) {
1115                 ret = PTR_ERR(sched->thread);
1116                 sched->thread = NULL;
1117                 DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
1118                 return ret;
1119         }
1120
1121         sched->ready = true;
1122         return 0;
1123 }
1124 EXPORT_SYMBOL(drm_sched_init);
1125
1126 /**
1127  * drm_sched_fini - Destroy a gpu scheduler
1128  *
1129  * @sched: scheduler instance
1130  *
1131  * Tears down and cleans up the scheduler.
1132  */
1133 void drm_sched_fini(struct drm_gpu_scheduler *sched)
1134 {
1135         struct drm_sched_entity *s_entity;
1136         int i;
1137
1138         if (sched->thread)
1139                 kthread_stop(sched->thread);
1140
1141         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
1142                 struct drm_sched_rq *rq = &sched->sched_rq[i];
1143
1144                 if (!rq)
1145                         continue;
1146
1147                 spin_lock(&rq->lock);
1148                 list_for_each_entry(s_entity, &rq->entities, list)
1149                         /*
1150                          * Prevents reinsertion and marks job_queue as idle,
1151                          * it will removed from rq in drm_sched_entity_fini
1152                          * eventually
1153                          */
1154                         s_entity->stopped = true;
1155                 spin_unlock(&rq->lock);
1156
1157         }
1158
1159         /* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */
1160         wake_up_all(&sched->job_scheduled);
1161
1162         /* Confirm no work left behind accessing device structures */
1163         cancel_delayed_work_sync(&sched->work_tdr);
1164
1165         sched->ready = false;
1166 }
1167 EXPORT_SYMBOL(drm_sched_fini);
1168
1169 /**
1170  * drm_sched_increase_karma - Update sched_entity guilty flag
1171  *
1172  * @bad: The job guilty of time out
1173  *
1174  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
1175  * limit of the scheduler then the respective sched entity is marked guilty and
1176  * jobs from it will not be scheduled further
1177  */
1178 void drm_sched_increase_karma(struct drm_sched_job *bad)
1179 {
1180         int i;
1181         struct drm_sched_entity *tmp;
1182         struct drm_sched_entity *entity;
1183         struct drm_gpu_scheduler *sched = bad->sched;
1184
1185         /* don't change @bad's karma if it's from KERNEL RQ,
1186          * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
1187          * corrupt but keep in mind that kernel jobs always considered good.
1188          */
1189         if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
1190                 atomic_inc(&bad->karma);
1191
1192                 for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
1193                      i++) {
1194                         struct drm_sched_rq *rq = &sched->sched_rq[i];
1195
1196                         spin_lock(&rq->lock);
1197                         list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
1198                                 if (bad->s_fence->scheduled.context ==
1199                                     entity->fence_context) {
1200                                         if (entity->guilty)
1201                                                 atomic_set(entity->guilty, 1);
1202                                         break;
1203                                 }
1204                         }
1205                         spin_unlock(&rq->lock);
1206                         if (&entity->list != &rq->entities)
1207                                 break;
1208                 }
1209         }
1210 }
1211 EXPORT_SYMBOL(drm_sched_increase_karma);