Merge tag 'drm-misc-next-fixes-2021-09-09' of git://anongit.freedesktop.org/drm/drm...
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_migrate.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "intel_context.h"
8 #include "intel_gpu_commands.h"
9 #include "intel_gt.h"
10 #include "intel_gtt.h"
11 #include "intel_migrate.h"
12 #include "intel_ring.h"
13
14 struct insert_pte_data {
15         u64 offset;
16         bool is_lmem;
17 };
18
19 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
20
21 static bool engine_supports_migration(struct intel_engine_cs *engine)
22 {
23         if (!engine)
24                 return false;
25
26         /*
27          * We need the ability to prevent aribtration (MI_ARB_ON_OFF),
28          * the ability to write PTE using inline data (MI_STORE_DATA)
29          * and of course the ability to do the block transfer (blits).
30          */
31         GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
32
33         return true;
34 }
35
36 static void insert_pte(struct i915_address_space *vm,
37                        struct i915_page_table *pt,
38                        void *data)
39 {
40         struct insert_pte_data *d = data;
41
42         vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE,
43                         d->is_lmem ? PTE_LM : 0);
44         d->offset += PAGE_SIZE;
45 }
46
47 static struct i915_address_space *migrate_vm(struct intel_gt *gt)
48 {
49         struct i915_vm_pt_stash stash = {};
50         struct i915_ppgtt *vm;
51         int err;
52         int i;
53
54         /*
55          * We construct a very special VM for use by all migration contexts,
56          * it is kept pinned so that it can be used at any time. As we need
57          * to pre-allocate the page directories for the migration VM, this
58          * limits us to only using a small number of prepared vma.
59          *
60          * To be able to pipeline and reschedule migration operations while
61          * avoiding unnecessary contention on the vm itself, the PTE updates
62          * are inline with the blits. All the blits use the same fixed
63          * addresses, with the backing store redirection being updated on the
64          * fly. Only 2 implicit vma are used for all migration operations.
65          *
66          * We lay the ppGTT out as:
67          *
68          *      [0, CHUNK_SZ) -> first object
69          *      [CHUNK_SZ, 2 * CHUNK_SZ) -> second object
70          *      [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
71          *
72          * By exposing the dma addresses of the page directories themselves
73          * within the ppGTT, we are then able to rewrite the PTE prior to use.
74          * But the PTE update and subsequent migration operation must be atomic,
75          * i.e. within the same non-preemptible window so that we do not switch
76          * to another migration context that overwrites the PTE.
77          *
78          * TODO: Add support for huge LMEM PTEs
79          */
80
81         vm = i915_ppgtt_create(gt);
82         if (IS_ERR(vm))
83                 return ERR_CAST(vm);
84
85         if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
86                 err = -ENODEV;
87                 goto err_vm;
88         }
89
90         /*
91          * Each engine instance is assigned its own chunk in the VM, so
92          * that we can run multiple instances concurrently
93          */
94         for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
95                 struct intel_engine_cs *engine;
96                 u64 base = (u64)i << 32;
97                 struct insert_pte_data d = {};
98                 struct i915_gem_ww_ctx ww;
99                 u64 sz;
100
101                 engine = gt->engine_class[COPY_ENGINE_CLASS][i];
102                 if (!engine_supports_migration(engine))
103                         continue;
104
105                 /*
106                  * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
107                  * 4x2 page directories for source/destination.
108                  */
109                 sz = 2 * CHUNK_SZ;
110                 d.offset = base + sz;
111
112                 /*
113                  * We need another page directory setup so that we can write
114                  * the 8x512 PTE in each chunk.
115                  */
116                 sz += (sz >> 12) * sizeof(u64);
117
118                 err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
119                 if (err)
120                         goto err_vm;
121
122                 for_i915_gem_ww(&ww, err, true) {
123                         err = i915_vm_lock_objects(&vm->vm, &ww);
124                         if (err)
125                                 continue;
126                         err = i915_vm_map_pt_stash(&vm->vm, &stash);
127                         if (err)
128                                 continue;
129
130                         vm->vm.allocate_va_range(&vm->vm, &stash, base, sz);
131                 }
132                 i915_vm_free_pt_stash(&vm->vm, &stash);
133                 if (err)
134                         goto err_vm;
135
136                 /* Now allow the GPU to rewrite the PTE via its own ppGTT */
137                 d.is_lmem = i915_gem_object_is_lmem(vm->vm.scratch[0]);
138                 vm->vm.foreach(&vm->vm, base, base + sz, insert_pte, &d);
139         }
140
141         return &vm->vm;
142
143 err_vm:
144         i915_vm_put(&vm->vm);
145         return ERR_PTR(err);
146 }
147
148 static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
149 {
150         struct intel_engine_cs *engine;
151         int i;
152
153         for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
154                 engine = gt->engine_class[COPY_ENGINE_CLASS][i];
155                 if (engine_supports_migration(engine))
156                         return engine;
157         }
158
159         return NULL;
160 }
161
162 static struct intel_context *pinned_context(struct intel_gt *gt)
163 {
164         static struct lock_class_key key;
165         struct intel_engine_cs *engine;
166         struct i915_address_space *vm;
167         struct intel_context *ce;
168
169         engine = first_copy_engine(gt);
170         if (!engine)
171                 return ERR_PTR(-ENODEV);
172
173         vm = migrate_vm(gt);
174         if (IS_ERR(vm))
175                 return ERR_CAST(vm);
176
177         ce = intel_engine_create_pinned_context(engine, vm, SZ_512K,
178                                                 I915_GEM_HWS_MIGRATE,
179                                                 &key, "migrate");
180         i915_vm_put(vm);
181         return ce;
182 }
183
184 int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
185 {
186         struct intel_context *ce;
187
188         memset(m, 0, sizeof(*m));
189
190         ce = pinned_context(gt);
191         if (IS_ERR(ce))
192                 return PTR_ERR(ce);
193
194         m->context = ce;
195         return 0;
196 }
197
198 static int random_index(unsigned int max)
199 {
200         return upper_32_bits(mul_u32_u32(get_random_u32(), max));
201 }
202
203 static struct intel_context *__migrate_engines(struct intel_gt *gt)
204 {
205         struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
206         struct intel_engine_cs *engine;
207         unsigned int count, i;
208
209         count = 0;
210         for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
211                 engine = gt->engine_class[COPY_ENGINE_CLASS][i];
212                 if (engine_supports_migration(engine))
213                         engines[count++] = engine;
214         }
215
216         return intel_context_create(engines[random_index(count)]);
217 }
218
219 struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
220 {
221         struct intel_context *ce;
222
223         /*
224          * We randomly distribute contexts across the engines upon constrction,
225          * as they all share the same pinned vm, and so in order to allow
226          * multiple blits to run in parallel, we must construct each blit
227          * to use a different range of the vm for its GTT. This has to be
228          * known at construction, so we can not use the late greedy load
229          * balancing of the virtual-engine.
230          */
231         ce = __migrate_engines(m->context->engine->gt);
232         if (IS_ERR(ce))
233                 return ce;
234
235         ce->ring = NULL;
236         ce->ring_size = SZ_256K;
237
238         i915_vm_put(ce->vm);
239         ce->vm = i915_vm_get(m->context->vm);
240
241         return ce;
242 }
243
244 static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
245 {
246         dma_addr_t addr = sg_dma_address(sg);
247
248         return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
249 }
250
251 static int emit_no_arbitration(struct i915_request *rq)
252 {
253         u32 *cs;
254
255         cs = intel_ring_begin(rq, 2);
256         if (IS_ERR(cs))
257                 return PTR_ERR(cs);
258
259         /* Explicitly disable preemption for this request. */
260         *cs++ = MI_ARB_ON_OFF;
261         *cs++ = MI_NOOP;
262         intel_ring_advance(rq, cs);
263
264         return 0;
265 }
266
267 static int emit_pte(struct i915_request *rq,
268                     struct sgt_dma *it,
269                     enum i915_cache_level cache_level,
270                     bool is_lmem,
271                     u64 offset,
272                     int length)
273 {
274         const u64 encode = rq->context->vm->pte_encode(0, cache_level,
275                                                        is_lmem ? PTE_LM : 0);
276         struct intel_ring *ring = rq->ring;
277         int total = 0;
278         u32 *hdr, *cs;
279         int pkt;
280
281         GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8);
282
283         /* Compute the page directory offset for the target address range */
284         offset += (u64)rq->engine->instance << 32;
285         offset >>= 12;
286         offset *= sizeof(u64);
287         offset += 2 * CHUNK_SZ;
288
289         cs = intel_ring_begin(rq, 6);
290         if (IS_ERR(cs))
291                 return PTR_ERR(cs);
292
293         /* Pack as many PTE updates as possible into a single MI command */
294         pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5);
295         pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
296
297         hdr = cs;
298         *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */
299         *cs++ = lower_32_bits(offset);
300         *cs++ = upper_32_bits(offset);
301
302         do {
303                 if (cs - hdr >= pkt) {
304                         *hdr += cs - hdr - 2;
305                         *cs++ = MI_NOOP;
306
307                         ring->emit = (void *)cs - ring->vaddr;
308                         intel_ring_advance(rq, cs);
309                         intel_ring_update_space(ring);
310
311                         cs = intel_ring_begin(rq, 6);
312                         if (IS_ERR(cs))
313                                 return PTR_ERR(cs);
314
315                         pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5);
316                         pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
317
318                         hdr = cs;
319                         *cs++ = MI_STORE_DATA_IMM | REG_BIT(21);
320                         *cs++ = lower_32_bits(offset);
321                         *cs++ = upper_32_bits(offset);
322                 }
323
324                 *cs++ = lower_32_bits(encode | it->dma);
325                 *cs++ = upper_32_bits(encode | it->dma);
326
327                 offset += 8;
328                 total += I915_GTT_PAGE_SIZE;
329
330                 it->dma += I915_GTT_PAGE_SIZE;
331                 if (it->dma >= it->max) {
332                         it->sg = __sg_next(it->sg);
333                         if (!it->sg || sg_dma_len(it->sg) == 0)
334                                 break;
335
336                         it->dma = sg_dma_address(it->sg);
337                         it->max = it->dma + sg_dma_len(it->sg);
338                 }
339         } while (total < length);
340
341         *hdr += cs - hdr - 2;
342         *cs++ = MI_NOOP;
343
344         ring->emit = (void *)cs - ring->vaddr;
345         intel_ring_advance(rq, cs);
346         intel_ring_update_space(ring);
347
348         return total;
349 }
350
351 static bool wa_1209644611_applies(int ver, u32 size)
352 {
353         u32 height = size >> PAGE_SHIFT;
354
355         if (ver != 11)
356                 return false;
357
358         return height % 4 == 3 && height <= 8;
359 }
360
361 static int emit_copy(struct i915_request *rq, int size)
362 {
363         const int ver = GRAPHICS_VER(rq->engine->i915);
364         u32 instance = rq->engine->instance;
365         u32 *cs;
366
367         cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
368         if (IS_ERR(cs))
369                 return PTR_ERR(cs);
370
371         if (ver >= 9 && !wa_1209644611_applies(ver, size)) {
372                 *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
373                 *cs++ = BLT_DEPTH_32 | PAGE_SIZE;
374                 *cs++ = 0;
375                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
376                 *cs++ = CHUNK_SZ; /* dst offset */
377                 *cs++ = instance;
378                 *cs++ = 0;
379                 *cs++ = PAGE_SIZE;
380                 *cs++ = 0; /* src offset */
381                 *cs++ = instance;
382         } else if (ver >= 8) {
383                 *cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
384                 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
385                 *cs++ = 0;
386                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
387                 *cs++ = CHUNK_SZ; /* dst offset */
388                 *cs++ = instance;
389                 *cs++ = 0;
390                 *cs++ = PAGE_SIZE;
391                 *cs++ = 0; /* src offset */
392                 *cs++ = instance;
393         } else {
394                 GEM_BUG_ON(instance);
395                 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
396                 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
397                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
398                 *cs++ = CHUNK_SZ; /* dst offset */
399                 *cs++ = PAGE_SIZE;
400                 *cs++ = 0; /* src offset */
401         }
402
403         intel_ring_advance(rq, cs);
404         return 0;
405 }
406
407 int
408 intel_context_migrate_copy(struct intel_context *ce,
409                            struct dma_fence *await,
410                            struct scatterlist *src,
411                            enum i915_cache_level src_cache_level,
412                            bool src_is_lmem,
413                            struct scatterlist *dst,
414                            enum i915_cache_level dst_cache_level,
415                            bool dst_is_lmem,
416                            struct i915_request **out)
417 {
418         struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
419         struct i915_request *rq;
420         int err;
421
422         GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
423         *out = NULL;
424
425         GEM_BUG_ON(ce->ring->size < SZ_64K);
426
427         do {
428                 int len;
429
430                 rq = i915_request_create(ce);
431                 if (IS_ERR(rq)) {
432                         err = PTR_ERR(rq);
433                         goto out_ce;
434                 }
435
436                 if (await) {
437                         err = i915_request_await_dma_fence(rq, await);
438                         if (err)
439                                 goto out_rq;
440
441                         if (rq->engine->emit_init_breadcrumb) {
442                                 err = rq->engine->emit_init_breadcrumb(rq);
443                                 if (err)
444                                         goto out_rq;
445                         }
446
447                         await = NULL;
448                 }
449
450                 /* The PTE updates + copy must not be interrupted. */
451                 err = emit_no_arbitration(rq);
452                 if (err)
453                         goto out_rq;
454
455                 len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem, 0,
456                                CHUNK_SZ);
457                 if (len <= 0) {
458                         err = len;
459                         goto out_rq;
460                 }
461
462                 err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
463                                CHUNK_SZ, len);
464                 if (err < 0)
465                         goto out_rq;
466                 if (err < len) {
467                         err = -EINVAL;
468                         goto out_rq;
469                 }
470
471                 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
472                 if (err)
473                         goto out_rq;
474
475                 err = emit_copy(rq, len);
476
477                 /* Arbitration is re-enabled between requests. */
478 out_rq:
479                 if (*out)
480                         i915_request_put(*out);
481                 *out = i915_request_get(rq);
482                 i915_request_add(rq);
483                 if (err || !it_src.sg || !sg_dma_len(it_src.sg))
484                         break;
485
486                 cond_resched();
487         } while (1);
488
489 out_ce:
490         return err;
491 }
492
493 static int emit_clear(struct i915_request *rq, int size, u32 value)
494 {
495         const int ver = GRAPHICS_VER(rq->engine->i915);
496         u32 instance = rq->engine->instance;
497         u32 *cs;
498
499         GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
500
501         cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
502         if (IS_ERR(cs))
503                 return PTR_ERR(cs);
504
505         if (ver >= 8) {
506                 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
507                 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
508                 *cs++ = 0;
509                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
510                 *cs++ = 0; /* offset */
511                 *cs++ = instance;
512                 *cs++ = value;
513                 *cs++ = MI_NOOP;
514         } else {
515                 GEM_BUG_ON(instance);
516                 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
517                 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
518                 *cs++ = 0;
519                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
520                 *cs++ = 0;
521                 *cs++ = value;
522         }
523
524         intel_ring_advance(rq, cs);
525         return 0;
526 }
527
528 int
529 intel_context_migrate_clear(struct intel_context *ce,
530                             struct dma_fence *await,
531                             struct scatterlist *sg,
532                             enum i915_cache_level cache_level,
533                             bool is_lmem,
534                             u32 value,
535                             struct i915_request **out)
536 {
537         struct sgt_dma it = sg_sgt(sg);
538         struct i915_request *rq;
539         int err;
540
541         GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
542         *out = NULL;
543
544         GEM_BUG_ON(ce->ring->size < SZ_64K);
545
546         do {
547                 int len;
548
549                 rq = i915_request_create(ce);
550                 if (IS_ERR(rq)) {
551                         err = PTR_ERR(rq);
552                         goto out_ce;
553                 }
554
555                 if (await) {
556                         err = i915_request_await_dma_fence(rq, await);
557                         if (err)
558                                 goto out_rq;
559
560                         if (rq->engine->emit_init_breadcrumb) {
561                                 err = rq->engine->emit_init_breadcrumb(rq);
562                                 if (err)
563                                         goto out_rq;
564                         }
565
566                         await = NULL;
567                 }
568
569                 /* The PTE updates + clear must not be interrupted. */
570                 err = emit_no_arbitration(rq);
571                 if (err)
572                         goto out_rq;
573
574                 len = emit_pte(rq, &it, cache_level, is_lmem, 0, CHUNK_SZ);
575                 if (len <= 0) {
576                         err = len;
577                         goto out_rq;
578                 }
579
580                 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
581                 if (err)
582                         goto out_rq;
583
584                 err = emit_clear(rq, len, value);
585
586                 /* Arbitration is re-enabled between requests. */
587 out_rq:
588                 if (*out)
589                         i915_request_put(*out);
590                 *out = i915_request_get(rq);
591                 i915_request_add(rq);
592                 if (err || !it.sg || !sg_dma_len(it.sg))
593                         break;
594
595                 cond_resched();
596         } while (1);
597
598 out_ce:
599         return err;
600 }
601
602 int intel_migrate_copy(struct intel_migrate *m,
603                        struct i915_gem_ww_ctx *ww,
604                        struct dma_fence *await,
605                        struct scatterlist *src,
606                        enum i915_cache_level src_cache_level,
607                        bool src_is_lmem,
608                        struct scatterlist *dst,
609                        enum i915_cache_level dst_cache_level,
610                        bool dst_is_lmem,
611                        struct i915_request **out)
612 {
613         struct intel_context *ce;
614         int err;
615
616         *out = NULL;
617         if (!m->context)
618                 return -ENODEV;
619
620         ce = intel_migrate_create_context(m);
621         if (IS_ERR(ce))
622                 ce = intel_context_get(m->context);
623         GEM_BUG_ON(IS_ERR(ce));
624
625         err = intel_context_pin_ww(ce, ww);
626         if (err)
627                 goto out;
628
629         err = intel_context_migrate_copy(ce, await,
630                                          src, src_cache_level, src_is_lmem,
631                                          dst, dst_cache_level, dst_is_lmem,
632                                          out);
633
634         intel_context_unpin(ce);
635 out:
636         intel_context_put(ce);
637         return err;
638 }
639
640 int
641 intel_migrate_clear(struct intel_migrate *m,
642                     struct i915_gem_ww_ctx *ww,
643                     struct dma_fence *await,
644                     struct scatterlist *sg,
645                     enum i915_cache_level cache_level,
646                     bool is_lmem,
647                     u32 value,
648                     struct i915_request **out)
649 {
650         struct intel_context *ce;
651         int err;
652
653         *out = NULL;
654         if (!m->context)
655                 return -ENODEV;
656
657         ce = intel_migrate_create_context(m);
658         if (IS_ERR(ce))
659                 ce = intel_context_get(m->context);
660         GEM_BUG_ON(IS_ERR(ce));
661
662         err = intel_context_pin_ww(ce, ww);
663         if (err)
664                 goto out;
665
666         err = intel_context_migrate_clear(ce, await, sg, cache_level,
667                                           is_lmem, value, out);
668
669         intel_context_unpin(ce);
670 out:
671         intel_context_put(ce);
672         return err;
673 }
674
675 void intel_migrate_fini(struct intel_migrate *m)
676 {
677         struct intel_context *ce;
678
679         ce = fetch_and_zero(&m->context);
680         if (!ce)
681                 return;
682
683         intel_engine_destroy_pinned_context(ce);
684 }
685
686 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
687 #include "selftest_migrate.c"
688 #endif