1 // SPDX-License-Identifier: MIT
3 * Copyright © 2020 Intel Corporation
5 #include "xe_migrate.h"
12 #include "xe_hw_engine.h"
17 #include "xe_res_cursor.h"
18 #include "xe_sched_job.h"
23 #include <linux/sizes.h>
24 #include <drm/drm_managed.h>
25 #include <drm/ttm/ttm_tt.h>
26 #include <drm/xe_drm.h>
28 #include "gt/intel_gpu_commands.h"
31 * struct xe_migrate - migrate context.
34 /** @eng: Default engine used for migration */
35 struct xe_engine *eng;
36 /** @gt: Backpointer to the gt this struct xe_migrate belongs to. */
38 /** @job_mutex: Timeline mutex for @eng. */
39 struct mutex job_mutex;
40 /** @pt_bo: Page-table buffer object. */
43 * @cleared_bo: Zeroed out bo used as a source for CCS metadata clears
45 struct xe_bo *cleared_bo;
46 /** @batch_base_ofs: VM offset of the migration batch buffer */
48 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */
49 u64 usm_batch_base_ofs;
50 /** @cleared_vram_ofs: VM offset of @cleared_bo. */
53 * @fence: dma-fence representing the last migration job batch.
54 * Protected by @job_mutex.
56 struct dma_fence *fence;
58 * @vm_update_sa: For integrated, used to suballocate page-tables
61 struct drm_suballoc_manager vm_update_sa;
64 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
65 #define NUM_KERNEL_PDE 17
66 #define NUM_PT_SLOTS 32
67 #define NUM_PT_PER_BLIT (MAX_PREEMPTDISABLE_TRANSFER / SZ_2M)
70 * xe_gt_migrate_engine() - Get this gt's migrate engine.
73 * Returns the default migrate engine of this gt.
74 * TODO: Perhaps this function is slightly misplaced, and even unneeded?
76 * Return: The default migrate engine
78 struct xe_engine *xe_gt_migrate_engine(struct xe_gt *gt)
80 return gt->migrate->eng;
83 static void xe_migrate_fini(struct drm_device *dev, void *arg)
85 struct xe_migrate *m = arg;
86 struct ww_acquire_ctx ww;
88 xe_vm_lock(m->eng->vm, &ww, 0, false);
89 xe_bo_unpin(m->pt_bo);
91 xe_bo_unpin(m->cleared_bo);
92 xe_vm_unlock(m->eng->vm, &ww);
94 dma_fence_put(m->fence);
96 xe_bo_put(m->cleared_bo);
98 drm_suballoc_manager_fini(&m->vm_update_sa);
99 mutex_destroy(&m->job_mutex);
100 xe_vm_close_and_put(m->eng->vm);
101 xe_engine_put(m->eng);
104 static u64 xe_migrate_vm_addr(u64 slot, u32 level)
106 XE_BUG_ON(slot >= NUM_PT_SLOTS);
108 /* First slot is reserved for mapping of PT bo and bb, start from 1 */
109 return (slot + 1ULL) << xe_pt_shift(level + 1);
112 static u64 xe_migrate_vram_ofs(u64 addr)
114 return addr + (256ULL << xe_pt_shift(2));
118 * For flat CCS clearing we need a cleared chunk of memory to copy from,
119 * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be buggy
120 * (it clears on only 14 bytes in each chunk of 16).
121 * If clearing the main surface one can use the part of the main surface
122 * already cleared, but for clearing as part of copying non-compressed
123 * data out of system memory, we don't readily have a cleared part of
124 * VRAM to copy from, so create one to use for that case.
126 static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct xe_vm *vm)
128 struct xe_gt *gt = m->gt;
129 struct xe_device *xe = vm->xe;
134 if (!xe_device_has_flat_ccs(xe))
137 cleared_size = xe_device_ccs_bytes(xe, MAX_PREEMPTDISABLE_TRANSFER);
138 cleared_size = PAGE_ALIGN(cleared_size);
139 m->cleared_bo = xe_bo_create_pin_map(xe, gt, vm, cleared_size,
141 XE_BO_CREATE_VRAM_IF_DGFX(gt) |
142 XE_BO_CREATE_PINNED_BIT);
143 if (IS_ERR(m->cleared_bo))
144 return PTR_ERR(m->cleared_bo);
146 xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00, cleared_size);
147 vram_addr = xe_bo_addr(m->cleared_bo, 0, GEN8_PAGE_SIZE, &is_vram);
149 m->cleared_vram_ofs = xe_migrate_vram_ofs(vram_addr);
154 static int xe_migrate_prepare_vm(struct xe_gt *gt, struct xe_migrate *m,
158 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level;
159 u32 map_ofs, level, i;
160 struct xe_device *xe = gt_to_xe(m->gt);
161 struct xe_bo *bo, *batch = gt->kernel_bb_pool.bo;
165 /* Can't bump NUM_PT_SLOTS too high */
166 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/GEN8_PAGE_SIZE);
167 /* Must be a multiple of 64K to support all platforms */
168 BUILD_BUG_ON(NUM_PT_SLOTS * GEN8_PAGE_SIZE % SZ_64K);
169 /* And one slot reserved for the 4KiB page table updates */
170 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
172 /* Need to be sure everything fits in the first PT, or create more */
173 XE_BUG_ON(m->batch_base_ofs + batch->size >= SZ_2M);
175 bo = xe_bo_create_pin_map(vm->xe, m->gt, vm,
176 num_entries * GEN8_PAGE_SIZE,
178 XE_BO_CREATE_VRAM_IF_DGFX(m->gt) |
179 XE_BO_CREATE_PINNED_BIT);
183 ret = xe_migrate_create_cleared_bo(m, vm);
189 entry = gen8_pde_encode(bo, bo->size - GEN8_PAGE_SIZE, XE_CACHE_WB);
190 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
192 map_ofs = (num_entries - num_level) * GEN8_PAGE_SIZE;
194 /* Map the entire BO in our level 0 pt */
195 for (i = 0, level = 0; i < num_entries; level++) {
196 entry = gen8_pte_encode(NULL, bo, i * GEN8_PAGE_SIZE,
199 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry);
201 if (vm->flags & XE_VM_FLAGS_64K)
208 XE_BUG_ON(xe->info.supports_usm);
210 /* Write out batch too */
211 m->batch_base_ofs = NUM_PT_SLOTS * GEN8_PAGE_SIZE;
212 for (i = 0; i < batch->size;
213 i += vm->flags & XE_VM_FLAGS_64K ? GEN8_64K_PAGE_SIZE :
215 entry = gen8_pte_encode(NULL, batch, i,
218 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
224 u64 batch_addr = xe_bo_addr(batch, 0, GEN8_PAGE_SIZE, &is_lmem);
226 m->batch_base_ofs = xe_migrate_vram_ofs(batch_addr);
228 if (xe->info.supports_usm) {
229 batch = gt->usm.bb_pool.bo;
230 batch_addr = xe_bo_addr(batch, 0, GEN8_PAGE_SIZE,
232 m->usm_batch_base_ofs = xe_migrate_vram_ofs(batch_addr);
236 for (level = 1; level < num_level; level++) {
239 if (vm->flags & XE_VM_FLAGS_64K && level == 1)
240 flags = GEN12_PDE_64K;
242 entry = gen8_pde_encode(bo, map_ofs + (level - 1) *
243 GEN8_PAGE_SIZE, XE_CACHE_WB);
244 xe_map_wr(xe, &bo->vmap, map_ofs + GEN8_PAGE_SIZE * level, u64,
248 /* Write PDE's that point to our BO. */
249 for (i = 0; i < num_entries - num_level; i++) {
250 entry = gen8_pde_encode(bo, i * GEN8_PAGE_SIZE,
253 xe_map_wr(xe, &bo->vmap, map_ofs + GEN8_PAGE_SIZE +
254 (i + 1) * 8, u64, entry);
257 /* Identity map the entire vram at 256GiB offset */
262 ofs = map_ofs + GEN8_PAGE_SIZE * level + 256 * 8;
263 flags = GEN8_PAGE_RW | GEN8_PAGE_PRESENT | PPAT_CACHED |
264 GEN12_PPGTT_PTE_LM | GEN8_PDPE_PS_1G;
267 * Use 1GB pages, it shouldn't matter the physical amount of
268 * vram is less, when we don't access it.
270 for (pos = 0; pos < xe->mem.vram.size; pos += SZ_1G, ofs += 8)
271 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
275 * Example layout created above, with root level = 3:
276 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
277 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
278 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's
279 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2]
281 * This makes the lowest part of the VM point to the pagetables.
282 * Hence the lowest 2M in the vm should point to itself, with a few writes
283 * and flushes, other parts of the VM can be used either for copying and
286 * For performance, the kernel reserves PDE's, so about 20 are left
287 * for async VM updates.
289 * To make it easier to work, each scratch PT is put in slot (1 + PT #)
290 * everywhere, this allows lockless updates to scratch pages by using
291 * the different addresses in VM.
293 #define NUM_VMUSA_UNIT_PER_PAGE 32
294 #define VM_SA_UPDATE_UNIT_SIZE (GEN8_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE)
295 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))
296 drm_suballoc_manager_init(&m->vm_update_sa,
297 (map_ofs / GEN8_PAGE_SIZE - NUM_KERNEL_PDE) *
298 NUM_VMUSA_UNIT_PER_PAGE, 0);
305 * xe_migrate_init() - Initialize a migrate context
306 * @gt: Back-pointer to the gt we're initializing for.
308 * Return: Pointer to a migrate context on success. Error pointer on error.
310 struct xe_migrate *xe_migrate_init(struct xe_gt *gt)
312 struct xe_device *xe = gt_to_xe(gt);
313 struct xe_migrate *m;
315 struct ww_acquire_ctx ww;
318 XE_BUG_ON(xe_gt_is_media_type(gt));
320 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL);
322 return ERR_PTR(-ENOMEM);
326 /* Special layout, prepared below.. */
327 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |
328 XE_VM_FLAG_SET_GT_ID(gt));
332 xe_vm_lock(vm, &ww, 0, false);
333 err = xe_migrate_prepare_vm(gt, m, vm);
334 xe_vm_unlock(vm, &ww);
336 xe_vm_close_and_put(vm);
340 if (xe->info.supports_usm) {
341 struct xe_hw_engine *hwe = xe_gt_hw_engine(gt,
342 XE_ENGINE_CLASS_COPY,
343 gt->usm.reserved_bcs_instance,
346 return ERR_PTR(-EINVAL);
348 m->eng = xe_engine_create(xe, vm,
349 BIT(hwe->logical_instance), 1,
350 hwe, ENGINE_FLAG_KERNEL);
352 m->eng = xe_engine_create_class(xe, gt, vm,
353 XE_ENGINE_CLASS_COPY,
356 if (IS_ERR(m->eng)) {
357 xe_vm_close_and_put(vm);
358 return ERR_CAST(m->eng);
361 mutex_init(&m->job_mutex);
363 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m);
370 static void emit_arb_clear(struct xe_bb *bb)
373 bb->cs[bb->len++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
376 static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur)
379 * For VRAM we use identity mapped pages so we are limited to current
380 * cursor size. For system we program the pages ourselves so we have no
383 return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER,
384 mem_type_is_vram(cur->mem_type) ? cur->size :
388 static u32 pte_update_size(struct xe_migrate *m,
390 struct xe_res_cursor *cur,
391 u64 *L0, u64 *L0_ofs, u32 *L0_pt,
392 u32 cmd_size, u32 pt_ofs, u32 avail_pts)
398 /* Clip L0 to available size */
399 u64 size = min(*L0, (u64)avail_pts * SZ_2M);
400 u64 num_4k_pages = DIV_ROUND_UP(size, GEN8_PAGE_SIZE);
403 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0);
405 /* MI_STORE_DATA_IMM */
406 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff);
409 cmds += num_4k_pages * 2;
411 /* Each chunk has a single blit command */
414 /* Offset into identity map. */
415 *L0_ofs = xe_migrate_vram_ofs(cur->start);
422 static void emit_pte(struct xe_migrate *m,
423 struct xe_bb *bb, u32 at_pt,
425 struct xe_res_cursor *cur,
426 u32 size, struct xe_bo *bo)
429 u64 ofs = at_pt * GEN8_PAGE_SIZE;
433 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
434 * we're only emitting VRAM PTEs during sanity tests, so when
435 * that's moved to a Kunit test, we should condition VRAM PTEs
439 ptes = DIV_ROUND_UP(size, GEN8_PAGE_SIZE);
442 u32 chunk = min(0x1ffU, ptes);
444 bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(21) |
446 bb->cs[bb->len++] = ofs;
447 bb->cs[bb->len++] = 0;
456 XE_BUG_ON(cur->start & (PAGE_SIZE - 1));
461 /* Is this a 64K PTE entry? */
462 if ((m->eng->vm->flags & XE_VM_FLAGS_64K) &&
463 !(cur_ofs & (16 * 8 - 1))) {
464 XE_WARN_ON(!IS_ALIGNED(addr, SZ_64K));
465 addr |= GEN12_PTE_PS64;
468 addr |= GEN12_PPGTT_PTE_LM;
470 addr = xe_res_dma(cur);
472 addr |= PPAT_CACHED | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
473 bb->cs[bb->len++] = lower_32_bits(addr);
474 bb->cs[bb->len++] = upper_32_bits(addr);
476 xe_res_next(cur, PAGE_SIZE);
482 #define EMIT_COPY_CCS_DW 5
483 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
484 u64 dst_ofs, bool dst_is_indirect,
485 u64 src_ofs, bool src_is_indirect,
488 u32 *cs = bb->cs + bb->len;
490 u32 mocs = xe_mocs_index_to_value(gt->mocs.uc_index);
492 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
493 NUM_CCS_BYTES_PER_BLOCK);
494 XE_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER);
495 *cs++ = XY_CTRL_SURF_COPY_BLT |
496 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
497 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
498 ((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
499 *cs++ = lower_32_bits(src_ofs);
500 *cs++ = upper_32_bits(src_ofs) |
501 FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
502 *cs++ = lower_32_bits(dst_ofs);
503 *cs++ = upper_32_bits(dst_ofs) |
504 FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
506 bb->len = cs - bb->cs;
509 #define EMIT_COPY_DW 10
510 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
511 u64 src_ofs, u64 dst_ofs, unsigned int size,
514 XE_BUG_ON(size / pitch > S16_MAX);
515 XE_BUG_ON(pitch / 4 > S16_MAX);
516 XE_BUG_ON(pitch > U16_MAX);
518 bb->cs[bb->len++] = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
519 bb->cs[bb->len++] = BLT_DEPTH_32 | pitch;
520 bb->cs[bb->len++] = 0;
521 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4;
522 bb->cs[bb->len++] = lower_32_bits(dst_ofs);
523 bb->cs[bb->len++] = upper_32_bits(dst_ofs);
524 bb->cs[bb->len++] = 0;
525 bb->cs[bb->len++] = pitch;
526 bb->cs[bb->len++] = lower_32_bits(src_ofs);
527 bb->cs[bb->len++] = upper_32_bits(src_ofs);
530 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
531 enum dma_resv_usage usage)
533 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage);
536 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
538 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
541 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
543 u64 src_ofs, bool src_is_vram,
544 u64 dst_ofs, bool dst_is_vram, u32 dst_size,
545 u64 ccs_ofs, bool copy_ccs)
547 struct xe_gt *gt = m->gt;
550 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_vram) {
552 * If the bo doesn't have any CCS metadata attached, we still
553 * need to clear it for security reasons.
555 emit_copy_ccs(gt, bb, dst_ofs, true, m->cleared_vram_ofs, false,
557 flush_flags = MI_FLUSH_DW_CCS;
558 } else if (copy_ccs) {
561 else if (!dst_is_vram)
565 * At the moment, we don't support copying CCS metadata from
568 XE_BUG_ON(!src_is_vram && !dst_is_vram);
570 emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs,
571 src_is_vram, dst_size);
573 flush_flags = MI_FLUSH_DW_CCS;
580 * xe_migrate_copy() - Copy content of TTM resources.
581 * @m: The migration context.
582 * @bo: The buffer object @src is currently bound to.
583 * @src: The source TTM resource.
584 * @dst: The dst TTM resource.
586 * Copies the contents of @src to @dst: On flat CCS devices,
587 * the CCS metadata is copied as well if needed, or if not present,
588 * the CCS metadata of @dst is cleared for security reasons.
589 * It's currently not possible to copy between two system resources,
590 * since that would require two TTM page-vectors.
591 * TODO: Eliminate the @bo argument and supply two TTM page-vectors.
593 * Return: Pointer to a dma_fence representing the last copy batch, or
594 * an error pointer on failure. If there is a failure, any copy operation
595 * started by the function call has been synced.
597 struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
599 struct ttm_resource *src,
600 struct ttm_resource *dst)
602 struct xe_gt *gt = m->gt;
603 struct xe_device *xe = gt_to_xe(gt);
604 struct dma_fence *fence = NULL;
606 struct xe_res_cursor src_it, dst_it, ccs_it;
607 u64 src_L0_ofs, dst_L0_ofs;
608 u32 src_L0_pt, dst_L0_pt;
612 bool src_is_vram = mem_type_is_vram(src->mem_type);
613 bool dst_is_vram = mem_type_is_vram(dst->mem_type);
614 bool copy_ccs = xe_device_has_flat_ccs(xe) && xe_bo_needs_ccs_pages(bo);
615 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
618 xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &src_it);
620 xe_res_first(src, 0, bo->size, &src_it);
622 xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &dst_it);
624 xe_res_first(dst, 0, bo->size, &dst_it);
627 xe_res_first_sg(xe_bo_get_sg(bo), xe_bo_ccs_pages_start(bo),
628 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
632 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
633 struct xe_sched_job *job;
637 u64 ccs_ofs, ccs_size;
639 bool usm = xe->info.supports_usm;
641 src_L0 = xe_migrate_res_sizes(&src_it);
642 dst_L0 = xe_migrate_res_sizes(&dst_it);
644 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
645 pass++, src_L0, dst_L0);
647 src_L0 = min(src_L0, dst_L0);
649 batch_size += pte_update_size(m, src_is_vram, &src_it, &src_L0,
650 &src_L0_ofs, &src_L0_pt, 0, 0,
653 batch_size += pte_update_size(m, dst_is_vram, &dst_it, &src_L0,
654 &dst_L0_ofs, &dst_L0_pt, 0,
655 NUM_PT_PER_BLIT, NUM_PT_PER_BLIT);
657 if (copy_system_ccs) {
658 ccs_size = xe_device_ccs_bytes(xe, src_L0);
659 batch_size += pte_update_size(m, false, &ccs_it, &ccs_size,
660 &ccs_ofs, &ccs_pt, 0,
665 /* Add copy commands size here */
666 batch_size += EMIT_COPY_DW +
667 (xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0);
669 bb = xe_bb_new(gt, batch_size, usm);
675 /* Preemption is enabled again by the ring ops. */
676 if (!src_is_vram || !dst_is_vram)
680 emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0,
683 xe_res_next(&src_it, src_L0);
686 emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0,
689 xe_res_next(&dst_it, src_L0);
692 emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, bo);
694 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
695 update_idx = bb->len;
697 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, GEN8_PAGE_SIZE);
698 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_vram,
699 dst_L0_ofs, dst_is_vram,
700 src_L0, ccs_ofs, copy_ccs);
702 mutex_lock(&m->job_mutex);
703 job = xe_bb_create_migration_job(m->eng, bb,
704 xe_migrate_batch_base(m, usm),
711 xe_sched_job_add_migrate_flush(job, flush_flags);
713 err = job_add_deps(job, bo->ttm.base.resv,
714 DMA_RESV_USAGE_BOOKKEEP);
719 xe_sched_job_arm(job);
720 dma_fence_put(fence);
721 fence = dma_fence_get(&job->drm.s_fence->finished);
722 xe_sched_job_push(job);
724 dma_fence_put(m->fence);
725 m->fence = dma_fence_get(fence);
727 mutex_unlock(&m->job_mutex);
729 xe_bb_free(bb, fence);
734 xe_sched_job_put(job);
736 mutex_unlock(&m->job_mutex);
737 xe_bb_free(bb, NULL);
740 /* Sync partial copy if any. FIXME: under job_mutex? */
742 dma_fence_wait(fence, false);
743 dma_fence_put(fence);
752 static int emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
753 u32 size, u32 pitch, u32 value, bool is_vram)
755 u32 *cs = bb->cs + bb->len;
756 u32 len = XY_FAST_COLOR_BLT_DW;
757 u32 mocs = xe_mocs_index_to_value(gt->mocs.uc_index);
759 if (GRAPHICS_VERx100(gt->xe) < 1250)
762 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
764 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
767 *cs++ = (size / pitch) << 16 | pitch / 4;
768 *cs++ = lower_32_bits(src_ofs);
769 *cs++ = upper_32_bits(src_ofs);
770 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
784 XE_BUG_ON(cs - bb->cs != len + bb->len);
791 * xe_migrate_clear() - Copy content of TTM resources.
792 * @m: The migration context.
793 * @bo: The buffer object @dst is currently bound to.
794 * @dst: The dst TTM resource to be cleared.
795 * @value: Clear value.
797 * Clear the contents of @dst. On flat CCS devices,
798 * the CCS metadata is cleared to zero as well on VRAM destionations.
799 * TODO: Eliminate the @bo argument.
801 * Return: Pointer to a dma_fence representing the last clear batch, or
802 * an error pointer on failure. If there is a failure, any clear operation
803 * started by the function call has been synced.
805 struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
807 struct ttm_resource *dst,
810 bool clear_vram = mem_type_is_vram(dst->mem_type);
811 struct xe_gt *gt = m->gt;
812 struct xe_device *xe = gt_to_xe(gt);
813 struct dma_fence *fence = NULL;
815 struct xe_res_cursor src_it;
816 struct ttm_resource *src = dst;
821 xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &src_it);
823 xe_res_first(src, 0, bo->size, &src_it);
830 struct xe_sched_job *job;
832 u32 batch_size, update_idx;
833 bool usm = xe->info.supports_usm;
835 clear_L0 = xe_migrate_res_sizes(&src_it);
836 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
838 /* Calculate final sizes and batch size.. */
840 pte_update_size(m, clear_vram, &src_it,
841 &clear_L0, &clear_L0_ofs, &clear_L0_pt,
842 XY_FAST_COLOR_BLT_DW, 0, NUM_PT_PER_BLIT);
843 if (xe_device_has_flat_ccs(xe) && clear_vram)
844 batch_size += EMIT_COPY_CCS_DW;
848 if (WARN_ON_ONCE(!clear_L0))
851 bb = xe_bb_new(gt, batch_size, usm);
859 /* TODO: Add dependencies here */
861 /* Preemption is enabled again by the ring ops. */
864 emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
867 xe_res_next(&src_it, clear_L0);
869 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
870 update_idx = bb->len;
872 emit_clear(gt, bb, clear_L0_ofs, clear_L0, GEN8_PAGE_SIZE,
874 if (xe_device_has_flat_ccs(xe) && clear_vram) {
875 emit_copy_ccs(gt, bb, clear_L0_ofs, true,
876 m->cleared_vram_ofs, false, clear_L0);
877 flush_flags = MI_FLUSH_DW_CCS;
880 mutex_lock(&m->job_mutex);
881 job = xe_bb_create_migration_job(m->eng, bb,
882 xe_migrate_batch_base(m, usm),
889 xe_sched_job_add_migrate_flush(job, flush_flags);
891 xe_sched_job_arm(job);
892 dma_fence_put(fence);
893 fence = dma_fence_get(&job->drm.s_fence->finished);
894 xe_sched_job_push(job);
896 dma_fence_put(m->fence);
897 m->fence = dma_fence_get(fence);
899 mutex_unlock(&m->job_mutex);
901 xe_bb_free(bb, fence);
905 mutex_unlock(&m->job_mutex);
906 xe_bb_free(bb, NULL);
908 /* Sync partial copies if any. FIXME: job_mutex? */
910 dma_fence_wait(m->fence, false);
911 dma_fence_put(fence);
920 static void write_pgtable(struct xe_gt *gt, struct xe_bb *bb, u64 ppgtt_ofs,
921 const struct xe_vm_pgtable_update *update,
922 struct xe_migrate_pt_update *pt_update)
924 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
926 u32 ofs = update->ofs, size = update->qwords;
929 * If we have 512 entries (max), we would populate it ourselves,
930 * and update the PDE above it to the new pointer.
931 * The only time this can only happen if we have to update the top
932 * PDE. This requires a BO that is almost vm->size big.
934 * This shouldn't be possible in practice.. might change when 16K
935 * pages are used. Hence the BUG_ON.
937 XE_BUG_ON(update->qwords > 0x1ff);
941 ppgtt_ofs = xe_migrate_vram_ofs(xe_bo_addr(update->pt_bo, 0,
948 u64 addr = ppgtt_ofs + ofs * 8;
949 chunk = min(update->qwords, 0x1ffU);
951 /* Ensure populatefn can do memset64 by aligning bb->cs */
953 bb->cs[bb->len++] = MI_NOOP;
955 bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(21) |
957 bb->cs[bb->len++] = lower_32_bits(addr);
958 bb->cs[bb->len++] = upper_32_bits(addr);
959 ops->populate(pt_update, gt, NULL, bb->cs + bb->len, ofs, chunk,
962 bb->len += chunk * 2;
968 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)
970 return xe_vm_get(m->eng->vm);
973 static struct dma_fence *
974 xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
975 struct xe_vm *vm, struct xe_bo *bo,
976 const struct xe_vm_pgtable_update *updates,
977 u32 num_updates, bool wait_vm,
978 struct xe_migrate_pt_update *pt_update)
980 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
981 struct dma_fence *fence;
985 /* Wait on BO moves for 10 ms, then fall back to GPU job */
989 wait = dma_resv_wait_timeout(bo->ttm.base.resv,
990 DMA_RESV_USAGE_KERNEL,
993 return ERR_PTR(-ETIME);
998 wait = dma_resv_wait_timeout(&vm->resv,
999 DMA_RESV_USAGE_BOOKKEEP,
1002 return ERR_PTR(-ETIME);
1005 if (ops->pre_commit) {
1006 err = ops->pre_commit(pt_update);
1008 return ERR_PTR(err);
1010 for (i = 0; i < num_updates; i++) {
1011 const struct xe_vm_pgtable_update *update = &updates[i];
1013 ops->populate(pt_update, m->gt, &update->pt_bo->vmap, NULL,
1014 update->ofs, update->qwords, update);
1017 trace_xe_vm_cpu_bind(vm);
1018 xe_device_wmb(vm->xe);
1020 fence = dma_fence_get_stub();
1025 static bool no_in_syncs(struct xe_sync_entry *syncs, u32 num_syncs)
1029 for (i = 0; i < num_syncs; i++) {
1030 struct dma_fence *fence = syncs[i].fence;
1032 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
1040 static bool engine_is_idle(struct xe_engine *e)
1042 return !e || e->lrc[0].fence_ctx.next_seqno == 1 ||
1043 xe_lrc_seqno(&e->lrc[0]) == e->lrc[0].fence_ctx.next_seqno;
1047 * xe_migrate_update_pgtables() - Pipelined page-table update
1048 * @m: The migrate context.
1049 * @vm: The vm we'll be updating.
1050 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr.
1051 * @eng: The engine to be used for the update or NULL if the default
1052 * migration engine is to be used.
1053 * @updates: An array of update descriptors.
1054 * @num_updates: Number of descriptors in @updates.
1055 * @syncs: Array of xe_sync_entry to await before updating. Note that waits
1056 * will block the engine timeline.
1057 * @num_syncs: Number of entries in @syncs.
1058 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains
1059 * pointers to callback functions and, if subclassed, private arguments to
1062 * Perform a pipelined page-table update. The update descriptors are typically
1063 * built under the same lock critical section as a call to this function. If
1064 * using the default engine for the updates, they will be performed in the
1065 * order they grab the job_mutex. If different engines are used, external
1066 * synchronization is needed for overlapping updates to maintain page-table
1067 * consistency. Note that the meaing of "overlapping" is that the updates
1068 * touch the same page-table, which might be a higher-level page-directory.
1069 * If no pipelining is needed, then updates may be performed by the cpu.
1071 * Return: A dma_fence that, when signaled, indicates the update completion.
1074 xe_migrate_update_pgtables(struct xe_migrate *m,
1077 struct xe_engine *eng,
1078 const struct xe_vm_pgtable_update *updates,
1080 struct xe_sync_entry *syncs, u32 num_syncs,
1081 struct xe_migrate_pt_update *pt_update)
1083 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1084 struct xe_gt *gt = m->gt;
1085 struct xe_device *xe = gt_to_xe(gt);
1086 struct xe_sched_job *job;
1087 struct dma_fence *fence;
1088 struct drm_suballoc *sa_bo = NULL;
1089 struct xe_vma *vma = pt_update->vma;
1091 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0;
1094 bool usm = !eng && xe->info.supports_usm;
1095 bool first_munmap_rebind = vma && vma->first_munmap_rebind;
1097 /* Use the CPU if no in syncs and engine is idle */
1098 if (no_in_syncs(syncs, num_syncs) && engine_is_idle(eng)) {
1099 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates,
1101 first_munmap_rebind,
1103 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN))
1107 /* fixed + PTE entries */
1111 batch_size = 6 + num_updates * 2;
1113 for (i = 0; i < num_updates; i++) {
1114 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff);
1116 /* align noop + MI_STORE_DATA_IMM cmd prefix */
1117 batch_size += 4 * num_cmds + updates[i].qwords * 2;
1121 * XXX: Create temp bo to copy from, if batch_size becomes too big?
1123 * Worst case: Sum(2 * (each lower level page size) + (top level page size))
1124 * Should be reasonably bound..
1126 XE_BUG_ON(batch_size >= SZ_128K);
1128 bb = xe_bb_new(gt, batch_size, !eng && xe->info.supports_usm);
1130 return ERR_CAST(bb);
1132 /* For sysmem PTE's, need to map them in our hole.. */
1134 ppgtt_ofs = NUM_KERNEL_PDE - 1;
1136 XE_BUG_ON(num_updates > NUM_VMUSA_WRITES_PER_UNIT);
1138 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1,
1139 GFP_KERNEL, true, 0);
1140 if (IS_ERR(sa_bo)) {
1141 err = PTR_ERR(sa_bo);
1145 ppgtt_ofs = NUM_KERNEL_PDE +
1146 (drm_suballoc_soffset(sa_bo) /
1147 NUM_VMUSA_UNIT_PER_PAGE);
1148 page_ofs = (drm_suballoc_soffset(sa_bo) %
1149 NUM_VMUSA_UNIT_PER_PAGE) *
1150 VM_SA_UPDATE_UNIT_SIZE;
1153 /* Preemption is enabled again by the ring ops. */
1156 /* Map our PT's to gtt */
1157 bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(21) |
1158 (num_updates * 2 + 1);
1159 bb->cs[bb->len++] = ppgtt_ofs * GEN8_PAGE_SIZE + page_ofs;
1160 bb->cs[bb->len++] = 0; /* upper_32_bits */
1162 for (i = 0; i < num_updates; i++) {
1163 struct xe_bo *pt_bo = updates[i].pt_bo;
1165 BUG_ON(pt_bo->size != SZ_4K);
1167 addr = gen8_pte_encode(NULL, pt_bo, 0, XE_CACHE_WB,
1169 bb->cs[bb->len++] = lower_32_bits(addr);
1170 bb->cs[bb->len++] = upper_32_bits(addr);
1173 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1174 update_idx = bb->len;
1176 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +
1177 (page_ofs / sizeof(u64)) * GEN8_PAGE_SIZE;
1178 for (i = 0; i < num_updates; i++)
1179 write_pgtable(m->gt, bb, addr + i * GEN8_PAGE_SIZE,
1180 &updates[i], pt_update);
1182 /* phys pages, no preamble required */
1183 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1184 update_idx = bb->len;
1186 /* Preemption is enabled again by the ring ops. */
1188 for (i = 0; i < num_updates; i++)
1189 write_pgtable(m->gt, bb, 0, &updates[i], pt_update);
1193 mutex_lock(&m->job_mutex);
1195 job = xe_bb_create_migration_job(eng ?: m->eng, bb,
1196 xe_migrate_batch_base(m, usm),
1203 /* Wait on BO move */
1205 err = job_add_deps(job, bo->ttm.base.resv,
1206 DMA_RESV_USAGE_KERNEL);
1212 * Munmap style VM unbind, need to wait for all jobs to be complete /
1213 * trigger preempts before moving forward
1215 if (first_munmap_rebind) {
1216 err = job_add_deps(job, &vm->resv,
1217 DMA_RESV_USAGE_BOOKKEEP);
1222 for (i = 0; !err && i < num_syncs; i++)
1223 err = xe_sync_entry_add_deps(&syncs[i], job);
1228 if (ops->pre_commit) {
1229 err = ops->pre_commit(pt_update);
1233 xe_sched_job_arm(job);
1234 fence = dma_fence_get(&job->drm.s_fence->finished);
1235 xe_sched_job_push(job);
1238 mutex_unlock(&m->job_mutex);
1240 xe_bb_free(bb, fence);
1241 drm_suballoc_free(sa_bo, fence);
1246 xe_sched_job_put(job);
1249 mutex_unlock(&m->job_mutex);
1250 xe_bb_free(bb, NULL);
1252 drm_suballoc_free(sa_bo, NULL);
1253 return ERR_PTR(err);
1257 * xe_migrate_wait() - Complete all operations using the xe_migrate context
1258 * @m: Migrate context to wait for.
1260 * Waits until the GPU no longer uses the migrate context's default engine
1261 * or its page-table objects. FIXME: What about separate page-table update
1264 void xe_migrate_wait(struct xe_migrate *m)
1267 dma_fence_wait(m->fence, false);
1270 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1271 #include "tests/xe_migrate.c"