2 * SPDX-License-Identifier: MIT
4 * Copyright © 2017-2018 Intel Corporation
7 #include <linux/prime_numbers.h>
9 #include "../i915_selftest.h"
10 #include "i915_random.h"
12 #include "igt_flush_test.h"
13 #include "mock_gem_device.h"
14 #include "mock_timeline.h"
16 static struct page *hwsp_page(struct i915_timeline *tl)
18 struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
20 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
21 return sg_page(obj->mm.pages->sgl);
24 static unsigned long hwsp_cacheline(struct i915_timeline *tl)
26 unsigned long address = (unsigned long)page_address(hwsp_page(tl));
28 return (address + tl->hwsp_offset) / CACHELINE_BYTES;
31 #define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES)
33 struct mock_hwsp_freelist {
34 struct drm_i915_private *i915;
35 struct radix_tree_root cachelines;
36 struct i915_timeline **history;
37 unsigned long count, max;
38 struct rnd_state prng;
45 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
47 struct i915_timeline *tl)
49 tl = xchg(&state->history[idx], tl);
51 radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
52 i915_timeline_put(tl);
56 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
60 struct i915_timeline *tl;
64 unsigned long cacheline;
67 tl = i915_timeline_create(state->i915, NULL);
71 cacheline = hwsp_cacheline(tl);
72 err = radix_tree_insert(&state->cachelines, cacheline, tl);
75 pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
78 i915_timeline_put(tl);
82 idx = state->count++ % state->max;
83 __mock_hwsp_record(state, idx, tl);
87 i915_prandom_shuffle(state->history,
88 sizeof(*state->history),
89 min(state->count, state->max),
92 count = i915_prandom_u32_max_state(min(state->count, state->max),
95 idx = --state->count % state->max;
96 __mock_hwsp_record(state, idx, NULL);
102 static int mock_hwsp_freelist(void *arg)
104 struct mock_hwsp_freelist state;
110 { "shuffled", SHUFFLE },
116 INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
117 state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
119 state.i915 = mock_gem_device();
124 * Create a bunch of timelines and check that their HWSP do not overlap.
125 * Free some, and try again.
128 state.max = PAGE_SIZE / sizeof(*state.history);
130 state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
131 if (!state.history) {
136 mutex_lock(&state.i915->drm.struct_mutex);
137 for (p = phases; p->name; p++) {
138 pr_debug("%s(%s)\n", __func__, p->name);
139 for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
140 err = __mock_hwsp_timeline(&state, na, p->flags);
147 for (na = 0; na < state.max; na++)
148 __mock_hwsp_record(&state, na, NULL);
149 mutex_unlock(&state.i915->drm.struct_mutex);
150 kfree(state.history);
152 drm_dev_put(&state.i915->drm);
163 static int __igt_sync(struct i915_timeline *tl,
165 const struct __igt_sync *p,
170 if (__i915_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
171 pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
172 name, p->name, ctx, p->seqno, yesno(p->expected));
177 ret = __i915_timeline_sync_set(tl, ctx, p->seqno);
185 static int igt_sync(void *arg)
187 const struct __igt_sync pass[] = {
188 { "unset", 0, false, false },
189 { "new", 0, false, true },
190 { "0a", 0, true, true },
191 { "1a", 1, false, true },
192 { "1b", 1, true, true },
193 { "0b", 0, true, false },
194 { "2a", 2, false, true },
195 { "4", 4, false, true },
196 { "INT_MAX", INT_MAX, false, true },
197 { "INT_MAX-1", INT_MAX-1, true, false },
198 { "INT_MAX+1", (u32)INT_MAX+1, false, true },
199 { "INT_MAX", INT_MAX, true, false },
200 { "UINT_MAX", UINT_MAX, false, true },
201 { "wrap", 0, false, true },
202 { "unwrap", UINT_MAX, true, false },
205 struct i915_timeline tl;
209 mock_timeline_init(&tl, 0);
210 for (p = pass; p->name; p++) {
211 for (order = 1; order < 64; order++) {
212 for (offset = -1; offset <= (order > 1); offset++) {
213 u64 ctx = BIT_ULL(order) + offset;
215 ret = __igt_sync(&tl, ctx, p, "1");
221 mock_timeline_fini(&tl);
223 mock_timeline_init(&tl, 0);
224 for (order = 1; order < 64; order++) {
225 for (offset = -1; offset <= (order > 1); offset++) {
226 u64 ctx = BIT_ULL(order) + offset;
228 for (p = pass; p->name; p++) {
229 ret = __igt_sync(&tl, ctx, p, "2");
237 mock_timeline_fini(&tl);
241 static unsigned int random_engine(struct rnd_state *rnd)
243 return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
246 static int bench_sync(void *arg)
248 struct rnd_state prng;
249 struct i915_timeline tl;
250 unsigned long end_time, count;
253 int order, last_order;
255 mock_timeline_init(&tl, 0);
257 /* Lookups from cache are very fast and so the random number generation
258 * and the loop itself becomes a significant factor in the per-iteration
259 * timings. We try to compensate the results by measuring the overhead
260 * of the prng and subtract it from the reported results.
262 prandom_seed_state(&prng, i915_selftest.random_seed);
265 end_time = jiffies + HZ/10;
269 /* Make sure the compiler doesn't optimise away the prng call */
270 WRITE_ONCE(x, prandom_u32_state(&prng));
273 } while (!time_after(jiffies, end_time));
274 kt = ktime_sub(ktime_get(), kt);
275 pr_debug("%s: %lu random evaluations, %lluns/prng\n",
276 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
277 prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
279 /* Benchmark (only) setting random context ids */
280 prandom_seed_state(&prng, i915_selftest.random_seed);
283 end_time = jiffies + HZ/10;
285 u64 id = i915_prandom_u64_state(&prng);
287 __i915_timeline_sync_set(&tl, id, 0);
289 } while (!time_after(jiffies, end_time));
290 kt = ktime_sub(ktime_get(), kt);
291 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
292 pr_info("%s: %lu random insertions, %lluns/insert\n",
293 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
295 /* Benchmark looking up the exact same context ids as we just set */
296 prandom_seed_state(&prng, i915_selftest.random_seed);
300 u64 id = i915_prandom_u64_state(&prng);
302 if (!__i915_timeline_sync_is_later(&tl, id, 0)) {
303 mock_timeline_fini(&tl);
304 pr_err("Lookup of %llu failed\n", id);
308 kt = ktime_sub(ktime_get(), kt);
309 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
310 pr_info("%s: %lu random lookups, %lluns/lookup\n",
311 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
313 mock_timeline_fini(&tl);
316 mock_timeline_init(&tl, 0);
318 /* Benchmark setting the first N (in order) contexts */
321 end_time = jiffies + HZ/10;
323 __i915_timeline_sync_set(&tl, count++, 0);
324 } while (!time_after(jiffies, end_time));
325 kt = ktime_sub(ktime_get(), kt);
326 pr_info("%s: %lu in-order insertions, %lluns/insert\n",
327 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
329 /* Benchmark looking up the exact same context ids as we just set */
333 if (!__i915_timeline_sync_is_later(&tl, end_time, 0)) {
334 pr_err("Lookup of %lu failed\n", end_time);
335 mock_timeline_fini(&tl);
339 kt = ktime_sub(ktime_get(), kt);
340 pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
341 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
343 mock_timeline_fini(&tl);
346 mock_timeline_init(&tl, 0);
348 /* Benchmark searching for a random context id and maybe changing it */
349 prandom_seed_state(&prng, i915_selftest.random_seed);
352 end_time = jiffies + HZ/10;
354 u32 id = random_engine(&prng);
355 u32 seqno = prandom_u32_state(&prng);
357 if (!__i915_timeline_sync_is_later(&tl, id, seqno))
358 __i915_timeline_sync_set(&tl, id, seqno);
361 } while (!time_after(jiffies, end_time));
362 kt = ktime_sub(ktime_get(), kt);
363 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
364 pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
365 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
366 mock_timeline_fini(&tl);
369 /* Benchmark searching for a known context id and changing the seqno */
370 for (last_order = 1, order = 1; order < 32;
371 ({ int tmp = last_order; last_order = order; order += tmp; })) {
372 unsigned int mask = BIT(order) - 1;
374 mock_timeline_init(&tl, 0);
378 end_time = jiffies + HZ/10;
380 /* Without assuming too many details of the underlying
381 * implementation, try to identify its phase-changes
384 u64 id = (u64)(count & mask) << order;
386 __i915_timeline_sync_is_later(&tl, id, 0);
387 __i915_timeline_sync_set(&tl, id, 0);
390 } while (!time_after(jiffies, end_time));
391 kt = ktime_sub(ktime_get(), kt);
392 pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
393 __func__, count, order,
394 (long long)div64_ul(ktime_to_ns(kt), count));
395 mock_timeline_fini(&tl);
402 int i915_timeline_mock_selftests(void)
404 static const struct i915_subtest tests[] = {
405 SUBTEST(mock_hwsp_freelist),
410 return i915_subtests(tests, NULL);
413 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
417 cs = intel_ring_begin(rq, 4);
421 if (INTEL_GEN(rq->i915) >= 8) {
422 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
426 } else if (INTEL_GEN(rq->i915) >= 4) {
427 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
432 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
438 intel_ring_advance(rq, cs);
443 static struct i915_request *
444 tl_write(struct i915_timeline *tl, struct intel_engine_cs *engine, u32 value)
446 struct i915_request *rq;
449 lockdep_assert_held(&tl->i915->drm.struct_mutex); /* lazy rq refs */
451 err = i915_timeline_pin(tl);
457 rq = i915_request_alloc(engine, engine->i915->kernel_context);
461 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
462 i915_request_add(rq);
467 i915_timeline_unpin(tl);
470 pr_err("Failed to write to timeline!\n");
474 static struct i915_timeline *
475 checked_i915_timeline_create(struct drm_i915_private *i915)
477 struct i915_timeline *tl;
479 tl = i915_timeline_create(i915, NULL);
483 if (*tl->hwsp_seqno != tl->seqno) {
484 pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
485 *tl->hwsp_seqno, tl->seqno);
486 i915_timeline_put(tl);
487 return ERR_PTR(-EINVAL);
493 static int live_hwsp_engine(void *arg)
495 #define NUM_TIMELINES 4096
496 struct drm_i915_private *i915 = arg;
497 struct i915_timeline **timelines;
498 struct intel_engine_cs *engine;
499 enum intel_engine_id id;
500 intel_wakeref_t wakeref;
501 unsigned long count, n;
505 * Create a bunch of timelines and check we can write
506 * independently to each of their breadcrumb slots.
509 timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
515 mutex_lock(&i915->drm.struct_mutex);
516 wakeref = intel_runtime_pm_get(i915);
519 for_each_engine(engine, i915, id) {
520 if (!intel_engine_can_store_dword(engine))
523 for (n = 0; n < NUM_TIMELINES; n++) {
524 struct i915_timeline *tl;
525 struct i915_request *rq;
527 tl = checked_i915_timeline_create(i915);
533 rq = tl_write(tl, engine, count);
535 i915_timeline_put(tl);
540 timelines[count++] = tl;
545 if (igt_flush_test(i915, I915_WAIT_LOCKED))
548 for (n = 0; n < count; n++) {
549 struct i915_timeline *tl = timelines[n];
551 if (!err && *tl->hwsp_seqno != n) {
552 pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
556 i915_timeline_put(tl);
559 intel_runtime_pm_put(i915, wakeref);
560 mutex_unlock(&i915->drm.struct_mutex);
568 static int live_hwsp_alternate(void *arg)
570 #define NUM_TIMELINES 4096
571 struct drm_i915_private *i915 = arg;
572 struct i915_timeline **timelines;
573 struct intel_engine_cs *engine;
574 enum intel_engine_id id;
575 intel_wakeref_t wakeref;
576 unsigned long count, n;
580 * Create a bunch of timelines and check we can write
581 * independently to each of their breadcrumb slots with adjacent
585 timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
591 mutex_lock(&i915->drm.struct_mutex);
592 wakeref = intel_runtime_pm_get(i915);
595 for (n = 0; n < NUM_TIMELINES; n++) {
596 for_each_engine(engine, i915, id) {
597 struct i915_timeline *tl;
598 struct i915_request *rq;
600 if (!intel_engine_can_store_dword(engine))
603 tl = checked_i915_timeline_create(i915);
609 rq = tl_write(tl, engine, count);
611 i915_timeline_put(tl);
616 timelines[count++] = tl;
621 if (igt_flush_test(i915, I915_WAIT_LOCKED))
624 for (n = 0; n < count; n++) {
625 struct i915_timeline *tl = timelines[n];
627 if (!err && *tl->hwsp_seqno != n) {
628 pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
632 i915_timeline_put(tl);
635 intel_runtime_pm_put(i915, wakeref);
636 mutex_unlock(&i915->drm.struct_mutex);
644 static int live_hwsp_wrap(void *arg)
646 struct drm_i915_private *i915 = arg;
647 struct intel_engine_cs *engine;
648 struct i915_timeline *tl;
649 enum intel_engine_id id;
650 intel_wakeref_t wakeref;
654 * Across a seqno wrap, we need to keep the old cacheline alive for
655 * foreign GPU references.
658 mutex_lock(&i915->drm.struct_mutex);
659 wakeref = intel_runtime_pm_get(i915);
661 tl = i915_timeline_create(i915, NULL);
666 if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
669 err = i915_timeline_pin(tl);
673 for_each_engine(engine, i915, id) {
674 const u32 *hwsp_seqno[2];
675 struct i915_request *rq;
678 if (!intel_engine_can_store_dword(engine))
681 rq = i915_request_alloc(engine, i915->kernel_context);
689 err = i915_timeline_get_seqno(tl, rq, &seqno[0]);
691 i915_request_add(rq);
694 pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
695 seqno[0], tl->hwsp_offset);
697 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
699 i915_request_add(rq);
702 hwsp_seqno[0] = tl->hwsp_seqno;
704 err = i915_timeline_get_seqno(tl, rq, &seqno[1]);
706 i915_request_add(rq);
709 pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
710 seqno[1], tl->hwsp_offset);
712 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
714 i915_request_add(rq);
717 hwsp_seqno[1] = tl->hwsp_seqno;
719 /* With wrap should come a new hwsp */
720 GEM_BUG_ON(seqno[1] >= seqno[0]);
721 GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
723 i915_request_add(rq);
725 if (i915_request_wait(rq, I915_WAIT_LOCKED, HZ / 5) < 0) {
726 pr_err("Wait for timeline writes timed out!\n");
731 if (*hwsp_seqno[0] != seqno[0] || *hwsp_seqno[1] != seqno[1]) {
732 pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
733 *hwsp_seqno[0], *hwsp_seqno[1],
739 i915_retire_requests(i915); /* recycle HWSP */
743 if (igt_flush_test(i915, I915_WAIT_LOCKED))
746 i915_timeline_unpin(tl);
748 i915_timeline_put(tl);
750 intel_runtime_pm_put(i915, wakeref);
751 mutex_unlock(&i915->drm.struct_mutex);
756 static int live_hwsp_recycle(void *arg)
758 struct drm_i915_private *i915 = arg;
759 struct intel_engine_cs *engine;
760 enum intel_engine_id id;
761 intel_wakeref_t wakeref;
766 * Check seqno writes into one timeline at a time. We expect to
767 * recycle the breadcrumb slot between iterations and neither
768 * want to confuse ourselves or the GPU.
771 mutex_lock(&i915->drm.struct_mutex);
772 wakeref = intel_runtime_pm_get(i915);
775 for_each_engine(engine, i915, id) {
776 IGT_TIMEOUT(end_time);
778 if (!intel_engine_can_store_dword(engine))
782 struct i915_timeline *tl;
783 struct i915_request *rq;
785 tl = checked_i915_timeline_create(i915);
791 rq = tl_write(tl, engine, count);
793 i915_timeline_put(tl);
798 if (i915_request_wait(rq,
801 pr_err("Wait for timeline writes timed out!\n");
802 i915_timeline_put(tl);
807 if (*tl->hwsp_seqno != count) {
808 pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
809 count, *tl->hwsp_seqno);
813 i915_timeline_put(tl);
819 i915_timelines_park(i915); /* Encourage recycling! */
820 } while (!__igt_timeout(end_time, NULL));
824 if (igt_flush_test(i915, I915_WAIT_LOCKED))
826 intel_runtime_pm_put(i915, wakeref);
827 mutex_unlock(&i915->drm.struct_mutex);
832 int i915_timeline_live_selftests(struct drm_i915_private *i915)
834 static const struct i915_subtest tests[] = {
835 SUBTEST(live_hwsp_recycle),
836 SUBTEST(live_hwsp_engine),
837 SUBTEST(live_hwsp_alternate),
838 SUBTEST(live_hwsp_wrap),
841 if (i915_terminally_wedged(i915))
844 return i915_subtests(tests, i915);