+/*
+ * This is a collision-retry read-side/write-side 'lock', a lot like a
+ * seqcount, however this allows multiple write-sides to hold it at
+ * once. Conceptually the write side is protecting the values of the PTEs in
+ * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
+ * writer exists.
+ *
+ * Note that the core mm creates nested invalidate_range_start()/end() regions
+ * within the same thread, and runs invalidate_range_start()/end() in parallel
+ * on multiple CPUs. This is designed to not reduce concurrency or block
+ * progress on the mm side.
+ *
+ * As a secondary function, holding the full write side also serves to prevent
+ * writers for the itree, this is an optimization to avoid extra locking
+ * during invalidate_range_start/end notifiers.
+ *
+ * The write side has two states, fully excluded:
+ * - mm->active_invalidate_ranges != 0
+ * - mnn->invalidate_seq & 1 == True (odd)
+ * - some range on the mm_struct is being invalidated
+ * - the itree is not allowed to change
+ *
+ * And partially excluded:
+ * - mm->active_invalidate_ranges != 0
+ * - mnn->invalidate_seq & 1 == False (even)
+ * - some range on the mm_struct is being invalidated
+ * - the itree is allowed to change
+ *
+ * Operations on mmu_notifier_mm->invalidate_seq (under spinlock):
+ * seq |= 1 # Begin writing
+ * seq++ # Release the writing state
+ * seq & 1 # True if a writer exists
+ *
+ * The later state avoids some expensive work on inv_end in the common case of
+ * no mni monitoring the VA.
+ */
+static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm)
+{
+ lockdep_assert_held(&mmn_mm->lock);
+ return mmn_mm->invalidate_seq & 1;
+}
+
+static struct mmu_interval_notifier *
+mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm,
+ const struct mmu_notifier_range *range,
+ unsigned long *seq)
+{
+ struct interval_tree_node *node;
+ struct mmu_interval_notifier *res = NULL;
+
+ spin_lock(&mmn_mm->lock);
+ mmn_mm->active_invalidate_ranges++;
+ node = interval_tree_iter_first(&mmn_mm->itree, range->start,
+ range->end - 1);
+ if (node) {
+ mmn_mm->invalidate_seq |= 1;
+ res = container_of(node, struct mmu_interval_notifier,
+ interval_tree);
+ }
+
+ *seq = mmn_mm->invalidate_seq;
+ spin_unlock(&mmn_mm->lock);
+ return res;
+}
+
+static struct mmu_interval_notifier *
+mn_itree_inv_next(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range)
+{
+ struct interval_tree_node *node;
+
+ node = interval_tree_iter_next(&mni->interval_tree, range->start,
+ range->end - 1);
+ if (!node)
+ return NULL;
+ return container_of(node, struct mmu_interval_notifier, interval_tree);
+}
+
+static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
+{
+ struct mmu_interval_notifier *mni;
+ struct hlist_node *next;
+
+ spin_lock(&mmn_mm->lock);
+ if (--mmn_mm->active_invalidate_ranges ||
+ !mn_itree_is_invalidating(mmn_mm)) {
+ spin_unlock(&mmn_mm->lock);
+ return;
+ }
+
+ /* Make invalidate_seq even */
+ mmn_mm->invalidate_seq++;
+
+ /*
+ * The inv_end incorporates a deferred mechanism like rtnl_unlock().
+ * Adds and removes are queued until the final inv_end happens then
+ * they are progressed. This arrangement for tree updates is used to
+ * avoid using a blocking lock during invalidate_range_start.
+ */
+ hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list,
+ deferred_item) {
+ if (RB_EMPTY_NODE(&mni->interval_tree.rb))
+ interval_tree_insert(&mni->interval_tree,
+ &mmn_mm->itree);
+ else
+ interval_tree_remove(&mni->interval_tree,
+ &mmn_mm->itree);
+ hlist_del(&mni->deferred_item);
+ }
+ spin_unlock(&mmn_mm->lock);
+
+ wake_up_all(&mmn_mm->wq);
+}
+
+/**
+ * mmu_interval_read_begin - Begin a read side critical section against a VA
+ * range
+ * mni: The range to use
+ *
+ * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
+ * collision-retry scheme similar to seqcount for the VA range under mni. If
+ * the mm invokes invalidation during the critical section then
+ * mmu_interval_read_retry() will return true.
+ *
+ * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
+ * require a blocking context. The critical region formed by this can sleep,
+ * and the required 'user_lock' can also be a sleeping lock.
+ *
+ * The caller is required to provide a 'user_lock' to serialize both teardown
+ * and setup.
+ *
+ * The return value should be passed to mmu_interval_read_retry().
+ */
+unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni)
+{
+ struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm;
+ unsigned long seq;
+ bool is_invalidating;
+
+ /*
+ * If the mni has a different seq value under the user_lock than we
+ * started with then it has collided.
+ *
+ * If the mni currently has the same seq value as the mmn_mm seq, then
+ * it is currently between invalidate_start/end and is colliding.
+ *
+ * The locking looks broadly like this:
+ * mn_tree_invalidate_start(): mmu_interval_read_begin():
+ * spin_lock
+ * seq = READ_ONCE(mni->invalidate_seq);
+ * seq == mmn_mm->invalidate_seq
+ * spin_unlock
+ * spin_lock
+ * seq = ++mmn_mm->invalidate_seq
+ * spin_unlock
+ * op->invalidate_range():
+ * user_lock
+ * mmu_interval_set_seq()
+ * mni->invalidate_seq = seq
+ * user_unlock
+ *
+ * [Required: mmu_interval_read_retry() == true]
+ *
+ * mn_itree_inv_end():
+ * spin_lock
+ * seq = ++mmn_mm->invalidate_seq
+ * spin_unlock
+ *
+ * user_lock
+ * mmu_interval_read_retry():
+ * mni->invalidate_seq != seq
+ * user_unlock
+ *
+ * Barriers are not needed here as any races here are closed by an
+ * eventual mmu_interval_read_retry(), which provides a barrier via the
+ * user_lock.
+ */
+ spin_lock(&mmn_mm->lock);
+ /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
+ seq = READ_ONCE(mni->invalidate_seq);
+ is_invalidating = seq == mmn_mm->invalidate_seq;
+ spin_unlock(&mmn_mm->lock);
+
+ /*
+ * mni->invalidate_seq must always be set to an odd value via
+ * mmu_interval_set_seq() using the provided cur_seq from
+ * mn_itree_inv_start_range(). This ensures that if seq does wrap we
+ * will always clear the below sleep in some reasonable time as
+ * mmn_mm->invalidate_seq is even in the idle state.
+ */
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ if (is_invalidating)
+ wait_event(mmn_mm->wq,
+ READ_ONCE(mmn_mm->invalidate_seq) != seq);
+
+ /*
+ * Notice that mmu_interval_read_retry() can already be true at this
+ * point, avoiding loops here allows the caller to provide a global
+ * time bound.
+ */
+
+ return seq;
+}
+EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
+
+static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier_range range = {
+ .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
+ .event = MMU_NOTIFY_RELEASE,
+ .mm = mm,
+ .start = 0,
+ .end = ULONG_MAX,
+ };
+ struct mmu_interval_notifier *mni;
+ unsigned long cur_seq;
+ bool ret;
+
+ for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni;
+ mni = mn_itree_inv_next(mni, &range)) {
+ ret = mni->ops->invalidate(mni, &range, cur_seq);
+ WARN_ON(!ret);
+ }
+
+ mn_itree_inv_end(mmn_mm);
+}
+