riscv: Improve zacas fully-ordered cmpxchg()
authorAlexandre Ghiti <alexghiti@rivosinc.com>
Sun, 3 Nov 2024 14:51:46 +0000 (15:51 +0100)
committerPalmer Dabbelt <palmer@rivosinc.com>
Mon, 11 Nov 2024 15:33:13 +0000 (07:33 -0800)
The current fully-ordered cmpxchgXX() implementation results in:

  amocas.X.rl     a5,a4,(s1)
  fence           rw,rw

This provides enough sync but we can actually use the following better
mapping instead:

  amocas.X.aqrl   a5,a4,(s1)

Suggested-by: Andrea Parri <andrea@rivosinc.com>
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20241103145153.105097-7-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
arch/riscv/include/asm/cmpxchg.h

index 1f4cd12..052418a 100644 (file)
  * store NEW in MEM.  Return the initial value in MEM.  Success is
  * indicated by comparing RETURN with OLD.
  */
-
-#define __arch_cmpxchg_masked(sc_sfx, cas_sfx, prepend, append, r, p, o, n)    \
+#define __arch_cmpxchg_masked(sc_sfx, cas_sfx,                                 \
+                             sc_prepend, sc_append,                            \
+                             cas_prepend, cas_append,                          \
+                             r, p, o, n)                                       \
 ({                                                                             \
        if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&                               \
            IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) &&                               \
                r = o;                                                          \
                                                                                \
                __asm__ __volatile__ (                                          \
-                       prepend                                                 \
+                       cas_prepend                                                     \
                        "       amocas" cas_sfx " %0, %z2, %1\n"                \
-                       append                                                  \
+                       cas_append                                                      \
                        : "+&r" (r), "+A" (*(p))                                \
                        : "rJ" (n)                                              \
                        : "memory");                                            \
                ulong __rc;                                                     \
                                                                                \
                __asm__ __volatile__ (                                          \
-                       prepend                                                 \
+                       sc_prepend                                                      \
                        "0:     lr.w %0, %2\n"                                  \
                        "       and  %1, %0, %z5\n"                             \
                        "       bne  %1, %z3, 1f\n"                             \
                        "       or   %1, %1, %z4\n"                             \
                        "       sc.w" sc_sfx " %1, %1, %2\n"                    \
                        "       bnez %1, 0b\n"                                  \
-                       append                                                  \
+                       sc_append                                                       \
                        "1:\n"                                                  \
                        : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))      \
                        : "rJ" ((long)__oldx), "rJ" (__newx),                   \
        }                                                                       \
 })
 
-#define __arch_cmpxchg(lr_sfx, sc_cas_sfx, prepend, append, r, p, co, o, n)    \
+#define __arch_cmpxchg(lr_sfx, sc_sfx, cas_sfx,                                \
+                      sc_prepend, sc_append,                           \
+                      cas_prepend, cas_append,                         \
+                      r, p, co, o, n)                                  \
 ({                                                                     \
        if (IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) &&                       \
            riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) {        \
                r = o;                                                  \
                                                                        \
                __asm__ __volatile__ (                                  \
-                       prepend                                         \
-                       "       amocas" sc_cas_sfx " %0, %z2, %1\n"     \
-                       append                                          \
+                       cas_prepend                                     \
+                       "       amocas" cas_sfx " %0, %z2, %1\n"        \
+                       cas_append                                      \
                        : "+&r" (r), "+A" (*(p))                        \
                        : "rJ" (n)                                      \
                        : "memory");                                    \
                register unsigned int __rc;                             \
                                                                        \
                __asm__ __volatile__ (                                  \
-                       prepend                                         \
+                       sc_prepend                                      \
                        "0:     lr" lr_sfx " %0, %2\n"                  \
                        "       bne  %0, %z3, 1f\n"                     \
-                       "       sc" sc_cas_sfx " %1, %z4, %2\n"         \
+                       "       sc" sc_sfx " %1, %z4, %2\n"             \
                        "       bnez %1, 0b\n"                          \
-                       append                                          \
+                       sc_append                                       \
                        "1:\n"                                          \
                        : "=&r" (r), "=&r" (__rc), "+A" (*(p))          \
                        : "rJ" (co o), "rJ" (n)                         \
        }                                                               \
 })
 
-#define _arch_cmpxchg(ptr, old, new, sc_cas_sfx, prepend, append)      \
+#define _arch_cmpxchg(ptr, old, new, sc_sfx, cas_sfx,                  \
+                     sc_prepend, sc_append,                            \
+                     cas_prepend, cas_append)                          \
 ({                                                                     \
        __typeof__(ptr) __ptr = (ptr);                                  \
        __typeof__(*(__ptr)) __old = (old);                             \
                                                                        \
        switch (sizeof(*__ptr)) {                                       \
        case 1:                                                         \
-               __arch_cmpxchg_masked(sc_cas_sfx, ".b" sc_cas_sfx,      \
-                                       prepend, append,                \
-                                       __ret, __ptr, __old, __new);    \
+               __arch_cmpxchg_masked(sc_sfx, ".b" cas_sfx,             \
+                                     sc_prepend, sc_append,            \
+                                     cas_prepend, cas_append,          \
+                                     __ret, __ptr, __old, __new);      \
                break;                                                  \
        case 2:                                                         \
-               __arch_cmpxchg_masked(sc_cas_sfx, ".h" sc_cas_sfx,      \
-                                       prepend, append,                \
-                                       __ret, __ptr, __old, __new);    \
+               __arch_cmpxchg_masked(sc_sfx, ".h" cas_sfx,             \
+                                     sc_prepend, sc_append,            \
+                                     cas_prepend, cas_append,          \
+                                     __ret, __ptr, __old, __new);      \
                break;                                                  \
        case 4:                                                         \
-               __arch_cmpxchg(".w", ".w" sc_cas_sfx, prepend, append,  \
-                               __ret, __ptr, (long), __old, __new);    \
+               __arch_cmpxchg(".w", ".w" sc_sfx, ".w" cas_sfx,         \
+                              sc_prepend, sc_append,                   \
+                              cas_prepend, cas_append,                 \
+                              __ret, __ptr, (long), __old, __new);     \
                break;                                                  \
        case 8:                                                         \
-               __arch_cmpxchg(".d", ".d" sc_cas_sfx, prepend, append,  \
-                               __ret, __ptr, /**/, __old, __new);      \
+               __arch_cmpxchg(".d", ".d" sc_sfx, ".d" cas_sfx,         \
+                              sc_prepend, sc_append,                   \
+                              cas_prepend, cas_append,                 \
+                              __ret, __ptr, /**/, __old, __new);       \
                break;                                                  \
        default:                                                        \
                BUILD_BUG();                                            \
        (__typeof__(*(__ptr)))__ret;                                    \
 })
 
+/*
+ * These macros are here to improve the readability of the arch_cmpxchg_XXX()
+ * macros.
+ */
+#define SC_SFX(x)      x
+#define CAS_SFX(x)     x
+#define SC_PREPEND(x)  x
+#define SC_APPEND(x)   x
+#define CAS_PREPEND(x) x
+#define CAS_APPEND(x)  x
+
 #define arch_cmpxchg_relaxed(ptr, o, n)                                        \
-       _arch_cmpxchg((ptr), (o), (n), "", "", "")
+       _arch_cmpxchg((ptr), (o), (n),                                  \
+                     SC_SFX(""), CAS_SFX(""),                          \
+                     SC_PREPEND(""), SC_APPEND(""),                    \
+                     CAS_PREPEND(""), CAS_APPEND(""))
 
 #define arch_cmpxchg_acquire(ptr, o, n)                                        \
-       _arch_cmpxchg((ptr), (o), (n), "", "", RISCV_ACQUIRE_BARRIER)
+       _arch_cmpxchg((ptr), (o), (n),                                  \
+                     SC_SFX(""), CAS_SFX(""),                          \
+                     SC_PREPEND(""), SC_APPEND(RISCV_ACQUIRE_BARRIER), \
+                     CAS_PREPEND(""), CAS_APPEND(RISCV_ACQUIRE_BARRIER))
 
 #define arch_cmpxchg_release(ptr, o, n)                                        \
-       _arch_cmpxchg((ptr), (o), (n), "", RISCV_RELEASE_BARRIER, "")
+       _arch_cmpxchg((ptr), (o), (n),                                  \
+                     SC_SFX(""), CAS_SFX(""),                          \
+                     SC_PREPEND(RISCV_RELEASE_BARRIER), SC_APPEND(""), \
+                     CAS_PREPEND(RISCV_RELEASE_BARRIER), CAS_APPEND(""))
 
 #define arch_cmpxchg(ptr, o, n)                                                \
-       _arch_cmpxchg((ptr), (o), (n), ".rl", "", "     fence rw, rw\n")
+       _arch_cmpxchg((ptr), (o), (n),                                  \
+                     SC_SFX(".rl"), CAS_SFX(".aqrl"),                  \
+                     SC_PREPEND(""), SC_APPEND(RISCV_FULL_BARRIER),    \
+                     CAS_PREPEND(""), CAS_APPEND(""))
 
 #define arch_cmpxchg_local(ptr, o, n)                                  \
        arch_cmpxchg_relaxed((ptr), (o), (n))