riscv: Implement xchg8/16() using Zabha
authorAlexandre Ghiti <alexghiti@rivosinc.com>
Sun, 3 Nov 2024 14:51:48 +0000 (15:51 +0100)
committerPalmer Dabbelt <palmer@rivosinc.com>
Mon, 11 Nov 2024 15:33:15 +0000 (07:33 -0800)
This adds runtime support for Zabha in xchg8/16() operations.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20241103145153.105097-9-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
arch/riscv/include/asm/cmpxchg.h

index f95929f..4cadc56 100644 (file)
 #include <asm/insn-def.h>
 #include <asm/cpufeature-macros.h>
 
-#define __arch_xchg_masked(sc_sfx, prepend, append, r, p, n)           \
-({                                                                     \
-       u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3);                     \
-       ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE;  \
-       ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
-                       << __s;                                         \
-       ulong __newx = (ulong)(n) << __s;                               \
-       ulong __retx;                                                   \
-       ulong __rc;                                                     \
-                                                                       \
-       __asm__ __volatile__ (                                          \
-              prepend                                                  \
-              "0:      lr.w %0, %2\n"                                  \
-              "        and  %1, %0, %z4\n"                             \
-              "        or   %1, %1, %z3\n"                             \
-              "        sc.w" sc_sfx " %1, %1, %2\n"                    \
-              "        bnez %1, 0b\n"                                  \
-              append                                                   \
-              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
-              : "rJ" (__newx), "rJ" (~__mask)                          \
-              : "memory");                                             \
-                                                                       \
-       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
+#define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append,               \
+                          swap_append, r, p, n)                                \
+({                                                                             \
+       if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&                               \
+           riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA)) {                \
+               __asm__ __volatile__ (                                          \
+                       prepend                                                 \
+                       "       amoswap" swap_sfx " %0, %z2, %1\n"              \
+                       swap_append                                             \
+                       : "=&r" (r), "+A" (*(p))                                \
+                       : "rJ" (n)                                              \
+                       : "memory");                                            \
+       } else {                                                                \
+               u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3);                     \
+               ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE;  \
+               ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)   \
+                               << __s;                                         \
+               ulong __newx = (ulong)(n) << __s;                               \
+               ulong __retx;                                                   \
+               ulong __rc;                                                     \
+                                                                               \
+               __asm__ __volatile__ (                                          \
+                      prepend                                                  \
+                      "0:      lr.w %0, %2\n"                                  \
+                      "        and  %1, %0, %z4\n"                             \
+                      "        or   %1, %1, %z3\n"                             \
+                      "        sc.w" sc_sfx " %1, %1, %2\n"                    \
+                      "        bnez %1, 0b\n"                                  \
+                      sc_append                                                \
+                      : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
+                      : "rJ" (__newx), "rJ" (~__mask)                          \
+                      : "memory");                                             \
+                                                                               \
+               r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
+       }                                                                       \
 })
 
 #define __arch_xchg(sfx, prepend, append, r, p, n)                     \
                                                                        \
        switch (sizeof(*__ptr)) {                                       \
        case 1:                                                         \
+               __arch_xchg_masked(sc_sfx, ".b" swap_sfx,               \
+                                  prepend, sc_append, swap_append,     \
+                                  __ret, __ptr, __new);                \
+               break;                                                  \
        case 2:                                                         \
-               __arch_xchg_masked(sc_sfx, prepend, sc_append,          \
+               __arch_xchg_masked(sc_sfx, ".h" swap_sfx,               \
+                                  prepend, sc_append, swap_append,     \
                                   __ret, __ptr, __new);                \
                break;                                                  \
        case 4:                                                         \