drivers/gpu/drm/drm_cache.c

   1 /**************************************************************************
   2  *
   3  * Copyright (c) 2006-2007 Tungsten Graphics, Inc., Cedar Park, TX., USA
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  21  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27 /*
  28  * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
  29  */
  30 #include <linux/cc_platform.h>
  31 #include <linux/export.h>
  32 #include <linux/highmem.h>
  33 #include <linux/ioport.h>
  34 #include <linux/iosys-map.h>
  35 #include <xen/xen.h>
  36
  37 #include <drm/drm_cache.h>
  38
  39 /* A small bounce buffer that fits on the stack. */
  40 #define MEMCPY_BOUNCE_SIZE 128
  41
  42 #if defined(CONFIG_X86)
  43 #include <asm/smp.h>
  44
  45 /*
  46  * clflushopt is an unordered instruction which needs fencing with mfence or
  47  * sfence to avoid ordering issues.  For drm_clflush_page this fencing happens
  48  * in the caller.
  49  */
  50 static void
  51 drm_clflush_page(struct page *page)
  52 {
  53         uint8_t *page_virtual;
  54         unsigned int i;
  55         const int size = boot_cpu_data.x86_clflush_size;
  56
  57         if (unlikely(page == NULL))
  58                 return;
  59
  60         page_virtual = kmap_atomic(page);
  61         for (i = 0; i < PAGE_SIZE; i += size)
  62                 clflushopt(page_virtual + i);
  63         kunmap_atomic(page_virtual);
  64 }
  65
  66 static void drm_cache_flush_clflush(struct page *pages[],
  67                                     unsigned long num_pages)
  68 {
  69         unsigned long i;
  70
  71         mb(); /*Full memory barrier used before so that CLFLUSH is ordered*/
  72         for (i = 0; i < num_pages; i++)
  73                 drm_clflush_page(*pages++);
  74         mb(); /*Also used after CLFLUSH so that all cache is flushed*/
  75 }
  76 #endif
  77
  78 /**
  79  * drm_clflush_pages - Flush dcache lines of a set of pages.
  80  * @pages: List of pages to be flushed.
  81  * @num_pages: Number of pages in the array.
  82  *
  83  * Flush every data cache line entry that points to an address belonging
  84  * to a page in the array.
  85  */
  86 void
  87 drm_clflush_pages(struct page *pages[], unsigned long num_pages)
  88 {
  89
  90 #if defined(CONFIG_X86)
  91         if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
  92                 drm_cache_flush_clflush(pages, num_pages);
  93                 return;
  94         }
  95
  96         if (wbinvd_on_all_cpus())
  97                 pr_err("Timed out waiting for cache flush\n");
  98
  99 #elif defined(__powerpc__)
 100         unsigned long i;
 101
 102         for (i = 0; i < num_pages; i++) {
 103                 struct page *page = pages[i];
 104                 void *page_virtual;
 105
 106                 if (unlikely(page == NULL))
 107                         continue;
 108
 109                 page_virtual = kmap_atomic(page);
 110                 flush_dcache_range((unsigned long)page_virtual,
 111                                    (unsigned long)page_virtual + PAGE_SIZE);
 112                 kunmap_atomic(page_virtual);
 113         }
 114 #else
 115         WARN_ONCE(1, "Architecture has no drm_cache.c support\n");
 116 #endif
 117 }
 118 EXPORT_SYMBOL(drm_clflush_pages);
 119
 120 /**
 121  * drm_clflush_sg - Flush dcache lines pointing to a scather-gather.
 122  * @st: struct sg_table.
 123  *
 124  * Flush every data cache line entry that points to an address in the
 125  * sg.
 126  */
 127 void
 128 drm_clflush_sg(struct sg_table *st)
 129 {
 130 #if defined(CONFIG_X86)
 131         if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 132                 struct sg_page_iter sg_iter;
 133
 134                 mb(); /*CLFLUSH is ordered only by using memory barriers*/
 135                 for_each_sgtable_page(st, &sg_iter, 0)
 136                         drm_clflush_page(sg_page_iter_page(&sg_iter));
 137                 mb(); /*Make sure that all cache line entry is flushed*/
 138
 139                 return;
 140         }
 141
 142         if (wbinvd_on_all_cpus())
 143                 pr_err("Timed out waiting for cache flush\n");
 144 #else
 145         WARN_ONCE(1, "Architecture has no drm_cache.c support\n");
 146 #endif
 147 }
 148 EXPORT_SYMBOL(drm_clflush_sg);
 149
 150 /**
 151  * drm_clflush_virt_range - Flush dcache lines of a region
 152  * @addr: Initial kernel memory address.
 153  * @length: Region size.
 154  *
 155  * Flush every data cache line entry that points to an address in the
 156  * region requested.
 157  */
 158 void
 159 drm_clflush_virt_range(void *addr, unsigned long length)
 160 {
 161 #if defined(CONFIG_X86)
 162         if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 163                 const int size = boot_cpu_data.x86_clflush_size;
 164                 void *end = addr + length;
 165
 166                 addr = (void *)(((unsigned long)addr) & -size);
 167                 mb(); /*CLFLUSH is only ordered with a full memory barrier*/
 168                 for (; addr < end; addr += size)
 169                         clflushopt(addr);
 170                 clflushopt(end - 1); /* force serialisation */
 171                 mb(); /*Ensure that every data cache line entry is flushed*/
 172                 return;
 173         }
 174
 175         if (wbinvd_on_all_cpus())
 176                 pr_err("Timed out waiting for cache flush\n");
 177 #else
 178         WARN_ONCE(1, "Architecture has no drm_cache.c support\n");
 179 #endif
 180 }
 181 EXPORT_SYMBOL(drm_clflush_virt_range);
 182
 183 bool drm_need_swiotlb(int dma_bits)
 184 {
 185         struct resource *tmp;
 186         resource_size_t max_iomem = 0;
 187
 188         /*
 189          * Xen paravirtual hosts require swiotlb regardless of requested dma
 190          * transfer size.
 191          *
 192          * NOTE: Really, what it requires is use of the dma_alloc_coherent
 193          *       allocator used in ttm_dma_populate() instead of
 194          *       ttm_populate_and_map_pages(), which bounce buffers so much in
 195          *       Xen it leads to swiotlb buffer exhaustion.
 196          */
 197         if (xen_pv_domain())
 198                 return true;
 199
 200         /*
 201          * Enforce dma_alloc_coherent when memory encryption is active as well
 202          * for the same reasons as for Xen paravirtual hosts.
 203          */
 204         if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
 205                 return true;
 206
 207         for (tmp = iomem_resource.child; tmp; tmp = tmp->sibling)
 208                 max_iomem = max(max_iomem,  tmp->end);
 209
 210         return max_iomem > ((u64)1 << dma_bits);
 211 }
 212 EXPORT_SYMBOL(drm_need_swiotlb);
 213
 214 static void memcpy_fallback(struct iosys_map *dst,
 215                             const struct iosys_map *src,
 216                             unsigned long len)
 217 {
 218         if (!dst->is_iomem && !src->is_iomem) {
 219                 memcpy(dst->vaddr, src->vaddr, len);
 220         } else if (!src->is_iomem) {
 221                 iosys_map_memcpy_to(dst, 0, src->vaddr, len);
 222         } else if (!dst->is_iomem) {
 223                 memcpy_fromio(dst->vaddr, src->vaddr_iomem, len);
 224         } else {
 225                 /*
 226                  * Bounce size is not performance tuned, but using a
 227                  * bounce buffer like this is significantly faster than
 228                  * resorting to ioreadxx() + iowritexx().
 229                  */
 230                 char bounce[MEMCPY_BOUNCE_SIZE];
 231                 void __iomem *_src = src->vaddr_iomem;
 232                 void __iomem *_dst = dst->vaddr_iomem;
 233
 234                 while (len >= MEMCPY_BOUNCE_SIZE) {
 235                         memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);
 236                         memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);
 237                         _src += MEMCPY_BOUNCE_SIZE;
 238                         _dst += MEMCPY_BOUNCE_SIZE;
 239                         len -= MEMCPY_BOUNCE_SIZE;
 240                 }
 241                 if (len) {
 242                         memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);
 243                         memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);
 244                 }
 245         }
 246 }
 247
 248 #ifdef CONFIG_X86
 249
 250 static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
 251
 252 static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
 253 {
 254         kernel_fpu_begin();
 255
 256         while (len >= 4) {
 257                 asm("movntdqa   (%0), %%xmm0\n"
 258                     "movntdqa 16(%0), %%xmm1\n"
 259                     "movntdqa 32(%0), %%xmm2\n"
 260                     "movntdqa 48(%0), %%xmm3\n"
 261                     "movaps %%xmm0,   (%1)\n"
 262                     "movaps %%xmm1, 16(%1)\n"
 263                     "movaps %%xmm2, 32(%1)\n"
 264                     "movaps %%xmm3, 48(%1)\n"
 265                     :: "r" (src), "r" (dst) : "memory");
 266                 src += 64;
 267                 dst += 64;
 268                 len -= 4;
 269         }
 270         while (len--) {
 271                 asm("movntdqa (%0), %%xmm0\n"
 272                     "movaps %%xmm0, (%1)\n"
 273                     :: "r" (src), "r" (dst) : "memory");
 274                 src += 16;
 275                 dst += 16;
 276         }
 277
 278         kernel_fpu_end();
 279 }
 280
 281 /*
 282  * __drm_memcpy_from_wc copies @len bytes from @src to @dst using
 283  * non-temporal instructions where available. Note that all arguments
 284  * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
 285  * of 16.
 286  */
 287 static void __drm_memcpy_from_wc(void *dst, const void *src, unsigned long len)
 288 {
 289         if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
 290                 memcpy(dst, src, len);
 291         else if (likely(len))
 292                 __memcpy_ntdqa(dst, src, len >> 4);
 293 }
 294
 295 /**
 296  * drm_memcpy_from_wc - Perform the fastest available memcpy from a source
 297  * that may be WC.
 298  * @dst: The destination pointer
 299  * @src: The source pointer
 300  * @len: The size of the area o transfer in bytes
 301  *
 302  * Tries an arch optimized memcpy for prefetching reading out of a WC region,
 303  * and if no such beast is available, falls back to a normal memcpy.
 304  */
 305 void drm_memcpy_from_wc(struct iosys_map *dst,
 306                         const struct iosys_map *src,
 307                         unsigned long len)
 308 {
 309         if (WARN_ON(in_interrupt())) {
 310                 memcpy_fallback(dst, src, len);
 311                 return;
 312         }
 313
 314         if (static_branch_likely(&has_movntdqa)) {
 315                 __drm_memcpy_from_wc(dst->is_iomem ?
 316                                      (void __force *)dst->vaddr_iomem :
 317                                      dst->vaddr,
 318                                      src->is_iomem ?
 319                                      (void const __force *)src->vaddr_iomem :
 320                                      src->vaddr,
 321                                      len);
 322                 return;
 323         }
 324
 325         memcpy_fallback(dst, src, len);
 326 }
 327 EXPORT_SYMBOL(drm_memcpy_from_wc);
 328
 329 /*
 330  * drm_memcpy_init_early - One time initialization of the WC memcpy code
 331  */
 332 void drm_memcpy_init_early(void)
 333 {
 334         /*
 335          * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
 336          * emulation. So don't enable movntdqa in hypervisor guest.
 337          */
 338         if (static_cpu_has(X86_FEATURE_XMM4_1) &&
 339             !boot_cpu_has(X86_FEATURE_HYPERVISOR))
 340                 static_branch_enable(&has_movntdqa);
 341 }
 342 #else
 343 void drm_memcpy_from_wc(struct iosys_map *dst,
 344                         const struct iosys_map *src,
 345                         unsigned long len)
 346 {
 347         WARN_ON(in_interrupt());
 348
 349         memcpy_fallback(dst, src, len);
 350 }
 351 EXPORT_SYMBOL(drm_memcpy_from_wc);
 352
 353 void drm_memcpy_init_early(void)
 354 {
 355 }
 356 #endif /* CONFIG_X86 */