Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdim...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 17 Nov 2017 17:51:57 +0000 (09:51 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 17 Nov 2017 17:51:57 +0000 (09:51 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Nov 2017 17:51:57 +0000 (09:51 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Nov 2017 17:51:57 +0000 (09:51 -0800)
diff --git a/MAINTAINERS b/MAINTAINERS

index 540762a..e04d108 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4208,7 +4208,7 @@ L:        linux-i2c@vger.kernel.org
  S:     Maintained
  F:     drivers/i2c/busses/i2c-diolan-u2c.c
  
-DIRECT ACCESS (DAX)
+FILESYSTEM DIRECT ACCESS (DAX)
  M:     Matthew Wilcox <mawilcox@microsoft.com>
  M:     Ross Zwisler <ross.zwisler@linux.intel.com>
  L:     linux-fsdevel@vger.kernel.org
@@ -4217,6 +4217,12 @@ F:       fs/dax.c
  F:     include/linux/dax.h
  F:     include/trace/events/fs_dax.h
  
+DEVICE DIRECT ACCESS (DAX)
+M:     Dan Williams <dan.j.williams@intel.com>
+L:     linux-nvdimm@lists.01.org
+S:     Supported
+F:     drivers/dax/
+
  DIRECTORY NOTIFICATION (DNOTIFY)
  M:     Jan Kara <jack@suse.cz>
  R:     Amir Goldstein <amir73il@gmail.com>
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h

index 6bf7300..2dbdf59 100644 (file)
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -12,6 +12,7 @@
  
  #define MAP_SHARED     0x01            /* Share changes */
  #define MAP_PRIVATE    0x02            /* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03       /* share + validate extension flags */
  #define MAP_TYPE       0x0f            /* Mask for type of mapping (OSF/1 is _wrong_) */
  #define MAP_FIXED      0x100           /* Interpret addr exactly */
  #define MAP_ANONYMOUS  0x10            /* don't use a file */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h

index 20c3df7..606e02c 100644 (file)
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -29,6 +29,7 @@
   */
  #define MAP_SHARED     0x001           /* Share changes */
  #define MAP_PRIVATE    0x002           /* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003      /* share + validate extension flags */
  #define MAP_TYPE       0x00f           /* Mask for type of mapping */
  #define MAP_FIXED      0x010           /* Interpret addr exactly */
  
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h

index d1af0d7..80510ba 100644 (file)
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -12,6 +12,7 @@
  
  #define MAP_SHARED     0x01            /* Share changes */
  #define MAP_PRIVATE    0x02            /* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03       /* share + validate extension flags */
  #define MAP_TYPE       0x03            /* Mask for type of mapping */
  #define MAP_FIXED      0x04            /* Interpret addr exactly */
  #define MAP_ANONYMOUS  0x10            /* don't use a file */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h

index 2bfe590..3e9d01a 100644 (file)
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -36,6 +36,7 @@
   */
  #define MAP_SHARED     0x001           /* Share changes */
  #define MAP_PRIVATE    0x002           /* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003      /* share + validate extension flags */
  #define MAP_TYPE       0x00f           /* Mask for type of mapping */
  #define MAP_FIXED      0x010           /* Interpret addr exactly */
  
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c

index 9c2c49b..ff2580e 100644 (file)
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -183,13 +183,33 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status)
         return 0;
  }
  
-static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status)
+#define ACPI_LABELS_LOCKED 3
+
+static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
+               u32 status)
  {
+       struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
         switch (cmd) {
         case ND_CMD_GET_CONFIG_SIZE:
+               /*
+                * In the _LSI, _LSR, _LSW case the locked status is
+                * communicated via the read/write commands
+                */
+               if (nfit_mem->has_lsi)
+                       break;
+
                 if (status >> 16 & ND_CONFIG_LOCKED)
                         return -EACCES;
                 break;
+       case ND_CMD_GET_CONFIG_DATA:
+               if (nfit_mem->has_lsr && status == ACPI_LABELS_LOCKED)
+                       return -EACCES;
+               break;
+       case ND_CMD_SET_CONFIG_DATA:
+               if (nfit_mem->has_lsw && status == ACPI_LABELS_LOCKED)
+                       return -EACCES;
+               break;
         default:
                 break;
         }
@@ -205,13 +225,182 @@ static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
  {
         if (!nvdimm)
                 return xlat_bus_status(buf, cmd, status);
-       return xlat_nvdimm_status(buf, cmd, status);
+       return xlat_nvdimm_status(nvdimm, buf, cmd, status);
+}
+
+/* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */
+static union acpi_object *pkg_to_buf(union acpi_object *pkg)
+{
+       int i;
+       void *dst;
+       size_t size = 0;
+       union acpi_object *buf = NULL;
+
+       if (pkg->type != ACPI_TYPE_PACKAGE) {
+               WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
+                               pkg->type);
+               goto err;
+       }
+
+       for (i = 0; i < pkg->package.count; i++) {
+               union acpi_object *obj = &pkg->package.elements[i];
+
+               if (obj->type == ACPI_TYPE_INTEGER)
+                       size += 4;
+               else if (obj->type == ACPI_TYPE_BUFFER)
+                       size += obj->buffer.length;
+               else {
+                       WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
+                                       obj->type);
+                       goto err;
+               }
+       }
+
+       buf = ACPI_ALLOCATE(sizeof(*buf) + size);
+       if (!buf)
+               goto err;
+
+       dst = buf + 1;
+       buf->type = ACPI_TYPE_BUFFER;
+       buf->buffer.length = size;
+       buf->buffer.pointer = dst;
+       for (i = 0; i < pkg->package.count; i++) {
+               union acpi_object *obj = &pkg->package.elements[i];
+
+               if (obj->type == ACPI_TYPE_INTEGER) {
+                       memcpy(dst, &obj->integer.value, 4);
+                       dst += 4;
+               } else if (obj->type == ACPI_TYPE_BUFFER) {
+                       memcpy(dst, obj->buffer.pointer, obj->buffer.length);
+                       dst += obj->buffer.length;
+               }
+       }
+err:
+       ACPI_FREE(pkg);
+       return buf;
+}
+
+static union acpi_object *int_to_buf(union acpi_object *integer)
+{
+       union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4);
+       void *dst = NULL;
+
+       if (!buf)
+               goto err;
+
+       if (integer->type != ACPI_TYPE_INTEGER) {
+               WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
+                               integer->type);
+               goto err;
+       }
+
+       dst = buf + 1;
+       buf->type = ACPI_TYPE_BUFFER;
+       buf->buffer.length = 4;
+       buf->buffer.pointer = dst;
+       memcpy(dst, &integer->integer.value, 4);
+err:
+       ACPI_FREE(integer);
+       return buf;
+}
+
+static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset,
+               u32 len, void *data)
+{
+       acpi_status rc;
+       struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+       struct acpi_object_list input = {
+               .count = 3,
+               .pointer = (union acpi_object []) {
+                       [0] = {
+                               .integer.type = ACPI_TYPE_INTEGER,
+                               .integer.value = offset,
+                       },
+                       [1] = {
+                               .integer.type = ACPI_TYPE_INTEGER,
+                               .integer.value = len,
+                       },
+                       [2] = {
+                               .buffer.type = ACPI_TYPE_BUFFER,
+                               .buffer.pointer = data,
+                               .buffer.length = len,
+                       },
+               },
+       };
+
+       rc = acpi_evaluate_object(handle, "_LSW", &input, &buf);
+       if (ACPI_FAILURE(rc))
+               return NULL;
+       return int_to_buf(buf.pointer);
+}
+
+static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset,
+               u32 len)
+{
+       acpi_status rc;
+       struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+       struct acpi_object_list input = {
+               .count = 2,
+               .pointer = (union acpi_object []) {
+                       [0] = {
+                               .integer.type = ACPI_TYPE_INTEGER,
+                               .integer.value = offset,
+                       },
+                       [1] = {
+                               .integer.type = ACPI_TYPE_INTEGER,
+                               .integer.value = len,
+                       },
+               },
+       };
+
+       rc = acpi_evaluate_object(handle, "_LSR", &input, &buf);
+       if (ACPI_FAILURE(rc))
+               return NULL;
+       return pkg_to_buf(buf.pointer);
+}
+
+static union acpi_object *acpi_label_info(acpi_handle handle)
+{
+       acpi_status rc;
+       struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+
+       rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf);
+       if (ACPI_FAILURE(rc))
+               return NULL;
+       return pkg_to_buf(buf.pointer);
+}
+
+static u8 nfit_dsm_revid(unsigned family, unsigned func)
+{
+       static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
+               [NVDIMM_FAMILY_INTEL] = {
+                       [NVDIMM_INTEL_GET_MODES] = 2,
+                       [NVDIMM_INTEL_GET_FWINFO] = 2,
+                       [NVDIMM_INTEL_START_FWUPDATE] = 2,
+                       [NVDIMM_INTEL_SEND_FWUPDATE] = 2,
+                       [NVDIMM_INTEL_FINISH_FWUPDATE] = 2,
+                       [NVDIMM_INTEL_QUERY_FWUPDATE] = 2,
+                       [NVDIMM_INTEL_SET_THRESHOLD] = 2,
+                       [NVDIMM_INTEL_INJECT_ERROR] = 2,
+               },
+       };
+       u8 id;
+
+       if (family > NVDIMM_FAMILY_MAX)
+               return 0;
+       if (func > 31)
+               return 0;
+       id = revid_table[family][func];
+       if (id == 0)
+               return 1; /* default */
+       return id;
  }
  
  int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
                 unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc)
  {
         struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
+       struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
         union acpi_object in_obj, in_buf, *out_obj;
         const struct nd_cmd_desc *desc = NULL;
         struct device *dev = acpi_desc->dev;
@@ -235,7 +424,6 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
         }
  
         if (nvdimm) {
-               struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
                 struct acpi_device *adev = nfit_mem->adev;
  
                 if (!adev)
@@ -294,7 +482,29 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
                         in_buf.buffer.pointer,
                         min_t(u32, 256, in_buf.buffer.length), true);
  
-       out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj);
+       /* call the BIOS, prefer the named methods over _DSM if available */
+       if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi)
+               out_obj = acpi_label_info(handle);
+       else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) {
+               struct nd_cmd_get_config_data_hdr *p = buf;
+
+               out_obj = acpi_label_read(handle, p->in_offset, p->in_length);
+       } else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA
+                       && nfit_mem->has_lsw) {
+               struct nd_cmd_set_config_hdr *p = buf;
+
+               out_obj = acpi_label_write(handle, p->in_offset, p->in_length,
+                               p->in_buf);
+       } else {
+               u8 revid;
+
+               if (nvdimm)
+                       revid = nfit_dsm_revid(nfit_mem->family, func);
+               else
+                       revid = 1;
+               out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj);
+       }
+
         if (!out_obj) {
                 dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
                                 cmd_name);
@@ -356,8 +566,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
          * Set fw_status for all the commands with a known format to be
          * later interpreted by xlat_status().
          */
-       if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR)
-                       || (cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR)))
+       if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP
+                                       && cmd <= ND_CMD_CLEAR_ERROR)
+                               || (nvdimm && cmd >= ND_CMD_SMART
+                                       && cmd <= ND_CMD_VENDOR)))
                 fw_status = *(u32 *) out_obj->buffer.pointer;
  
         if (offset + in_buf.buffer.length < buf_len) {
@@ -1431,6 +1643,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
  {
         struct acpi_device *adev, *adev_dimm;
         struct device *dev = acpi_desc->dev;
+       union acpi_object *obj;
         unsigned long dsm_mask;
         const guid_t *guid;
         int i;
@@ -1463,7 +1676,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
          * different command sets.  Note, that checking for function0 (bit0)
          * tells us if any commands are reachable through this GUID.
          */
-       for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++)
+       for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
                 if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
                         if (family < 0 || i == default_dsm_family)
                                 family = i;
@@ -1473,7 +1686,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
         if (override_dsm_mask && !disable_vendor_specific)
                 dsm_mask = override_dsm_mask;
         else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
-               dsm_mask = 0x3fe;
+               dsm_mask = NVDIMM_INTEL_CMDMASK;
                 if (disable_vendor_specific)
                         dsm_mask &= ~(1 << ND_CMD_VENDOR);
         } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) {
@@ -1493,9 +1706,32 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
  
         guid = to_nfit_uuid(nfit_mem->family);
         for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
-               if (acpi_check_dsm(adev_dimm->handle, guid, 1, 1ULL << i))
+               if (acpi_check_dsm(adev_dimm->handle, guid,
+                                       nfit_dsm_revid(nfit_mem->family, i),
+                                       1ULL << i))
                         set_bit(i, &nfit_mem->dsm_mask);
  
+       obj = acpi_label_info(adev_dimm->handle);
+       if (obj) {
+               ACPI_FREE(obj);
+               nfit_mem->has_lsi = 1;
+               dev_dbg(dev, "%s: has _LSI\n", dev_name(&adev_dimm->dev));
+       }
+
+       obj = acpi_label_read(adev_dimm->handle, 0, 0);
+       if (obj) {
+               ACPI_FREE(obj);
+               nfit_mem->has_lsr = 1;
+               dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev));
+       }
+
+       obj = acpi_label_write(adev_dimm->handle, 0, 0, NULL);
+       if (obj) {
+               ACPI_FREE(obj);
+               nfit_mem->has_lsw = 1;
+               dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev));
+       }
+
         return 0;
  }
  
@@ -1571,8 +1807,21 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
                  * userspace interface.
                  */
                 cmd_mask = 1UL << ND_CMD_CALL;
-               if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
-                       cmd_mask |= nfit_mem->dsm_mask;
+               if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
+                       /*
+                        * These commands have a 1:1 correspondence
+                        * between DSM payload and libnvdimm ioctl
+                        * payload format.
+                        */
+                       cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
+               }
+
+               if (nfit_mem->has_lsi)
+                       set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
+               if (nfit_mem->has_lsr)
+                       set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
+               if (nfit_mem->has_lsw)
+                       set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
  
                 flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
                         : NULL;
@@ -1645,6 +1894,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
         int i;
  
         nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
+       nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
         adev = to_acpi_dev(acpi_desc);
         if (!adev)
                 return;
@@ -2239,7 +2489,7 @@ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc,
                 if (ars_status->out_length
                                 < 44 + sizeof(struct nd_ars_record) * (i + 1))
                         break;
-               rc = nvdimm_bus_add_poison(nvdimm_bus,
+               rc = nvdimm_bus_add_badrange(nvdimm_bus,
                                 ars_status->records[i].err_address,
                                 ars_status->records[i].length);
                 if (rc)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c

index feeb95d..b929214 100644 (file)
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
                         continue;
  
                 /* If this fails due to an -ENOMEM, there is little we can do */
-               nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
+               nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
                                 ALIGN(mce->addr, L1_CACHE_BYTES),
                                 L1_CACHE_BYTES);
                 nvdimm_region_notify(nfit_spa->nd_region,
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h

index 54292db..f0cf18b 100644 (file)
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -24,7 +24,7 @@
  /* ACPI 6.1 */
  #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
  
-/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */
+/* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */
  #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
  
  /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
@@ -38,6 +38,37 @@
                 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
                 | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
  
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
+
+#define NVDIMM_STANDARD_CMDMASK \
+(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
+ | 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
+ | 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
+ | 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
+
+/*
+ * Command numbers that the kernel needs to know about to handle
+ * non-default DSM revision ids
+ */
+enum nvdimm_family_cmds {
+       NVDIMM_INTEL_LATCH_SHUTDOWN = 10,
+       NVDIMM_INTEL_GET_MODES = 11,
+       NVDIMM_INTEL_GET_FWINFO = 12,
+       NVDIMM_INTEL_START_FWUPDATE = 13,
+       NVDIMM_INTEL_SEND_FWUPDATE = 14,
+       NVDIMM_INTEL_FINISH_FWUPDATE = 15,
+       NVDIMM_INTEL_QUERY_FWUPDATE = 16,
+       NVDIMM_INTEL_SET_THRESHOLD = 17,
+       NVDIMM_INTEL_INJECT_ERROR = 18,
+};
+
+#define NVDIMM_INTEL_CMDMASK \
+(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \
+ | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
+ | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
+ | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
+ | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN)
+
  enum nfit_uuids {
         /* for simplicity alias the uuid index with the family id */
         NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
@@ -140,6 +171,9 @@ struct nfit_mem {
         struct resource *flush_wpq;
         unsigned long dsm_mask;
         int family;
+       u32 has_lsi:1;
+       u32 has_lsr:1;
+       u32 has_lsw:1;
  };
  
  struct acpi_nfit_desc {
@@ -167,6 +201,7 @@ struct acpi_nfit_desc {
         unsigned int init_complete:1;
         unsigned long dimm_cmd_force_en;
         unsigned long bus_cmd_force_en;
+       unsigned long bus_nfit_cmd_force_en;
         int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
                         void *iobuf, u64 len, int rw);
  };
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index 923b417..40579d0 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -302,7 +302,6 @@ config BLK_DEV_SX8
  
  config BLK_DEV_RAM
         tristate "RAM block device support"
-       select DAX if BLK_DEV_RAM_DAX
         ---help---
           Saying Y here will allow you to use a portion of your RAM memory as
           a block device, so that you can make file systems on it, read and
@@ -338,17 +337,6 @@ config BLK_DEV_RAM_SIZE
           The default value is 4096 kilobytes. Only change this if you know
           what you are doing.
  
-config BLK_DEV_RAM_DAX
-       bool "Support Direct Access (DAX) to RAM block devices"
-       depends on BLK_DEV_RAM && FS_DAX
-       default n
-       help
-         Support filesystems using DAX to access RAM block devices.  This
-         avoids double-buffering data in the page cache before copying it
-         to the block device.  Answering Y will slightly enlarge the kernel,
-         and will prevent RAM block device backing store memory from being
-         allocated from highmem (only a problem for highmem systems).
-
  config CDROM_PKTCDVD
         tristate "Packet writing on CD/DVD media (DEPRECATED)"
         depends on !UML
diff --git a/drivers/block/brd.c b/drivers/block/brd.c

index 588360d..8028a3a 100644 (file)
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -21,11 +21,6 @@
  #include <linux/fs.h>
  #include <linux/slab.h>
  #include <linux/backing-dev.h>
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-#include <linux/pfn_t.h>
-#include <linux/dax.h>
-#include <linux/uio.h>
-#endif
  
  #include <linux/uaccess.h>
  
@@ -45,9 +40,6 @@ struct brd_device {
  
         struct request_queue    *brd_queue;
         struct gendisk          *brd_disk;
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-       struct dax_device       *dax_dev;
-#endif
         struct list_head        brd_list;
  
         /*
@@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
          * restriction might be able to be lifted.
          */
         gfp_flags = GFP_NOIO | __GFP_ZERO;
-#ifndef CONFIG_BLK_DEV_RAM_DAX
-       gfp_flags |= __GFP_HIGHMEM;
-#endif
         page = alloc_page(gfp_flags);
         if (!page)
                 return NULL;
@@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
         return err;
  }
  
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
-               long nr_pages, void **kaddr, pfn_t *pfn)
-{
-       struct page *page;
-
-       if (!brd)
-               return -ENODEV;
-       page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT);
-       if (!page)
-               return -ENOSPC;
-       *kaddr = page_address(page);
-       *pfn = page_to_pfn_t(page);
-
-       return 1;
-}
-
-static long brd_dax_direct_access(struct dax_device *dax_dev,
-               pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
-{
-       struct brd_device *brd = dax_get_private(dax_dev);
-
-       return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
-}
-
-static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
-               void *addr, size_t bytes, struct iov_iter *i)
-{
-       return copy_from_iter(addr, bytes, i);
-}
-
-static const struct dax_operations brd_dax_ops = {
-       .direct_access = brd_dax_direct_access,
-       .copy_from_iter = brd_dax_copy_from_iter,
-};
-#endif
-
  static const struct block_device_operations brd_fops = {
         .owner =                THIS_MODULE,
         .rw_page =              brd_rw_page,
@@ -451,21 +403,8 @@ static struct brd_device *brd_alloc(int i)
         set_capacity(disk, rd_size * 2);
         disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
  
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-       queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
-       brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
-       if (!brd->dax_dev)
-               goto out_free_inode;
-#endif
-
-
         return brd;
  
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-out_free_inode:
-       kill_dax(brd->dax_dev);
-       put_dax(brd->dax_dev);
-#endif
  out_free_queue:
         blk_cleanup_queue(brd->brd_queue);
  out_free_dev:
@@ -505,10 +444,6 @@ out:
  static void brd_del_one(struct brd_device *brd)
  {
         list_del(&brd->brd_list);
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-       kill_dax(brd->dax_dev);
-       put_dax(brd->dax_dev);
-#endif
         del_gendisk(brd->brd_disk);
         brd_free(brd);
  }
diff --git a/drivers/dax/device.c b/drivers/dax/device.c

index e9f3b3e..6833ada 100644 (file)
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
                 unsigned long size)
  {
         struct resource *res;
-       phys_addr_t phys;
+       /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */
+       phys_addr_t uninitialized_var(phys);
         int i;
  
         for (i = 0; i < dev_dax->num_resources; i++) {
diff --git a/drivers/dax/super.c b/drivers/dax/super.c

index 557b937..3ec8046 100644 (file)
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
         long len;
  
         if (blocksize != PAGE_SIZE) {
-               pr_err("VFS (%s): error: unsupported blocksize for dax\n",
+               pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
                                 sb->s_id);
                 return -EINVAL;
         }
  
         err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
         if (err) {
-               pr_err("VFS (%s): error: unaligned partition for dax\n",
+               pr_debug("VFS (%s): error: unaligned partition for dax\n",
                                 sb->s_id);
                 return err;
         }
  
         dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
         if (!dax_dev) {
-               pr_err("VFS (%s): error: device does not support dax\n",
+               pr_debug("VFS (%s): error: device does not support dax\n",
                                 sb->s_id);
                 return -EOPNOTSUPP;
         }
@@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
         put_dax(dax_dev);
  
         if (len < 1) {
-               pr_err("VFS (%s): error: dax access failed (%ld)",
+               pr_debug("VFS (%s): error: dax access failed (%ld)\n",
                                 sb->s_id, len);
                 return len < 0 ? len : -EIO;
         }
@@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
  void arch_wb_cache_pmem(void *addr, size_t size);
  void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
  {
-       if (unlikely(!dax_alive(dax_dev)))
-               return;
-
         if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
                 return;
  
@@ -344,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
         struct inode *inode;
  
         dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
+       if (!dax_dev)
+               return NULL;
+
         inode = &dax_dev->inode;
         inode->i_rdev = 0;
         return inode;
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile

index 447e0e1..70d5f3a 100644 (file)
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o
  libnvdimm-y += region.o
  libnvdimm-y += namespace_devs.o
  libnvdimm-y += label.o
+libnvdimm-y += badrange.o
  libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
  libnvdimm-$(CONFIG_BTT) += btt_devs.o
  libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c

new file mode 100644 (file)

index 0000000..e068d72
--- /dev/null
+++ b/drivers/nvdimm/badrange.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+void badrange_init(struct badrange *badrange)
+{
+       INIT_LIST_HEAD(&badrange->list);
+       spin_lock_init(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_init);
+
+static void append_badrange_entry(struct badrange *badrange,
+               struct badrange_entry *bre, u64 addr, u64 length)
+{
+       lockdep_assert_held(&badrange->lock);
+       bre->start = addr;
+       bre->length = length;
+       list_add_tail(&bre->list, &badrange->list);
+}
+
+static int alloc_and_append_badrange_entry(struct badrange *badrange,
+               u64 addr, u64 length, gfp_t flags)
+{
+       struct badrange_entry *bre;
+
+       bre = kzalloc(sizeof(*bre), flags);
+       if (!bre)
+               return -ENOMEM;
+
+       append_badrange_entry(badrange, bre, addr, length);
+       return 0;
+}
+
+static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
+{
+       struct badrange_entry *bre, *bre_new;
+
+       spin_unlock(&badrange->lock);
+       bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
+       spin_lock(&badrange->lock);
+
+       if (list_empty(&badrange->list)) {
+               if (!bre_new)
+                       return -ENOMEM;
+               append_badrange_entry(badrange, bre_new, addr, length);
+               return 0;
+       }
+
+       /*
+        * There is a chance this is a duplicate, check for those first.
+        * This will be the common case as ARS_STATUS returns all known
+        * errors in the SPA space, and we can't query it per region
+        */
+       list_for_each_entry(bre, &badrange->list, list)
+               if (bre->start == addr) {
+                       /* If length has changed, update this list entry */
+                       if (bre->length != length)
+                               bre->length = length;
+                       kfree(bre_new);
+                       return 0;
+               }
+
+       /*
+        * If not a duplicate or a simple length update, add the entry as is,
+        * as any overlapping ranges will get resolved when the list is consumed
+        * and converted to badblocks
+        */
+       if (!bre_new)
+               return -ENOMEM;
+       append_badrange_entry(badrange, bre_new, addr, length);
+
+       return 0;
+}
+
+int badrange_add(struct badrange *badrange, u64 addr, u64 length)
+{
+       int rc;
+
+       spin_lock(&badrange->lock);
+       rc = add_badrange(badrange, addr, length);
+       spin_unlock(&badrange->lock);
+
+       return rc;
+}
+EXPORT_SYMBOL_GPL(badrange_add);
+
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+               unsigned int len)
+{
+       struct list_head *badrange_list = &badrange->list;
+       u64 clr_end = start + len - 1;
+       struct badrange_entry *bre, *next;
+
+       spin_lock(&badrange->lock);
+
+       /*
+        * [start, clr_end] is the badrange interval being cleared.
+        * [bre->start, bre_end] is the badrange_list entry we're comparing
+        * the above interval against. The badrange list entry may need
+        * to be modified (update either start or length), deleted, or
+        * split into two based on the overlap characteristics
+        */
+
+       list_for_each_entry_safe(bre, next, badrange_list, list) {
+               u64 bre_end = bre->start + bre->length - 1;
+
+               /* Skip intervals with no intersection */
+               if (bre_end < start)
+                       continue;
+               if (bre->start >  clr_end)
+                       continue;
+               /* Delete completely overlapped badrange entries */
+               if ((bre->start >= start) && (bre_end <= clr_end)) {
+                       list_del(&bre->list);
+                       kfree(bre);
+                       continue;
+               }
+               /* Adjust start point of partially cleared entries */
+               if ((start <= bre->start) && (clr_end > bre->start)) {
+                       bre->length -= clr_end - bre->start + 1;
+                       bre->start = clr_end + 1;
+                       continue;
+               }
+               /* Adjust bre->length for partial clearing at the tail end */
+               if ((bre->start < start) && (bre_end <= clr_end)) {
+                       /* bre->start remains the same */
+                       bre->length = start - bre->start;
+                       continue;
+               }
+               /*
+                * If clearing in the middle of an entry, we split it into
+                * two by modifying the current entry to represent one half of
+                * the split, and adding a new entry for the second half.
+                */
+               if ((bre->start < start) && (bre_end > clr_end)) {
+                       u64 new_start = clr_end + 1;
+                       u64 new_len = bre_end - new_start + 1;
+
+                       /* Add new entry covering the right half */
+                       alloc_and_append_badrange_entry(badrange, new_start,
+                                       new_len, GFP_NOWAIT);
+                       /* Adjust this entry to cover the left half */
+                       bre->length = start - bre->start;
+                       continue;
+               }
+       }
+       spin_unlock(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_forget);
+
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+       dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
+                       (u64) s * 512, (u64) num * 512);
+       /* this isn't an error as the hardware will still throw an exception */
+       if (badblocks_set(bb, s, num, 1))
+               dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+                               __func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb:                badblocks instance to populate
+ * @ns_offset: namespace offset where the error range begins (in bytes)
+ * @len:       number of bytes of badrange to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+       const unsigned int sector_size = 512;
+       sector_t start_sector, end_sector;
+       u64 num_sectors;
+       u32 rem;
+
+       start_sector = div_u64(ns_offset, sector_size);
+       end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
+       if (rem)
+               end_sector++;
+       num_sectors = end_sector - start_sector;
+
+       if (unlikely(num_sectors > (u64)INT_MAX)) {
+               u64 remaining = num_sectors;
+               sector_t s = start_sector;
+
+               while (remaining) {
+                       int done = min_t(u64, remaining, INT_MAX);
+
+                       set_badblock(bb, s, done);
+                       remaining -= done;
+                       s += done;
+               }
+       } else
+               set_badblock(bb, start_sector, num_sectors);
+}
+
+static void badblocks_populate(struct badrange *badrange,
+               struct badblocks *bb, const struct resource *res)
+{
+       struct badrange_entry *bre;
+
+       if (list_empty(&badrange->list))
+               return;
+
+       list_for_each_entry(bre, &badrange->list, list) {
+               u64 bre_end = bre->start + bre->length - 1;
+
+               /* Discard intervals with no intersection */
+               if (bre_end < res->start)
+                       continue;
+               if (bre->start >  res->end)
+                       continue;
+               /* Deal with any overlap after start of the namespace */
+               if (bre->start >= res->start) {
+                       u64 start = bre->start;
+                       u64 len;
+
+                       if (bre_end <= res->end)
+                               len = bre->length;
+                       else
+                               len = res->start + resource_size(res)
+                                       - bre->start;
+                       __add_badblock_range(bb, start - res->start, len);
+                       continue;
+               }
+               /*
+                * Deal with overlap for badrange starting before
+                * the namespace.
+                */
+               if (bre->start < res->start) {
+                       u64 len;
+
+                       if (bre_end < res->end)
+                               len = bre->start + bre->length - res->start;
+                       else
+                               len = resource_size(res);
+                       __add_badblock_range(bb, 0, len);
+               }
+       }
+}
+
+/**
+ * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
+ * @region: parent region of the range to interrogate
+ * @bb: badblocks instance to populate
+ * @res: resource range to consider
+ *
+ * The badrange list generated during bus initialization may contain
+ * multiple, possibly overlapping physical address ranges.  Compare each
+ * of these ranges to the resource range currently being initialized,
+ * and add badblocks entries for all matching sub-ranges
+ */
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+               struct badblocks *bb, const struct resource *res)
+{
+       struct nvdimm_bus *nvdimm_bus;
+
+       if (!is_memory(&nd_region->dev)) {
+               dev_WARN_ONCE(&nd_region->dev, 1,
+                               "%s only valid for pmem regions\n", __func__);
+               return;
+       }
+       nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+       nvdimm_bus_lock(&nvdimm_bus->dev);
+       badblocks_populate(&nvdimm_bus->badrange, bb, res);
+       nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c

index baf2839..0a5e6cd 100644 (file)
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
   * General Public License for more details.
   */
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/libnvdimm.h>
  #include <linux/sched/mm.h>
  #include <linux/vmalloc.h>
  #include <linux/uaccess.h>
@@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
                 phys_addr_t phys, u64 cleared)
  {
         if (cleared > 0)
-               nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+               badrange_forget(&nvdimm_bus->badrange, phys, cleared);
  
         if (cleared > 0 && cleared / 512)
                 nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
@@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
                 return NULL;
         INIT_LIST_HEAD(&nvdimm_bus->list);
         INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
-       INIT_LIST_HEAD(&nvdimm_bus->poison_list);
         init_waitqueue_head(&nvdimm_bus->probe_wait);
         nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
         mutex_init(&nvdimm_bus->reconfig_mutex);
-       spin_lock_init(&nvdimm_bus->poison_lock);
+       badrange_init(&nvdimm_bus->badrange);
         if (nvdimm_bus->id < 0) {
                 kfree(nvdimm_bus);
                 return NULL;
@@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
         return 0;
  }
  
-static void free_poison_list(struct list_head *poison_list)
+static void free_badrange_list(struct list_head *badrange_list)
  {
-       struct nd_poison *pl, *next;
+       struct badrange_entry *bre, *next;
  
-       list_for_each_entry_safe(pl, next, poison_list, list) {
-               list_del(&pl->list);
-               kfree(pl);
+       list_for_each_entry_safe(bre, next, badrange_list, list) {
+               list_del(&bre->list);
+               kfree(bre);
         }
-       list_del_init(poison_list);
+       list_del_init(badrange_list);
  }
  
  static int nd_bus_remove(struct device *dev)
@@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
         nd_synchronize();
         device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
  
-       spin_lock(&nvdimm_bus->poison_lock);
-       free_poison_list(&nvdimm_bus->poison_list);
-       spin_unlock(&nvdimm_bus->poison_lock);
+       spin_lock(&nvdimm_bus->badrange.lock);
+       free_badrange_list(&nvdimm_bus->badrange.list);
+       spin_unlock(&nvdimm_bus->badrange.lock);
  
         nvdimm_bus_destroy_ndctl(nvdimm_bus);
  
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c

index bb71f0c..1dc5276 100644 (file)
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
  };
  EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
  
-static void set_badblock(struct badblocks *bb, sector_t s, int num)
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
  {
-       dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
-                       (u64) s * 512, (u64) num * 512);
-       /* this isn't an error as the hardware will still throw an exception */
-       if (badblocks_set(bb, s, num, 1))
-               dev_info_once(bb->dev, "%s: failed for sector %llx\n",
-                               __func__, (u64) s);
+       return badrange_add(&nvdimm_bus->badrange, addr, length);
  }
-
-/**
- * __add_badblock_range() - Convert a physical address range to bad sectors
- * @bb:                badblocks instance to populate
- * @ns_offset: namespace offset where the error range begins (in bytes)
- * @len:       number of bytes of poison to be added
- *
- * This assumes that the range provided with (ns_offset, len) is within
- * the bounds of physical addresses for this namespace, i.e. lies in the
- * interval [ns_start, ns_start + ns_size)
- */
-static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
-{
-       const unsigned int sector_size = 512;
-       sector_t start_sector, end_sector;
-       u64 num_sectors;
-       u32 rem;
-
-       start_sector = div_u64(ns_offset, sector_size);
-       end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
-       if (rem)
-               end_sector++;
-       num_sectors = end_sector - start_sector;
-
-       if (unlikely(num_sectors > (u64)INT_MAX)) {
-               u64 remaining = num_sectors;
-               sector_t s = start_sector;
-
-               while (remaining) {
-                       int done = min_t(u64, remaining, INT_MAX);
-
-                       set_badblock(bb, s, done);
-                       remaining -= done;
-                       s += done;
-               }
-       } else
-               set_badblock(bb, start_sector, num_sectors);
-}
-
-static void badblocks_populate(struct list_head *poison_list,
-               struct badblocks *bb, const struct resource *res)
-{
-       struct nd_poison *pl;
-
-       if (list_empty(poison_list))
-               return;
-
-       list_for_each_entry(pl, poison_list, list) {
-               u64 pl_end = pl->start + pl->length - 1;
-
-               /* Discard intervals with no intersection */
-               if (pl_end < res->start)
-                       continue;
-               if (pl->start >  res->end)
-                       continue;
-               /* Deal with any overlap after start of the namespace */
-               if (pl->start >= res->start) {
-                       u64 start = pl->start;
-                       u64 len;
-
-                       if (pl_end <= res->end)
-                               len = pl->length;
-                       else
-                               len = res->start + resource_size(res)
-                                       - pl->start;
-                       __add_badblock_range(bb, start - res->start, len);
-                       continue;
-               }
-               /* Deal with overlap for poison starting before the namespace */
-               if (pl->start < res->start) {
-                       u64 len;
-
-                       if (pl_end < res->end)
-                               len = pl->start + pl->length - res->start;
-                       else
-                               len = resource_size(res);
-                       __add_badblock_range(bb, 0, len);
-               }
-       }
-}
-
-/**
- * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
- * @region: parent region of the range to interrogate
- * @bb: badblocks instance to populate
- * @res: resource range to consider
- *
- * The poison list generated during bus initialization may contain
- * multiple, possibly overlapping physical address ranges.  Compare each
- * of these ranges to the resource range currently being initialized,
- * and add badblocks entries for all matching sub-ranges
- */
-void nvdimm_badblocks_populate(struct nd_region *nd_region,
-               struct badblocks *bb, const struct resource *res)
-{
-       struct nvdimm_bus *nvdimm_bus;
-       struct list_head *poison_list;
-
-       if (!is_memory(&nd_region->dev)) {
-               dev_WARN_ONCE(&nd_region->dev, 1,
-                               "%s only valid for pmem regions\n", __func__);
-               return;
-       }
-       nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-       poison_list = &nvdimm_bus->poison_list;
-
-       nvdimm_bus_lock(&nvdimm_bus->dev);
-       badblocks_populate(poison_list, bb, res);
-       nvdimm_bus_unlock(&nvdimm_bus->dev);
-}
-EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
-
-static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
-               struct nd_poison *pl, u64 addr, u64 length)
-{
-       lockdep_assert_held(&nvdimm_bus->poison_lock);
-       pl->start = addr;
-       pl->length = length;
-       list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-}
-
-static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
-                       gfp_t flags)
-{
-       struct nd_poison *pl;
-
-       pl = kzalloc(sizeof(*pl), flags);
-       if (!pl)
-               return -ENOMEM;
-
-       append_poison_entry(nvdimm_bus, pl, addr, length);
-       return 0;
-}
-
-static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-       struct nd_poison *pl, *pl_new;
-
-       spin_unlock(&nvdimm_bus->poison_lock);
-       pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
-       spin_lock(&nvdimm_bus->poison_lock);
-
-       if (list_empty(&nvdimm_bus->poison_list)) {
-               if (!pl_new)
-                       return -ENOMEM;
-               append_poison_entry(nvdimm_bus, pl_new, addr, length);
-               return 0;
-       }
-
-       /*
-        * There is a chance this is a duplicate, check for those first.
-        * This will be the common case as ARS_STATUS returns all known
-        * errors in the SPA space, and we can't query it per region
-        */
-       list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
-               if (pl->start == addr) {
-                       /* If length has changed, update this list entry */
-                       if (pl->length != length)
-                               pl->length = length;
-                       kfree(pl_new);
-                       return 0;
-               }
-
-       /*
-        * If not a duplicate or a simple length update, add the entry as is,
-        * as any overlapping ranges will get resolved when the list is consumed
-        * and converted to badblocks
-        */
-       if (!pl_new)
-               return -ENOMEM;
-       append_poison_entry(nvdimm_bus, pl_new, addr, length);
-
-       return 0;
-}
-
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-       int rc;
-
-       spin_lock(&nvdimm_bus->poison_lock);
-       rc = bus_add_poison(nvdimm_bus, addr, length);
-       spin_unlock(&nvdimm_bus->poison_lock);
-
-       return rc;
-}
-EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
-               unsigned int len)
-{
-       struct list_head *poison_list = &nvdimm_bus->poison_list;
-       u64 clr_end = start + len - 1;
-       struct nd_poison *pl, *next;
-
-       spin_lock(&nvdimm_bus->poison_lock);
-       WARN_ON_ONCE(list_empty(poison_list));
-
-       /*
-        * [start, clr_end] is the poison interval being cleared.
-        * [pl->start, pl_end] is the poison_list entry we're comparing
-        * the above interval against. The poison list entry may need
-        * to be modified (update either start or length), deleted, or
-        * split into two based on the overlap characteristics
-        */
-
-       list_for_each_entry_safe(pl, next, poison_list, list) {
-               u64 pl_end = pl->start + pl->length - 1;
-
-               /* Skip intervals with no intersection */
-               if (pl_end < start)
-                       continue;
-               if (pl->start >  clr_end)
-                       continue;
-               /* Delete completely overlapped poison entries */
-               if ((pl->start >= start) && (pl_end <= clr_end)) {
-                       list_del(&pl->list);
-                       kfree(pl);
-                       continue;
-               }
-               /* Adjust start point of partially cleared entries */
-               if ((start <= pl->start) && (clr_end > pl->start)) {
-                       pl->length -= clr_end - pl->start + 1;
-                       pl->start = clr_end + 1;
-                       continue;
-               }
-               /* Adjust pl->length for partial clearing at the tail end */
-               if ((pl->start < start) && (pl_end <= clr_end)) {
-                       /* pl->start remains the same */
-                       pl->length = start - pl->start;
-                       continue;
-               }
-               /*
-                * If clearing in the middle of an entry, we split it into
-                * two by modifying the current entry to represent one half of
-                * the split, and adding a new entry for the second half.
-                */
-               if ((pl->start < start) && (pl_end > clr_end)) {
-                       u64 new_start = clr_end + 1;
-                       u64 new_len = pl_end - new_start + 1;
-
-                       /* Add new entry covering the right half */
-                       add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
-                       /* Adjust this entry to cover the left half */
-                       pl->length = start - pl->start;
-                       continue;
-               }
-       }
-       spin_unlock(&nvdimm_bus->poison_lock);
-}
-EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
  
  #ifdef CONFIG_BLK_DEV_INTEGRITY
  int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c

index e0f0e3c..f8913b8 100644 (file)
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev)
                 goto err;
  
         rc = nvdimm_init_config_data(ndd);
+       if (rc == -EACCES)
+               nvdimm_set_locked(dev);
         if (rc)
                 goto err;
  
@@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev)
         rc = nd_label_reserve_dpa(ndd);
         if (ndd->ns_current >= 0)
                 nvdimm_set_aliasing(dev);
+       nvdimm_clear_locked(dev);
         nvdimm_bus_unlock(dev);
  
         if (rc)
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c

index f0d1b7e..097794d 100644 (file)
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev)
         set_bit(NDD_LOCKED, &nvdimm->flags);
  }
  
+void nvdimm_clear_locked(struct device *dev)
+{
+       struct nvdimm *nvdimm = to_nvdimm(dev);
+
+       clear_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
  static void nvdimm_release(struct device *dev)
  {
         struct nvdimm *nvdimm = to_nvdimm(dev);
@@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev,
  }
  static DEVICE_ATTR_RO(commands);
  
+static ssize_t flags_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nvdimm *nvdimm = to_nvdimm(dev);
+
+       return sprintf(buf, "%s%s\n",
+                       test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+                       test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
  static ssize_t state_show(struct device *dev, struct device_attribute *attr,
                 char *buf)
  {
@@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots);
  
  static struct attribute *nvdimm_attributes[] = {
         &dev_attr_state.attr,
+       &dev_attr_flags.attr,
         &dev_attr_commands.attr,
         &dev_attr_available_slots.attr,
         NULL,
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c

index 9c5f108..de66c02 100644 (file)
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
         nsindex = to_namespace_index(ndd, 0);
         memset(nsindex, 0, ndd->nsarea.config_size);
         for (i = 0; i < 2; i++) {
-               int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT);
+               int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
  
                 if (rc)
                         return rc;
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c

index 3e4d1e7..bb3ba8c 100644 (file)
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj,
         if (a == &dev_attr_resource.attr) {
                 if (is_namespace_blk(dev))
                         return 0;
-               return a->mode;
+               return 0400;
         }
  
         if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
@@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
   * @nspm: target namespace to create
   * @nd_label: target pmem namespace label to evaluate
   */
-struct device *create_namespace_pmem(struct nd_region *nd_region,
+static struct device *create_namespace_pmem(struct nd_region *nd_region,
                 struct nd_namespace_index *nsindex,
                 struct nd_namespace_label *nd_label)
  {
@@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region,
         return i;
  }
  
-struct device *create_namespace_blk(struct nd_region *nd_region,
+static struct device *create_namespace_blk(struct nd_region *nd_region,
                 struct nd_namespace_label *nd_label, int count)
  {
  
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h

index 86bc19a..79274ea 100644 (file)
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -29,10 +29,9 @@ struct nvdimm_bus {
         struct list_head list;
         struct device dev;
         int id, probe_active;
-       struct list_head poison_list;
         struct list_head mapping_list;
         struct mutex reconfig_mutex;
-       spinlock_t poison_lock;
+       struct badrange badrange;
  };
  
  struct nvdimm {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h

index 9c758a9..e958f37 100644 (file)
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -34,12 +34,6 @@ enum {
         NVDIMM_IO_ATOMIC = 1,
  };
  
-struct nd_poison {
-       u64 start;
-       u64 length;
-       struct list_head list;
-};
-
  struct nvdimm_drvdata {
         struct device *dev;
         int nslabel_size;
@@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
                 unsigned int len);
  void nvdimm_set_aliasing(struct device *dev);
  void nvdimm_set_locked(struct device *dev);
+void nvdimm_clear_locked(struct device *dev);
  struct nd_btt *to_nd_btt(struct device *dev);
  
  struct nd_gen_sb {
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c

index 9576c44..65cc171 100644 (file)
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = {
         NULL,
  };
  
+static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+       if (a == &dev_attr_resource.attr)
+               return 0400;
+       return a->mode;
+}
+
  struct attribute_group nd_pfn_attribute_group = {
         .attrs = nd_pfn_attributes,
+       .is_visible = pfn_visible,
  };
  
  static const struct attribute_group *nd_pfn_attribute_groups[] = {
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c

index 829d760..abaf38c 100644 (file)
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
         if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
                 return 0;
  
-       if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
-               return 0;
+       if (a == &dev_attr_resource.attr) {
+               if (is_nd_pmem(dev))
+                       return 0400;
+               else
+                       return 0;
+       }
  
         if (a == &dev_attr_deep_flush.attr) {
                 int has_flush = nvdimm_has_flush(nd_region);
diff --git a/fs/dax.c b/fs/dax.c

index 3652b26..9598159 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
  static void *dax_insert_mapping_entry(struct address_space *mapping,
                                       struct vm_fault *vmf,
                                       void *entry, sector_t sector,
-                                     unsigned long flags)
+                                     unsigned long flags, bool dirty)
  {
         struct radix_tree_root *page_tree = &mapping->page_tree;
         void *new_entry;
         pgoff_t index = vmf->pgoff;
  
-       if (vmf->flags & FAULT_FLAG_WRITE)
+       if (dirty)
                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  
         if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
                 entry = new_entry;
         }
  
-       if (vmf->flags & FAULT_FLAG_WRITE)
+       if (dirty)
                 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  
         spin_unlock_irq(&mapping->tree_lock);
@@ -825,38 +825,42 @@ out:
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  
-static int dax_insert_mapping(struct address_space *mapping,
-               struct block_device *bdev, struct dax_device *dax_dev,
-               sector_t sector, size_t size, void *entry,
-               struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
  {
-       unsigned long vaddr = vmf->address;
-       void *ret, *kaddr;
+       return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+                        pfn_t *pfnp)
+{
+       const sector_t sector = dax_iomap_sector(iomap, pos);
         pgoff_t pgoff;
+       void *kaddr;
         int id, rc;
-       pfn_t pfn;
+       long length;
  
-       rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+       rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
         if (rc)
                 return rc;
-
         id = dax_read_lock();
-       rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-       if (rc < 0) {
-               dax_read_unlock(id);
-               return rc;
+       length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+                                  &kaddr, pfnp);
+       if (length < 0) {
+               rc = length;
+               goto out;
         }
+       rc = -EINVAL;
+       if (PFN_PHYS(length) < size)
+               goto out;
+       if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+               goto out;
+       /* For larger pages we need devmap */
+       if (length > 1 && !pfn_t_devmap(*pfnp))
+               goto out;
+       rc = 0;
+out:
         dax_read_unlock(id);
-
-       ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
-       if (IS_ERR(ret))
-               return PTR_ERR(ret);
-
-       trace_dax_insert_mapping(mapping->host, vmf, ret);
-       if (vmf->flags & FAULT_FLAG_WRITE)
-               return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
-       else
-               return vm_insert_mixed(vma, vaddr, pfn);
+       return rc;
  }
  
  /*
@@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
         }
  
         entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-                       RADIX_DAX_ZERO_PAGE);
+                       RADIX_DAX_ZERO_PAGE, false);
         if (IS_ERR(entry2)) {
                 ret = VM_FAULT_SIGBUS;
                 goto out;
@@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
  }
  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
  
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
-       return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
  static loff_t
  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                 struct iomap *iomap)
@@ -1085,19 +1084,33 @@ static int dax_fault_return(int error)
         return VM_FAULT_SIGBUS;
  }
  
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+               struct vm_area_struct *vma, struct iomap *iomap)
+{
+       return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+               && (iomap->flags & IOMAP_F_DIRTY);
+}
+
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
                                const struct iomap_ops *ops)
  {
-       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       struct vm_area_struct *vma = vmf->vma;
+       struct address_space *mapping = vma->vm_file->f_mapping;
         struct inode *inode = mapping->host;
         unsigned long vaddr = vmf->address;
         loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-       sector_t sector;
         struct iomap iomap = { 0 };
         unsigned flags = IOMAP_FAULT;
         int error, major = 0;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+       bool sync;
         int vmf_ret = 0;
         void *entry;
+       pfn_t pfn;
  
         trace_dax_pte_fault(inode, vmf, vmf_ret);
         /*
@@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                 goto out;
         }
  
-       if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+       if (write && !vmf->cow_page)
                 flags |= IOMAP_WRITE;
  
         entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                 goto error_finish_iomap;
         }
  
-       sector = dax_iomap_sector(&iomap, pos);
-
         if (vmf->cow_page) {
+               sector_t sector = dax_iomap_sector(&iomap, pos);
+
                 switch (iomap.type) {
                 case IOMAP_HOLE:
                 case IOMAP_UNWRITTEN:
@@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                 goto finish_iomap;
         }
  
+       sync = dax_fault_is_synchronous(flags, vma, &iomap);
+
         switch (iomap.type) {
         case IOMAP_MAPPED:
                 if (iomap.flags & IOMAP_F_NEW) {
                         count_vm_event(PGMAJFAULT);
-                       count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+                       count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
                         major = VM_FAULT_MAJOR;
                 }
-               error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-                               sector, PAGE_SIZE, entry, vmf->vma, vmf);
+               error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+               if (error < 0)
+                       goto error_finish_iomap;
+
+               entry = dax_insert_mapping_entry(mapping, vmf, entry,
+                                                dax_iomap_sector(&iomap, pos),
+                                                0, write && !sync);
+               if (IS_ERR(entry)) {
+                       error = PTR_ERR(entry);
+                       goto error_finish_iomap;
+               }
+
+               /*
+                * If we are doing synchronous page fault and inode needs fsync,
+                * we can insert PTE into page tables only after that happens.
+                * Skip insertion for now and return the pfn so that caller can
+                * insert it after fsync is done.
+                */
+               if (sync) {
+                       if (WARN_ON_ONCE(!pfnp)) {
+                               error = -EIO;
+                               goto error_finish_iomap;
+                       }
+                       *pfnp = pfn;
+                       vmf_ret = VM_FAULT_NEEDDSYNC | major;
+                       goto finish_iomap;
+               }
+               trace_dax_insert_mapping(inode, vmf, entry);
+               if (write)
+                       error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+               else
+                       error = vm_insert_mixed(vma, vaddr, pfn);
+
                 /* -EBUSY is fine, somebody else faulted on the same PTE */
                 if (error == -EBUSY)
                         error = 0;
                 break;
         case IOMAP_UNWRITTEN:
         case IOMAP_HOLE:
-               if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+               if (!write) {
                         vmf_ret = dax_load_hole(mapping, entry, vmf);
                         goto finish_iomap;
                 }
@@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
  }
  
  #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-               loff_t pos, void *entry)
-{
-       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-       const sector_t sector = dax_iomap_sector(iomap, pos);
-       struct dax_device *dax_dev = iomap->dax_dev;
-       struct block_device *bdev = iomap->bdev;
-       struct inode *inode = mapping->host;
-       const size_t size = PMD_SIZE;
-       void *ret = NULL, *kaddr;
-       long length = 0;
-       pgoff_t pgoff;
-       pfn_t pfn = {};
-       int id;
-
-       if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
-               goto fallback;
-
-       id = dax_read_lock();
-       length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-       if (length < 0)
-               goto unlock_fallback;
-       length = PFN_PHYS(length);
-
-       if (length < size)
-               goto unlock_fallback;
-       if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
-               goto unlock_fallback;
-       if (!pfn_t_devmap(pfn))
-               goto unlock_fallback;
-       dax_read_unlock(id);
-
-       ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
-                       RADIX_DAX_PMD);
-       if (IS_ERR(ret))
-               goto fallback;
-
-       trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
-       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-                       pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-unlock_fallback:
-       dax_read_unlock(id);
-fallback:
-       trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
-       return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
  
  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
                 void *entry)
@@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
                 goto fallback;
  
         ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-                       RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+                       RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
         if (IS_ERR(ret))
                 goto fallback;
  
@@ -1310,13 +1314,14 @@ fallback:
         return VM_FAULT_FALLBACK;
  }
  
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                                const struct iomap_ops *ops)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct address_space *mapping = vma->vm_file->f_mapping;
         unsigned long pmd_addr = vmf->address & PMD_MASK;
         bool write = vmf->flags & FAULT_FLAG_WRITE;
+       bool sync;
         unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
         struct inode *inode = mapping->host;
         int result = VM_FAULT_FALLBACK;
@@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
         void *entry;
         loff_t pos;
         int error;
+       pfn_t pfn;
  
         /*
          * Check whether offset isn't beyond end of file now. Caller is
@@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
          * this is a reliable test.
          */
         pgoff = linear_page_index(vma, pmd_addr);
-       max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+       max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  
         trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
  
@@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
         if ((pmd_addr + PMD_SIZE) > vma->vm_end)
                 goto fallback;
  
-       if (pgoff > max_pgoff) {
+       if (pgoff >= max_pgoff) {
                 result = VM_FAULT_SIGBUS;
                 goto out;
         }
  
         /* If the PMD would extend beyond the file size */
-       if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+       if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
                 goto fallback;
  
         /*
@@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
         if (iomap.offset + iomap.length < pos + PMD_SIZE)
                 goto finish_iomap;
  
+       sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
+
         switch (iomap.type) {
         case IOMAP_MAPPED:
-               result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+               error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+               if (error < 0)
+                       goto finish_iomap;
+
+               entry = dax_insert_mapping_entry(mapping, vmf, entry,
+                                               dax_iomap_sector(&iomap, pos),
+                                               RADIX_DAX_PMD, write && !sync);
+               if (IS_ERR(entry))
+                       goto finish_iomap;
+
+               /*
+                * If we are doing synchronous page fault and inode needs fsync,
+                * we can insert PMD into page tables only after that happens.
+                * Skip insertion for now and return the pfn so that caller can
+                * insert it after fsync is done.
+                */
+               if (sync) {
+                       if (WARN_ON_ONCE(!pfnp))
+                               goto finish_iomap;
+                       *pfnp = pfn;
+                       result = VM_FAULT_NEEDDSYNC;
+                       goto finish_iomap;
+               }
+
+               trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+                                           write);
                 break;
         case IOMAP_UNWRITTEN:
         case IOMAP_HOLE:
@@ -1442,7 +1476,7 @@ out:
         return result;
  }
  #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                                const struct iomap_ops *ops)
  {
         return VM_FAULT_FALLBACK;
@@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  /**
   * dax_iomap_fault - handle a page fault on a DAX file
   * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
+ * @ops: Iomap ops passed from the file system
   *
   * When a page fault occurs, filesystems may call this helper in
   * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
   * successfully.
   */
  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-                   const struct iomap_ops *ops)
+                   pfn_t *pfnp, const struct iomap_ops *ops)
  {
         switch (pe_size) {
         case PE_SIZE_PTE:
-               return dax_iomap_pte_fault(vmf, ops);
+               return dax_iomap_pte_fault(vmf, pfnp, ops);
         case PE_SIZE_PMD:
-               return dax_iomap_pmd_fault(vmf, ops);
+               return dax_iomap_pmd_fault(vmf, pfnp, ops);
         default:
                 return VM_FAULT_FALLBACK;
         }
  }
  EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file.  It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+                                 enum page_entry_size pe_size,
+                                 pfn_t pfn)
+{
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       void *entry, **slot;
+       pgoff_t index = vmf->pgoff;
+       int vmf_ret, error;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, &slot);
+       /* Did we race with someone splitting entry or so? */
+       if (!entry ||
+           (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+           (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+               put_unlocked_mapping_entry(mapping, index, entry);
+               spin_unlock_irq(&mapping->tree_lock);
+               trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+                                                     VM_FAULT_NOPAGE);
+               return VM_FAULT_NOPAGE;
+       }
+       radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+       entry = lock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       switch (pe_size) {
+       case PE_SIZE_PTE:
+               error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+               vmf_ret = dax_fault_return(error);
+               break;
+#ifdef CONFIG_FS_DAX_PMD
+       case PE_SIZE_PMD:
+               vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+                       pfn, true);
+               break;
+#endif
+       default:
+               vmf_ret = VM_FAULT_FALLBACK;
+       }
+       put_locked_mapping_entry(mapping, index);
+       trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+       return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+                         pfn_t pfn)
+{
+       int err;
+       loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+       size_t len = 0;
+
+       if (pe_size == PE_SIZE_PTE)
+               len = PAGE_SIZE;
+       else if (pe_size == PE_SIZE_PMD)
+               len = PMD_SIZE;
+       else
+               WARN_ON_ONCE(1);
+       err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+       if (err)
+               return VM_FAULT_SIGBUS;
+       return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index c67b486..2da6769 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
         }
         down_read(&ei->dax_sem);
  
-       ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+       ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
  
         up_read(&ei->dax_sem);
         if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index ad204d2..a0ae27b 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
  #include <linux/quotaops.h>
  #include <linux/pagevec.h>
  #include <linux/uio.h>
+#include <linux/mman.h>
  #include "ext4.h"
  #include "ext4_jbd2.h"
  #include "xattr.h"
@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
          */
         bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                 (vmf->vma->vm_flags & VM_SHARED);
+       pfn_t pfn;
  
         if (write) {
                 sb_start_pagefault(sb);
@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
                 down_read(&EXT4_I(inode)->i_mmap_sem);
                 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                                EXT4_DATA_TRANS_BLOCKS(sb));
+               if (IS_ERR(handle)) {
+                       up_read(&EXT4_I(inode)->i_mmap_sem);
+                       sb_end_pagefault(sb);
+                       return VM_FAULT_SIGBUS;
+               }
         } else {
                 down_read(&EXT4_I(inode)->i_mmap_sem);
         }
-       if (!IS_ERR(handle))
-               result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
-       else
-               result = VM_FAULT_SIGBUS;
+       result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
         if (write) {
-               if (!IS_ERR(handle))
-                       ext4_journal_stop(handle);
+               ext4_journal_stop(handle);
+               /* Handling synchronous page fault? */
+               if (result & VM_FAULT_NEEDDSYNC)
+                       result = dax_finish_sync_fault(vmf, pe_size, pfn);
                 up_read(&EXT4_I(inode)->i_mmap_sem);
                 sb_end_pagefault(sb);
         } else {
@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
         if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                 return -EIO;
  
+       /*
+        * We don't support synchronous mappings for non-DAX files. At least
+        * until someone comes with a sensible use case.
+        */
+       if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+               return -EOPNOTSUPP;
+
         file_accessed(file);
         if (IS_DAX(file_inode(file))) {
                 vma->vm_ops = &ext4_dax_vm_ops;
@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
         .compat_ioctl   = ext4_compat_ioctl,
  #endif
         .mmap           = ext4_file_mmap,
+       .mmap_supported_flags = MAP_SYNC,
         .open           = ext4_file_open,
         .release        = ext4_release_file,
         .fsync          = ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 8d2b582..0992d76 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
                 return try_to_free_buffers(page);
  }
  
+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+       journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+       if (journal)
+               return !jbd2_transaction_committed(journal,
+                                       EXT4_I(inode)->i_datasync_tid);
+       /* Any metadata buffers to write? */
+       if (!list_empty(&inode->i_mapping->private_list))
+               return true;
+       return inode->i_state & I_DIRTY_DATASYNC;
+}
+
  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                             unsigned flags, struct iomap *iomap)
  {
@@ -3497,6 +3510,8 @@ retry:
         }
  
         iomap->flags = 0;
+       if (ext4_inode_datasync_dirty(inode))
+               iomap->flags |= IOMAP_F_DIRTY;
         iomap->bdev = inode->i_sb->s_bdev;
         iomap->dax_dev = sbi->s_daxdev;
         iomap->offset = first_block << blkbits;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index d2a85c9..67546c7 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
         return err;
  }
  
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+       int ret = 1;
+
+       read_lock(&journal->j_state_lock);
+       if (journal->j_running_transaction &&
+           journal->j_running_transaction->t_tid == tid)
+               ret = 0;
+       if (journal->j_committing_transaction &&
+           journal->j_committing_transaction->t_tid == tid)
+               ret = 0;
+       read_unlock(&journal->j_state_lock);
+       return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
  /*
   * When this function returns the transaction corresponding to tid
   * will be completed.  If the transaction has currently running, start
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index 875231c..339e4c1 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                 [ilog2(VM_ACCOUNT)]     = "ac",
                 [ilog2(VM_NORESERVE)]   = "nr",
                 [ilog2(VM_HUGETLB)]     = "ht",
+               [ilog2(VM_SYNC)]        = "sf",
                 [ilog2(VM_ARCH_1)]      = "ar",
                 [ilog2(VM_WIPEONFORK)]  = "wf",
                 [ilog2(VM_DONTDUMP)]    = "dd",
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 1814687..8601275 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
  #include <linux/falloc.h>
  #include <linux/pagevec.h>
  #include <linux/backing-dev.h>
+#include <linux/mman.h>
  
  static const struct vm_operations_struct xfs_file_vm_ops;
  
@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
  
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
         if (IS_DAX(inode)) {
-               ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+               pfn_t pfn;
+
+               ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+               if (ret & VM_FAULT_NEEDDSYNC)
+                       ret = dax_finish_sync_fault(vmf, pe_size, pfn);
         } else {
                 if (write_fault)
                         ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
  }
  
  /*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
   */
  static int
  xfs_filemap_pfn_mkwrite(
         struct vm_fault         *vmf)
  {
  
-       struct inode            *inode = file_inode(vmf->vma->vm_file);
-       struct xfs_inode        *ip = XFS_I(inode);
-       int                     ret = VM_FAULT_NOPAGE;
-       loff_t                  size;
-
-       trace_xfs_filemap_pfn_mkwrite(ip);
-
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vmf->vma->vm_file);
-
-       /* check if the faulting page hasn't raced with truncate */
-       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size)
-               ret = VM_FAULT_SIGBUS;
-       else if (IS_DAX(inode))
-               ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
-       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
-       sb_end_pagefault(inode->i_sb);
-       return ret;
-
+       return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
  }
  
  static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1136,6 +1120,13 @@ xfs_file_mmap(
         struct file     *filp,
         struct vm_area_struct *vma)
  {
+       /*
+        * We don't support synchronous mappings for non-DAX files. At least
+        * until someone comes with a sensible use case.
+        */
+       if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+               return -EOPNOTSUPP;
+
         file_accessed(filp);
         vma->vm_ops = &xfs_file_vm_ops;
         if (IS_DAX(file_inode(filp)))
@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
         .compat_ioctl   = xfs_file_compat_ioctl,
  #endif
         .mmap           = xfs_file_mmap,
+       .mmap_supported_flags = MAP_SYNC,
         .open           = xfs_file_open,
         .release        = xfs_file_release,
         .fsync          = xfs_file_fsync,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 18077e2..33eb4fb 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -34,6 +34,7 @@
  #include "xfs_error.h"
  #include "xfs_trans.h"
  #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
  #include "xfs_iomap.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
                 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
         }
  
+       if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+                               & ~XFS_ILOG_TIMESTAMP))
+               iomap->flags |= IOMAP_F_DIRTY;
+
         xfs_bmbt_to_iomap(ip, iomap, &imap);
  
         if (shared)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 515ba04..d718a10 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
  DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
  DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
  
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
  TRACE_EVENT(xfs_filemap_fault,
         TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
                  bool write_fault),
diff --git a/include/linux/dax.h b/include/linux/dax.h

index 895e16f..5258346 100644 (file)
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -96,7 +96,9 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
  ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                 const struct iomap_ops *ops);
  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-                   const struct iomap_ops *ops);
+                   pfn_t *pfnp, const struct iomap_ops *ops);
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+                         pfn_t pfn);
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                       pgoff_t index);
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 2690864..a2b5d64 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1702,6 +1702,7 @@ struct file_operations {
         long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
         long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
         int (*mmap) (struct file *, struct vm_area_struct *);
+       unsigned long mmap_supported_flags;
         int (*open) (struct inode *, struct file *);
         int (*flush) (struct file *, fl_owner_t id);
         int (*release) (struct inode *, struct file *);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h

index ca10767..19a07de 100644 (file)
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -21,9 +21,13 @@ struct vm_fault;
  
  /*
   * Flags for all iomap mappings:
+ *
+ * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
+ * written data and requires fdatasync to commit them to persistent storage.
   */
  #define IOMAP_F_NEW            0x01    /* blocks have been newly allocated */
  #define IOMAP_F_BOUNDARY       0x02    /* mapping ends at metadata boundary */
+#define IOMAP_F_DIRTY          0x04    /* uncommitted metadata */
  
  /*
   * Flags that only need to be reported for IOMAP_REPORT requests:
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h

index 606b6bc..296d1e0 100644 (file)
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
  int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
  int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
  int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_transaction_committed(journal_t *journal, tid_t tid);
  int jbd2_complete_transaction(journal_t *journal, tid_t tid);
  int jbd2_log_do_checkpoint(journal_t *journal);
  int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h

index 3eaad2f..f8109dd 100644 (file)
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -18,6 +18,18 @@
  #include <linux/sizes.h>
  #include <linux/types.h>
  #include <linux/uuid.h>
+#include <linux/spinlock.h>
+
+struct badrange_entry {
+       u64 start;
+       u64 length;
+       struct list_head list;
+};
+
+struct badrange {
+       struct list_head list;
+       spinlock_t lock;
+};
  
  enum {
         /* when a dimm supports both PMEM and BLK access a label is required */
@@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
  
  }
  
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-               phys_addr_t start, unsigned int len);
+void badrange_init(struct badrange *badrange);
+int badrange_add(struct badrange *badrange, u64 addr, u64 length);
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+               unsigned int len);
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
+               u64 length);
  struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
                 struct nvdimm_bus_descriptor *nfit_desc);
  void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
diff --git a/include/linux/mm.h b/include/linux/mm.h

index c7b1d61..ee07314 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,7 @@ extern unsigned int kobjsize(const void *objp);
  #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
  #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
  #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
+#define VM_SYNC                0x00800000      /* Synchronous page faults */
  #define VM_ARCH_1      0x01000000      /* Architecture-specific flag */
  #define VM_WIPEONFORK  0x02000000      /* Wipe VMA contents in child. */
  #define VM_DONTDUMP    0x04000000      /* Do not include in the core dump */
@@ -1191,8 +1192,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
  #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */
  #define VM_FAULT_FALLBACK 0x0800       /* huge page fault failed, fall back to small */
  #define VM_FAULT_DONE_COW   0x1000     /* ->fault has fully handled COW */
-
-#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
+#define VM_FAULT_NEEDDSYNC  0x2000     /* ->fault did not modify page tables
+                                        * and needs fsync() to complete (for
+                                        * synchronous page faults in DAX) */
  
  #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
                          VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@@ -1210,7 +1212,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
         { VM_FAULT_LOCKED,              "LOCKED" }, \
         { VM_FAULT_RETRY,               "RETRY" }, \
         { VM_FAULT_FALLBACK,            "FALLBACK" }, \
-       { VM_FAULT_DONE_COW,            "DONE_COW" }
+       { VM_FAULT_DONE_COW,            "DONE_COW" }, \
+       { VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" }
  
  /* Encode hstate index for a hwpoisoned large page */
  #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
diff --git a/include/linux/mman.h b/include/linux/mman.h

index 7c87b66..6a4d1ca 100644 (file)
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -8,6 +8,48 @@
  #include <linux/atomic.h>
  #include <uapi/linux/mman.h>
  
+/*
+ * Arrange for legacy / undefined architecture specific flags to be
+ * ignored by mmap handling code.
+ */
+#ifndef MAP_32BIT
+#define MAP_32BIT 0
+#endif
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB 0
+#endif
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB 0
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+#ifndef MAP_SYNC
+#define MAP_SYNC 0
+#endif
+
+/*
+ * The historical set of flags that all mmap implementations implicitly
+ * support when a ->mmap_validate() op is not provided in file_operations.
+ */
+#define LEGACY_MAP_MASK (MAP_SHARED \
+               | MAP_PRIVATE \
+               | MAP_FIXED \
+               | MAP_ANONYMOUS \
+               | MAP_DENYWRITE \
+               | MAP_EXECUTABLE \
+               | MAP_UNINITIALIZED \
+               | MAP_GROWSDOWN \
+               | MAP_LOCKED \
+               | MAP_NORESERVE \
+               | MAP_POPULATE \
+               | MAP_NONBLOCK \
+               | MAP_STACK \
+               | MAP_HUGETLB \
+               | MAP_32BIT \
+               | MAP_HUGE_2MB \
+               | MAP_HUGE_1GB)
+
  extern int sysctl_overcommit_memory;
  extern int sysctl_overcommit_ratio;
  extern unsigned long sysctl_overcommit_kbytes;
@@ -64,8 +106,9 @@ static inline bool arch_validate_prot(unsigned long prot)
   * ("bit1" and "bit2" must be single bits)
   */
  #define _calc_vm_trans(x, bit1, bit2) \
+  ((!(bit1) || !(bit2)) ? 0 : \
    ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
-   : ((x) & (bit1)) / ((bit1) / (bit2)))
+   : ((x) & (bit1)) / ((bit1) / (bit2))))
  
  /*
   * Combine the mmap "prot" argument into "vm_flags" used internally.
@@ -87,7 +130,8 @@ calc_vm_flag_bits(unsigned long flags)
  {
         return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
                _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-              _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+              _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
+              _calc_vm_trans(flags, MAP_SYNC,       VM_SYNC      );
  }
  
  unsigned long vm_commit_limit(void);
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h

index 8a8df54..97b09fc 100644 (file)
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -149,7 +149,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
         TP_ARGS(inode, vmf, length, pfn, radix_entry))
  
  DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
-DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
  
  DECLARE_EVENT_CLASS(dax_pte_fault_class,
         TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
@@ -192,6 +191,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
  DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
  DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
  DEFINE_PTE_FAULT_EVENT(dax_load_hole);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);
  
  TRACE_EVENT(dax_insert_mapping,
         TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h

index 6d319c4..f8b134f 100644 (file)
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -17,6 +17,7 @@
  
  #define MAP_SHARED     0x01            /* Share changes */
  #define MAP_PRIVATE    0x02            /* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03       /* share + validate extension flags */
  #define MAP_TYPE       0x0f            /* Mask for type of mapping */
  #define MAP_FIXED      0x10            /* Interpret addr exactly */
  #define MAP_ANONYMOUS  0x20            /* don't use a file */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h

index 2dffcbf..653687d 100644 (file)
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -13,6 +13,7 @@
  #define MAP_NONBLOCK   0x10000         /* do not block on IO */
  #define MAP_STACK      0x20000         /* give out an address that is best suited for process/thread stacks */
  #define MAP_HUGETLB    0x40000         /* create a huge page mapping */
+#define MAP_SYNC       0x80000         /* perform synchronous page faults for the mapping */
  
  /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
  
diff --git a/mm/mmap.c b/mm/mmap.c

index 680506f..924839f 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
  
         if (file) {
                 struct inode *inode = file_inode(file);
+               unsigned long flags_mask;
+
+               flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
  
                 switch (flags & MAP_TYPE) {
                 case MAP_SHARED:
+                       /*
+                        * Force use of MAP_SHARED_VALIDATE with non-legacy
+                        * flags. E.g. MAP_SYNC is dangerous to use with
+                        * MAP_SHARED as you don't know which consistency model
+                        * you will get. We silently ignore unsupported flags
+                        * with MAP_SHARED to preserve backward compatibility.
+                        */
+                       flags &= LEGACY_MAP_MASK;
+                       /* fall through */
+               case MAP_SHARED_VALIDATE:
+                       if (flags & ~flags_mask)
+                               return -EOPNOTSUPP;
                         if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
                                 return -EACCES;
  
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h

index 6d319c4..f8b134f 100644 (file)
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -17,6 +17,7 @@
  
  #define MAP_SHARED     0x01            /* Share changes */
  #define MAP_PRIVATE    0x02            /* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03       /* share + validate extension flags */
  #define MAP_TYPE       0x0f            /* Mask for type of mapping */
  #define MAP_FIXED      0x10            /* Interpret addr exactly */
  #define MAP_ANONYMOUS  0x20            /* don't use a file */
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild

index 65368d9..db33b28 100644 (file)
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -70,6 +70,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
  libnvdimm-y += $(NVDIMM_SRC)/region.o
  libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
  libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-y += $(NVDIMM_SRC)/badrange.o
  libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
  libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
  libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c

index bef419d..7217b2b 100644 (file)
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -168,8 +168,12 @@ struct nfit_test {
                 spinlock_t lock;
         } ars_state;
         struct device *dimm_dev[NUM_DCR];
+       struct badrange badrange;
+       struct work_struct work;
  };
  
+static struct workqueue_struct *nfit_wq;
+
  static struct nfit_test *to_nfit_test(struct device *dev)
  {
         struct platform_device *pdev = to_platform_device(dev);
@@ -234,48 +238,68 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
         return rc;
  }
  
-#define NFIT_TEST_ARS_RECORDS 4
  #define NFIT_TEST_CLEAR_ERR_UNIT 256
  
  static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
                 unsigned int buf_len)
  {
+       int ars_recs;
+
         if (buf_len < sizeof(*nd_cmd))
                 return -EINVAL;
  
+       /* for testing, only store up to n records that fit within 4k */
+       ars_recs = SZ_4K / sizeof(struct nd_ars_record);
+
         nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
-               + NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record);
+               + ars_recs * sizeof(struct nd_ars_record);
         nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
         nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT;
  
         return 0;
  }
  
-/*
- * Initialize the ars_state to return an ars_result 1 second in the future with
- * a 4K error range in the middle of the requested address range.
- */
-static void post_ars_status(struct ars_state *ars_state, u64 addr, u64 len)
+static void post_ars_status(struct ars_state *ars_state,
+               struct badrange *badrange, u64 addr, u64 len)
  {
         struct nd_cmd_ars_status *ars_status;
         struct nd_ars_record *ars_record;
+       struct badrange_entry *be;
+       u64 end = addr + len - 1;
+       int i = 0;
  
         ars_state->deadline = jiffies + 1*HZ;
         ars_status = ars_state->ars_status;
         ars_status->status = 0;
-       ars_status->out_length = sizeof(struct nd_cmd_ars_status)
-               + sizeof(struct nd_ars_record);
         ars_status->address = addr;
         ars_status->length = len;
         ars_status->type = ND_ARS_PERSISTENT;
-       ars_status->num_records = 1;
-       ars_record = &ars_status->records[0];
-       ars_record->handle = 0;
-       ars_record->err_address = addr + len / 2;
-       ars_record->length = SZ_4K;
+
+       spin_lock(&badrange->lock);
+       list_for_each_entry(be, &badrange->list, list) {
+               u64 be_end = be->start + be->length - 1;
+               u64 rstart, rend;
+
+               /* skip entries outside the range */
+               if (be_end < addr || be->start > end)
+                       continue;
+
+               rstart = (be->start < addr) ? addr : be->start;
+               rend = (be_end < end) ? be_end : end;
+               ars_record = &ars_status->records[i];
+               ars_record->handle = 0;
+               ars_record->err_address = rstart;
+               ars_record->length = rend - rstart + 1;
+               i++;
+       }
+       spin_unlock(&badrange->lock);
+       ars_status->num_records = i;
+       ars_status->out_length = sizeof(struct nd_cmd_ars_status)
+               + i * sizeof(struct nd_ars_record);
  }
  
-static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
+static int nfit_test_cmd_ars_start(struct nfit_test *t,
+               struct ars_state *ars_state,
                 struct nd_cmd_ars_start *ars_start, unsigned int buf_len,
                 int *cmd_rc)
  {
@@ -289,7 +313,7 @@ static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
         } else {
                 ars_start->status = 0;
                 ars_start->scrub_time = 1;
-               post_ars_status(ars_state, ars_start->address,
+               post_ars_status(ars_state, &t->badrange, ars_start->address,
                                 ars_start->length);
                 *cmd_rc = 0;
         }
@@ -320,7 +344,8 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
         return 0;
  }
  
-static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
+static int nfit_test_cmd_clear_error(struct nfit_test *t,
+               struct nd_cmd_clear_error *clear_err,
                 unsigned int buf_len, int *cmd_rc)
  {
         const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1;
@@ -330,18 +355,91 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
         if ((clear_err->address & mask) || (clear_err->length & mask))
                 return -EINVAL;
  
-       /*
-        * Report 'all clear' success for all commands even though a new
-        * scrub will find errors again.  This is enough to have the
-        * error removed from the 'badblocks' tracking in the pmem
-        * driver.
-        */
+       badrange_forget(&t->badrange, clear_err->address, clear_err->length);
         clear_err->status = 0;
         clear_err->cleared = clear_err->length;
         *cmd_rc = 0;
         return 0;
  }
  
+struct region_search_spa {
+       u64 addr;
+       struct nd_region *region;
+};
+
+static int is_region_device(struct device *dev)
+{
+       return !strncmp(dev->kobj.name, "region", 6);
+}
+
+static int nfit_test_search_region_spa(struct device *dev, void *data)
+{
+       struct region_search_spa *ctx = data;
+       struct nd_region *nd_region;
+       resource_size_t ndr_end;
+
+       if (!is_region_device(dev))
+               return 0;
+
+       nd_region = to_nd_region(dev);
+       ndr_end = nd_region->ndr_start + nd_region->ndr_size;
+
+       if (ctx->addr >= nd_region->ndr_start && ctx->addr < ndr_end) {
+               ctx->region = nd_region;
+               return 1;
+       }
+
+       return 0;
+}
+
+static int nfit_test_search_spa(struct nvdimm_bus *bus,
+               struct nd_cmd_translate_spa *spa)
+{
+       int ret;
+       struct nd_region *nd_region = NULL;
+       struct nvdimm *nvdimm = NULL;
+       struct nd_mapping *nd_mapping = NULL;
+       struct region_search_spa ctx = {
+               .addr = spa->spa,
+               .region = NULL,
+       };
+       u64 dpa;
+
+       ret = device_for_each_child(&bus->dev, &ctx,
+                               nfit_test_search_region_spa);
+
+       if (!ret)
+               return -ENODEV;
+
+       nd_region = ctx.region;
+
+       dpa = ctx.addr - nd_region->ndr_start;
+
+       /*
+        * last dimm is selected for test
+        */
+       nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1];
+       nvdimm = nd_mapping->nvdimm;
+
+       spa->devices[0].nfit_device_handle = handle[nvdimm->id];
+       spa->num_nvdimms = 1;
+       spa->devices[0].dpa = dpa;
+
+       return 0;
+}
+
+static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus,
+               struct nd_cmd_translate_spa *spa, unsigned int buf_len)
+{
+       if (buf_len < spa->translate_length)
+               return -EINVAL;
+
+       if (nfit_test_search_spa(bus, spa) < 0 || !spa->num_nvdimms)
+               spa->status = 2;
+
+       return 0;
+}
+
  static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len)
  {
         static const struct nd_smart_payload smart_data = {
@@ -378,6 +476,93 @@ static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t,
         return 0;
  }
  
+static void uc_error_notify(struct work_struct *work)
+{
+       struct nfit_test *t = container_of(work, typeof(*t), work);
+
+       __acpi_nfit_notify(&t->pdev.dev, t, NFIT_NOTIFY_UC_MEMORY_ERROR);
+}
+
+static int nfit_test_cmd_ars_error_inject(struct nfit_test *t,
+               struct nd_cmd_ars_err_inj *err_inj, unsigned int buf_len)
+{
+       int rc;
+
+       if (buf_len != sizeof(*err_inj)) {
+               rc = -EINVAL;
+               goto err;
+       }
+
+       if (err_inj->err_inj_spa_range_length <= 0) {
+               rc = -EINVAL;
+               goto err;
+       }
+
+       rc =  badrange_add(&t->badrange, err_inj->err_inj_spa_range_base,
+                       err_inj->err_inj_spa_range_length);
+       if (rc < 0)
+               goto err;
+
+       if (err_inj->err_inj_options & (1 << ND_ARS_ERR_INJ_OPT_NOTIFY))
+               queue_work(nfit_wq, &t->work);
+
+       err_inj->status = 0;
+       return 0;
+
+err:
+       err_inj->status = NFIT_ARS_INJECT_INVALID;
+       return rc;
+}
+
+static int nfit_test_cmd_ars_inject_clear(struct nfit_test *t,
+               struct nd_cmd_ars_err_inj_clr *err_clr, unsigned int buf_len)
+{
+       int rc;
+
+       if (buf_len != sizeof(*err_clr)) {
+               rc = -EINVAL;
+               goto err;
+       }
+
+       if (err_clr->err_inj_clr_spa_range_length <= 0) {
+               rc = -EINVAL;
+               goto err;
+       }
+
+       badrange_forget(&t->badrange, err_clr->err_inj_clr_spa_range_base,
+                       err_clr->err_inj_clr_spa_range_length);
+
+       err_clr->status = 0;
+       return 0;
+
+err:
+       err_clr->status = NFIT_ARS_INJECT_INVALID;
+       return rc;
+}
+
+static int nfit_test_cmd_ars_inject_status(struct nfit_test *t,
+               struct nd_cmd_ars_err_inj_stat *err_stat,
+               unsigned int buf_len)
+{
+       struct badrange_entry *be;
+       int max = SZ_4K / sizeof(struct nd_error_stat_query_record);
+       int i = 0;
+
+       err_stat->status = 0;
+       spin_lock(&t->badrange.lock);
+       list_for_each_entry(be, &t->badrange.list, list) {
+               err_stat->record[i].err_inj_stat_spa_range_base = be->start;
+               err_stat->record[i].err_inj_stat_spa_range_length = be->length;
+               i++;
+               if (i > max)
+                       break;
+       }
+       spin_unlock(&t->badrange.lock);
+       err_stat->inj_err_rec_count = i;
+
+       return 0;
+}
+
  static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
                 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
                 unsigned int buf_len, int *cmd_rc)
@@ -449,6 +634,38 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
                 }
         } else {
                 struct ars_state *ars_state = &t->ars_state;
+               struct nd_cmd_pkg *call_pkg = buf;
+
+               if (!nd_desc)
+                       return -ENOTTY;
+
+               if (cmd == ND_CMD_CALL) {
+                       func = call_pkg->nd_command;
+
+                       buf_len = call_pkg->nd_size_in + call_pkg->nd_size_out;
+                       buf = (void *) call_pkg->nd_payload;
+
+                       switch (func) {
+                       case NFIT_CMD_TRANSLATE_SPA:
+                               rc = nfit_test_cmd_translate_spa(
+                                       acpi_desc->nvdimm_bus, buf, buf_len);
+                               return rc;
+                       case NFIT_CMD_ARS_INJECT_SET:
+                               rc = nfit_test_cmd_ars_error_inject(t, buf,
+                                       buf_len);
+                               return rc;
+                       case NFIT_CMD_ARS_INJECT_CLEAR:
+                               rc = nfit_test_cmd_ars_inject_clear(t, buf,
+                                       buf_len);
+                               return rc;
+                       case NFIT_CMD_ARS_INJECT_GET:
+                               rc = nfit_test_cmd_ars_inject_status(t, buf,
+                                       buf_len);
+                               return rc;
+                       default:
+                               return -ENOTTY;
+                       }
+               }
  
                 if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
                         return -ENOTTY;
@@ -458,15 +675,15 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
                         rc = nfit_test_cmd_ars_cap(buf, buf_len);
                         break;
                 case ND_CMD_ARS_START:
-                       rc = nfit_test_cmd_ars_start(ars_state, buf, buf_len,
-                                       cmd_rc);
+                       rc = nfit_test_cmd_ars_start(t, ars_state, buf,
+                                       buf_len, cmd_rc);
                         break;
                 case ND_CMD_ARS_STATUS:
                         rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
                                         cmd_rc);
                         break;
                 case ND_CMD_CLEAR_ERROR:
-                       rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc);
+                       rc = nfit_test_cmd_clear_error(t, buf, buf_len, cmd_rc);
                         break;
                 default:
                         return -ENOTTY;
@@ -566,10 +783,9 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
  
  static int ars_state_init(struct device *dev, struct ars_state *ars_state)
  {
+       /* for testing, only store up to n records that fit within 4k */
         ars_state->ars_status = devm_kzalloc(dev,
-                       sizeof(struct nd_cmd_ars_status)
-                       + sizeof(struct nd_ars_record) * NFIT_TEST_ARS_RECORDS,
-                       GFP_KERNEL);
+                       sizeof(struct nd_cmd_ars_status) + SZ_4K, GFP_KERNEL);
         if (!ars_state->ars_status)
                 return -ENOMEM;
         spin_lock_init(&ars_state->lock);
@@ -1419,7 +1635,8 @@ static void nfit_test0_setup(struct nfit_test *t)
                                 + i * sizeof(u64);
         }
  
-       post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
+       post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
+                       SPA0_SIZE);
  
         acpi_desc = &t->acpi_desc;
         set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
@@ -1430,7 +1647,12 @@ static void nfit_test0_setup(struct nfit_test *t)
         set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
         set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
         set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
+       set_bit(ND_CMD_CALL, &acpi_desc->bus_cmd_force_en);
         set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en);
+       set_bit(NFIT_CMD_TRANSLATE_SPA, &acpi_desc->bus_nfit_cmd_force_en);
+       set_bit(NFIT_CMD_ARS_INJECT_SET, &acpi_desc->bus_nfit_cmd_force_en);
+       set_bit(NFIT_CMD_ARS_INJECT_CLEAR, &acpi_desc->bus_nfit_cmd_force_en);
+       set_bit(NFIT_CMD_ARS_INJECT_GET, &acpi_desc->bus_nfit_cmd_force_en);
  }
  
  static void nfit_test1_setup(struct nfit_test *t)
@@ -1520,7 +1742,8 @@ static void nfit_test1_setup(struct nfit_test *t)
         dcr->code = NFIT_FIC_BYTE;
         dcr->windows = 0;
  
-       post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
+       post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
+                       SPA2_SIZE);
  
         acpi_desc = &t->acpi_desc;
         set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
@@ -1589,6 +1812,7 @@ static int nfit_ctl_test(struct device *dev)
         unsigned long mask, cmd_size, offset;
         union {
                 struct nd_cmd_get_config_size cfg_size;
+               struct nd_cmd_clear_error clear_err;
                 struct nd_cmd_ars_status ars_stat;
                 struct nd_cmd_ars_cap ars_cap;
                 char buf[sizeof(struct nd_cmd_ars_status)
@@ -1613,10 +1837,15 @@ static int nfit_ctl_test(struct device *dev)
                         .cmd_mask = 1UL << ND_CMD_ARS_CAP
                                 | 1UL << ND_CMD_ARS_START
                                 | 1UL << ND_CMD_ARS_STATUS
-                               | 1UL << ND_CMD_CLEAR_ERROR,
+                               | 1UL << ND_CMD_CLEAR_ERROR
+                               | 1UL << ND_CMD_CALL,
                         .module = THIS_MODULE,
                         .provider_name = "ACPI.NFIT",
                         .ndctl = acpi_nfit_ctl,
+                       .bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA
+                               | 1UL << NFIT_CMD_ARS_INJECT_SET
+                               | 1UL << NFIT_CMD_ARS_INJECT_CLEAR
+                               | 1UL << NFIT_CMD_ARS_INJECT_GET,
                 },
                 .dev = &adev->dev,
         };
@@ -1767,6 +1996,23 @@ static int nfit_ctl_test(struct device *dev)
                 return -EIO;
         }
  
+       /* test clear error */
+       cmd_size = sizeof(cmds.clear_err);
+       cmds.clear_err = (struct nd_cmd_clear_error) {
+               .length = 512,
+               .cleared = 512,
+       };
+       rc = setup_result(cmds.buf, cmd_size);
+       if (rc)
+               return rc;
+       rc = acpi_nfit_ctl(&acpi_desc->nd_desc, NULL, ND_CMD_CLEAR_ERROR,
+                       cmds.buf, cmd_size, &cmd_rc);
+       if (rc < 0 || cmd_rc) {
+               dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
+                               __func__, __LINE__, rc, cmd_rc);
+               return -EIO;
+       }
+
         return 0;
  }
  
@@ -1915,6 +2161,10 @@ static __init int nfit_test_init(void)
  
         nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
  
+       nfit_wq = create_singlethread_workqueue("nfit");
+       if (!nfit_wq)
+               return -ENOMEM;
+
         nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm");
         if (IS_ERR(nfit_test_dimm)) {
                 rc = PTR_ERR(nfit_test_dimm);
@@ -1931,6 +2181,7 @@ static __init int nfit_test_init(void)
                         goto err_register;
                 }
                 INIT_LIST_HEAD(&nfit_test->resources);
+               badrange_init(&nfit_test->badrange);
                 switch (i) {
                 case 0:
                         nfit_test->num_pm = NUM_PM;
@@ -1966,6 +2217,7 @@ static __init int nfit_test_init(void)
                         goto err_register;
  
                 instances[i] = nfit_test;
+               INIT_WORK(&nfit_test->work, uc_error_notify);
         }
  
         rc = platform_driver_register(&nfit_test_driver);
@@ -1974,6 +2226,7 @@ static __init int nfit_test_init(void)
         return 0;
  
   err_register:
+       destroy_workqueue(nfit_wq);
         for (i = 0; i < NUM_NFITS; i++)
                 if (instances[i])
                         platform_device_unregister(&instances[i]->pdev);
@@ -1989,6 +2242,8 @@ static __exit void nfit_test_exit(void)
  {
         int i;
  
+       flush_workqueue(nfit_wq);
+       destroy_workqueue(nfit_wq);
         for (i = 0; i < NUM_NFITS; i++)
                 platform_device_unregister(&instances[i]->pdev);
         platform_driver_unregister(&nfit_test_driver);
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h

index d3d63dd..113b446 100644 (file)
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -32,6 +32,58 @@ struct nfit_test_resource {
         void *buf;
  };
  
+#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA  2
+#define NFIT_ARS_INJECT_INVALID 2
+
+enum err_inj_options {
+       ND_ARS_ERR_INJ_OPT_NOTIFY = 0,
+};
+
+/* nfit commands */
+enum nfit_cmd_num {
+       NFIT_CMD_TRANSLATE_SPA = 5,
+       NFIT_CMD_ARS_INJECT_SET = 7,
+       NFIT_CMD_ARS_INJECT_CLEAR = 8,
+       NFIT_CMD_ARS_INJECT_GET = 9,
+};
+
+struct nd_cmd_translate_spa {
+       __u64 spa;
+       __u32 status;
+       __u8  flags;
+       __u8  _reserved[3];
+       __u64 translate_length;
+       __u32 num_nvdimms;
+       struct nd_nvdimm_device {
+               __u32 nfit_device_handle;
+               __u32 _reserved;
+               __u64 dpa;
+       } __packed devices[0];
+
+} __packed;
+
+struct nd_cmd_ars_err_inj {
+       __u64 err_inj_spa_range_base;
+       __u64 err_inj_spa_range_length;
+       __u8  err_inj_options;
+       __u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_clr {
+       __u64 err_inj_clr_spa_range_base;
+       __u64 err_inj_clr_spa_range_length;
+       __u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_stat {
+       __u32 status;
+       __u32 inj_err_rec_count;
+       struct nd_error_stat_query_record {
+               __u64 err_inj_stat_spa_range_base;
+               __u64 err_inj_stat_spa_range_length;
+       } __packed record[0];
+} __packed;
+
  union acpi_object;
  typedef void *acpi_handle;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 17 Nov 2017 17:51:57 +0000 (09:51 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 17 Nov 2017 17:51:57 +0000 (09:51 -0800)
MAINTAINERS		patch \| blob \| history
arch/alpha/include/uapi/asm/mman.h		patch \| blob \| history
arch/mips/include/uapi/asm/mman.h		patch \| blob \| history
arch/parisc/include/uapi/asm/mman.h		patch \| blob \| history
arch/xtensa/include/uapi/asm/mman.h		patch \| blob \| history
drivers/acpi/nfit/core.c		patch \| blob \| history
drivers/acpi/nfit/mce.c		patch \| blob \| history
drivers/acpi/nfit/nfit.h		patch \| blob \| history
drivers/block/Kconfig		patch \| blob \| history
drivers/block/brd.c		patch \| blob \| history
drivers/dax/device.c		patch \| blob \| history
drivers/dax/super.c		patch \| blob \| history
drivers/nvdimm/Makefile		patch \| blob \| history
drivers/nvdimm/badrange.c	[new file with mode: 0644]	patch \| blob
drivers/nvdimm/bus.c		patch \| blob \| history
drivers/nvdimm/core.c		patch \| blob \| history
drivers/nvdimm/dimm.c		patch \| blob \| history
drivers/nvdimm/dimm_devs.c		patch \| blob \| history
drivers/nvdimm/label.c		patch \| blob \| history
drivers/nvdimm/namespace_devs.c		patch \| blob \| history
drivers/nvdimm/nd-core.h		patch \| blob \| history
drivers/nvdimm/nd.h		patch \| blob \| history
drivers/nvdimm/pfn_devs.c		patch \| blob \| history
drivers/nvdimm/region_devs.c		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/ext2/file.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
include/linux/dax.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/iomap.h		patch \| blob \| history
include/linux/jbd2.h		patch \| blob \| history
include/linux/libnvdimm.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mman.h		patch \| blob \| history
include/trace/events/fs_dax.h		patch \| blob \| history
include/uapi/asm-generic/mman-common.h		patch \| blob \| history
include/uapi/asm-generic/mman.h		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
tools/include/uapi/asm-generic/mman-common.h		patch \| blob \| history
tools/testing/nvdimm/Kbuild		patch \| blob \| history
tools/testing/nvdimm/test/nfit.c		patch \| blob \| history
tools/testing/nvdimm/test/nfit_test.h		patch \| blob \| history