Merge tag 'fsnotify_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 17:04:31 +0000 (10:04 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 17:04:31 +0000 (10:04 -0700)
Pull fsnotify updates from Jan Kara:
 "fsnotify speedups when notification actually isn't used and support
  for identifying processes which caused fanotify events through pidfd
  instead of normal pid"

* tag 'fsnotify_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
  fsnotify: optimize the case of no marks of any type
  fsnotify: count all objects with attached connectors
  fsnotify: count s_fsnotify_inode_refs for attached connectors
  fsnotify: replace igrab() with ihold() on attach connector
  fanotify: add pidfd support to the fanotify API
  fanotify: introduce a generic info record copying helper
  fanotify: minor cosmetic adjustments to fid labels
  kernel/pid.c: implement additional checks upon pidfd_create() parameters
  kernel/pid.c: remove static qualifier from pidfd_create()

fs/notify/fanotify/fanotify_user.c
fs/notify/fsnotify.c
fs/notify/fsnotify.h
fs/notify/mark.c
include/linux/fanotify.h
include/linux/fs.h
include/linux/fsnotify.h
include/linux/pid.h
include/uapi/linux/fanotify.h
kernel/pid.c

index 28b67cb..6facdf4 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/fanotify.h>
 #include <linux/fcntl.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
@@ -109,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly;
 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 
 #define FANOTIFY_EVENT_ALIGN 4
-#define FANOTIFY_INFO_HDR_LEN \
+#define FANOTIFY_FID_INFO_HDR_LEN \
        (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+       sizeof(struct fanotify_event_info_pidfd)
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -119,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
        if (name_len)
                info_len += name_len + 1;
 
-       return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
+       return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
+                      FANOTIFY_EVENT_ALIGN);
 }
 
-static int fanotify_event_info_len(unsigned int fid_mode,
+static int fanotify_event_info_len(unsigned int info_mode,
                                   struct fanotify_event *event)
 {
        struct fanotify_info *info = fanotify_event_info(event);
@@ -133,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode,
 
        if (dir_fh_len) {
                info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
-       } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) {
+       } else if ((info_mode & FAN_REPORT_NAME) &&
+                  (event->mask & FAN_ONDIR)) {
                /*
                 * With group flag FAN_REPORT_NAME, if name was not recorded in
                 * event on a directory, we will report the name ".".
@@ -141,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode,
                dot_len = 1;
        }
 
+       if (info_mode & FAN_REPORT_PIDFD)
+               info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
        if (fh_len)
                info_len += fanotify_fid_info_len(fh_len, dot_len);
 
@@ -176,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
        size_t event_size = FAN_EVENT_METADATA_LEN;
        struct fanotify_event *event = NULL;
        struct fsnotify_event *fsn_event;
-       unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+       unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 
        pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 
@@ -186,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
                goto out;
 
        event = FANOTIFY_E(fsn_event);
-       if (fid_mode)
-               event_size += fanotify_event_info_len(fid_mode, event);
+       if (info_mode)
+               event_size += fanotify_event_info_len(info_mode, event);
 
        if (event_size > count) {
                event = ERR_PTR(-EINVAL);
@@ -308,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group,
        return -ENOENT;
 }
 
-static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
-                            int info_type, const char *name, size_t name_len,
-                            char __user *buf, size_t count)
+static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+                                int info_type, const char *name,
+                                size_t name_len,
+                                char __user *buf, size_t count)
 {
        struct fanotify_event_info_fid info = { };
        struct file_handle handle = { };
@@ -403,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
        return info_len;
 }
 
+static int copy_pidfd_info_to_user(int pidfd,
+                                  char __user *buf,
+                                  size_t count)
+{
+       struct fanotify_event_info_pidfd info = { };
+       size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+       if (WARN_ON_ONCE(info_len > count))
+               return -EFAULT;
+
+       info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
+       info.hdr.len = info_len;
+       info.pidfd = pidfd;
+
+       if (copy_to_user(buf, &info, info_len))
+               return -EFAULT;
+
+       return info_len;
+}
+
+static int copy_info_records_to_user(struct fanotify_event *event,
+                                    struct fanotify_info *info,
+                                    unsigned int info_mode, int pidfd,
+                                    char __user *buf, size_t count)
+{
+       int ret, total_bytes = 0, info_type = 0;
+       unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
+       unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
+
+       /*
+        * Event info records order is as follows: dir fid + name, child fid.
+        */
+       if (fanotify_event_dir_fh_len(event)) {
+               info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
+                                            FAN_EVENT_INFO_TYPE_DFID;
+               ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+                                           fanotify_info_dir_fh(info),
+                                           info_type,
+                                           fanotify_info_name(info),
+                                           info->name_len, buf, count);
+               if (ret < 0)
+                       return ret;
+
+               buf += ret;
+               count -= ret;
+               total_bytes += ret;
+       }
+
+       if (fanotify_event_object_fh_len(event)) {
+               const char *dot = NULL;
+               int dot_len = 0;
+
+               if (fid_mode == FAN_REPORT_FID || info_type) {
+                       /*
+                        * With only group flag FAN_REPORT_FID only type FID is
+                        * reported. Second info record type is always FID.
+                        */
+                       info_type = FAN_EVENT_INFO_TYPE_FID;
+               } else if ((fid_mode & FAN_REPORT_NAME) &&
+                          (event->mask & FAN_ONDIR)) {
+                       /*
+                        * With group flag FAN_REPORT_NAME, if name was not
+                        * recorded in an event on a directory, report the name
+                        * "." with info type DFID_NAME.
+                        */
+                       info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
+                       dot = ".";
+                       dot_len = 1;
+               } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
+                          (event->mask & FAN_ONDIR)) {
+                       /*
+                        * With group flag FAN_REPORT_DIR_FID, a single info
+                        * record has type DFID for directory entry modification
+                        * event and for event on a directory.
+                        */
+                       info_type = FAN_EVENT_INFO_TYPE_DFID;
+               } else {
+                       /*
+                        * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
+                        * a single info record has type FID for event on a
+                        * non-directory, when there is no directory to report.
+                        * For example, on FAN_DELETE_SELF event.
+                        */
+                       info_type = FAN_EVENT_INFO_TYPE_FID;
+               }
+
+               ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+                                           fanotify_event_object_fh(event),
+                                           info_type, dot, dot_len,
+                                           buf, count);
+               if (ret < 0)
+                       return ret;
+
+               buf += ret;
+               count -= ret;
+               total_bytes += ret;
+       }
+
+       if (pidfd_mode) {
+               ret = copy_pidfd_info_to_user(pidfd, buf, count);
+               if (ret < 0)
+                       return ret;
+
+               buf += ret;
+               count -= ret;
+               total_bytes += ret;
+       }
+
+       return total_bytes;
+}
+
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
                                  struct fanotify_event *event,
                                  char __user *buf, size_t count)
@@ -410,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        struct fanotify_event_metadata metadata;
        struct path *path = fanotify_event_path(event);
        struct fanotify_info *info = fanotify_event_info(event);
-       unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+       unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
+       unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
        struct file *f = NULL;
-       int ret, fd = FAN_NOFD;
-       int info_type = 0;
+       int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
 
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
        metadata.event_len = FAN_EVENT_METADATA_LEN +
-                               fanotify_event_info_len(fid_mode, event);
+                               fanotify_event_info_len(info_mode, event);
        metadata.metadata_len = FAN_EVENT_METADATA_LEN;
        metadata.vers = FANOTIFY_METADATA_VERSION;
        metadata.reserved = 0;
@@ -447,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        }
        metadata.fd = fd;
 
+       if (pidfd_mode) {
+               /*
+                * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
+                * exclusion is ever lifted. At the time of incoporating pidfd
+                * support within fanotify, the pidfd API only supported the
+                * creation of pidfds for thread-group leaders.
+                */
+               WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
+
+               /*
+                * The PIDTYPE_TGID check for an event->pid is performed
+                * preemptively in an attempt to catch out cases where the event
+                * listener reads events after the event generating process has
+                * already terminated. Report FAN_NOPIDFD to the event listener
+                * in those cases, with all other pidfd creation errors being
+                * reported as FAN_EPIDFD.
+                */
+               if (metadata.pid == 0 ||
+                   !pid_has_task(event->pid, PIDTYPE_TGID)) {
+                       pidfd = FAN_NOPIDFD;
+               } else {
+                       pidfd = pidfd_create(event->pid, 0);
+                       if (pidfd < 0)
+                               pidfd = FAN_EPIDFD;
+               }
+       }
+
        ret = -EFAULT;
        /*
         * Sanity check copy size in case get_one_event() and
@@ -467,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        if (f)
                fd_install(fd, f);
 
-       /* Event info records order is: dir fid + name, child fid */
-       if (fanotify_event_dir_fh_len(event)) {
-               info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
-                                            FAN_EVENT_INFO_TYPE_DFID;
-               ret = copy_info_to_user(fanotify_event_fsid(event),
-                                       fanotify_info_dir_fh(info),
-                                       info_type, fanotify_info_name(info),
-                                       info->name_len, buf, count);
+       if (info_mode) {
+               ret = copy_info_records_to_user(event, info, info_mode, pidfd,
+                                               buf, count);
                if (ret < 0)
                        goto out_close_fd;
-
-               buf += ret;
-               count -= ret;
-       }
-
-       if (fanotify_event_object_fh_len(event)) {
-               const char *dot = NULL;
-               int dot_len = 0;
-
-               if (fid_mode == FAN_REPORT_FID || info_type) {
-                       /*
-                        * With only group flag FAN_REPORT_FID only type FID is
-                        * reported. Second info record type is always FID.
-                        */
-                       info_type = FAN_EVENT_INFO_TYPE_FID;
-               } else if ((fid_mode & FAN_REPORT_NAME) &&
-                          (event->mask & FAN_ONDIR)) {
-                       /*
-                        * With group flag FAN_REPORT_NAME, if name was not
-                        * recorded in an event on a directory, report the
-                        * name "." with info type DFID_NAME.
-                        */
-                       info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
-                       dot = ".";
-                       dot_len = 1;
-               } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
-                          (event->mask & FAN_ONDIR)) {
-                       /*
-                        * With group flag FAN_REPORT_DIR_FID, a single info
-                        * record has type DFID for directory entry modification
-                        * event and for event on a directory.
-                        */
-                       info_type = FAN_EVENT_INFO_TYPE_DFID;
-               } else {
-                       /*
-                        * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
-                        * a single info record has type FID for event on a
-                        * non-directory, when there is no directory to report.
-                        * For example, on FAN_DELETE_SELF event.
-                        */
-                       info_type = FAN_EVENT_INFO_TYPE_FID;
-               }
-
-               ret = copy_info_to_user(fanotify_event_fsid(event),
-                                       fanotify_event_object_fh(event),
-                                       info_type, dot, dot_len, buf, count);
-               if (ret < 0)
-                       goto out_close_fd;
-
-               buf += ret;
-               count -= ret;
        }
 
        return metadata.event_len;
@@ -537,6 +628,10 @@ out_close_fd:
                put_unused_fd(fd);
                fput(f);
        }
+
+       if (pidfd >= 0)
+               close_fd(pidfd);
+
        return ret;
 }
 
@@ -1082,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 #endif
                return -EINVAL;
 
+       /*
+        * A pidfd can only be returned for a thread-group leader; thus
+        * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
+        * exclusive.
+        */
+       if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
+               return -EINVAL;
+
        if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
                return -EINVAL;
 
@@ -1483,7 +1586,7 @@ static int __init fanotify_user_setup(void)
                                     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
        BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
-       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
        BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
index 30d422b..963e6ce 100644 (file)
@@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
 
        if (iput_inode)
                iput(iput_inode);
-       /* Wait for outstanding inode references from connectors */
-       wait_var_event(&sb->s_fsnotify_inode_refs,
-                      !atomic_long_read(&sb->s_fsnotify_inode_refs));
 }
 
 void fsnotify_sb_delete(struct super_block *sb)
 {
        fsnotify_unmount_inodes(sb);
        fsnotify_clear_marks_by_sb(sb);
+       /* Wait for outstanding object references from connectors */
+       wait_var_event(&sb->s_fsnotify_connectors,
+                      !atomic_long_read(&sb->s_fsnotify_connectors));
 }
 
 /*
index ff2063e..87d8a50 100644 (file)
@@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb(
        return container_of(conn->obj, struct super_block, s_fsnotify_marks);
 }
 
+static inline struct super_block *fsnotify_connector_sb(
+                               struct fsnotify_mark_connector *conn)
+{
+       switch (conn->type) {
+       case FSNOTIFY_OBJ_TYPE_INODE:
+               return fsnotify_conn_inode(conn)->i_sb;
+       case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
+               return fsnotify_conn_mount(conn)->mnt.mnt_sb;
+       case FSNOTIFY_OBJ_TYPE_SB:
+               return fsnotify_conn_sb(conn);
+       default:
+               return NULL;
+       }
+}
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
index d32ab34..95006d1 100644 (file)
@@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
        }
 }
 
+static void fsnotify_get_inode_ref(struct inode *inode)
+{
+       ihold(inode);
+       atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_put_inode_ref(struct inode *inode)
+{
+       struct super_block *sb = inode->i_sb;
+
+       iput(inode);
+       if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
+               wake_up_var(&sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
+{
+       struct super_block *sb = fsnotify_connector_sb(conn);
+
+       if (sb)
+               atomic_long_inc(&sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
+{
+       struct super_block *sb = fsnotify_connector_sb(conn);
+
+       if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
+               wake_up_var(&sb->s_fsnotify_connectors);
+}
+
 static void *fsnotify_detach_connector_from_object(
                                        struct fsnotify_mark_connector *conn,
                                        unsigned int *type)
@@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object(
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
                inode = fsnotify_conn_inode(conn);
                inode->i_fsnotify_mask = 0;
-               atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
                fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
                fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
        }
 
+       fsnotify_put_sb_connectors(conn);
        rcu_assign_pointer(*(conn->obj), NULL);
        conn->obj = NULL;
        conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
@@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
 /* Drop object reference originally held by a connector */
 static void fsnotify_drop_object(unsigned int type, void *objp)
 {
-       struct inode *inode;
-       struct super_block *sb;
-
        if (!objp)
                return;
        /* Currently only inode references are passed to be dropped */
        if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
                return;
-       inode = objp;
-       sb = inode->i_sb;
-       iput(inode);
-       if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
-               wake_up_var(&sb->s_fsnotify_inode_refs);
+       fsnotify_put_inode_ref(objp);
 }
 
 void fsnotify_put_mark(struct fsnotify_mark *mark)
@@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                conn->fsid.val[0] = conn->fsid.val[1] = 0;
                conn->flags = 0;
        }
-       if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
-               inode = igrab(fsnotify_conn_inode(conn));
+       if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
+               inode = fsnotify_conn_inode(conn);
+               fsnotify_get_inode_ref(inode);
+       }
+       fsnotify_get_sb_connectors(conn);
+
        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
@@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */
                if (inode)
-                       iput(inode);
+                       fsnotify_put_inode_ref(inode);
                kmem_cache_free(fsnotify_mark_connector_cachep, conn);
        }
 
index a16dbec..eec3b7c 100644 (file)
@@ -27,6 +27,8 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
 
 #define FANOTIFY_FID_BITS      (FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
 
+#define FANOTIFY_INFO_MODES    (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
+
 /*
  * fanotify_init() flags that require CAP_SYS_ADMIN.
  * We do not allow unprivileged groups to request permission events.
@@ -35,6 +37,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
  */
 #define FANOTIFY_ADMIN_INIT_FLAGS      (FANOTIFY_PERM_CLASSES | \
                                         FAN_REPORT_TID | \
+                                        FAN_REPORT_PIDFD | \
                                         FAN_UNLIMITED_QUEUE | \
                                         FAN_UNLIMITED_MARKS)
 
index 6405742..bea8ec5 100644 (file)
@@ -1507,8 +1507,11 @@ struct super_block {
        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;
 
-       /* Pending fsnotify inode refs */
-       atomic_long_t s_fsnotify_inode_refs;
+       /*
+        * Number of inode/mount/sb objects that are being watched, note that
+        * inodes objects are currently double-accounted.
+        */
+       atomic_long_t s_fsnotify_connectors;
 
        /* Being remounted read-only */
        int s_readonly_remount;
index f8acddc..12d3a7d 100644 (file)
@@ -30,6 +30,9 @@ static inline void fsnotify_name(struct inode *dir, __u32 mask,
                                 struct inode *child,
                                 const struct qstr *name, u32 cookie)
 {
+       if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0)
+               return;
+
        fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie);
 }
 
@@ -41,6 +44,9 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
 
 static inline void fsnotify_inode(struct inode *inode, __u32 mask)
 {
+       if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0)
+               return;
+
        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;
 
@@ -53,6 +59,9 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
 {
        struct inode *inode = d_inode(dentry);
 
+       if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0)
+               return 0;
+
        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;
 
index fa10acb..af308e1 100644 (file)
@@ -78,6 +78,7 @@ struct file;
 
 extern struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
+int pidfd_create(struct pid *pid, unsigned int flags);
 
 static inline struct pid *get_pid(struct pid *pid)
 {
index fbf9c5c..64553df 100644 (file)
@@ -51,6 +51,7 @@
 #define FAN_ENABLE_AUDIT       0x00000040
 
 /* Flags to determine fanotify event format */
+#define FAN_REPORT_PIDFD       0x00000080      /* Report pidfd for event->pid */
 #define FAN_REPORT_TID         0x00000100      /* event->pid is thread id */
 #define FAN_REPORT_FID         0x00000200      /* Report unique file id */
 #define FAN_REPORT_DIR_FID     0x00000400      /* Report unique directory id */
@@ -123,6 +124,7 @@ struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_FID                1
 #define FAN_EVENT_INFO_TYPE_DFID_NAME  2
 #define FAN_EVENT_INFO_TYPE_DFID       3
+#define FAN_EVENT_INFO_TYPE_PIDFD      4
 
 /* Variable length info record following event metadata */
 struct fanotify_event_info_header {
@@ -148,6 +150,15 @@ struct fanotify_event_info_fid {
        unsigned char handle[0];
 };
 
+/*
+ * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
+ * It holds a pidfd for the pid that was responsible for generating an event.
+ */
+struct fanotify_event_info_pidfd {
+       struct fanotify_event_info_header hdr;
+       __s32 pidfd;
+};
+
 struct fanotify_response {
        __s32 fd;
        __u32 response;
@@ -160,6 +171,8 @@ struct fanotify_response {
 
 /* No fd set in event */
 #define FAN_NOFD       -1
+#define FAN_NOPIDFD    FAN_NOFD
+#define FAN_EPIDFD     -2
 
 /* Helper functions to deal with fanotify_event_metadata buffers */
 #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
index ebdf9c6..efe87db 100644 (file)
@@ -550,13 +550,21 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
  * Note, that this function can only be called after the fd table has
  * been unshared to avoid leaking the pidfd to the new process.
  *
+ * This symbol should not be explicitly exported to loadable modules.
+ *
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
  */
-static int pidfd_create(struct pid *pid, unsigned int flags)
+int pidfd_create(struct pid *pid, unsigned int flags)
 {
        int fd;
 
+       if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+               return -EINVAL;
+
+       if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+               return -EINVAL;
+
        fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
                              flags | O_RDWR | O_CLOEXEC);
        if (fd < 0)
@@ -596,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
        if (!p)
                return -ESRCH;
 
-       if (pid_has_task(p, PIDTYPE_TGID))
-               fd = pidfd_create(p, flags);
-       else
-               fd = -EINVAL;
+       fd = pidfd_create(p, flags);
 
        put_pid(p);
        return fd;