fs/notify/fanotify/fanotify_user.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/fanotify.h>
   3 #include <linux/fcntl.h>
   4 #include <linux/fdtable.h>
   5 #include <linux/file.h>
   6 #include <linux/fs.h>
   7 #include <linux/anon_inodes.h>
   8 #include <linux/fsnotify_backend.h>
   9 #include <linux/init.h>
  10 #include <linux/mount.h>
  11 #include <linux/namei.h>
  12 #include <linux/poll.h>
  13 #include <linux/security.h>
  14 #include <linux/syscalls.h>
  15 #include <linux/slab.h>
  16 #include <linux/types.h>
  17 #include <linux/uaccess.h>
  18 #include <linux/compat.h>
  19 #include <linux/sched/signal.h>
  20 #include <linux/memcontrol.h>
  21 #include <linux/statfs.h>
  22 #include <linux/exportfs.h>
  23
  24 #include <asm/ioctls.h>
  25
  26 #include "../../mount.h"
  27 #include "../fdinfo.h"
  28 #include "fanotify.h"
  29
  30 #define FANOTIFY_DEFAULT_MAX_EVENTS     16384
  31 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS  8192
  32 #define FANOTIFY_DEFAULT_MAX_GROUPS     128
  33
  34 /*
  35  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
  36  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
  37  * of fanotify marks per user is <max marks per group> * <max groups per user>.
  38  * This default limit (1M) also happens to match the increased limit of inotify
  39  * max_user_watches since v5.10.
  40  */
  41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS \
  42         (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
  43
  44 /*
  45  * Most of the memory cost of adding an inode mark is pinning the marked inode.
  46  * The size of the filesystem inode struct is not uniform across filesystems,
  47  * so double the size of a VFS inode is used as a conservative approximation.
  48  */
  49 #define INODE_MARK_COST (2 * sizeof(struct inode))
  50
  51 /* configurable via /proc/sys/fs/fanotify/ */
  52 static int fanotify_max_queued_events __read_mostly;
  53
  54 #ifdef CONFIG_SYSCTL
  55
  56 #include <linux/sysctl.h>
  57
  58 struct ctl_table fanotify_table[] = {
  59         {
  60                 .procname       = "max_user_groups",
  61                 .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
  62                 .maxlen         = sizeof(int),
  63                 .mode           = 0644,
  64                 .proc_handler   = proc_dointvec_minmax,
  65                 .extra1         = SYSCTL_ZERO,
  66         },
  67         {
  68                 .procname       = "max_user_marks",
  69                 .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
  70                 .maxlen         = sizeof(int),
  71                 .mode           = 0644,
  72                 .proc_handler   = proc_dointvec_minmax,
  73                 .extra1         = SYSCTL_ZERO,
  74         },
  75         {
  76                 .procname       = "max_queued_events",
  77                 .data           = &fanotify_max_queued_events,
  78                 .maxlen         = sizeof(int),
  79                 .mode           = 0644,
  80                 .proc_handler   = proc_dointvec_minmax,
  81                 .extra1         = SYSCTL_ZERO
  82         },
  83         { }
  84 };
  85 #endif /* CONFIG_SYSCTL */
  86
  87 /*
  88  * All flags that may be specified in parameter event_f_flags of fanotify_init.
  89  *
  90  * Internal and external open flags are stored together in field f_flags of
  91  * struct file. Only external open flags shall be allowed in event_f_flags.
  92  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
  93  * excluded.
  94  */
  95 #define FANOTIFY_INIT_ALL_EVENT_F_BITS                          ( \
  96                 O_ACCMODE       | O_APPEND      | O_NONBLOCK    | \
  97                 __O_SYNC        | O_DSYNC       | O_CLOEXEC     | \
  98                 O_LARGEFILE     | O_NOATIME     )
  99
 100 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 101
 102 struct kmem_cache *fanotify_mark_cache __read_mostly;
 103 struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
 104 struct kmem_cache *fanotify_path_event_cachep __read_mostly;
 105 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 106
 107 #define FANOTIFY_EVENT_ALIGN 4
 108 #define FANOTIFY_FID_INFO_HDR_LEN \
 109         (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
 110 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
 111         sizeof(struct fanotify_event_info_pidfd)
 112
 113 static int fanotify_fid_info_len(int fh_len, int name_len)
 114 {
 115         int info_len = fh_len;
 116
 117         if (name_len)
 118                 info_len += name_len + 1;
 119
 120         return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
 121                        FANOTIFY_EVENT_ALIGN);
 122 }
 123
 124 static int fanotify_event_info_len(unsigned int info_mode,
 125                                    struct fanotify_event *event)
 126 {
 127         struct fanotify_info *info = fanotify_event_info(event);
 128         int dir_fh_len = fanotify_event_dir_fh_len(event);
 129         int fh_len = fanotify_event_object_fh_len(event);
 130         int info_len = 0;
 131         int dot_len = 0;
 132
 133         if (dir_fh_len) {
 134                 info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
 135         } else if ((info_mode & FAN_REPORT_NAME) &&
 136                    (event->mask & FAN_ONDIR)) {
 137                 /*
 138                  * With group flag FAN_REPORT_NAME, if name was not recorded in
 139                  * event on a directory, we will report the name ".".
 140                  */
 141                 dot_len = 1;
 142         }
 143
 144         if (info_mode & FAN_REPORT_PIDFD)
 145                 info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
 146
 147         if (fh_len)
 148                 info_len += fanotify_fid_info_len(fh_len, dot_len);
 149
 150         return info_len;
 151 }
 152
 153 /*
 154  * Remove an hashed event from merge hash table.
 155  */
 156 static void fanotify_unhash_event(struct fsnotify_group *group,
 157                                   struct fanotify_event *event)
 158 {
 159         assert_spin_locked(&group->notification_lock);
 160
 161         pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
 162                  group, event, fanotify_event_hash_bucket(group, event));
 163
 164         if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
 165                 return;
 166
 167         hlist_del_init(&event->merge_list);
 168 }
 169
 170 /*
 171  * Get an fanotify notification event if one exists and is small
 172  * enough to fit in "count". Return an error pointer if the count
 173  * is not large enough. When permission event is dequeued, its state is
 174  * updated accordingly.
 175  */
 176 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
 177                                             size_t count)
 178 {
 179         size_t event_size = FAN_EVENT_METADATA_LEN;
 180         struct fanotify_event *event = NULL;
 181         struct fsnotify_event *fsn_event;
 182         unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 183
 184         pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 185
 186         spin_lock(&group->notification_lock);
 187         fsn_event = fsnotify_peek_first_event(group);
 188         if (!fsn_event)
 189                 goto out;
 190
 191         event = FANOTIFY_E(fsn_event);
 192         if (info_mode)
 193                 event_size += fanotify_event_info_len(info_mode, event);
 194
 195         if (event_size > count) {
 196                 event = ERR_PTR(-EINVAL);
 197                 goto out;
 198         }
 199
 200         /*
 201          * Held the notification_lock the whole time, so this is the
 202          * same event we peeked above.
 203          */
 204         fsnotify_remove_first_event(group);
 205         if (fanotify_is_perm_event(event->mask))
 206                 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
 207         if (fanotify_is_hashed_event(event->mask))
 208                 fanotify_unhash_event(group, event);
 209 out:
 210         spin_unlock(&group->notification_lock);
 211         return event;
 212 }
 213
 214 static int create_fd(struct fsnotify_group *group, struct path *path,
 215                      struct file **file)
 216 {
 217         int client_fd;
 218         struct file *new_file;
 219
 220         client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
 221         if (client_fd < 0)
 222                 return client_fd;
 223
 224         /*
 225          * we need a new file handle for the userspace program so it can read even if it was
 226          * originally opened O_WRONLY.
 227          */
 228         new_file = dentry_open(path,
 229                                group->fanotify_data.f_flags | FMODE_NONOTIFY,
 230                                current_cred());
 231         if (IS_ERR(new_file)) {
 232                 /*
 233                  * we still send an event even if we can't open the file.  this
 234                  * can happen when say tasks are gone and we try to open their
 235                  * /proc files or we try to open a WRONLY file like in sysfs
 236                  * we just send the errno to userspace since there isn't much
 237                  * else we can do.
 238                  */
 239                 put_unused_fd(client_fd);
 240                 client_fd = PTR_ERR(new_file);
 241         } else {
 242                 *file = new_file;
 243         }
 244
 245         return client_fd;
 246 }
 247
 248 /*
 249  * Finish processing of permission event by setting it to ANSWERED state and
 250  * drop group->notification_lock.
 251  */
 252 static void finish_permission_event(struct fsnotify_group *group,
 253                                     struct fanotify_perm_event *event,
 254                                     unsigned int response)
 255                                     __releases(&group->notification_lock)
 256 {
 257         bool destroy = false;
 258
 259         assert_spin_locked(&group->notification_lock);
 260         event->response = response;
 261         if (event->state == FAN_EVENT_CANCELED)
 262                 destroy = true;
 263         else
 264                 event->state = FAN_EVENT_ANSWERED;
 265         spin_unlock(&group->notification_lock);
 266         if (destroy)
 267                 fsnotify_destroy_event(group, &event->fae.fse);
 268 }
 269
 270 static int process_access_response(struct fsnotify_group *group,
 271                                    struct fanotify_response *response_struct)
 272 {
 273         struct fanotify_perm_event *event;
 274         int fd = response_struct->fd;
 275         int response = response_struct->response;
 276
 277         pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
 278                  fd, response);
 279         /*
 280          * make sure the response is valid, if invalid we do nothing and either
 281          * userspace can send a valid response or we will clean it up after the
 282          * timeout
 283          */
 284         switch (response & ~FAN_AUDIT) {
 285         case FAN_ALLOW:
 286         case FAN_DENY:
 287                 break;
 288         default:
 289                 return -EINVAL;
 290         }
 291
 292         if (fd < 0)
 293                 return -EINVAL;
 294
 295         if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
 296                 return -EINVAL;
 297
 298         spin_lock(&group->notification_lock);
 299         list_for_each_entry(event, &group->fanotify_data.access_list,
 300                             fae.fse.list) {
 301                 if (event->fd != fd)
 302                         continue;
 303
 304                 list_del_init(&event->fae.fse.list);
 305                 finish_permission_event(group, event, response);
 306                 wake_up(&group->fanotify_data.access_waitq);
 307                 return 0;
 308         }
 309         spin_unlock(&group->notification_lock);
 310
 311         return -ENOENT;
 312 }
 313
 314 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 315                                  int info_type, const char *name,
 316                                  size_t name_len,
 317                                  char __user *buf, size_t count)
 318 {
 319         struct fanotify_event_info_fid info = { };
 320         struct file_handle handle = { };
 321         unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
 322         size_t fh_len = fh ? fh->len : 0;
 323         size_t info_len = fanotify_fid_info_len(fh_len, name_len);
 324         size_t len = info_len;
 325
 326         pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
 327                  __func__, fh_len, name_len, info_len, count);
 328
 329         if (!fh_len)
 330                 return 0;
 331
 332         if (WARN_ON_ONCE(len < sizeof(info) || len > count))
 333                 return -EFAULT;
 334
 335         /*
 336          * Copy event info fid header followed by variable sized file handle
 337          * and optionally followed by variable sized filename.
 338          */
 339         switch (info_type) {
 340         case FAN_EVENT_INFO_TYPE_FID:
 341         case FAN_EVENT_INFO_TYPE_DFID:
 342                 if (WARN_ON_ONCE(name_len))
 343                         return -EFAULT;
 344                 break;
 345         case FAN_EVENT_INFO_TYPE_DFID_NAME:
 346                 if (WARN_ON_ONCE(!name || !name_len))
 347                         return -EFAULT;
 348                 break;
 349         default:
 350                 return -EFAULT;
 351         }
 352
 353         info.hdr.info_type = info_type;
 354         info.hdr.len = len;
 355         info.fsid = *fsid;
 356         if (copy_to_user(buf, &info, sizeof(info)))
 357                 return -EFAULT;
 358
 359         buf += sizeof(info);
 360         len -= sizeof(info);
 361         if (WARN_ON_ONCE(len < sizeof(handle)))
 362                 return -EFAULT;
 363
 364         handle.handle_type = fh->type;
 365         handle.handle_bytes = fh_len;
 366         if (copy_to_user(buf, &handle, sizeof(handle)))
 367                 return -EFAULT;
 368
 369         buf += sizeof(handle);
 370         len -= sizeof(handle);
 371         if (WARN_ON_ONCE(len < fh_len))
 372                 return -EFAULT;
 373
 374         /*
 375          * For an inline fh and inline file name, copy through stack to exclude
 376          * the copy from usercopy hardening protections.
 377          */
 378         fh_buf = fanotify_fh_buf(fh);
 379         if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
 380                 memcpy(bounce, fh_buf, fh_len);
 381                 fh_buf = bounce;
 382         }
 383         if (copy_to_user(buf, fh_buf, fh_len))
 384                 return -EFAULT;
 385
 386         buf += fh_len;
 387         len -= fh_len;
 388
 389         if (name_len) {
 390                 /* Copy the filename with terminating null */
 391                 name_len++;
 392                 if (WARN_ON_ONCE(len < name_len))
 393                         return -EFAULT;
 394
 395                 if (copy_to_user(buf, name, name_len))
 396                         return -EFAULT;
 397
 398                 buf += name_len;
 399                 len -= name_len;
 400         }
 401
 402         /* Pad with 0's */
 403         WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
 404         if (len > 0 && clear_user(buf, len))
 405                 return -EFAULT;
 406
 407         return info_len;
 408 }
 409
 410 static int copy_pidfd_info_to_user(int pidfd,
 411                                    char __user *buf,
 412                                    size_t count)
 413 {
 414         struct fanotify_event_info_pidfd info = { };
 415         size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
 416
 417         if (WARN_ON_ONCE(info_len > count))
 418                 return -EFAULT;
 419
 420         info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
 421         info.hdr.len = info_len;
 422         info.pidfd = pidfd;
 423
 424         if (copy_to_user(buf, &info, info_len))
 425                 return -EFAULT;
 426
 427         return info_len;
 428 }
 429
 430 static int copy_info_records_to_user(struct fanotify_event *event,
 431                                      struct fanotify_info *info,
 432                                      unsigned int info_mode, int pidfd,
 433                                      char __user *buf, size_t count)
 434 {
 435         int ret, total_bytes = 0, info_type = 0;
 436         unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
 437         unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 438
 439         /*
 440          * Event info records order is as follows: dir fid + name, child fid.
 441          */
 442         if (fanotify_event_dir_fh_len(event)) {
 443                 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
 444                                              FAN_EVENT_INFO_TYPE_DFID;
 445                 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
 446                                             fanotify_info_dir_fh(info),
 447                                             info_type,
 448                                             fanotify_info_name(info),
 449                                             info->name_len, buf, count);
 450                 if (ret < 0)
 451                         return ret;
 452
 453                 buf += ret;
 454                 count -= ret;
 455                 total_bytes += ret;
 456         }
 457
 458         if (fanotify_event_object_fh_len(event)) {
 459                 const char *dot = NULL;
 460                 int dot_len = 0;
 461
 462                 if (fid_mode == FAN_REPORT_FID || info_type) {
 463                         /*
 464                          * With only group flag FAN_REPORT_FID only type FID is
 465                          * reported. Second info record type is always FID.
 466                          */
 467                         info_type = FAN_EVENT_INFO_TYPE_FID;
 468                 } else if ((fid_mode & FAN_REPORT_NAME) &&
 469                            (event->mask & FAN_ONDIR)) {
 470                         /*
 471                          * With group flag FAN_REPORT_NAME, if name was not
 472                          * recorded in an event on a directory, report the name
 473                          * "." with info type DFID_NAME.
 474                          */
 475                         info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
 476                         dot = ".";
 477                         dot_len = 1;
 478                 } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
 479                            (event->mask & FAN_ONDIR)) {
 480                         /*
 481                          * With group flag FAN_REPORT_DIR_FID, a single info
 482                          * record has type DFID for directory entry modification
 483                          * event and for event on a directory.
 484                          */
 485                         info_type = FAN_EVENT_INFO_TYPE_DFID;
 486                 } else {
 487                         /*
 488                          * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
 489                          * a single info record has type FID for event on a
 490                          * non-directory, when there is no directory to report.
 491                          * For example, on FAN_DELETE_SELF event.
 492                          */
 493                         info_type = FAN_EVENT_INFO_TYPE_FID;
 494                 }
 495
 496                 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
 497                                             fanotify_event_object_fh(event),
 498                                             info_type, dot, dot_len,
 499                                             buf, count);
 500                 if (ret < 0)
 501                         return ret;
 502
 503                 buf += ret;
 504                 count -= ret;
 505                 total_bytes += ret;
 506         }
 507
 508         if (pidfd_mode) {
 509                 ret = copy_pidfd_info_to_user(pidfd, buf, count);
 510                 if (ret < 0)
 511                         return ret;
 512
 513                 buf += ret;
 514                 count -= ret;
 515                 total_bytes += ret;
 516         }
 517
 518         return total_bytes;
 519 }
 520
 521 static ssize_t copy_event_to_user(struct fsnotify_group *group,
 522                                   struct fanotify_event *event,
 523                                   char __user *buf, size_t count)
 524 {
 525         struct fanotify_event_metadata metadata;
 526         struct path *path = fanotify_event_path(event);
 527         struct fanotify_info *info = fanotify_event_info(event);
 528         unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 529         unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 530         struct file *f = NULL;
 531         int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
 532
 533         pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 534
 535         metadata.event_len = FAN_EVENT_METADATA_LEN +
 536                                 fanotify_event_info_len(info_mode, event);
 537         metadata.metadata_len = FAN_EVENT_METADATA_LEN;
 538         metadata.vers = FANOTIFY_METADATA_VERSION;
 539         metadata.reserved = 0;
 540         metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
 541         metadata.pid = pid_vnr(event->pid);
 542         /*
 543          * For an unprivileged listener, event->pid can be used to identify the
 544          * events generated by the listener process itself, without disclosing
 545          * the pids of other processes.
 546          */
 547         if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 548             task_tgid(current) != event->pid)
 549                 metadata.pid = 0;
 550
 551         /*
 552          * For now, fid mode is required for an unprivileged listener and
 553          * fid mode does not report fd in events.  Keep this check anyway
 554          * for safety in case fid mode requirement is relaxed in the future
 555          * to allow unprivileged listener to get events with no fd and no fid.
 556          */
 557         if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 558             path && path->mnt && path->dentry) {
 559                 fd = create_fd(group, path, &f);
 560                 if (fd < 0)
 561                         return fd;
 562         }
 563         metadata.fd = fd;
 564
 565         if (pidfd_mode) {
 566                 /*
 567                  * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
 568                  * exclusion is ever lifted. At the time of incoporating pidfd
 569                  * support within fanotify, the pidfd API only supported the
 570                  * creation of pidfds for thread-group leaders.
 571                  */
 572                 WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
 573
 574                 /*
 575                  * The PIDTYPE_TGID check for an event->pid is performed
 576                  * preemptively in an attempt to catch out cases where the event
 577                  * listener reads events after the event generating process has
 578                  * already terminated. Report FAN_NOPIDFD to the event listener
 579                  * in those cases, with all other pidfd creation errors being
 580                  * reported as FAN_EPIDFD.
 581                  */
 582                 if (metadata.pid == 0 ||
 583                     !pid_has_task(event->pid, PIDTYPE_TGID)) {
 584                         pidfd = FAN_NOPIDFD;
 585                 } else {
 586                         pidfd = pidfd_create(event->pid, 0);
 587                         if (pidfd < 0)
 588                                 pidfd = FAN_EPIDFD;
 589                 }
 590         }
 591
 592         ret = -EFAULT;
 593         /*
 594          * Sanity check copy size in case get_one_event() and
 595          * event_len sizes ever get out of sync.
 596          */
 597         if (WARN_ON_ONCE(metadata.event_len > count))
 598                 goto out_close_fd;
 599
 600         if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
 601                 goto out_close_fd;
 602
 603         buf += FAN_EVENT_METADATA_LEN;
 604         count -= FAN_EVENT_METADATA_LEN;
 605
 606         if (fanotify_is_perm_event(event->mask))
 607                 FANOTIFY_PERM(event)->fd = fd;
 608
 609         if (f)
 610                 fd_install(fd, f);
 611
 612         if (info_mode) {
 613                 ret = copy_info_records_to_user(event, info, info_mode, pidfd,
 614                                                 buf, count);
 615                 if (ret < 0)
 616                         goto out_close_fd;
 617         }
 618
 619         return metadata.event_len;
 620
 621 out_close_fd:
 622         if (fd != FAN_NOFD) {
 623                 put_unused_fd(fd);
 624                 fput(f);
 625         }
 626
 627         if (pidfd >= 0)
 628                 close_fd(pidfd);
 629
 630         return ret;
 631 }
 632
 633 /* intofiy userspace file descriptor functions */
 634 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
 635 {
 636         struct fsnotify_group *group = file->private_data;
 637         __poll_t ret = 0;
 638
 639         poll_wait(file, &group->notification_waitq, wait);
 640         spin_lock(&group->notification_lock);
 641         if (!fsnotify_notify_queue_is_empty(group))
 642                 ret = EPOLLIN | EPOLLRDNORM;
 643         spin_unlock(&group->notification_lock);
 644
 645         return ret;
 646 }
 647
 648 static ssize_t fanotify_read(struct file *file, char __user *buf,
 649                              size_t count, loff_t *pos)
 650 {
 651         struct fsnotify_group *group;
 652         struct fanotify_event *event;
 653         char __user *start;
 654         int ret;
 655         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 656
 657         start = buf;
 658         group = file->private_data;
 659
 660         pr_debug("%s: group=%p\n", __func__, group);
 661
 662         add_wait_queue(&group->notification_waitq, &wait);
 663         while (1) {
 664                 /*
 665                  * User can supply arbitrarily large buffer. Avoid softlockups
 666                  * in case there are lots of available events.
 667                  */
 668                 cond_resched();
 669                 event = get_one_event(group, count);
 670                 if (IS_ERR(event)) {
 671                         ret = PTR_ERR(event);
 672                         break;
 673                 }
 674
 675                 if (!event) {
 676                         ret = -EAGAIN;
 677                         if (file->f_flags & O_NONBLOCK)
 678                                 break;
 679
 680                         ret = -ERESTARTSYS;
 681                         if (signal_pending(current))
 682                                 break;
 683
 684                         if (start != buf)
 685                                 break;
 686
 687                         wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 688                         continue;
 689                 }
 690
 691                 ret = copy_event_to_user(group, event, buf, count);
 692                 if (unlikely(ret == -EOPENSTALE)) {
 693                         /*
 694                          * We cannot report events with stale fd so drop it.
 695                          * Setting ret to 0 will continue the event loop and
 696                          * do the right thing if there are no more events to
 697                          * read (i.e. return bytes read, -EAGAIN or wait).
 698                          */
 699                         ret = 0;
 700                 }
 701
 702                 /*
 703                  * Permission events get queued to wait for response.  Other
 704                  * events can be destroyed now.
 705                  */
 706                 if (!fanotify_is_perm_event(event->mask)) {
 707                         fsnotify_destroy_event(group, &event->fse);
 708                 } else {
 709                         if (ret <= 0) {
 710                                 spin_lock(&group->notification_lock);
 711                                 finish_permission_event(group,
 712                                         FANOTIFY_PERM(event), FAN_DENY);
 713                                 wake_up(&group->fanotify_data.access_waitq);
 714                         } else {
 715                                 spin_lock(&group->notification_lock);
 716                                 list_add_tail(&event->fse.list,
 717                                         &group->fanotify_data.access_list);
 718                                 spin_unlock(&group->notification_lock);
 719                         }
 720                 }
 721                 if (ret < 0)
 722                         break;
 723                 buf += ret;
 724                 count -= ret;
 725         }
 726         remove_wait_queue(&group->notification_waitq, &wait);
 727
 728         if (start != buf && ret != -EFAULT)
 729                 ret = buf - start;
 730         return ret;
 731 }
 732
 733 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 734 {
 735         struct fanotify_response response = { .fd = -1, .response = -1 };
 736         struct fsnotify_group *group;
 737         int ret;
 738
 739         if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
 740                 return -EINVAL;
 741
 742         group = file->private_data;
 743
 744         if (count < sizeof(response))
 745                 return -EINVAL;
 746
 747         count = sizeof(response);
 748
 749         pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
 750
 751         if (copy_from_user(&response, buf, count))
 752                 return -EFAULT;
 753
 754         ret = process_access_response(group, &response);
 755         if (ret < 0)
 756                 count = ret;
 757
 758         return count;
 759 }
 760
 761 static int fanotify_release(struct inode *ignored, struct file *file)
 762 {
 763         struct fsnotify_group *group = file->private_data;
 764         struct fsnotify_event *fsn_event;
 765
 766         /*
 767          * Stop new events from arriving in the notification queue. since
 768          * userspace cannot use fanotify fd anymore, no event can enter or
 769          * leave access_list by now either.
 770          */
 771         fsnotify_group_stop_queueing(group);
 772
 773         /*
 774          * Process all permission events on access_list and notification queue
 775          * and simulate reply from userspace.
 776          */
 777         spin_lock(&group->notification_lock);
 778         while (!list_empty(&group->fanotify_data.access_list)) {
 779                 struct fanotify_perm_event *event;
 780
 781                 event = list_first_entry(&group->fanotify_data.access_list,
 782                                 struct fanotify_perm_event, fae.fse.list);
 783                 list_del_init(&event->fae.fse.list);
 784                 finish_permission_event(group, event, FAN_ALLOW);
 785                 spin_lock(&group->notification_lock);
 786         }
 787
 788         /*
 789          * Destroy all non-permission events. For permission events just
 790          * dequeue them and set the response. They will be freed once the
 791          * response is consumed and fanotify_get_response() returns.
 792          */
 793         while ((fsn_event = fsnotify_remove_first_event(group))) {
 794                 struct fanotify_event *event = FANOTIFY_E(fsn_event);
 795
 796                 if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
 797                         spin_unlock(&group->notification_lock);
 798                         fsnotify_destroy_event(group, fsn_event);
 799                 } else {
 800                         finish_permission_event(group, FANOTIFY_PERM(event),
 801                                                 FAN_ALLOW);
 802                 }
 803                 spin_lock(&group->notification_lock);
 804         }
 805         spin_unlock(&group->notification_lock);
 806
 807         /* Response for all permission events it set, wakeup waiters */
 808         wake_up(&group->fanotify_data.access_waitq);
 809
 810         /* matches the fanotify_init->fsnotify_alloc_group */
 811         fsnotify_destroy_group(group);
 812
 813         return 0;
 814 }
 815
 816 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 817 {
 818         struct fsnotify_group *group;
 819         struct fsnotify_event *fsn_event;
 820         void __user *p;
 821         int ret = -ENOTTY;
 822         size_t send_len = 0;
 823
 824         group = file->private_data;
 825
 826         p = (void __user *) arg;
 827
 828         switch (cmd) {
 829         case FIONREAD:
 830                 spin_lock(&group->notification_lock);
 831                 list_for_each_entry(fsn_event, &group->notification_list, list)
 832                         send_len += FAN_EVENT_METADATA_LEN;
 833                 spin_unlock(&group->notification_lock);
 834                 ret = put_user(send_len, (int __user *) p);
 835                 break;
 836         }
 837
 838         return ret;
 839 }
 840
 841 static const struct file_operations fanotify_fops = {
 842         .show_fdinfo    = fanotify_show_fdinfo,
 843         .poll           = fanotify_poll,
 844         .read           = fanotify_read,
 845         .write          = fanotify_write,
 846         .fasync         = NULL,
 847         .release        = fanotify_release,
 848         .unlocked_ioctl = fanotify_ioctl,
 849         .compat_ioctl   = compat_ptr_ioctl,
 850         .llseek         = noop_llseek,
 851 };
 852
 853 static int fanotify_find_path(int dfd, const char __user *filename,
 854                               struct path *path, unsigned int flags, __u64 mask,
 855                               unsigned int obj_type)
 856 {
 857         int ret;
 858
 859         pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
 860                  dfd, filename, flags);
 861
 862         if (filename == NULL) {
 863                 struct fd f = fdget(dfd);
 864
 865                 ret = -EBADF;
 866                 if (!f.file)
 867                         goto out;
 868
 869                 ret = -ENOTDIR;
 870                 if ((flags & FAN_MARK_ONLYDIR) &&
 871                     !(S_ISDIR(file_inode(f.file)->i_mode))) {
 872                         fdput(f);
 873                         goto out;
 874                 }
 875
 876                 *path = f.file->f_path;
 877                 path_get(path);
 878                 fdput(f);
 879         } else {
 880                 unsigned int lookup_flags = 0;
 881
 882                 if (!(flags & FAN_MARK_DONT_FOLLOW))
 883                         lookup_flags |= LOOKUP_FOLLOW;
 884                 if (flags & FAN_MARK_ONLYDIR)
 885                         lookup_flags |= LOOKUP_DIRECTORY;
 886
 887                 ret = user_path_at(dfd, filename, lookup_flags, path);
 888                 if (ret)
 889                         goto out;
 890         }
 891
 892         /* you can only watch an inode if you have read permissions on it */
 893         ret = path_permission(path, MAY_READ);
 894         if (ret) {
 895                 path_put(path);
 896                 goto out;
 897         }
 898
 899         ret = security_path_notify(path, mask, obj_type);
 900         if (ret)
 901                 path_put(path);
 902
 903 out:
 904         return ret;
 905 }
 906
 907 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 908                                             __u32 mask, unsigned int flags,
 909                                             __u32 umask, int *destroy)
 910 {
 911         __u32 oldmask = 0;
 912
 913         /* umask bits cannot be removed by user */
 914         mask &= ~umask;
 915         spin_lock(&fsn_mark->lock);
 916         if (!(flags & FAN_MARK_IGNORED_MASK)) {
 917                 oldmask = fsn_mark->mask;
 918                 fsn_mark->mask &= ~mask;
 919         } else {
 920                 fsn_mark->ignored_mask &= ~mask;
 921         }
 922         /*
 923          * We need to keep the mark around even if remaining mask cannot
 924          * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
 925          * changes to the mask.
 926          * Destroy mark when only umask bits remain.
 927          */
 928         *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
 929         spin_unlock(&fsn_mark->lock);
 930
 931         return mask & oldmask;
 932 }
 933
 934 static int fanotify_remove_mark(struct fsnotify_group *group,
 935                                 fsnotify_connp_t *connp, __u32 mask,
 936                                 unsigned int flags, __u32 umask)
 937 {
 938         struct fsnotify_mark *fsn_mark = NULL;
 939         __u32 removed;
 940         int destroy_mark;
 941
 942         mutex_lock(&group->mark_mutex);
 943         fsn_mark = fsnotify_find_mark(connp, group);
 944         if (!fsn_mark) {
 945                 mutex_unlock(&group->mark_mutex);
 946                 return -ENOENT;
 947         }
 948
 949         removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 950                                                  umask, &destroy_mark);
 951         if (removed & fsnotify_conn_mask(fsn_mark->connector))
 952                 fsnotify_recalc_mask(fsn_mark->connector);
 953         if (destroy_mark)
 954                 fsnotify_detach_mark(fsn_mark);
 955         mutex_unlock(&group->mark_mutex);
 956         if (destroy_mark)
 957                 fsnotify_free_mark(fsn_mark);
 958
 959         /* matches the fsnotify_find_mark() */
 960         fsnotify_put_mark(fsn_mark);
 961         return 0;
 962 }
 963
 964 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 965                                          struct vfsmount *mnt, __u32 mask,
 966                                          unsigned int flags, __u32 umask)
 967 {
 968         return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
 969                                     mask, flags, umask);
 970 }
 971
 972 static int fanotify_remove_sb_mark(struct fsnotify_group *group,
 973                                    struct super_block *sb, __u32 mask,
 974                                    unsigned int flags, __u32 umask)
 975 {
 976         return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
 977                                     flags, umask);
 978 }
 979
 980 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 981                                       struct inode *inode, __u32 mask,
 982                                       unsigned int flags, __u32 umask)
 983 {
 984         return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
 985                                     flags, umask);
 986 }
 987
 988 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 989                                        __u32 mask,
 990                                        unsigned int flags)
 991 {
 992         __u32 oldmask = -1;
 993
 994         spin_lock(&fsn_mark->lock);
 995         if (!(flags & FAN_MARK_IGNORED_MASK)) {
 996                 oldmask = fsn_mark->mask;
 997                 fsn_mark->mask |= mask;
 998         } else {
 999                 fsn_mark->ignored_mask |= mask;
1000                 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
1001                         fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1002         }
1003         spin_unlock(&fsn_mark->lock);
1004
1005         return mask & ~oldmask;
1006 }
1007
1008 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1009                                                    fsnotify_connp_t *connp,
1010                                                    unsigned int type,
1011                                                    __kernel_fsid_t *fsid)
1012 {
1013         struct ucounts *ucounts = group->fanotify_data.ucounts;
1014         struct fsnotify_mark *mark;
1015         int ret;
1016
1017         /*
1018          * Enforce per user marks limits per user in all containing user ns.
1019          * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1020          * in the limited groups account.
1021          */
1022         if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1023             !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1024                 return ERR_PTR(-ENOSPC);
1025
1026         mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1027         if (!mark) {
1028                 ret = -ENOMEM;
1029                 goto out_dec_ucounts;
1030         }
1031
1032         fsnotify_init_mark(mark, group);
1033         ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
1034         if (ret) {
1035                 fsnotify_put_mark(mark);
1036                 goto out_dec_ucounts;
1037         }
1038
1039         return mark;
1040
1041 out_dec_ucounts:
1042         if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1043                 dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1044         return ERR_PTR(ret);
1045 }
1046
1047
1048 static int fanotify_add_mark(struct fsnotify_group *group,
1049                              fsnotify_connp_t *connp, unsigned int type,
1050                              __u32 mask, unsigned int flags,
1051                              __kernel_fsid_t *fsid)
1052 {
1053         struct fsnotify_mark *fsn_mark;
1054         __u32 added;
1055
1056         mutex_lock(&group->mark_mutex);
1057         fsn_mark = fsnotify_find_mark(connp, group);
1058         if (!fsn_mark) {
1059                 fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
1060                 if (IS_ERR(fsn_mark)) {
1061                         mutex_unlock(&group->mark_mutex);
1062                         return PTR_ERR(fsn_mark);
1063                 }
1064         }
1065         added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
1066         if (added & ~fsnotify_conn_mask(fsn_mark->connector))
1067                 fsnotify_recalc_mask(fsn_mark->connector);
1068         mutex_unlock(&group->mark_mutex);
1069
1070         fsnotify_put_mark(fsn_mark);
1071         return 0;
1072 }
1073
1074 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
1075                                       struct vfsmount *mnt, __u32 mask,
1076                                       unsigned int flags, __kernel_fsid_t *fsid)
1077 {
1078         return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
1079                                  FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
1080 }
1081
1082 static int fanotify_add_sb_mark(struct fsnotify_group *group,
1083                                 struct super_block *sb, __u32 mask,
1084                                 unsigned int flags, __kernel_fsid_t *fsid)
1085 {
1086         return fanotify_add_mark(group, &sb->s_fsnotify_marks,
1087                                  FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
1088 }
1089
1090 static int fanotify_add_inode_mark(struct fsnotify_group *group,
1091                                    struct inode *inode, __u32 mask,
1092                                    unsigned int flags, __kernel_fsid_t *fsid)
1093 {
1094         pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
1095
1096         /*
1097          * If some other task has this inode open for write we should not add
1098          * an ignored mark, unless that ignored mark is supposed to survive
1099          * modification changes anyway.
1100          */
1101         if ((flags & FAN_MARK_IGNORED_MASK) &&
1102             !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1103             inode_is_open_for_write(inode))
1104                 return 0;
1105
1106         return fanotify_add_mark(group, &inode->i_fsnotify_marks,
1107                                  FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
1108 }
1109
1110 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1111 {
1112         struct fanotify_event *oevent;
1113
1114         oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1115         if (!oevent)
1116                 return NULL;
1117
1118         fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1119         oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1120
1121         return &oevent->fse;
1122 }
1123
1124 static struct hlist_head *fanotify_alloc_merge_hash(void)
1125 {
1126         struct hlist_head *hash;
1127
1128         hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1129                        GFP_KERNEL_ACCOUNT);
1130         if (!hash)
1131                 return NULL;
1132
1133         __hash_init(hash, FANOTIFY_HTABLE_SIZE);
1134
1135         return hash;
1136 }
1137
1138 /* fanotify syscalls */
1139 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1140 {
1141         struct fsnotify_group *group;
1142         int f_flags, fd;
1143         unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1144         unsigned int class = flags & FANOTIFY_CLASS_BITS;
1145         unsigned int internal_flags = 0;
1146
1147         pr_debug("%s: flags=%x event_f_flags=%x\n",
1148                  __func__, flags, event_f_flags);
1149
1150         if (!capable(CAP_SYS_ADMIN)) {
1151                 /*
1152                  * An unprivileged user can setup an fanotify group with
1153                  * limited functionality - an unprivileged group is limited to
1154                  * notification events with file handles and it cannot use
1155                  * unlimited queue/marks.
1156                  */
1157                 if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1158                         return -EPERM;
1159
1160                 /*
1161                  * Setting the internal flag FANOTIFY_UNPRIV on the group
1162                  * prevents setting mount/filesystem marks on this group and
1163                  * prevents reporting pid and open fd in events.
1164                  */
1165                 internal_flags |= FANOTIFY_UNPRIV;
1166         }
1167
1168 #ifdef CONFIG_AUDITSYSCALL
1169         if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1170 #else
1171         if (flags & ~FANOTIFY_INIT_FLAGS)
1172 #endif
1173                 return -EINVAL;
1174
1175         /*
1176          * A pidfd can only be returned for a thread-group leader; thus
1177          * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1178          * exclusive.
1179          */
1180         if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1181                 return -EINVAL;
1182
1183         if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1184                 return -EINVAL;
1185
1186         switch (event_f_flags & O_ACCMODE) {
1187         case O_RDONLY:
1188         case O_RDWR:
1189         case O_WRONLY:
1190                 break;
1191         default:
1192                 return -EINVAL;
1193         }
1194
1195         if (fid_mode && class != FAN_CLASS_NOTIF)
1196                 return -EINVAL;
1197
1198         /*
1199          * Child name is reported with parent fid so requires dir fid.
1200          * We can report both child fid and dir fid with or without name.
1201          */
1202         if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1203                 return -EINVAL;
1204
1205         f_flags = O_RDWR | FMODE_NONOTIFY;
1206         if (flags & FAN_CLOEXEC)
1207                 f_flags |= O_CLOEXEC;
1208         if (flags & FAN_NONBLOCK)
1209                 f_flags |= O_NONBLOCK;
1210
1211         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1212         group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
1213         if (IS_ERR(group)) {
1214                 return PTR_ERR(group);
1215         }
1216
1217         /* Enforce groups limits per user in all containing user ns */
1218         group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1219                                                   current_euid(),
1220                                                   UCOUNT_FANOTIFY_GROUPS);
1221         if (!group->fanotify_data.ucounts) {
1222                 fd = -EMFILE;
1223                 goto out_destroy_group;
1224         }
1225
1226         group->fanotify_data.flags = flags | internal_flags;
1227         group->memcg = get_mem_cgroup_from_mm(current->mm);
1228
1229         group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1230         if (!group->fanotify_data.merge_hash) {
1231                 fd = -ENOMEM;
1232                 goto out_destroy_group;
1233         }
1234
1235         group->overflow_event = fanotify_alloc_overflow_event();
1236         if (unlikely(!group->overflow_event)) {
1237                 fd = -ENOMEM;
1238                 goto out_destroy_group;
1239         }
1240
1241         if (force_o_largefile())
1242                 event_f_flags |= O_LARGEFILE;
1243         group->fanotify_data.f_flags = event_f_flags;
1244         init_waitqueue_head(&group->fanotify_data.access_waitq);
1245         INIT_LIST_HEAD(&group->fanotify_data.access_list);
1246         switch (class) {
1247         case FAN_CLASS_NOTIF:
1248                 group->priority = FS_PRIO_0;
1249                 break;
1250         case FAN_CLASS_CONTENT:
1251                 group->priority = FS_PRIO_1;
1252                 break;
1253         case FAN_CLASS_PRE_CONTENT:
1254                 group->priority = FS_PRIO_2;
1255                 break;
1256         default:
1257                 fd = -EINVAL;
1258                 goto out_destroy_group;
1259         }
1260
1261         if (flags & FAN_UNLIMITED_QUEUE) {
1262                 fd = -EPERM;
1263                 if (!capable(CAP_SYS_ADMIN))
1264                         goto out_destroy_group;
1265                 group->max_events = UINT_MAX;
1266         } else {
1267                 group->max_events = fanotify_max_queued_events;
1268         }
1269
1270         if (flags & FAN_UNLIMITED_MARKS) {
1271                 fd = -EPERM;
1272                 if (!capable(CAP_SYS_ADMIN))
1273                         goto out_destroy_group;
1274         }
1275
1276         if (flags & FAN_ENABLE_AUDIT) {
1277                 fd = -EPERM;
1278                 if (!capable(CAP_AUDIT_WRITE))
1279                         goto out_destroy_group;
1280         }
1281
1282         fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1283         if (fd < 0)
1284                 goto out_destroy_group;
1285
1286         return fd;
1287
1288 out_destroy_group:
1289         fsnotify_destroy_group(group);
1290         return fd;
1291 }
1292
1293 /* Check if filesystem can encode a unique fid */
1294 static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
1295 {
1296         __kernel_fsid_t root_fsid;
1297         int err;
1298
1299         /*
1300          * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
1301          */
1302         err = vfs_get_fsid(path->dentry, fsid);
1303         if (err)
1304                 return err;
1305
1306         if (!fsid->val[0] && !fsid->val[1])
1307                 return -ENODEV;
1308
1309         /*
1310          * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
1311          * which uses a different fsid than sb root.
1312          */
1313         err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
1314         if (err)
1315                 return err;
1316
1317         if (root_fsid.val[0] != fsid->val[0] ||
1318             root_fsid.val[1] != fsid->val[1])
1319                 return -EXDEV;
1320
1321         /*
1322          * We need to make sure that the file system supports at least
1323          * encoding a file handle so user can use name_to_handle_at() to
1324          * compare fid returned with event to the file handle of watched
1325          * objects. However, name_to_handle_at() requires that the
1326          * filesystem also supports decoding file handles.
1327          */
1328         if (!path->dentry->d_sb->s_export_op ||
1329             !path->dentry->d_sb->s_export_op->fh_to_dentry)
1330                 return -EOPNOTSUPP;
1331
1332         return 0;
1333 }
1334
1335 static int fanotify_events_supported(struct path *path, __u64 mask)
1336 {
1337         /*
1338          * Some filesystems such as 'proc' acquire unusual locks when opening
1339          * files. For them fanotify permission events have high chances of
1340          * deadlocking the system - open done when reporting fanotify event
1341          * blocks on this "unusual" lock while another process holding the lock
1342          * waits for fanotify permission event to be answered. Just disallow
1343          * permission events for such filesystems.
1344          */
1345         if (mask & FANOTIFY_PERM_EVENTS &&
1346             path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1347                 return -EINVAL;
1348         return 0;
1349 }
1350
1351 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1352                             int dfd, const char  __user *pathname)
1353 {
1354         struct inode *inode = NULL;
1355         struct vfsmount *mnt = NULL;
1356         struct fsnotify_group *group;
1357         struct fd f;
1358         struct path path;
1359         __kernel_fsid_t __fsid, *fsid = NULL;
1360         u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1361         unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1362         bool ignored = flags & FAN_MARK_IGNORED_MASK;
1363         unsigned int obj_type, fid_mode;
1364         u32 umask = 0;
1365         int ret;
1366
1367         pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1368                  __func__, fanotify_fd, flags, dfd, pathname, mask);
1369
1370         /* we only use the lower 32 bits as of right now. */
1371         if (upper_32_bits(mask))
1372                 return -EINVAL;
1373
1374         if (flags & ~FANOTIFY_MARK_FLAGS)
1375                 return -EINVAL;
1376
1377         switch (mark_type) {
1378         case FAN_MARK_INODE:
1379                 obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1380                 break;
1381         case FAN_MARK_MOUNT:
1382                 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1383                 break;
1384         case FAN_MARK_FILESYSTEM:
1385                 obj_type = FSNOTIFY_OBJ_TYPE_SB;
1386                 break;
1387         default:
1388                 return -EINVAL;
1389         }
1390
1391         switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1392         case FAN_MARK_ADD:
1393         case FAN_MARK_REMOVE:
1394                 if (!mask)
1395                         return -EINVAL;
1396                 break;
1397         case FAN_MARK_FLUSH:
1398                 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1399                         return -EINVAL;
1400                 break;
1401         default:
1402                 return -EINVAL;
1403         }
1404
1405         if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1406                 valid_mask |= FANOTIFY_PERM_EVENTS;
1407
1408         if (mask & ~valid_mask)
1409                 return -EINVAL;
1410
1411         /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1412         if (ignored)
1413                 mask &= ~FANOTIFY_EVENT_FLAGS;
1414
1415         f = fdget(fanotify_fd);
1416         if (unlikely(!f.file))
1417                 return -EBADF;
1418
1419         /* verify that this is indeed an fanotify instance */
1420         ret = -EINVAL;
1421         if (unlikely(f.file->f_op != &fanotify_fops))
1422                 goto fput_and_out;
1423         group = f.file->private_data;
1424
1425         /*
1426          * An unprivileged user is not allowed to setup mount nor filesystem
1427          * marks.  This also includes setting up such marks by a group that
1428          * was initialized by an unprivileged user.
1429          */
1430         ret = -EPERM;
1431         if ((!capable(CAP_SYS_ADMIN) ||
1432              FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1433             mark_type != FAN_MARK_INODE)
1434                 goto fput_and_out;
1435
1436         /*
1437          * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1438          * allowed to set permissions events.
1439          */
1440         ret = -EINVAL;
1441         if (mask & FANOTIFY_PERM_EVENTS &&
1442             group->priority == FS_PRIO_0)
1443                 goto fput_and_out;
1444
1445         /*
1446          * Events with data type inode do not carry enough information to report
1447          * event->fd, so we do not allow setting a mask for inode events unless
1448          * group supports reporting fid.
1449          * inode events are not supported on a mount mark, because they do not
1450          * carry enough information (i.e. path) to be filtered by mount point.
1451          */
1452         fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1453         if (mask & FANOTIFY_INODE_EVENTS &&
1454             (!fid_mode || mark_type == FAN_MARK_MOUNT))
1455                 goto fput_and_out;
1456
1457         if (flags & FAN_MARK_FLUSH) {
1458                 ret = 0;
1459                 if (mark_type == FAN_MARK_MOUNT)
1460                         fsnotify_clear_vfsmount_marks_by_group(group);
1461                 else if (mark_type == FAN_MARK_FILESYSTEM)
1462                         fsnotify_clear_sb_marks_by_group(group);
1463                 else
1464                         fsnotify_clear_inode_marks_by_group(group);
1465                 goto fput_and_out;
1466         }
1467
1468         ret = fanotify_find_path(dfd, pathname, &path, flags,
1469                         (mask & ALL_FSNOTIFY_EVENTS), obj_type);
1470         if (ret)
1471                 goto fput_and_out;
1472
1473         if (flags & FAN_MARK_ADD) {
1474                 ret = fanotify_events_supported(&path, mask);
1475                 if (ret)
1476                         goto path_put_and_out;
1477         }
1478
1479         if (fid_mode) {
1480                 ret = fanotify_test_fid(&path, &__fsid);
1481                 if (ret)
1482                         goto path_put_and_out;
1483
1484                 fsid = &__fsid;
1485         }
1486
1487         /* inode held in place by reference to path; group by fget on fd */
1488         if (mark_type == FAN_MARK_INODE)
1489                 inode = path.dentry->d_inode;
1490         else
1491                 mnt = path.mnt;
1492
1493         /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1494         if (mnt || !S_ISDIR(inode->i_mode)) {
1495                 mask &= ~FAN_EVENT_ON_CHILD;
1496                 umask = FAN_EVENT_ON_CHILD;
1497                 /*
1498                  * If group needs to report parent fid, register for getting
1499                  * events with parent/name info for non-directory.
1500                  */
1501                 if ((fid_mode & FAN_REPORT_DIR_FID) &&
1502                     (flags & FAN_MARK_ADD) && !ignored)
1503                         mask |= FAN_EVENT_ON_CHILD;
1504         }
1505
1506         /* create/update an inode mark */
1507         switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1508         case FAN_MARK_ADD:
1509                 if (mark_type == FAN_MARK_MOUNT)
1510                         ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1511                                                          flags, fsid);
1512                 else if (mark_type == FAN_MARK_FILESYSTEM)
1513                         ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1514                                                    flags, fsid);
1515                 else
1516                         ret = fanotify_add_inode_mark(group, inode, mask,
1517                                                       flags, fsid);
1518                 break;
1519         case FAN_MARK_REMOVE:
1520                 if (mark_type == FAN_MARK_MOUNT)
1521                         ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1522                                                             flags, umask);
1523                 else if (mark_type == FAN_MARK_FILESYSTEM)
1524                         ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1525                                                       flags, umask);
1526                 else
1527                         ret = fanotify_remove_inode_mark(group, inode, mask,
1528                                                          flags, umask);
1529                 break;
1530         default:
1531                 ret = -EINVAL;
1532         }
1533
1534 path_put_and_out:
1535         path_put(&path);
1536 fput_and_out:
1537         fdput(f);
1538         return ret;
1539 }
1540
1541 #ifndef CONFIG_ARCH_SPLIT_ARG64
1542 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1543                               __u64, mask, int, dfd,
1544                               const char  __user *, pathname)
1545 {
1546         return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1547 }
1548 #endif
1549
1550 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1551 SYSCALL32_DEFINE6(fanotify_mark,
1552                                 int, fanotify_fd, unsigned int, flags,
1553                                 SC_ARG64(mask), int, dfd,
1554                                 const char  __user *, pathname)
1555 {
1556         return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1557                                 dfd, pathname);
1558 }
1559 #endif
1560
1561 /*
1562  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1563  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1564  * must result in panic().
1565  */
1566 static int __init fanotify_user_setup(void)
1567 {
1568         struct sysinfo si;
1569         int max_marks;
1570
1571         si_meminfo(&si);
1572         /*
1573          * Allow up to 1% of addressable memory to be accounted for per user
1574          * marks limited to the range [8192, 1048576]. mount and sb marks are
1575          * a lot cheaper than inode marks, but there is no reason for a user
1576          * to have many of those, so calculate by the cost of inode marks.
1577          */
1578         max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1579                     INODE_MARK_COST;
1580         max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1581                                      FANOTIFY_DEFAULT_MAX_USER_MARKS);
1582
1583         BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1584         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
1585         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1586
1587         fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1588                                          SLAB_PANIC|SLAB_ACCOUNT);
1589         fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1590                                                SLAB_PANIC);
1591         fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1592                                                 SLAB_PANIC);
1593         if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1594                 fanotify_perm_event_cachep =
1595                         KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1596         }
1597
1598         fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1599         init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1600                                         FANOTIFY_DEFAULT_MAX_GROUPS;
1601         init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1602
1603         return 0;
1604 }
1605 device_initcall(fanotify_user_setup);