fs/notify/fanotify/fanotify_user.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/fanotify.h>
   3 #include <linux/fcntl.h>
   4 #include <linux/file.h>
   5 #include <linux/fs.h>
   6 #include <linux/anon_inodes.h>
   7 #include <linux/fsnotify_backend.h>
   8 #include <linux/init.h>
   9 #include <linux/mount.h>
  10 #include <linux/namei.h>
  11 #include <linux/poll.h>
  12 #include <linux/security.h>
  13 #include <linux/syscalls.h>
  14 #include <linux/slab.h>
  15 #include <linux/types.h>
  16 #include <linux/uaccess.h>
  17 #include <linux/compat.h>
  18 #include <linux/sched/signal.h>
  19 #include <linux/memcontrol.h>
  20 #include <linux/statfs.h>
  21 #include <linux/exportfs.h>
  22
  23 #include <asm/ioctls.h>
  24
  25 #include "../../mount.h"
  26 #include "../fdinfo.h"
  27 #include "fanotify.h"
  28
  29 #define FANOTIFY_DEFAULT_MAX_EVENTS     16384
  30 #define FANOTIFY_DEFAULT_MAX_MARKS      8192
  31 #define FANOTIFY_DEFAULT_MAX_LISTENERS  128
  32
  33 /*
  34  * All flags that may be specified in parameter event_f_flags of fanotify_init.
  35  *
  36  * Internal and external open flags are stored together in field f_flags of
  37  * struct file. Only external open flags shall be allowed in event_f_flags.
  38  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
  39  * excluded.
  40  */
  41 #define FANOTIFY_INIT_ALL_EVENT_F_BITS                          ( \
  42                 O_ACCMODE       | O_APPEND      | O_NONBLOCK    | \
  43                 __O_SYNC        | O_DSYNC       | O_CLOEXEC     | \
  44                 O_LARGEFILE     | O_NOATIME     )
  45
  46 extern const struct fsnotify_ops fanotify_fsnotify_ops;
  47
  48 struct kmem_cache *fanotify_mark_cache __read_mostly;
  49 struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
  50 struct kmem_cache *fanotify_path_event_cachep __read_mostly;
  51 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
  52
  53 #define FANOTIFY_EVENT_ALIGN 4
  54 #define FANOTIFY_INFO_HDR_LEN \
  55         (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
  56
  57 static int fanotify_fid_info_len(int fh_len, int name_len)
  58 {
  59         int info_len = fh_len;
  60
  61         if (name_len)
  62                 info_len += name_len + 1;
  63
  64         return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
  65 }
  66
  67 static int fanotify_event_info_len(struct fanotify_event *event)
  68 {
  69         int info_len = 0;
  70         int fh_len = fanotify_event_object_fh_len(event);
  71
  72         if (fh_len)
  73                 info_len += fanotify_fid_info_len(fh_len, 0);
  74
  75         if (fanotify_event_name_len(event)) {
  76                 struct fanotify_name_event *fne = FANOTIFY_NE(event);
  77
  78                 info_len += fanotify_fid_info_len(fne->dir_fh.len,
  79                                                   fne->name_len);
  80         }
  81
  82         return info_len;
  83 }
  84
  85 /*
  86  * Get an fanotify notification event if one exists and is small
  87  * enough to fit in "count". Return an error pointer if the count
  88  * is not large enough. When permission event is dequeued, its state is
  89  * updated accordingly.
  90  */
  91 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
  92                                             size_t count)
  93 {
  94         size_t event_size = FAN_EVENT_METADATA_LEN;
  95         struct fanotify_event *event = NULL;
  96
  97         pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
  98
  99         spin_lock(&group->notification_lock);
 100         if (fsnotify_notify_queue_is_empty(group))
 101                 goto out;
 102
 103         if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) {
 104                 event_size += fanotify_event_info_len(
 105                         FANOTIFY_E(fsnotify_peek_first_event(group)));
 106         }
 107
 108         if (event_size > count) {
 109                 event = ERR_PTR(-EINVAL);
 110                 goto out;
 111         }
 112         event = FANOTIFY_E(fsnotify_remove_first_event(group));
 113         if (fanotify_is_perm_event(event->mask))
 114                 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
 115 out:
 116         spin_unlock(&group->notification_lock);
 117         return event;
 118 }
 119
 120 static int create_fd(struct fsnotify_group *group, struct path *path,
 121                      struct file **file)
 122 {
 123         int client_fd;
 124         struct file *new_file;
 125
 126         client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
 127         if (client_fd < 0)
 128                 return client_fd;
 129
 130         /*
 131          * we need a new file handle for the userspace program so it can read even if it was
 132          * originally opened O_WRONLY.
 133          */
 134         new_file = dentry_open(path,
 135                                group->fanotify_data.f_flags | FMODE_NONOTIFY,
 136                                current_cred());
 137         if (IS_ERR(new_file)) {
 138                 /*
 139                  * we still send an event even if we can't open the file.  this
 140                  * can happen when say tasks are gone and we try to open their
 141                  * /proc files or we try to open a WRONLY file like in sysfs
 142                  * we just send the errno to userspace since there isn't much
 143                  * else we can do.
 144                  */
 145                 put_unused_fd(client_fd);
 146                 client_fd = PTR_ERR(new_file);
 147         } else {
 148                 *file = new_file;
 149         }
 150
 151         return client_fd;
 152 }
 153
 154 /*
 155  * Finish processing of permission event by setting it to ANSWERED state and
 156  * drop group->notification_lock.
 157  */
 158 static void finish_permission_event(struct fsnotify_group *group,
 159                                     struct fanotify_perm_event *event,
 160                                     unsigned int response)
 161                                     __releases(&group->notification_lock)
 162 {
 163         bool destroy = false;
 164
 165         assert_spin_locked(&group->notification_lock);
 166         event->response = response;
 167         if (event->state == FAN_EVENT_CANCELED)
 168                 destroy = true;
 169         else
 170                 event->state = FAN_EVENT_ANSWERED;
 171         spin_unlock(&group->notification_lock);
 172         if (destroy)
 173                 fsnotify_destroy_event(group, &event->fae.fse);
 174 }
 175
 176 static int process_access_response(struct fsnotify_group *group,
 177                                    struct fanotify_response *response_struct)
 178 {
 179         struct fanotify_perm_event *event;
 180         int fd = response_struct->fd;
 181         int response = response_struct->response;
 182
 183         pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
 184                  fd, response);
 185         /*
 186          * make sure the response is valid, if invalid we do nothing and either
 187          * userspace can send a valid response or we will clean it up after the
 188          * timeout
 189          */
 190         switch (response & ~FAN_AUDIT) {
 191         case FAN_ALLOW:
 192         case FAN_DENY:
 193                 break;
 194         default:
 195                 return -EINVAL;
 196         }
 197
 198         if (fd < 0)
 199                 return -EINVAL;
 200
 201         if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
 202                 return -EINVAL;
 203
 204         spin_lock(&group->notification_lock);
 205         list_for_each_entry(event, &group->fanotify_data.access_list,
 206                             fae.fse.list) {
 207                 if (event->fd != fd)
 208                         continue;
 209
 210                 list_del_init(&event->fae.fse.list);
 211                 finish_permission_event(group, event, response);
 212                 wake_up(&group->fanotify_data.access_waitq);
 213                 return 0;
 214         }
 215         spin_unlock(&group->notification_lock);
 216
 217         return -ENOENT;
 218 }
 219
 220 static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 221                              const char *name, size_t name_len,
 222                              char __user *buf, size_t count)
 223 {
 224         struct fanotify_event_info_fid info = { };
 225         struct file_handle handle = { };
 226         unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
 227         size_t fh_len = fh ? fh->len : 0;
 228         size_t info_len = fanotify_fid_info_len(fh_len, name_len);
 229         size_t len = info_len;
 230
 231         pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
 232                  __func__, fh_len, name_len, info_len, count);
 233
 234         if (!fh_len || (name && !name_len))
 235                 return 0;
 236
 237         if (WARN_ON_ONCE(len < sizeof(info) || len > count))
 238                 return -EFAULT;
 239
 240         /*
 241          * Copy event info fid header followed by variable sized file handle
 242          * and optionally followed by variable sized filename.
 243          */
 244         info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
 245                                         FAN_EVENT_INFO_TYPE_FID;
 246         info.hdr.len = len;
 247         info.fsid = *fsid;
 248         if (copy_to_user(buf, &info, sizeof(info)))
 249                 return -EFAULT;
 250
 251         buf += sizeof(info);
 252         len -= sizeof(info);
 253         if (WARN_ON_ONCE(len < sizeof(handle)))
 254                 return -EFAULT;
 255
 256         handle.handle_type = fh->type;
 257         handle.handle_bytes = fh_len;
 258         if (copy_to_user(buf, &handle, sizeof(handle)))
 259                 return -EFAULT;
 260
 261         buf += sizeof(handle);
 262         len -= sizeof(handle);
 263         if (WARN_ON_ONCE(len < fh_len))
 264                 return -EFAULT;
 265
 266         /*
 267          * For an inline fh and inline file name, copy through stack to exclude
 268          * the copy from usercopy hardening protections.
 269          */
 270         fh_buf = fanotify_fh_buf(fh);
 271         if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
 272                 memcpy(bounce, fh_buf, fh_len);
 273                 fh_buf = bounce;
 274         }
 275         if (copy_to_user(buf, fh_buf, fh_len))
 276                 return -EFAULT;
 277
 278         buf += fh_len;
 279         len -= fh_len;
 280
 281         if (name_len) {
 282                 /* Copy the filename with terminating null */
 283                 name_len++;
 284                 if (WARN_ON_ONCE(len < name_len))
 285                         return -EFAULT;
 286
 287                 if (copy_to_user(buf, name, name_len))
 288                         return -EFAULT;
 289
 290                 buf += name_len;
 291                 len -= name_len;
 292         }
 293
 294         /* Pad with 0's */
 295         WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
 296         if (len > 0 && clear_user(buf, len))
 297                 return -EFAULT;
 298
 299         return info_len;
 300 }
 301
 302 static ssize_t copy_event_to_user(struct fsnotify_group *group,
 303                                   struct fanotify_event *event,
 304                                   char __user *buf, size_t count)
 305 {
 306         struct fanotify_event_metadata metadata;
 307         struct path *path = fanotify_event_path(event);
 308         struct file *f = NULL;
 309         int ret, fd = FAN_NOFD;
 310
 311         pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 312
 313         metadata.event_len = FAN_EVENT_METADATA_LEN +
 314                                         fanotify_event_info_len(event);
 315         metadata.metadata_len = FAN_EVENT_METADATA_LEN;
 316         metadata.vers = FANOTIFY_METADATA_VERSION;
 317         metadata.reserved = 0;
 318         metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
 319         metadata.pid = pid_vnr(event->pid);
 320
 321         if (path && path->mnt && path->dentry) {
 322                 fd = create_fd(group, path, &f);
 323                 if (fd < 0)
 324                         return fd;
 325         }
 326         metadata.fd = fd;
 327
 328         ret = -EFAULT;
 329         /*
 330          * Sanity check copy size in case get_one_event() and
 331          * event_len sizes ever get out of sync.
 332          */
 333         if (WARN_ON_ONCE(metadata.event_len > count))
 334                 goto out_close_fd;
 335
 336         if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
 337                 goto out_close_fd;
 338
 339         buf += FAN_EVENT_METADATA_LEN;
 340         count -= FAN_EVENT_METADATA_LEN;
 341
 342         if (fanotify_is_perm_event(event->mask))
 343                 FANOTIFY_PERM(event)->fd = fd;
 344
 345         if (f)
 346                 fd_install(fd, f);
 347
 348         /* Event info records order is: dir fid + name, child fid */
 349         if (fanotify_event_name_len(event)) {
 350                 struct fanotify_name_event *fne = FANOTIFY_NE(event);
 351
 352                 ret = copy_info_to_user(fanotify_event_fsid(event),
 353                                         fanotify_event_dir_fh(event),
 354                                         fne->name, fne->name_len,
 355                                         buf, count);
 356                 if (ret < 0)
 357                         return ret;
 358
 359                 buf += ret;
 360                 count -= ret;
 361         }
 362
 363         if (fanotify_event_object_fh_len(event)) {
 364                 ret = copy_info_to_user(fanotify_event_fsid(event),
 365                                         fanotify_event_object_fh(event),
 366                                         NULL, 0, buf, count);
 367                 if (ret < 0)
 368                         return ret;
 369
 370                 buf += ret;
 371                 count -= ret;
 372         }
 373
 374         return metadata.event_len;
 375
 376 out_close_fd:
 377         if (fd != FAN_NOFD) {
 378                 put_unused_fd(fd);
 379                 fput(f);
 380         }
 381         return ret;
 382 }
 383
 384 /* intofiy userspace file descriptor functions */
 385 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
 386 {
 387         struct fsnotify_group *group = file->private_data;
 388         __poll_t ret = 0;
 389
 390         poll_wait(file, &group->notification_waitq, wait);
 391         spin_lock(&group->notification_lock);
 392         if (!fsnotify_notify_queue_is_empty(group))
 393                 ret = EPOLLIN | EPOLLRDNORM;
 394         spin_unlock(&group->notification_lock);
 395
 396         return ret;
 397 }
 398
 399 static ssize_t fanotify_read(struct file *file, char __user *buf,
 400                              size_t count, loff_t *pos)
 401 {
 402         struct fsnotify_group *group;
 403         struct fanotify_event *event;
 404         char __user *start;
 405         int ret;
 406         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 407
 408         start = buf;
 409         group = file->private_data;
 410
 411         pr_debug("%s: group=%p\n", __func__, group);
 412
 413         add_wait_queue(&group->notification_waitq, &wait);
 414         while (1) {
 415                 /*
 416                  * User can supply arbitrarily large buffer. Avoid softlockups
 417                  * in case there are lots of available events.
 418                  */
 419                 cond_resched();
 420                 event = get_one_event(group, count);
 421                 if (IS_ERR(event)) {
 422                         ret = PTR_ERR(event);
 423                         break;
 424                 }
 425
 426                 if (!event) {
 427                         ret = -EAGAIN;
 428                         if (file->f_flags & O_NONBLOCK)
 429                                 break;
 430
 431                         ret = -ERESTARTSYS;
 432                         if (signal_pending(current))
 433                                 break;
 434
 435                         if (start != buf)
 436                                 break;
 437
 438                         wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 439                         continue;
 440                 }
 441
 442                 ret = copy_event_to_user(group, event, buf, count);
 443                 if (unlikely(ret == -EOPENSTALE)) {
 444                         /*
 445                          * We cannot report events with stale fd so drop it.
 446                          * Setting ret to 0 will continue the event loop and
 447                          * do the right thing if there are no more events to
 448                          * read (i.e. return bytes read, -EAGAIN or wait).
 449                          */
 450                         ret = 0;
 451                 }
 452
 453                 /*
 454                  * Permission events get queued to wait for response.  Other
 455                  * events can be destroyed now.
 456                  */
 457                 if (!fanotify_is_perm_event(event->mask)) {
 458                         fsnotify_destroy_event(group, &event->fse);
 459                 } else {
 460                         if (ret <= 0) {
 461                                 spin_lock(&group->notification_lock);
 462                                 finish_permission_event(group,
 463                                         FANOTIFY_PERM(event), FAN_DENY);
 464                                 wake_up(&group->fanotify_data.access_waitq);
 465                         } else {
 466                                 spin_lock(&group->notification_lock);
 467                                 list_add_tail(&event->fse.list,
 468                                         &group->fanotify_data.access_list);
 469                                 spin_unlock(&group->notification_lock);
 470                         }
 471                 }
 472                 if (ret < 0)
 473                         break;
 474                 buf += ret;
 475                 count -= ret;
 476         }
 477         remove_wait_queue(&group->notification_waitq, &wait);
 478
 479         if (start != buf && ret != -EFAULT)
 480                 ret = buf - start;
 481         return ret;
 482 }
 483
 484 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 485 {
 486         struct fanotify_response response = { .fd = -1, .response = -1 };
 487         struct fsnotify_group *group;
 488         int ret;
 489
 490         if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
 491                 return -EINVAL;
 492
 493         group = file->private_data;
 494
 495         if (count < sizeof(response))
 496                 return -EINVAL;
 497
 498         count = sizeof(response);
 499
 500         pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
 501
 502         if (copy_from_user(&response, buf, count))
 503                 return -EFAULT;
 504
 505         ret = process_access_response(group, &response);
 506         if (ret < 0)
 507                 count = ret;
 508
 509         return count;
 510 }
 511
 512 static int fanotify_release(struct inode *ignored, struct file *file)
 513 {
 514         struct fsnotify_group *group = file->private_data;
 515
 516         /*
 517          * Stop new events from arriving in the notification queue. since
 518          * userspace cannot use fanotify fd anymore, no event can enter or
 519          * leave access_list by now either.
 520          */
 521         fsnotify_group_stop_queueing(group);
 522
 523         /*
 524          * Process all permission events on access_list and notification queue
 525          * and simulate reply from userspace.
 526          */
 527         spin_lock(&group->notification_lock);
 528         while (!list_empty(&group->fanotify_data.access_list)) {
 529                 struct fanotify_perm_event *event;
 530
 531                 event = list_first_entry(&group->fanotify_data.access_list,
 532                                 struct fanotify_perm_event, fae.fse.list);
 533                 list_del_init(&event->fae.fse.list);
 534                 finish_permission_event(group, event, FAN_ALLOW);
 535                 spin_lock(&group->notification_lock);
 536         }
 537
 538         /*
 539          * Destroy all non-permission events. For permission events just
 540          * dequeue them and set the response. They will be freed once the
 541          * response is consumed and fanotify_get_response() returns.
 542          */
 543         while (!fsnotify_notify_queue_is_empty(group)) {
 544                 struct fanotify_event *event;
 545
 546                 event = FANOTIFY_E(fsnotify_remove_first_event(group));
 547                 if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
 548                         spin_unlock(&group->notification_lock);
 549                         fsnotify_destroy_event(group, &event->fse);
 550                 } else {
 551                         finish_permission_event(group, FANOTIFY_PERM(event),
 552                                                 FAN_ALLOW);
 553                 }
 554                 spin_lock(&group->notification_lock);
 555         }
 556         spin_unlock(&group->notification_lock);
 557
 558         /* Response for all permission events it set, wakeup waiters */
 559         wake_up(&group->fanotify_data.access_waitq);
 560
 561         /* matches the fanotify_init->fsnotify_alloc_group */
 562         fsnotify_destroy_group(group);
 563
 564         return 0;
 565 }
 566
 567 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 568 {
 569         struct fsnotify_group *group;
 570         struct fsnotify_event *fsn_event;
 571         void __user *p;
 572         int ret = -ENOTTY;
 573         size_t send_len = 0;
 574
 575         group = file->private_data;
 576
 577         p = (void __user *) arg;
 578
 579         switch (cmd) {
 580         case FIONREAD:
 581                 spin_lock(&group->notification_lock);
 582                 list_for_each_entry(fsn_event, &group->notification_list, list)
 583                         send_len += FAN_EVENT_METADATA_LEN;
 584                 spin_unlock(&group->notification_lock);
 585                 ret = put_user(send_len, (int __user *) p);
 586                 break;
 587         }
 588
 589         return ret;
 590 }
 591
 592 static const struct file_operations fanotify_fops = {
 593         .show_fdinfo    = fanotify_show_fdinfo,
 594         .poll           = fanotify_poll,
 595         .read           = fanotify_read,
 596         .write          = fanotify_write,
 597         .fasync         = NULL,
 598         .release        = fanotify_release,
 599         .unlocked_ioctl = fanotify_ioctl,
 600         .compat_ioctl   = compat_ptr_ioctl,
 601         .llseek         = noop_llseek,
 602 };
 603
 604 static int fanotify_find_path(int dfd, const char __user *filename,
 605                               struct path *path, unsigned int flags, __u64 mask,
 606                               unsigned int obj_type)
 607 {
 608         int ret;
 609
 610         pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
 611                  dfd, filename, flags);
 612
 613         if (filename == NULL) {
 614                 struct fd f = fdget(dfd);
 615
 616                 ret = -EBADF;
 617                 if (!f.file)
 618                         goto out;
 619
 620                 ret = -ENOTDIR;
 621                 if ((flags & FAN_MARK_ONLYDIR) &&
 622                     !(S_ISDIR(file_inode(f.file)->i_mode))) {
 623                         fdput(f);
 624                         goto out;
 625                 }
 626
 627                 *path = f.file->f_path;
 628                 path_get(path);
 629                 fdput(f);
 630         } else {
 631                 unsigned int lookup_flags = 0;
 632
 633                 if (!(flags & FAN_MARK_DONT_FOLLOW))
 634                         lookup_flags |= LOOKUP_FOLLOW;
 635                 if (flags & FAN_MARK_ONLYDIR)
 636                         lookup_flags |= LOOKUP_DIRECTORY;
 637
 638                 ret = user_path_at(dfd, filename, lookup_flags, path);
 639                 if (ret)
 640                         goto out;
 641         }
 642
 643         /* you can only watch an inode if you have read permissions on it */
 644         ret = inode_permission(path->dentry->d_inode, MAY_READ);
 645         if (ret) {
 646                 path_put(path);
 647                 goto out;
 648         }
 649
 650         ret = security_path_notify(path, mask, obj_type);
 651         if (ret)
 652                 path_put(path);
 653
 654 out:
 655         return ret;
 656 }
 657
 658 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 659                                             __u32 mask, unsigned int flags,
 660                                             __u32 umask, int *destroy)
 661 {
 662         __u32 oldmask = 0;
 663
 664         /* umask bits cannot be removed by user */
 665         mask &= ~umask;
 666         spin_lock(&fsn_mark->lock);
 667         if (!(flags & FAN_MARK_IGNORED_MASK)) {
 668                 oldmask = fsn_mark->mask;
 669                 fsn_mark->mask &= ~mask;
 670         } else {
 671                 fsn_mark->ignored_mask &= ~mask;
 672         }
 673         /*
 674          * We need to keep the mark around even if remaining mask cannot
 675          * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
 676          * changes to the mask.
 677          * Destroy mark when only umask bits remain.
 678          */
 679         *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
 680         spin_unlock(&fsn_mark->lock);
 681
 682         return mask & oldmask;
 683 }
 684
 685 static int fanotify_remove_mark(struct fsnotify_group *group,
 686                                 fsnotify_connp_t *connp, __u32 mask,
 687                                 unsigned int flags, __u32 umask)
 688 {
 689         struct fsnotify_mark *fsn_mark = NULL;
 690         __u32 removed;
 691         int destroy_mark;
 692
 693         mutex_lock(&group->mark_mutex);
 694         fsn_mark = fsnotify_find_mark(connp, group);
 695         if (!fsn_mark) {
 696                 mutex_unlock(&group->mark_mutex);
 697                 return -ENOENT;
 698         }
 699
 700         removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 701                                                  umask, &destroy_mark);
 702         if (removed & fsnotify_conn_mask(fsn_mark->connector))
 703                 fsnotify_recalc_mask(fsn_mark->connector);
 704         if (destroy_mark)
 705                 fsnotify_detach_mark(fsn_mark);
 706         mutex_unlock(&group->mark_mutex);
 707         if (destroy_mark)
 708                 fsnotify_free_mark(fsn_mark);
 709
 710         /* matches the fsnotify_find_mark() */
 711         fsnotify_put_mark(fsn_mark);
 712         return 0;
 713 }
 714
 715 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 716                                          struct vfsmount *mnt, __u32 mask,
 717                                          unsigned int flags, __u32 umask)
 718 {
 719         return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
 720                                     mask, flags, umask);
 721 }
 722
 723 static int fanotify_remove_sb_mark(struct fsnotify_group *group,
 724                                    struct super_block *sb, __u32 mask,
 725                                    unsigned int flags, __u32 umask)
 726 {
 727         return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
 728                                     flags, umask);
 729 }
 730
 731 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 732                                       struct inode *inode, __u32 mask,
 733                                       unsigned int flags, __u32 umask)
 734 {
 735         return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
 736                                     flags, umask);
 737 }
 738
 739 static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 740                                        __u32 mask,
 741                                        unsigned int flags)
 742 {
 743         __u32 oldmask = -1;
 744
 745         spin_lock(&fsn_mark->lock);
 746         if (!(flags & FAN_MARK_IGNORED_MASK)) {
 747                 oldmask = fsn_mark->mask;
 748                 fsn_mark->mask |= mask;
 749         } else {
 750                 fsn_mark->ignored_mask |= mask;
 751                 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
 752                         fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 753         }
 754         spin_unlock(&fsn_mark->lock);
 755
 756         return mask & ~oldmask;
 757 }
 758
 759 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 760                                                    fsnotify_connp_t *connp,
 761                                                    unsigned int type,
 762                                                    __kernel_fsid_t *fsid)
 763 {
 764         struct fsnotify_mark *mark;
 765         int ret;
 766
 767         if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
 768                 return ERR_PTR(-ENOSPC);
 769
 770         mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
 771         if (!mark)
 772                 return ERR_PTR(-ENOMEM);
 773
 774         fsnotify_init_mark(mark, group);
 775         ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
 776         if (ret) {
 777                 fsnotify_put_mark(mark);
 778                 return ERR_PTR(ret);
 779         }
 780
 781         return mark;
 782 }
 783
 784
 785 static int fanotify_add_mark(struct fsnotify_group *group,
 786                              fsnotify_connp_t *connp, unsigned int type,
 787                              __u32 mask, unsigned int flags,
 788                              __kernel_fsid_t *fsid)
 789 {
 790         struct fsnotify_mark *fsn_mark;
 791         __u32 added;
 792
 793         mutex_lock(&group->mark_mutex);
 794         fsn_mark = fsnotify_find_mark(connp, group);
 795         if (!fsn_mark) {
 796                 fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
 797                 if (IS_ERR(fsn_mark)) {
 798                         mutex_unlock(&group->mark_mutex);
 799                         return PTR_ERR(fsn_mark);
 800                 }
 801         }
 802         added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 803         if (added & ~fsnotify_conn_mask(fsn_mark->connector))
 804                 fsnotify_recalc_mask(fsn_mark->connector);
 805         mutex_unlock(&group->mark_mutex);
 806
 807         fsnotify_put_mark(fsn_mark);
 808         return 0;
 809 }
 810
 811 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 812                                       struct vfsmount *mnt, __u32 mask,
 813                                       unsigned int flags, __kernel_fsid_t *fsid)
 814 {
 815         return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
 816                                  FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
 817 }
 818
 819 static int fanotify_add_sb_mark(struct fsnotify_group *group,
 820                                 struct super_block *sb, __u32 mask,
 821                                 unsigned int flags, __kernel_fsid_t *fsid)
 822 {
 823         return fanotify_add_mark(group, &sb->s_fsnotify_marks,
 824                                  FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
 825 }
 826
 827 static int fanotify_add_inode_mark(struct fsnotify_group *group,
 828                                    struct inode *inode, __u32 mask,
 829                                    unsigned int flags, __kernel_fsid_t *fsid)
 830 {
 831         pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 832
 833         /*
 834          * If some other task has this inode open for write we should not add
 835          * an ignored mark, unless that ignored mark is supposed to survive
 836          * modification changes anyway.
 837          */
 838         if ((flags & FAN_MARK_IGNORED_MASK) &&
 839             !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
 840             inode_is_open_for_write(inode))
 841                 return 0;
 842
 843         return fanotify_add_mark(group, &inode->i_fsnotify_marks,
 844                                  FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
 845 }
 846
 847 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
 848 {
 849         struct fanotify_event *oevent;
 850
 851         oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
 852         if (!oevent)
 853                 return NULL;
 854
 855         fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
 856         oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
 857
 858         return &oevent->fse;
 859 }
 860
 861 /* fanotify syscalls */
 862 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 863 {
 864         struct fsnotify_group *group;
 865         int f_flags, fd;
 866         struct user_struct *user;
 867
 868         pr_debug("%s: flags=%x event_f_flags=%x\n",
 869                  __func__, flags, event_f_flags);
 870
 871         if (!capable(CAP_SYS_ADMIN))
 872                 return -EPERM;
 873
 874 #ifdef CONFIG_AUDITSYSCALL
 875         if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
 876 #else
 877         if (flags & ~FANOTIFY_INIT_FLAGS)
 878 #endif
 879                 return -EINVAL;
 880
 881         if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
 882                 return -EINVAL;
 883
 884         switch (event_f_flags & O_ACCMODE) {
 885         case O_RDONLY:
 886         case O_RDWR:
 887         case O_WRONLY:
 888                 break;
 889         default:
 890                 return -EINVAL;
 891         }
 892
 893         if ((flags & FANOTIFY_FID_BITS) &&
 894             (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
 895                 return -EINVAL;
 896
 897         user = get_current_user();
 898         if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
 899                 free_uid(user);
 900                 return -EMFILE;
 901         }
 902
 903         f_flags = O_RDWR | FMODE_NONOTIFY;
 904         if (flags & FAN_CLOEXEC)
 905                 f_flags |= O_CLOEXEC;
 906         if (flags & FAN_NONBLOCK)
 907                 f_flags |= O_NONBLOCK;
 908
 909         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
 910         group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
 911         if (IS_ERR(group)) {
 912                 free_uid(user);
 913                 return PTR_ERR(group);
 914         }
 915
 916         group->fanotify_data.user = user;
 917         group->fanotify_data.flags = flags;
 918         atomic_inc(&user->fanotify_listeners);
 919         group->memcg = get_mem_cgroup_from_mm(current->mm);
 920
 921         group->overflow_event = fanotify_alloc_overflow_event();
 922         if (unlikely(!group->overflow_event)) {
 923                 fd = -ENOMEM;
 924                 goto out_destroy_group;
 925         }
 926
 927         if (force_o_largefile())
 928                 event_f_flags |= O_LARGEFILE;
 929         group->fanotify_data.f_flags = event_f_flags;
 930         init_waitqueue_head(&group->fanotify_data.access_waitq);
 931         INIT_LIST_HEAD(&group->fanotify_data.access_list);
 932         switch (flags & FANOTIFY_CLASS_BITS) {
 933         case FAN_CLASS_NOTIF:
 934                 group->priority = FS_PRIO_0;
 935                 break;
 936         case FAN_CLASS_CONTENT:
 937                 group->priority = FS_PRIO_1;
 938                 break;
 939         case FAN_CLASS_PRE_CONTENT:
 940                 group->priority = FS_PRIO_2;
 941                 break;
 942         default:
 943                 fd = -EINVAL;
 944                 goto out_destroy_group;
 945         }
 946
 947         if (flags & FAN_UNLIMITED_QUEUE) {
 948                 fd = -EPERM;
 949                 if (!capable(CAP_SYS_ADMIN))
 950                         goto out_destroy_group;
 951                 group->max_events = UINT_MAX;
 952         } else {
 953                 group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
 954         }
 955
 956         if (flags & FAN_UNLIMITED_MARKS) {
 957                 fd = -EPERM;
 958                 if (!capable(CAP_SYS_ADMIN))
 959                         goto out_destroy_group;
 960                 group->fanotify_data.max_marks = UINT_MAX;
 961         } else {
 962                 group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
 963         }
 964
 965         if (flags & FAN_ENABLE_AUDIT) {
 966                 fd = -EPERM;
 967                 if (!capable(CAP_AUDIT_WRITE))
 968                         goto out_destroy_group;
 969         }
 970
 971         fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 972         if (fd < 0)
 973                 goto out_destroy_group;
 974
 975         return fd;
 976
 977 out_destroy_group:
 978         fsnotify_destroy_group(group);
 979         return fd;
 980 }
 981
 982 /* Check if filesystem can encode a unique fid */
 983 static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
 984 {
 985         __kernel_fsid_t root_fsid;
 986         int err;
 987
 988         /*
 989          * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
 990          */
 991         err = vfs_get_fsid(path->dentry, fsid);
 992         if (err)
 993                 return err;
 994
 995         if (!fsid->val[0] && !fsid->val[1])
 996                 return -ENODEV;
 997
 998         /*
 999          * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
1000          * which uses a different fsid than sb root.
1001          */
1002         err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
1003         if (err)
1004                 return err;
1005
1006         if (root_fsid.val[0] != fsid->val[0] ||
1007             root_fsid.val[1] != fsid->val[1])
1008                 return -EXDEV;
1009
1010         /*
1011          * We need to make sure that the file system supports at least
1012          * encoding a file handle so user can use name_to_handle_at() to
1013          * compare fid returned with event to the file handle of watched
1014          * objects. However, name_to_handle_at() requires that the
1015          * filesystem also supports decoding file handles.
1016          */
1017         if (!path->dentry->d_sb->s_export_op ||
1018             !path->dentry->d_sb->s_export_op->fh_to_dentry)
1019                 return -EOPNOTSUPP;
1020
1021         return 0;
1022 }
1023
1024 static int fanotify_events_supported(struct path *path, __u64 mask)
1025 {
1026         /*
1027          * Some filesystems such as 'proc' acquire unusual locks when opening
1028          * files. For them fanotify permission events have high chances of
1029          * deadlocking the system - open done when reporting fanotify event
1030          * blocks on this "unusual" lock while another process holding the lock
1031          * waits for fanotify permission event to be answered. Just disallow
1032          * permission events for such filesystems.
1033          */
1034         if (mask & FANOTIFY_PERM_EVENTS &&
1035             path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1036                 return -EINVAL;
1037         return 0;
1038 }
1039
1040 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1041                             int dfd, const char  __user *pathname)
1042 {
1043         struct inode *inode = NULL;
1044         struct vfsmount *mnt = NULL;
1045         struct fsnotify_group *group;
1046         struct fd f;
1047         struct path path;
1048         __kernel_fsid_t __fsid, *fsid = NULL;
1049         u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1050         unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1051         bool ignored = flags & FAN_MARK_IGNORED_MASK;
1052         unsigned int obj_type, fid_mode;
1053         int ret;
1054
1055         pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1056                  __func__, fanotify_fd, flags, dfd, pathname, mask);
1057
1058         /* we only use the lower 32 bits as of right now. */
1059         if (mask & ((__u64)0xffffffff << 32))
1060                 return -EINVAL;
1061
1062         if (flags & ~FANOTIFY_MARK_FLAGS)
1063                 return -EINVAL;
1064
1065         switch (mark_type) {
1066         case FAN_MARK_INODE:
1067                 obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1068                 break;
1069         case FAN_MARK_MOUNT:
1070                 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1071                 break;
1072         case FAN_MARK_FILESYSTEM:
1073                 obj_type = FSNOTIFY_OBJ_TYPE_SB;
1074                 break;
1075         default:
1076                 return -EINVAL;
1077         }
1078
1079         switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1080         case FAN_MARK_ADD:              /* fallthrough */
1081         case FAN_MARK_REMOVE:
1082                 if (!mask)
1083                         return -EINVAL;
1084                 break;
1085         case FAN_MARK_FLUSH:
1086                 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1087                         return -EINVAL;
1088                 break;
1089         default:
1090                 return -EINVAL;
1091         }
1092
1093         if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1094                 valid_mask |= FANOTIFY_PERM_EVENTS;
1095
1096         if (mask & ~valid_mask)
1097                 return -EINVAL;
1098
1099         /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1100         if (ignored)
1101                 mask &= ~FANOTIFY_EVENT_FLAGS;
1102
1103         f = fdget(fanotify_fd);
1104         if (unlikely(!f.file))
1105                 return -EBADF;
1106
1107         /* verify that this is indeed an fanotify instance */
1108         ret = -EINVAL;
1109         if (unlikely(f.file->f_op != &fanotify_fops))
1110                 goto fput_and_out;
1111         group = f.file->private_data;
1112
1113         /*
1114          * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1115          * allowed to set permissions events.
1116          */
1117         ret = -EINVAL;
1118         if (mask & FANOTIFY_PERM_EVENTS &&
1119             group->priority == FS_PRIO_0)
1120                 goto fput_and_out;
1121
1122         /*
1123          * Events with data type inode do not carry enough information to report
1124          * event->fd, so we do not allow setting a mask for inode events unless
1125          * group supports reporting fid.
1126          * inode events are not supported on a mount mark, because they do not
1127          * carry enough information (i.e. path) to be filtered by mount point.
1128          */
1129         fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1130         if (mask & FANOTIFY_INODE_EVENTS &&
1131             (!fid_mode || mark_type == FAN_MARK_MOUNT))
1132                 goto fput_and_out;
1133
1134         if (flags & FAN_MARK_FLUSH) {
1135                 ret = 0;
1136                 if (mark_type == FAN_MARK_MOUNT)
1137                         fsnotify_clear_vfsmount_marks_by_group(group);
1138                 else if (mark_type == FAN_MARK_FILESYSTEM)
1139                         fsnotify_clear_sb_marks_by_group(group);
1140                 else
1141                         fsnotify_clear_inode_marks_by_group(group);
1142                 goto fput_and_out;
1143         }
1144
1145         ret = fanotify_find_path(dfd, pathname, &path, flags,
1146                         (mask & ALL_FSNOTIFY_EVENTS), obj_type);
1147         if (ret)
1148                 goto fput_and_out;
1149
1150         if (flags & FAN_MARK_ADD) {
1151                 ret = fanotify_events_supported(&path, mask);
1152                 if (ret)
1153                         goto path_put_and_out;
1154         }
1155
1156         if (fid_mode) {
1157                 ret = fanotify_test_fid(&path, &__fsid);
1158                 if (ret)
1159                         goto path_put_and_out;
1160
1161                 fsid = &__fsid;
1162         }
1163
1164         /* inode held in place by reference to path; group by fget on fd */
1165         if (mark_type == FAN_MARK_INODE)
1166                 inode = path.dentry->d_inode;
1167         else
1168                 mnt = path.mnt;
1169
1170         /* create/update an inode mark */
1171         switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1172         case FAN_MARK_ADD:
1173                 if (mark_type == FAN_MARK_MOUNT)
1174                         ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1175                                                          flags, fsid);
1176                 else if (mark_type == FAN_MARK_FILESYSTEM)
1177                         ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1178                                                    flags, fsid);
1179                 else
1180                         ret = fanotify_add_inode_mark(group, inode, mask,
1181                                                       flags, fsid);
1182                 break;
1183         case FAN_MARK_REMOVE:
1184                 if (mark_type == FAN_MARK_MOUNT)
1185                         ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1186                                                             flags, 0);
1187                 else if (mark_type == FAN_MARK_FILESYSTEM)
1188                         ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1189                                                       flags, 0);
1190                 else
1191                         ret = fanotify_remove_inode_mark(group, inode, mask,
1192                                                          flags, 0);
1193                 break;
1194         default:
1195                 ret = -EINVAL;
1196         }
1197
1198 path_put_and_out:
1199         path_put(&path);
1200 fput_and_out:
1201         fdput(f);
1202         return ret;
1203 }
1204
1205 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1206                               __u64, mask, int, dfd,
1207                               const char  __user *, pathname)
1208 {
1209         return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1210 }
1211
1212 #ifdef CONFIG_COMPAT
1213 COMPAT_SYSCALL_DEFINE6(fanotify_mark,
1214                                 int, fanotify_fd, unsigned int, flags,
1215                                 __u32, mask0, __u32, mask1, int, dfd,
1216                                 const char  __user *, pathname)
1217 {
1218         return do_fanotify_mark(fanotify_fd, flags,
1219 #ifdef __BIG_ENDIAN
1220                                 ((__u64)mask0 << 32) | mask1,
1221 #else
1222                                 ((__u64)mask1 << 32) | mask0,
1223 #endif
1224                                  dfd, pathname);
1225 }
1226 #endif
1227
1228 /*
1229  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1230  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1231  * must result in panic().
1232  */
1233 static int __init fanotify_user_setup(void)
1234 {
1235         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
1236         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1237
1238         fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1239                                          SLAB_PANIC|SLAB_ACCOUNT);
1240         fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1241                                                SLAB_PANIC);
1242         fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1243                                                 SLAB_PANIC);
1244         if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1245                 fanotify_perm_event_cachep =
1246                         KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1247         }
1248
1249         return 0;
1250 }
1251 device_initcall(fanotify_user_setup);