mm/damon/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Data Access Monitor
   4  *
   5  * Author: SeongJae Park <sjpark@amazon.de>
   6  */
   7
   8 #define pr_fmt(fmt) "damon: " fmt
   9
  10 #include <linux/damon.h>
  11 #include <linux/delay.h>
  12 #include <linux/kthread.h>
  13 #include <linux/random.h>
  14 #include <linux/slab.h>
  15
  16 #define CREATE_TRACE_POINTS
  17 #include <trace/events/damon.h>
  18
  19 /* Get a random number in [l, r) */
  20 #define damon_rand(l, r) (l + prandom_u32_max(r - l))
  21
  22 static DEFINE_MUTEX(damon_lock);
  23 static int nr_running_ctxs;
  24
  25 /*
  26  * Construct a damon_region struct
  27  *
  28  * Returns the pointer to the new struct if success, or NULL otherwise
  29  */
  30 struct damon_region *damon_new_region(unsigned long start, unsigned long end)
  31 {
  32         struct damon_region *region;
  33
  34         region = kmalloc(sizeof(*region), GFP_KERNEL);
  35         if (!region)
  36                 return NULL;
  37
  38         region->ar.start = start;
  39         region->ar.end = end;
  40         region->nr_accesses = 0;
  41         INIT_LIST_HEAD(&region->list);
  42
  43         return region;
  44 }
  45
  46 /*
  47  * Add a region between two other regions
  48  */
  49 inline void damon_insert_region(struct damon_region *r,
  50                 struct damon_region *prev, struct damon_region *next,
  51                 struct damon_target *t)
  52 {
  53         __list_add(&r->list, &prev->list, &next->list);
  54         t->nr_regions++;
  55 }
  56
  57 void damon_add_region(struct damon_region *r, struct damon_target *t)
  58 {
  59         list_add_tail(&r->list, &t->regions_list);
  60         t->nr_regions++;
  61 }
  62
  63 static void damon_del_region(struct damon_region *r, struct damon_target *t)
  64 {
  65         list_del(&r->list);
  66         t->nr_regions--;
  67 }
  68
  69 static void damon_free_region(struct damon_region *r)
  70 {
  71         kfree(r);
  72 }
  73
  74 void damon_destroy_region(struct damon_region *r, struct damon_target *t)
  75 {
  76         damon_del_region(r, t);
  77         damon_free_region(r);
  78 }
  79
  80 /*
  81  * Construct a damon_target struct
  82  *
  83  * Returns the pointer to the new struct if success, or NULL otherwise
  84  */
  85 struct damon_target *damon_new_target(unsigned long id)
  86 {
  87         struct damon_target *t;
  88
  89         t = kmalloc(sizeof(*t), GFP_KERNEL);
  90         if (!t)
  91                 return NULL;
  92
  93         t->id = id;
  94         t->nr_regions = 0;
  95         INIT_LIST_HEAD(&t->regions_list);
  96
  97         return t;
  98 }
  99
 100 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
 101 {
 102         list_add_tail(&t->list, &ctx->adaptive_targets);
 103 }
 104
 105 static void damon_del_target(struct damon_target *t)
 106 {
 107         list_del(&t->list);
 108 }
 109
 110 void damon_free_target(struct damon_target *t)
 111 {
 112         struct damon_region *r, *next;
 113
 114         damon_for_each_region_safe(r, next, t)
 115                 damon_free_region(r);
 116         kfree(t);
 117 }
 118
 119 void damon_destroy_target(struct damon_target *t)
 120 {
 121         damon_del_target(t);
 122         damon_free_target(t);
 123 }
 124
 125 unsigned int damon_nr_regions(struct damon_target *t)
 126 {
 127         return t->nr_regions;
 128 }
 129
 130 struct damon_ctx *damon_new_ctx(void)
 131 {
 132         struct damon_ctx *ctx;
 133
 134         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 135         if (!ctx)
 136                 return NULL;
 137
 138         ctx->sample_interval = 5 * 1000;
 139         ctx->aggr_interval = 100 * 1000;
 140         ctx->primitive_update_interval = 60 * 1000 * 1000;
 141
 142         ktime_get_coarse_ts64(&ctx->last_aggregation);
 143         ctx->last_primitive_update = ctx->last_aggregation;
 144
 145         mutex_init(&ctx->kdamond_lock);
 146
 147         ctx->min_nr_regions = 10;
 148         ctx->max_nr_regions = 1000;
 149
 150         INIT_LIST_HEAD(&ctx->adaptive_targets);
 151
 152         return ctx;
 153 }
 154
 155 static void damon_destroy_targets(struct damon_ctx *ctx)
 156 {
 157         struct damon_target *t, *next_t;
 158
 159         if (ctx->primitive.cleanup) {
 160                 ctx->primitive.cleanup(ctx);
 161                 return;
 162         }
 163
 164         damon_for_each_target_safe(t, next_t, ctx)
 165                 damon_destroy_target(t);
 166 }
 167
 168 void damon_destroy_ctx(struct damon_ctx *ctx)
 169 {
 170         damon_destroy_targets(ctx);
 171         kfree(ctx);
 172 }
 173
 174 /**
 175  * damon_set_targets() - Set monitoring targets.
 176  * @ctx:        monitoring context
 177  * @ids:        array of target ids
 178  * @nr_ids:     number of entries in @ids
 179  *
 180  * This function should not be called while the kdamond is running.
 181  *
 182  * Return: 0 on success, negative error code otherwise.
 183  */
 184 int damon_set_targets(struct damon_ctx *ctx,
 185                       unsigned long *ids, ssize_t nr_ids)
 186 {
 187         ssize_t i;
 188         struct damon_target *t, *next;
 189
 190         damon_destroy_targets(ctx);
 191
 192         for (i = 0; i < nr_ids; i++) {
 193                 t = damon_new_target(ids[i]);
 194                 if (!t) {
 195                         pr_err("Failed to alloc damon_target\n");
 196                         /* The caller should do cleanup of the ids itself */
 197                         damon_for_each_target_safe(t, next, ctx)
 198                                 damon_destroy_target(t);
 199                         return -ENOMEM;
 200                 }
 201                 damon_add_target(ctx, t);
 202         }
 203
 204         return 0;
 205 }
 206
 207 /**
 208  * damon_set_attrs() - Set attributes for the monitoring.
 209  * @ctx:                monitoring context
 210  * @sample_int:         time interval between samplings
 211  * @aggr_int:           time interval between aggregations
 212  * @primitive_upd_int:  time interval between monitoring primitive updates
 213  * @min_nr_reg:         minimal number of regions
 214  * @max_nr_reg:         maximum number of regions
 215  *
 216  * This function should not be called while the kdamond is running.
 217  * Every time interval is in micro-seconds.
 218  *
 219  * Return: 0 on success, negative error code otherwise.
 220  */
 221 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 222                     unsigned long aggr_int, unsigned long primitive_upd_int,
 223                     unsigned long min_nr_reg, unsigned long max_nr_reg)
 224 {
 225         if (min_nr_reg < 3) {
 226                 pr_err("min_nr_regions (%lu) must be at least 3\n",
 227                                 min_nr_reg);
 228                 return -EINVAL;
 229         }
 230         if (min_nr_reg > max_nr_reg) {
 231                 pr_err("invalid nr_regions.  min (%lu) > max (%lu)\n",
 232                                 min_nr_reg, max_nr_reg);
 233                 return -EINVAL;
 234         }
 235
 236         ctx->sample_interval = sample_int;
 237         ctx->aggr_interval = aggr_int;
 238         ctx->primitive_update_interval = primitive_upd_int;
 239         ctx->min_nr_regions = min_nr_reg;
 240         ctx->max_nr_regions = max_nr_reg;
 241
 242         return 0;
 243 }
 244
 245 /**
 246  * damon_nr_running_ctxs() - Return number of currently running contexts.
 247  */
 248 int damon_nr_running_ctxs(void)
 249 {
 250         int nr_ctxs;
 251
 252         mutex_lock(&damon_lock);
 253         nr_ctxs = nr_running_ctxs;
 254         mutex_unlock(&damon_lock);
 255
 256         return nr_ctxs;
 257 }
 258
 259 /* Returns the size upper limit for each monitoring region */
 260 static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
 261 {
 262         struct damon_target *t;
 263         struct damon_region *r;
 264         unsigned long sz = 0;
 265
 266         damon_for_each_target(t, ctx) {
 267                 damon_for_each_region(r, t)
 268                         sz += r->ar.end - r->ar.start;
 269         }
 270
 271         if (ctx->min_nr_regions)
 272                 sz /= ctx->min_nr_regions;
 273         if (sz < DAMON_MIN_REGION)
 274                 sz = DAMON_MIN_REGION;
 275
 276         return sz;
 277 }
 278
 279 static bool damon_kdamond_running(struct damon_ctx *ctx)
 280 {
 281         bool running;
 282
 283         mutex_lock(&ctx->kdamond_lock);
 284         running = ctx->kdamond != NULL;
 285         mutex_unlock(&ctx->kdamond_lock);
 286
 287         return running;
 288 }
 289
 290 static int kdamond_fn(void *data);
 291
 292 /*
 293  * __damon_start() - Starts monitoring with given context.
 294  * @ctx:        monitoring context
 295  *
 296  * This function should be called while damon_lock is hold.
 297  *
 298  * Return: 0 on success, negative error code otherwise.
 299  */
 300 static int __damon_start(struct damon_ctx *ctx)
 301 {
 302         int err = -EBUSY;
 303
 304         mutex_lock(&ctx->kdamond_lock);
 305         if (!ctx->kdamond) {
 306                 err = 0;
 307                 ctx->kdamond_stop = false;
 308                 ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
 309                                 nr_running_ctxs);
 310                 if (IS_ERR(ctx->kdamond)) {
 311                         err = PTR_ERR(ctx->kdamond);
 312                         ctx->kdamond = 0;
 313                 }
 314         }
 315         mutex_unlock(&ctx->kdamond_lock);
 316
 317         return err;
 318 }
 319
 320 /**
 321  * damon_start() - Starts the monitorings for a given group of contexts.
 322  * @ctxs:       an array of the pointers for contexts to start monitoring
 323  * @nr_ctxs:    size of @ctxs
 324  *
 325  * This function starts a group of monitoring threads for a group of monitoring
 326  * contexts.  One thread per each context is created and run in parallel.  The
 327  * caller should handle synchronization between the threads by itself.  If a
 328  * group of threads that created by other 'damon_start()' call is currently
 329  * running, this function does nothing but returns -EBUSY.
 330  *
 331  * Return: 0 on success, negative error code otherwise.
 332  */
 333 int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
 334 {
 335         int i;
 336         int err = 0;
 337
 338         mutex_lock(&damon_lock);
 339         if (nr_running_ctxs) {
 340                 mutex_unlock(&damon_lock);
 341                 return -EBUSY;
 342         }
 343
 344         for (i = 0; i < nr_ctxs; i++) {
 345                 err = __damon_start(ctxs[i]);
 346                 if (err)
 347                         break;
 348                 nr_running_ctxs++;
 349         }
 350         mutex_unlock(&damon_lock);
 351
 352         return err;
 353 }
 354
 355 /*
 356  * __damon_stop() - Stops monitoring of given context.
 357  * @ctx:        monitoring context
 358  *
 359  * Return: 0 on success, negative error code otherwise.
 360  */
 361 static int __damon_stop(struct damon_ctx *ctx)
 362 {
 363         mutex_lock(&ctx->kdamond_lock);
 364         if (ctx->kdamond) {
 365                 ctx->kdamond_stop = true;
 366                 mutex_unlock(&ctx->kdamond_lock);
 367                 while (damon_kdamond_running(ctx))
 368                         usleep_range(ctx->sample_interval,
 369                                         ctx->sample_interval * 2);
 370                 return 0;
 371         }
 372         mutex_unlock(&ctx->kdamond_lock);
 373
 374         return -EPERM;
 375 }
 376
 377 /**
 378  * damon_stop() - Stops the monitorings for a given group of contexts.
 379  * @ctxs:       an array of the pointers for contexts to stop monitoring
 380  * @nr_ctxs:    size of @ctxs
 381  *
 382  * Return: 0 on success, negative error code otherwise.
 383  */
 384 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
 385 {
 386         int i, err = 0;
 387
 388         for (i = 0; i < nr_ctxs; i++) {
 389                 /* nr_running_ctxs is decremented in kdamond_fn */
 390                 err = __damon_stop(ctxs[i]);
 391                 if (err)
 392                         return err;
 393         }
 394
 395         return err;
 396 }
 397
 398 /*
 399  * damon_check_reset_time_interval() - Check if a time interval is elapsed.
 400  * @baseline:   the time to check whether the interval has elapsed since
 401  * @interval:   the time interval (microseconds)
 402  *
 403  * See whether the given time interval has passed since the given baseline
 404  * time.  If so, it also updates the baseline to current time for next check.
 405  *
 406  * Return:      true if the time interval has passed, or false otherwise.
 407  */
 408 static bool damon_check_reset_time_interval(struct timespec64 *baseline,
 409                 unsigned long interval)
 410 {
 411         struct timespec64 now;
 412
 413         ktime_get_coarse_ts64(&now);
 414         if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
 415                         interval * 1000)
 416                 return false;
 417         *baseline = now;
 418         return true;
 419 }
 420
 421 /*
 422  * Check whether it is time to flush the aggregated information
 423  */
 424 static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
 425 {
 426         return damon_check_reset_time_interval(&ctx->last_aggregation,
 427                         ctx->aggr_interval);
 428 }
 429
 430 /*
 431  * Reset the aggregated monitoring results ('nr_accesses' of each region).
 432  */
 433 static void kdamond_reset_aggregated(struct damon_ctx *c)
 434 {
 435         struct damon_target *t;
 436
 437         damon_for_each_target(t, c) {
 438                 struct damon_region *r;
 439
 440                 damon_for_each_region(r, t) {
 441                         trace_damon_aggregated(t, r, damon_nr_regions(t));
 442                         r->nr_accesses = 0;
 443                 }
 444         }
 445 }
 446
 447 #define sz_damon_region(r) (r->ar.end - r->ar.start)
 448
 449 /*
 450  * Merge two adjacent regions into one region
 451  */
 452 static void damon_merge_two_regions(struct damon_target *t,
 453                 struct damon_region *l, struct damon_region *r)
 454 {
 455         unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
 456
 457         l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
 458                         (sz_l + sz_r);
 459         l->ar.end = r->ar.end;
 460         damon_destroy_region(r, t);
 461 }
 462
 463 #define diff_of(a, b) (a > b ? a - b : b - a)
 464
 465 /*
 466  * Merge adjacent regions having similar access frequencies
 467  *
 468  * t            target affected by this merge operation
 469  * thres        '->nr_accesses' diff threshold for the merge
 470  * sz_limit     size upper limit of each region
 471  */
 472 static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
 473                                    unsigned long sz_limit)
 474 {
 475         struct damon_region *r, *prev = NULL, *next;
 476
 477         damon_for_each_region_safe(r, next, t) {
 478                 if (prev && prev->ar.end == r->ar.start &&
 479                     diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
 480                     sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
 481                         damon_merge_two_regions(t, prev, r);
 482                 else
 483                         prev = r;
 484         }
 485 }
 486
 487 /*
 488  * Merge adjacent regions having similar access frequencies
 489  *
 490  * threshold    '->nr_accesses' diff threshold for the merge
 491  * sz_limit     size upper limit of each region
 492  *
 493  * This function merges monitoring target regions which are adjacent and their
 494  * access frequencies are similar.  This is for minimizing the monitoring
 495  * overhead under the dynamically changeable access pattern.  If a merge was
 496  * unnecessarily made, later 'kdamond_split_regions()' will revert it.
 497  */
 498 static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
 499                                   unsigned long sz_limit)
 500 {
 501         struct damon_target *t;
 502
 503         damon_for_each_target(t, c)
 504                 damon_merge_regions_of(t, threshold, sz_limit);
 505 }
 506
 507 /*
 508  * Split a region in two
 509  *
 510  * r            the region to be split
 511  * sz_r         size of the first sub-region that will be made
 512  */
 513 static void damon_split_region_at(struct damon_ctx *ctx,
 514                 struct damon_target *t, struct damon_region *r,
 515                 unsigned long sz_r)
 516 {
 517         struct damon_region *new;
 518
 519         new = damon_new_region(r->ar.start + sz_r, r->ar.end);
 520         if (!new)
 521                 return;
 522
 523         r->ar.end = new->ar.start;
 524
 525         damon_insert_region(new, r, damon_next_region(r), t);
 526 }
 527
 528 /* Split every region in the given target into 'nr_subs' regions */
 529 static void damon_split_regions_of(struct damon_ctx *ctx,
 530                                      struct damon_target *t, int nr_subs)
 531 {
 532         struct damon_region *r, *next;
 533         unsigned long sz_region, sz_sub = 0;
 534         int i;
 535
 536         damon_for_each_region_safe(r, next, t) {
 537                 sz_region = r->ar.end - r->ar.start;
 538
 539                 for (i = 0; i < nr_subs - 1 &&
 540                                 sz_region > 2 * DAMON_MIN_REGION; i++) {
 541                         /*
 542                          * Randomly select size of left sub-region to be at
 543                          * least 10 percent and at most 90% of original region
 544                          */
 545                         sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
 546                                         sz_region / 10, DAMON_MIN_REGION);
 547                         /* Do not allow blank region */
 548                         if (sz_sub == 0 || sz_sub >= sz_region)
 549                                 continue;
 550
 551                         damon_split_region_at(ctx, t, r, sz_sub);
 552                         sz_region = sz_sub;
 553                 }
 554         }
 555 }
 556
 557 /*
 558  * Split every target region into randomly-sized small regions
 559  *
 560  * This function splits every target region into random-sized small regions if
 561  * current total number of the regions is equal or smaller than half of the
 562  * user-specified maximum number of regions.  This is for maximizing the
 563  * monitoring accuracy under the dynamically changeable access patterns.  If a
 564  * split was unnecessarily made, later 'kdamond_merge_regions()' will revert
 565  * it.
 566  */
 567 static void kdamond_split_regions(struct damon_ctx *ctx)
 568 {
 569         struct damon_target *t;
 570         unsigned int nr_regions = 0;
 571         static unsigned int last_nr_regions;
 572         int nr_subregions = 2;
 573
 574         damon_for_each_target(t, ctx)
 575                 nr_regions += damon_nr_regions(t);
 576
 577         if (nr_regions > ctx->max_nr_regions / 2)
 578                 return;
 579
 580         /* Maybe the middle of the region has different access frequency */
 581         if (last_nr_regions == nr_regions &&
 582                         nr_regions < ctx->max_nr_regions / 3)
 583                 nr_subregions = 3;
 584
 585         damon_for_each_target(t, ctx)
 586                 damon_split_regions_of(ctx, t, nr_subregions);
 587
 588         last_nr_regions = nr_regions;
 589 }
 590
 591 /*
 592  * Check whether it is time to check and apply the target monitoring regions
 593  *
 594  * Returns true if it is.
 595  */
 596 static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
 597 {
 598         return damon_check_reset_time_interval(&ctx->last_primitive_update,
 599                         ctx->primitive_update_interval);
 600 }
 601
 602 /*
 603  * Check whether current monitoring should be stopped
 604  *
 605  * The monitoring is stopped when either the user requested to stop, or all
 606  * monitoring targets are invalid.
 607  *
 608  * Returns true if need to stop current monitoring.
 609  */
 610 static bool kdamond_need_stop(struct damon_ctx *ctx)
 611 {
 612         struct damon_target *t;
 613         bool stop;
 614
 615         mutex_lock(&ctx->kdamond_lock);
 616         stop = ctx->kdamond_stop;
 617         mutex_unlock(&ctx->kdamond_lock);
 618         if (stop)
 619                 return true;
 620
 621         if (!ctx->primitive.target_valid)
 622                 return false;
 623
 624         damon_for_each_target(t, ctx) {
 625                 if (ctx->primitive.target_valid(t))
 626                         return false;
 627         }
 628
 629         return true;
 630 }
 631
 632 static void set_kdamond_stop(struct damon_ctx *ctx)
 633 {
 634         mutex_lock(&ctx->kdamond_lock);
 635         ctx->kdamond_stop = true;
 636         mutex_unlock(&ctx->kdamond_lock);
 637 }
 638
 639 /*
 640  * The monitoring daemon that runs as a kernel thread
 641  */
 642 static int kdamond_fn(void *data)
 643 {
 644         struct damon_ctx *ctx = (struct damon_ctx *)data;
 645         struct damon_target *t;
 646         struct damon_region *r, *next;
 647         unsigned int max_nr_accesses = 0;
 648         unsigned long sz_limit = 0;
 649
 650         mutex_lock(&ctx->kdamond_lock);
 651         pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
 652         mutex_unlock(&ctx->kdamond_lock);
 653
 654         if (ctx->primitive.init)
 655                 ctx->primitive.init(ctx);
 656         if (ctx->callback.before_start && ctx->callback.before_start(ctx))
 657                 set_kdamond_stop(ctx);
 658
 659         sz_limit = damon_region_sz_limit(ctx);
 660
 661         while (!kdamond_need_stop(ctx)) {
 662                 if (ctx->primitive.prepare_access_checks)
 663                         ctx->primitive.prepare_access_checks(ctx);
 664                 if (ctx->callback.after_sampling &&
 665                                 ctx->callback.after_sampling(ctx))
 666                         set_kdamond_stop(ctx);
 667
 668                 usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
 669
 670                 if (ctx->primitive.check_accesses)
 671                         max_nr_accesses = ctx->primitive.check_accesses(ctx);
 672
 673                 if (kdamond_aggregate_interval_passed(ctx)) {
 674                         kdamond_merge_regions(ctx,
 675                                         max_nr_accesses / 10,
 676                                         sz_limit);
 677                         if (ctx->callback.after_aggregation &&
 678                                         ctx->callback.after_aggregation(ctx))
 679                                 set_kdamond_stop(ctx);
 680                         kdamond_reset_aggregated(ctx);
 681                         kdamond_split_regions(ctx);
 682                         if (ctx->primitive.reset_aggregated)
 683                                 ctx->primitive.reset_aggregated(ctx);
 684                 }
 685
 686                 if (kdamond_need_update_primitive(ctx)) {
 687                         if (ctx->primitive.update)
 688                                 ctx->primitive.update(ctx);
 689                         sz_limit = damon_region_sz_limit(ctx);
 690                 }
 691         }
 692         damon_for_each_target(t, ctx) {
 693                 damon_for_each_region_safe(r, next, t)
 694                         damon_destroy_region(r, t);
 695         }
 696
 697         if (ctx->callback.before_terminate &&
 698                         ctx->callback.before_terminate(ctx))
 699                 set_kdamond_stop(ctx);
 700         if (ctx->primitive.cleanup)
 701                 ctx->primitive.cleanup(ctx);
 702
 703         pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
 704         mutex_lock(&ctx->kdamond_lock);
 705         ctx->kdamond = NULL;
 706         mutex_unlock(&ctx->kdamond_lock);
 707
 708         mutex_lock(&damon_lock);
 709         nr_running_ctxs--;
 710         mutex_unlock(&damon_lock);
 711
 712         do_exit(0);
 713 }