kernel/irq/timings.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
   3
   4 #include <linux/kernel.h>
   5 #include <linux/percpu.h>
   6 #include <linux/slab.h>
   7 #include <linux/static_key.h>
   8 #include <linux/interrupt.h>
   9 #include <linux/idr.h>
  10 #include <linux/irq.h>
  11 #include <linux/math64.h>
  12 #include <linux/log2.h>
  13
  14 #include <trace/events/irq.h>
  15
  16 #include "internals.h"
  17
  18 DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
  19
  20 DEFINE_PER_CPU(struct irq_timings, irq_timings);
  21
  22 static DEFINE_IDR(irqt_stats);
  23
  24 void irq_timings_enable(void)
  25 {
  26         static_branch_enable(&irq_timing_enabled);
  27 }
  28
  29 void irq_timings_disable(void)
  30 {
  31         static_branch_disable(&irq_timing_enabled);
  32 }
  33
  34 /*
  35  * The main goal of this algorithm is to predict the next interrupt
  36  * occurrence on the current CPU.
  37  *
  38  * Currently, the interrupt timings are stored in a circular array
  39  * buffer every time there is an interrupt, as a tuple: the interrupt
  40  * number and the associated timestamp when the event occurred <irq,
  41  * timestamp>.
  42  *
  43  * For every interrupt occurring in a short period of time, we can
  44  * measure the elapsed time between the occurrences for the same
  45  * interrupt and we end up with a suite of intervals. The experience
  46  * showed the interrupts are often coming following a periodic
  47  * pattern.
  48  *
  49  * The objective of the algorithm is to find out this periodic pattern
  50  * in a fastest way and use its period to predict the next irq event.
  51  *
  52  * When the next interrupt event is requested, we are in the situation
  53  * where the interrupts are disabled and the circular buffer
  54  * containing the timings is filled with the events which happened
  55  * after the previous next-interrupt-event request.
  56  *
  57  * At this point, we read the circular buffer and we fill the irq
  58  * related statistics structure. After this step, the circular array
  59  * containing the timings is empty because all the values are
  60  * dispatched in their corresponding buffers.
  61  *
  62  * Now for each interrupt, we can predict the next event by using the
  63  * suffix array, log interval and exponential moving average
  64  *
  65  * 1. Suffix array
  66  *
  67  * Suffix array is an array of all the suffixes of a string. It is
  68  * widely used as a data structure for compression, text search, ...
  69  * For instance for the word 'banana', the suffixes will be: 'banana'
  70  * 'anana' 'nana' 'ana' 'na' 'a'
  71  *
  72  * Usually, the suffix array is sorted but for our purpose it is
  73  * not necessary and won't provide any improvement in the context of
  74  * the solved problem where we clearly define the boundaries of the
  75  * search by a max period and min period.
  76  *
  77  * The suffix array will build a suite of intervals of different
  78  * length and will look for the repetition of each suite. If the suite
  79  * is repeating then we have the period because it is the length of
  80  * the suite whatever its position in the buffer.
  81  *
  82  * 2. Log interval
  83  *
  84  * We saw the irq timings allow to compute the interval of the
  85  * occurrences for a specific interrupt. We can reasonibly assume the
  86  * longer is the interval, the higher is the error for the next event
  87  * and we can consider storing those interval values into an array
  88  * where each slot in the array correspond to an interval at the power
  89  * of 2 of the index. For example, index 12 will contain values
  90  * between 2^11 and 2^12.
  91  *
  92  * At the end we have an array of values where at each index defines a
  93  * [2^index - 1, 2 ^ index] interval values allowing to store a large
  94  * number of values inside a small array.
  95  *
  96  * For example, if we have the value 1123, then we store it at
  97  * ilog2(1123) = 10 index value.
  98  *
  99  * Storing those value at the specific index is done by computing an
 100  * exponential moving average for this specific slot. For instance,
 101  * for values 1800, 1123, 1453, ... fall under the same slot (10) and
 102  * the exponential moving average is computed every time a new value
 103  * is stored at this slot.
 104  *
 105  * 3. Exponential Moving Average
 106  *
 107  * The EMA is largely used to track a signal for stocks or as a low
 108  * pass filter. The magic of the formula, is it is very simple and the
 109  * reactivity of the average can be tuned with the factors called
 110  * alpha.
 111  *
 112  * The higher the alphas are, the faster the average respond to the
 113  * signal change. In our case, if a slot in the array is a big
 114  * interval, we can have numbers with a big difference between
 115  * them. The impact of those differences in the average computation
 116  * can be tuned by changing the alpha value.
 117  *
 118  *
 119  *  -- The algorithm --
 120  *
 121  * We saw the different processing above, now let's see how they are
 122  * used together.
 123  *
 124  * For each interrupt:
 125  *      For each interval:
 126  *              Compute the index = ilog2(interval)
 127  *              Compute a new_ema(buffer[index], interval)
 128  *              Store the index in a circular buffer
 129  *
 130  *      Compute the suffix array of the indexes
 131  *
 132  *      For each suffix:
 133  *              If the suffix is reverse-found 3 times
 134  *                      Return suffix
 135  *
 136  *      Return Not found
 137  *
 138  * However we can not have endless suffix array to be build, it won't
 139  * make sense and it will add an extra overhead, so we can restrict
 140  * this to a maximum suffix length of 5 and a minimum suffix length of
 141  * 2. The experience showed 5 is the majority of the maximum pattern
 142  * period found for different devices.
 143  *
 144  * The result is a pattern finding less than 1us for an interrupt.
 145  *
 146  * Example based on real values:
 147  *
 148  * Example 1 : MMC write/read interrupt interval:
 149  *
 150  *      223947, 1240, 1384, 1386, 1386,
 151  *      217416, 1236, 1384, 1386, 1387,
 152  *      214719, 1241, 1386, 1387, 1384,
 153  *      213696, 1234, 1384, 1386, 1388,
 154  *      219904, 1240, 1385, 1389, 1385,
 155  *      212240, 1240, 1386, 1386, 1386,
 156  *      214415, 1236, 1384, 1386, 1387,
 157  *      214276, 1234, 1384, 1388, ?
 158  *
 159  * For each element, apply ilog2(value)
 160  *
 161  *      15, 8, 8, 8, 8,
 162  *      15, 8, 8, 8, 8,
 163  *      15, 8, 8, 8, 8,
 164  *      15, 8, 8, 8, 8,
 165  *      15, 8, 8, 8, 8,
 166  *      15, 8, 8, 8, 8,
 167  *      15, 8, 8, 8, 8,
 168  *      15, 8, 8, 8, ?
 169  *
 170  * Max period of 5, we take the last (max_period * 3) 15 elements as
 171  * we can be confident if the pattern repeats itself three times it is
 172  * a repeating pattern.
 173  *
 174  *                   8,
 175  *      15, 8, 8, 8, 8,
 176  *      15, 8, 8, 8, 8,
 177  *      15, 8, 8, 8, ?
 178  *
 179  * Suffixes are:
 180  *
 181  *  1) 8, 15, 8, 8, 8  <- max period
 182  *  2) 8, 15, 8, 8
 183  *  3) 8, 15, 8
 184  *  4) 8, 15           <- min period
 185  *
 186  * From there we search the repeating pattern for each suffix.
 187  *
 188  * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
 189  *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
 190  *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
 191  *                         8, 15, 8, 8, 8  |   |  |  |  |
 192  *                                         8, 15, 8, 8, 8
 193  *
 194  * When moving the suffix, we found exactly 3 matches.
 195  *
 196  * The first suffix with period 5 is repeating.
 197  *
 198  * The next event is (3 * max_period) % suffix_period
 199  *
 200  * In this example, the result 0, so the next event is suffix[0] => 8
 201  *
 202  * However, 8 is the index in the array of exponential moving average
 203  * which was calculated on the fly when storing the values, so the
 204  * interval is ema[8] = 1366
 205  *
 206  *
 207  * Example 2:
 208  *
 209  *      4, 3, 5, 100,
 210  *      3, 3, 5, 117,
 211  *      4, 4, 5, 112,
 212  *      4, 3, 4, 110,
 213  *      3, 5, 3, 117,
 214  *      4, 4, 5, 112,
 215  *      4, 3, 4, 110,
 216  *      3, 4, 5, 112,
 217  *      4, 3, 4, 110
 218  *
 219  * ilog2
 220  *
 221  *      0, 0, 0, 4,
 222  *      0, 0, 0, 4,
 223  *      0, 0, 0, 4,
 224  *      0, 0, 0, 4,
 225  *      0, 0, 0, 4,
 226  *      0, 0, 0, 4,
 227  *      0, 0, 0, 4,
 228  *      0, 0, 0, 4,
 229  *      0, 0, 0, 4
 230  *
 231  * Max period 5:
 232  *         0, 0, 4,
 233  *      0, 0, 0, 4,
 234  *      0, 0, 0, 4,
 235  *      0, 0, 0, 4
 236  *
 237  * Suffixes:
 238  *
 239  *  1) 0, 0, 4, 0, 0
 240  *  2) 0, 0, 4, 0
 241  *  3) 0, 0, 4
 242  *  4) 0, 0
 243  *
 244  * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
 245  *         |  |  |  |  |  |  X
 246  *         0, 0, 4, 0, 0, |  X
 247  *                        0, 0
 248  *
 249  * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
 250  *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
 251  *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
 252  *                     0, 0, 4, 0, |  |  |  |  |  |  |
 253  *                                 0, 0, 4, 0, |  |  |
 254  *                                             0  0  4
 255  *
 256  * Pattern is found 3 times, the remaining is 1 which results from
 257  * (max_period * 3) % suffix_period. This value is the index in the
 258  * suffix arrays. The suffix array for a period 4 has the value 4
 259  * at index 1.
 260  */
 261 #define EMA_ALPHA_VAL           64
 262 #define EMA_ALPHA_SHIFT         7
 263
 264 #define PREDICTION_PERIOD_MIN   3
 265 #define PREDICTION_PERIOD_MAX   5
 266 #define PREDICTION_FACTOR       4
 267 #define PREDICTION_MAX          10 /* 2 ^ PREDICTION_MAX useconds */
 268 #define PREDICTION_BUFFER_SIZE  16 /* slots for EMAs, hardly more than 16 */
 269
 270 /*
 271  * Number of elements in the circular buffer: If it happens it was
 272  * flushed before, then the number of elements could be smaller than
 273  * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
 274  * used as we wrapped. The index begins from zero when we did not
 275  * wrap. That could be done in a nicer way with the proper circular
 276  * array structure type but with the cost of extra computation in the
 277  * interrupt handler hot path. We choose efficiency.
 278  */
 279 #define for_each_irqts(i, irqts)                                        \
 280         for (i = irqts->count < IRQ_TIMINGS_SIZE ?                      \
 281                      0 : irqts->count & IRQ_TIMINGS_MASK,               \
 282                      irqts->count = min(IRQ_TIMINGS_SIZE,               \
 283                                         irqts->count);                  \
 284              irqts->count > 0; irqts->count--,                          \
 285                      i = (i + 1) & IRQ_TIMINGS_MASK)
 286
 287 struct irqt_stat {
 288         u64     last_ts;
 289         u64     ema_time[PREDICTION_BUFFER_SIZE];
 290         int     timings[IRQ_TIMINGS_SIZE];
 291         int     circ_timings[IRQ_TIMINGS_SIZE];
 292         int     count;
 293 };
 294
 295 /*
 296  * Exponential moving average computation
 297  */
 298 static u64 irq_timings_ema_new(u64 value, u64 ema_old)
 299 {
 300         s64 diff;
 301
 302         if (unlikely(!ema_old))
 303                 return value;
 304
 305         diff = (value - ema_old) * EMA_ALPHA_VAL;
 306         /*
 307          * We can use a s64 type variable to be added with the u64
 308          * ema_old variable as this one will never have its topmost
 309          * bit set, it will be always smaller than 2^63 nanosec
 310          * interrupt interval (292 years).
 311          */
 312         return ema_old + (diff >> EMA_ALPHA_SHIFT);
 313 }
 314
 315 static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
 316 {
 317         int period;
 318
 319         /*
 320          * Move the beginning pointer to the end minus the max period x 3.
 321          * We are at the point we can begin searching the pattern
 322          */
 323         buffer = &buffer[len - (period_max * 3)];
 324
 325         /* Adjust the length to the maximum allowed period x 3 */
 326         len = period_max * 3;
 327
 328         /*
 329          * The buffer contains the suite of intervals, in a ilog2
 330          * basis, we are looking for a repetition. We point the
 331          * beginning of the search three times the length of the
 332          * period beginning at the end of the buffer. We do that for
 333          * each suffix.
 334          */
 335         for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
 336
 337                 /*
 338                  * The first comparison always succeed because the
 339                  * suffix is deduced from the first n-period bytes of
 340                  * the buffer and we compare the initial suffix with
 341                  * itself, so we can skip the first iteration.
 342                  */
 343                 int idx = period;
 344                 size_t size = period;
 345
 346                 /*
 347                  * We look if the suite with period 'i' repeat
 348                  * itself. If it is truncated at the end, as it
 349                  * repeats we can use the period to find out the next
 350                  * element with the modulo.
 351                  */
 352                 while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
 353
 354                         /*
 355                          * Move the index in a period basis
 356                          */
 357                         idx += size;
 358
 359                         /*
 360                          * If this condition is reached, all previous
 361                          * memcmp were successful, so the period is
 362                          * found.
 363                          */
 364                         if (idx == len)
 365                                 return buffer[len % period];
 366
 367                         /*
 368                          * If the remaining elements to compare are
 369                          * smaller than the period, readjust the size
 370                          * of the comparison for the last iteration.
 371                          */
 372                         if (len - idx < period)
 373                                 size = len - idx;
 374                 }
 375         }
 376
 377         return -1;
 378 }
 379
 380 static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
 381 {
 382         int index, i, period_max, count, start, min = INT_MAX;
 383
 384         if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
 385                 irqs->count = irqs->last_ts = 0;
 386                 return U64_MAX;
 387         }
 388
 389         /*
 390          * As we want to find three times the repetition, we need a
 391          * number of intervals greater or equal to three times the
 392          * maximum period, otherwise we truncate the max period.
 393          */
 394         period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
 395                 PREDICTION_PERIOD_MAX : irqs->count / 3;
 396
 397         /*
 398          * If we don't have enough irq timings for this prediction,
 399          * just bail out.
 400          */
 401         if (period_max <= PREDICTION_PERIOD_MIN)
 402                 return U64_MAX;
 403
 404         /*
 405          * 'count' will depends if the circular buffer wrapped or not
 406          */
 407         count = irqs->count < IRQ_TIMINGS_SIZE ?
 408                 irqs->count : IRQ_TIMINGS_SIZE;
 409
 410         start = irqs->count < IRQ_TIMINGS_SIZE ?
 411                 0 : (irqs->count & IRQ_TIMINGS_MASK);
 412
 413         /*
 414          * Copy the content of the circular buffer into another buffer
 415          * in order to linearize the buffer instead of dealing with
 416          * wrapping indexes and shifted array which will be prone to
 417          * error and extremelly difficult to debug.
 418          */
 419         for (i = 0; i < count; i++) {
 420                 int index = (start + i) & IRQ_TIMINGS_MASK;
 421
 422                 irqs->timings[i] = irqs->circ_timings[index];
 423                 min = min_t(int, irqs->timings[i], min);
 424         }
 425
 426         index = irq_timings_next_event_index(irqs->timings, count, period_max);
 427         if (index < 0)
 428                 return irqs->last_ts + irqs->ema_time[min];
 429
 430         return irqs->last_ts + irqs->ema_time[index];
 431 }
 432
 433 static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 434 {
 435         u64 old_ts = irqs->last_ts;
 436         u64 interval;
 437         int index;
 438
 439         /*
 440          * The timestamps are absolute time values, we need to compute
 441          * the timing interval between two interrupts.
 442          */
 443         irqs->last_ts = ts;
 444
 445         /*
 446          * The interval type is u64 in order to deal with the same
 447          * type in our computation, that prevent mindfuck issues with
 448          * overflow, sign and division.
 449          */
 450         interval = ts - old_ts;
 451
 452         /*
 453          * The interrupt triggered more than one second apart, that
 454          * ends the sequence as predictible for our purpose. In this
 455          * case, assume we have the beginning of a sequence and the
 456          * timestamp is the first value. As it is impossible to
 457          * predict anything at this point, return.
 458          *
 459          * Note the first timestamp of the sequence will always fall
 460          * in this test because the old_ts is zero. That is what we
 461          * want as we need another timestamp to compute an interval.
 462          */
 463         if (interval >= NSEC_PER_SEC) {
 464                 irqs->count = 0;
 465                 return;
 466         }
 467
 468         /*
 469          * Get the index in the ema table for this interrupt. The
 470          * PREDICTION_FACTOR increase the interval size for the array
 471          * of exponential average.
 472          */
 473         index = likely(interval) ?
 474                 ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
 475
 476         /*
 477          * Store the index as an element of the pattern in another
 478          * circular array.
 479          */
 480         irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
 481
 482         irqs->ema_time[index] = irq_timings_ema_new(interval,
 483                                                     irqs->ema_time[index]);
 484
 485         irqs->count++;
 486 }
 487
 488 /**
 489  * irq_timings_next_event - Return when the next event is supposed to arrive
 490  *
 491  * During the last busy cycle, the number of interrupts is incremented
 492  * and stored in the irq_timings structure. This information is
 493  * necessary to:
 494  *
 495  * - know if the index in the table wrapped up:
 496  *
 497  *      If more than the array size interrupts happened during the
 498  *      last busy/idle cycle, the index wrapped up and we have to
 499  *      begin with the next element in the array which is the last one
 500  *      in the sequence, otherwise it is a the index 0.
 501  *
 502  * - have an indication of the interrupts activity on this CPU
 503  *   (eg. irq/sec)
 504  *
 505  * The values are 'consumed' after inserting in the statistical model,
 506  * thus the count is reinitialized.
 507  *
 508  * The array of values **must** be browsed in the time direction, the
 509  * timestamp must increase between an element and the next one.
 510  *
 511  * Returns a nanosec time based estimation of the earliest interrupt,
 512  * U64_MAX otherwise.
 513  */
 514 u64 irq_timings_next_event(u64 now)
 515 {
 516         struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
 517         struct irqt_stat *irqs;
 518         struct irqt_stat __percpu *s;
 519         u64 ts, next_evt = U64_MAX;
 520         int i, irq = 0;
 521
 522         /*
 523          * This function must be called with the local irq disabled in
 524          * order to prevent the timings circular buffer to be updated
 525          * while we are reading it.
 526          */
 527         lockdep_assert_irqs_disabled();
 528
 529         if (!irqts->count)
 530                 return next_evt;
 531
 532         /*
 533          * Number of elements in the circular buffer: If it happens it
 534          * was flushed before, then the number of elements could be
 535          * smaller than IRQ_TIMINGS_SIZE, so the count is used,
 536          * otherwise the array size is used as we wrapped. The index
 537          * begins from zero when we did not wrap. That could be done
 538          * in a nicer way with the proper circular array structure
 539          * type but with the cost of extra computation in the
 540          * interrupt handler hot path. We choose efficiency.
 541          *
 542          * Inject measured irq/timestamp to the pattern prediction
 543          * model while decrementing the counter because we consume the
 544          * data from our circular buffer.
 545          */
 546         for_each_irqts(i, irqts) {
 547                 irq = irq_timing_decode(irqts->values[i], &ts);
 548                 s = idr_find(&irqt_stats, irq);
 549                 if (s)
 550                         irq_timings_store(irq, this_cpu_ptr(s), ts);
 551         }
 552
 553         /*
 554          * Look in the list of interrupts' statistics, the earliest
 555          * next event.
 556          */
 557         idr_for_each_entry(&irqt_stats, s, i) {
 558
 559                 irqs = this_cpu_ptr(s);
 560
 561                 ts = __irq_timings_next_event(irqs, i, now);
 562                 if (ts <= now)
 563                         return now;
 564
 565                 if (ts < next_evt)
 566                         next_evt = ts;
 567         }
 568
 569         return next_evt;
 570 }
 571
 572 void irq_timings_free(int irq)
 573 {
 574         struct irqt_stat __percpu *s;
 575
 576         s = idr_find(&irqt_stats, irq);
 577         if (s) {
 578                 free_percpu(s);
 579                 idr_remove(&irqt_stats, irq);
 580         }
 581 }
 582
 583 int irq_timings_alloc(int irq)
 584 {
 585         struct irqt_stat __percpu *s;
 586         int id;
 587
 588         /*
 589          * Some platforms can have the same private interrupt per cpu,
 590          * so this function may be be called several times with the
 591          * same interrupt number. Just bail out in case the per cpu
 592          * stat structure is already allocated.
 593          */
 594         s = idr_find(&irqt_stats, irq);
 595         if (s)
 596                 return 0;
 597
 598         s = alloc_percpu(*s);
 599         if (!s)
 600                 return -ENOMEM;
 601
 602         idr_preload(GFP_KERNEL);
 603         id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
 604         idr_preload_end();
 605
 606         if (id < 0) {
 607                 free_percpu(s);
 608                 return id;
 609         }
 610
 611         return 0;
 612 }