Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[linux-2.6-microblaze.git] / fs / afs / rotate.c
1 /* Handle fileserver selection and rotation.
2  *
3  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
4  * Written by David Howells (dhowells@redhat.com)
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public Licence
8  * as published by the Free Software Foundation; either version
9  * 2 of the Licence, or (at your option) any later version.
10  */
11
12 #include <linux/kernel.h>
13 #include <linux/slab.h>
14 #include <linux/fs.h>
15 #include <linux/sched.h>
16 #include <linux/delay.h>
17 #include <linux/sched/signal.h>
18 #include "internal.h"
19 #include "afs_fs.h"
20
21 /*
22  * Begin an operation on the fileserver.
23  *
24  * Fileserver operations are serialised on the server by vnode, so we serialise
25  * them here also using the io_lock.
26  */
27 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
28                                struct key *key, bool intr)
29 {
30         memset(fc, 0, sizeof(*fc));
31         fc->vnode = vnode;
32         fc->key = key;
33         fc->ac.error = SHRT_MAX;
34         fc->error = -EDESTADDRREQ;
35
36         if (intr) {
37                 fc->flags |= AFS_FS_CURSOR_INTR;
38                 if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
39                         fc->error = -EINTR;
40                         fc->flags |= AFS_FS_CURSOR_STOP;
41                         return false;
42                 }
43         } else {
44                 mutex_lock(&vnode->io_lock);
45         }
46
47         if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
48                 fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
49         return true;
50 }
51
52 /*
53  * Begin iteration through a server list, starting with the vnode's last used
54  * server if possible, or the last recorded good server if not.
55  */
56 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
57                                    struct afs_vnode *vnode)
58 {
59         struct afs_cb_interest *cbi;
60         int i;
61
62         read_lock(&vnode->volume->servers_lock);
63         fc->server_list = afs_get_serverlist(vnode->volume->servers);
64         read_unlock(&vnode->volume->servers_lock);
65
66         fc->untried = (1UL << fc->server_list->nr_servers) - 1;
67         fc->index = READ_ONCE(fc->server_list->preferred);
68
69         cbi = rcu_dereference_protected(vnode->cb_interest,
70                                         lockdep_is_held(&vnode->io_lock));
71         if (cbi) {
72                 /* See if the vnode's preferred record is still available */
73                 for (i = 0; i < fc->server_list->nr_servers; i++) {
74                         if (fc->server_list->servers[i].cb_interest == cbi) {
75                                 fc->index = i;
76                                 goto found_interest;
77                         }
78                 }
79
80                 /* If we have a lock outstanding on a server that's no longer
81                  * serving this vnode, then we can't switch to another server
82                  * and have to return an error.
83                  */
84                 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
85                         fc->error = -ESTALE;
86                         return false;
87                 }
88
89                 /* Note that the callback promise is effectively broken */
90                 write_seqlock(&vnode->cb_lock);
91                 ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
92                 rcu_assign_pointer(vnode->cb_interest, NULL);
93                 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
94                         vnode->cb_break++;
95                 write_sequnlock(&vnode->cb_lock);
96
97                 afs_put_cb_interest(afs_v2net(vnode), cbi);
98                 cbi = NULL;
99         }
100
101 found_interest:
102         return true;
103 }
104
105 /*
106  * Post volume busy note.
107  */
108 static void afs_busy(struct afs_volume *volume, u32 abort_code)
109 {
110         const char *m;
111
112         switch (abort_code) {
113         case VOFFLINE:          m = "offline";          break;
114         case VRESTARTING:       m = "restarting";       break;
115         case VSALVAGING:        m = "being salvaged";   break;
116         default:                m = "busy";             break;
117         }
118
119         pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
120 }
121
122 /*
123  * Sleep and retry the operation to the same fileserver.
124  */
125 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
126 {
127         if (fc->flags & AFS_FS_CURSOR_INTR) {
128                 msleep_interruptible(1000);
129                 if (signal_pending(current)) {
130                         fc->error = -ERESTARTSYS;
131                         return false;
132                 }
133         } else {
134                 msleep(1000);
135         }
136
137         return true;
138 }
139
140 /*
141  * Select the fileserver to use.  May be called multiple times to rotate
142  * through the fileservers.
143  */
144 bool afs_select_fileserver(struct afs_fs_cursor *fc)
145 {
146         struct afs_addr_list *alist;
147         struct afs_server *server;
148         struct afs_vnode *vnode = fc->vnode;
149         struct afs_error e;
150         u32 rtt;
151         int error = fc->ac.error, i;
152
153         _enter("%lx[%d],%lx[%d],%d,%d",
154                fc->untried, fc->index,
155                fc->ac.tried, fc->ac.index,
156                error, fc->ac.abort_code);
157
158         if (fc->flags & AFS_FS_CURSOR_STOP) {
159                 _leave(" = f [stopped]");
160                 return false;
161         }
162
163         fc->nr_iterations++;
164
165         /* Evaluate the result of the previous operation, if there was one. */
166         switch (error) {
167         case SHRT_MAX:
168                 goto start;
169
170         case 0:
171         default:
172                 /* Success or local failure.  Stop. */
173                 fc->error = error;
174                 fc->flags |= AFS_FS_CURSOR_STOP;
175                 _leave(" = f [okay/local %d]", error);
176                 return false;
177
178         case -ECONNABORTED:
179                 /* The far side rejected the operation on some grounds.  This
180                  * might involve the server being busy or the volume having been moved.
181                  */
182                 switch (fc->ac.abort_code) {
183                 case VNOVOL:
184                         /* This fileserver doesn't know about the volume.
185                          * - May indicate that the VL is wrong - retry once and compare
186                          *   the results.
187                          * - May indicate that the fileserver couldn't attach to the vol.
188                          */
189                         if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
190                                 fc->error = -EREMOTEIO;
191                                 goto next_server;
192                         }
193
194                         write_lock(&vnode->volume->servers_lock);
195                         fc->server_list->vnovol_mask |= 1 << fc->index;
196                         write_unlock(&vnode->volume->servers_lock);
197
198                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
199                         error = afs_check_volume_status(vnode->volume, fc->key);
200                         if (error < 0)
201                                 goto failed_set_error;
202
203                         if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
204                                 fc->error = -ENOMEDIUM;
205                                 goto failed;
206                         }
207
208                         /* If the server list didn't change, then assume that
209                          * it's the fileserver having trouble.
210                          */
211                         if (vnode->volume->servers == fc->server_list) {
212                                 fc->error = -EREMOTEIO;
213                                 goto next_server;
214                         }
215
216                         /* Try again */
217                         fc->flags |= AFS_FS_CURSOR_VNOVOL;
218                         _leave(" = t [vnovol]");
219                         return true;
220
221                 case VSALVAGE: /* TODO: Should this return an error or iterate? */
222                 case VVOLEXISTS:
223                 case VNOSERVICE:
224                 case VONLINE:
225                 case VDISKFULL:
226                 case VOVERQUOTA:
227                         fc->error = afs_abort_to_error(fc->ac.abort_code);
228                         goto next_server;
229
230                 case VOFFLINE:
231                         if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
232                                 afs_busy(vnode->volume, fc->ac.abort_code);
233                                 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
234                         }
235                         if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
236                                 fc->error = -EADV;
237                                 goto failed;
238                         }
239                         if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
240                                 fc->error = -ESTALE;
241                                 goto failed;
242                         }
243                         goto busy;
244
245                 case VSALVAGING:
246                 case VRESTARTING:
247                 case VBUSY:
248                         /* Retry after going round all the servers unless we
249                          * have a file lock we need to maintain.
250                          */
251                         if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
252                                 fc->error = -EBUSY;
253                                 goto failed;
254                         }
255                         if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
256                                 afs_busy(vnode->volume, fc->ac.abort_code);
257                                 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
258                         }
259                 busy:
260                         if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
261                                 if (!afs_sleep_and_retry(fc))
262                                         goto failed;
263
264                                  /* Retry with same server & address */
265                                 _leave(" = t [vbusy]");
266                                 return true;
267                         }
268
269                         fc->flags |= AFS_FS_CURSOR_VBUSY;
270                         goto next_server;
271
272                 case VMOVED:
273                         /* The volume migrated to another server.  We consider
274                          * consider all locks and callbacks broken and request
275                          * an update from the VLDB.
276                          *
277                          * We also limit the number of VMOVED hops we will
278                          * honour, just in case someone sets up a loop.
279                          */
280                         if (fc->flags & AFS_FS_CURSOR_VMOVED) {
281                                 fc->error = -EREMOTEIO;
282                                 goto failed;
283                         }
284                         fc->flags |= AFS_FS_CURSOR_VMOVED;
285
286                         set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
287                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
288                         error = afs_check_volume_status(vnode->volume, fc->key);
289                         if (error < 0)
290                                 goto failed_set_error;
291
292                         /* If the server list didn't change, then the VLDB is
293                          * out of sync with the fileservers.  This is hopefully
294                          * a temporary condition, however, so we don't want to
295                          * permanently block access to the file.
296                          *
297                          * TODO: Try other fileservers if we can.
298                          *
299                          * TODO: Retry a few times with sleeps.
300                          */
301                         if (vnode->volume->servers == fc->server_list) {
302                                 fc->error = -ENOMEDIUM;
303                                 goto failed;
304                         }
305
306                         goto restart_from_beginning;
307
308                 default:
309                         clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
310                         clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
311                         fc->error = afs_abort_to_error(fc->ac.abort_code);
312                         goto failed;
313                 }
314
315         case -ETIMEDOUT:
316         case -ETIME:
317                 if (fc->error != -EDESTADDRREQ)
318                         goto iterate_address;
319                 /* Fall through */
320         case -ERFKILL:
321         case -EADDRNOTAVAIL:
322         case -ENETUNREACH:
323         case -EHOSTUNREACH:
324         case -EHOSTDOWN:
325         case -ECONNREFUSED:
326                 _debug("no conn");
327                 fc->error = error;
328                 goto iterate_address;
329
330         case -ECONNRESET:
331                 _debug("call reset");
332                 fc->error = error;
333                 goto failed;
334         }
335
336 restart_from_beginning:
337         _debug("restart");
338         afs_end_cursor(&fc->ac);
339         afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
340         fc->cbi = NULL;
341         afs_put_serverlist(afs_v2net(vnode), fc->server_list);
342         fc->server_list = NULL;
343 start:
344         _debug("start");
345         /* See if we need to do an update of the volume record.  Note that the
346          * volume may have moved or even have been deleted.
347          */
348         error = afs_check_volume_status(vnode->volume, fc->key);
349         if (error < 0)
350                 goto failed_set_error;
351
352         if (!afs_start_fs_iteration(fc, vnode))
353                 goto failed;
354
355         _debug("__ VOL %llx __", vnode->volume->vid);
356         error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
357         if (error < 0)
358                 goto failed_set_error;
359
360 pick_server:
361         _debug("pick [%lx]", fc->untried);
362
363         error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
364         if (error < 0)
365                 goto failed_set_error;
366
367         /* Pick the untried server with the lowest RTT.  If we have outstanding
368          * callbacks, we stick with the server we're already using if we can.
369          */
370         if (fc->cbi) {
371                 _debug("cbi %u", fc->index);
372                 if (test_bit(fc->index, &fc->untried))
373                         goto selected_server;
374                 afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
375                 fc->cbi = NULL;
376                 _debug("nocbi");
377         }
378
379         fc->index = -1;
380         rtt = U32_MAX;
381         for (i = 0; i < fc->server_list->nr_servers; i++) {
382                 struct afs_server *s = fc->server_list->servers[i].server;
383
384                 if (!test_bit(i, &fc->untried) || !s->probe.responded)
385                         continue;
386                 if (s->probe.rtt < rtt) {
387                         fc->index = i;
388                         rtt = s->probe.rtt;
389                 }
390         }
391
392         if (fc->index == -1)
393                 goto no_more_servers;
394
395 selected_server:
396         _debug("use %d", fc->index);
397         __clear_bit(fc->index, &fc->untried);
398
399         /* We're starting on a different fileserver from the list.  We need to
400          * check it, create a callback intercept, find its address list and
401          * probe its capabilities before we use it.
402          */
403         ASSERTCMP(fc->ac.alist, ==, NULL);
404         server = fc->server_list->servers[fc->index].server;
405
406         if (!afs_check_server_record(fc, server))
407                 goto failed;
408
409         _debug("USING SERVER: %pU", &server->uuid);
410
411         /* Make sure we've got a callback interest record for this server.  We
412          * have to link it in before we send the request as we can be sent a
413          * break request before we've finished decoding the reply and
414          * installing the vnode.
415          */
416         error = afs_register_server_cb_interest(vnode, fc->server_list,
417                                                 fc->index);
418         if (error < 0)
419                 goto failed_set_error;
420
421         fc->cbi = afs_get_cb_interest(
422                 rcu_dereference_protected(vnode->cb_interest,
423                                           lockdep_is_held(&vnode->io_lock)));
424
425         read_lock(&server->fs_lock);
426         alist = rcu_dereference_protected(server->addresses,
427                                           lockdep_is_held(&server->fs_lock));
428         afs_get_addrlist(alist);
429         read_unlock(&server->fs_lock);
430
431         memset(&fc->ac, 0, sizeof(fc->ac));
432
433         if (!fc->ac.alist)
434                 fc->ac.alist = alist;
435         else
436                 afs_put_addrlist(alist);
437
438         fc->ac.index = -1;
439
440 iterate_address:
441         ASSERT(fc->ac.alist);
442         /* Iterate over the current server's address list to try and find an
443          * address on which it will respond to us.
444          */
445         if (!afs_iterate_addresses(&fc->ac))
446                 goto next_server;
447
448         _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
449
450         _leave(" = t");
451         return true;
452
453 next_server:
454         _debug("next");
455         afs_end_cursor(&fc->ac);
456         goto pick_server;
457
458 no_more_servers:
459         /* That's all the servers poked to no good effect.  Try again if some
460          * of them were busy.
461          */
462         if (fc->flags & AFS_FS_CURSOR_VBUSY)
463                 goto restart_from_beginning;
464
465         e.error = -EDESTADDRREQ;
466         e.responded = false;
467         for (i = 0; i < fc->server_list->nr_servers; i++) {
468                 struct afs_server *s = fc->server_list->servers[i].server;
469
470                 afs_prioritise_error(&e, READ_ONCE(s->probe.error),
471                                      s->probe.abort_code);
472         }
473
474         error = e.error;
475
476 failed_set_error:
477         fc->error = error;
478 failed:
479         fc->flags |= AFS_FS_CURSOR_STOP;
480         afs_end_cursor(&fc->ac);
481         _leave(" = f [failed %d]", fc->error);
482         return false;
483 }
484
485 /*
486  * Select the same fileserver we used for a vnode before and only that
487  * fileserver.  We use this when we have a lock on that file, which is backed
488  * only by the fileserver we obtained it from.
489  */
490 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
491 {
492         struct afs_vnode *vnode = fc->vnode;
493         struct afs_cb_interest *cbi;
494         struct afs_addr_list *alist;
495         int error = fc->ac.error;
496
497         _enter("");
498
499         cbi = rcu_dereference_protected(vnode->cb_interest,
500                                         lockdep_is_held(&vnode->io_lock));
501
502         switch (error) {
503         case SHRT_MAX:
504                 if (!cbi) {
505                         fc->error = -ESTALE;
506                         fc->flags |= AFS_FS_CURSOR_STOP;
507                         return false;
508                 }
509
510                 fc->cbi = afs_get_cb_interest(cbi);
511
512                 read_lock(&cbi->server->fs_lock);
513                 alist = rcu_dereference_protected(cbi->server->addresses,
514                                                   lockdep_is_held(&cbi->server->fs_lock));
515                 afs_get_addrlist(alist);
516                 read_unlock(&cbi->server->fs_lock);
517                 if (!alist) {
518                         fc->error = -ESTALE;
519                         fc->flags |= AFS_FS_CURSOR_STOP;
520                         return false;
521                 }
522
523                 memset(&fc->ac, 0, sizeof(fc->ac));
524                 fc->ac.alist = alist;
525                 fc->ac.index = -1;
526                 goto iterate_address;
527
528         case 0:
529         default:
530                 /* Success or local failure.  Stop. */
531                 fc->error = error;
532                 fc->flags |= AFS_FS_CURSOR_STOP;
533                 _leave(" = f [okay/local %d]", error);
534                 return false;
535
536         case -ECONNABORTED:
537                 fc->error = afs_abort_to_error(fc->ac.abort_code);
538                 fc->flags |= AFS_FS_CURSOR_STOP;
539                 _leave(" = f [abort]");
540                 return false;
541
542         case -ERFKILL:
543         case -EADDRNOTAVAIL:
544         case -ENETUNREACH:
545         case -EHOSTUNREACH:
546         case -EHOSTDOWN:
547         case -ECONNREFUSED:
548         case -ETIMEDOUT:
549         case -ETIME:
550                 _debug("no conn");
551                 fc->error = error;
552                 goto iterate_address;
553         }
554
555 iterate_address:
556         /* Iterate over the current server's address list to try and find an
557          * address on which it will respond to us.
558          */
559         if (afs_iterate_addresses(&fc->ac)) {
560                 _leave(" = t");
561                 return true;
562         }
563
564         afs_end_cursor(&fc->ac);
565         return false;
566 }
567
568 /*
569  * Dump cursor state in the case of the error being EDESTADDRREQ.
570  */
571 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
572 {
573         static int count;
574         int i;
575
576         if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
577                 return;
578         count++;
579
580         rcu_read_lock();
581
582         pr_notice("EDESTADDR occurred\n");
583         pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
584                   fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
585         pr_notice("FC: ut=%lx ix=%d ni=%u\n",
586                   fc->untried, fc->index, fc->nr_iterations);
587
588         if (fc->server_list) {
589                 const struct afs_server_list *sl = fc->server_list;
590                 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
591                           sl->nr_servers, sl->preferred, sl->vnovol_mask);
592                 for (i = 0; i < sl->nr_servers; i++) {
593                         const struct afs_server *s = sl->servers[i].server;
594                         pr_notice("FC: server fl=%lx av=%u %pU\n",
595                                   s->flags, s->addr_version, &s->uuid);
596                         if (s->addresses) {
597                                 const struct afs_addr_list *a =
598                                         rcu_dereference(s->addresses);
599                                 pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
600                                           a->version,
601                                           a->nr_ipv4, a->nr_addrs, a->max_addrs,
602                                           a->preferred);
603                                 pr_notice("FC:  - pr=%lx R=%lx F=%lx\n",
604                                           a->probed, a->responded, a->failed);
605                                 if (a == fc->ac.alist)
606                                         pr_notice("FC:  - current\n");
607                         }
608                 }
609         }
610
611         pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
612                   fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
613                   fc->ac.responded, fc->ac.nr_iterations);
614         rcu_read_unlock();
615 }
616
617 /*
618  * Tidy up a filesystem cursor and unlock the vnode.
619  */
620 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
621 {
622         struct afs_net *net = afs_v2net(fc->vnode);
623
624         if (fc->error == -EDESTADDRREQ ||
625             fc->error == -EADDRNOTAVAIL ||
626             fc->error == -ENETUNREACH ||
627             fc->error == -EHOSTUNREACH)
628                 afs_dump_edestaddrreq(fc);
629
630         mutex_unlock(&fc->vnode->io_lock);
631
632         afs_end_cursor(&fc->ac);
633         afs_put_cb_interest(net, fc->cbi);
634         afs_put_serverlist(net, fc->server_list);
635
636         if (fc->error == -ECONNABORTED)
637                 fc->error = afs_abort_to_error(fc->ac.abort_code);
638
639         return fc->error;
640 }