afs: Actively poll fileservers to maintain NAT or firewall openings
[linux-2.6-microblaze.git] / fs / afs / rotate.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Handle fileserver selection and rotation.
3  *
4  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7
8 #include <linux/kernel.h>
9 #include <linux/slab.h>
10 #include <linux/fs.h>
11 #include <linux/sched.h>
12 #include <linux/delay.h>
13 #include <linux/sched/signal.h>
14 #include "internal.h"
15 #include "afs_fs.h"
16
17 /*
18  * Begin an operation on the fileserver.
19  *
20  * Fileserver operations are serialised on the server by vnode, so we serialise
21  * them here also using the io_lock.
22  */
23 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
24                                struct key *key, bool intr)
25 {
26         memset(fc, 0, sizeof(*fc));
27         fc->vnode = vnode;
28         fc->key = key;
29         fc->ac.error = SHRT_MAX;
30         fc->error = -EDESTADDRREQ;
31
32         if (intr) {
33                 fc->flags |= AFS_FS_CURSOR_INTR;
34                 if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
35                         fc->error = -EINTR;
36                         fc->flags |= AFS_FS_CURSOR_STOP;
37                         return false;
38                 }
39         } else {
40                 mutex_lock(&vnode->io_lock);
41         }
42
43         if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
44                 fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
45         return true;
46 }
47
48 /*
49  * Begin iteration through a server list, starting with the vnode's last used
50  * server if possible, or the last recorded good server if not.
51  */
52 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
53                                    struct afs_vnode *vnode)
54 {
55         struct afs_cb_interest *cbi;
56         int i;
57
58         read_lock(&vnode->volume->servers_lock);
59         fc->server_list = afs_get_serverlist(vnode->volume->servers);
60         read_unlock(&vnode->volume->servers_lock);
61
62         fc->untried = (1UL << fc->server_list->nr_servers) - 1;
63         fc->index = READ_ONCE(fc->server_list->preferred);
64
65         cbi = rcu_dereference_protected(vnode->cb_interest,
66                                         lockdep_is_held(&vnode->io_lock));
67         if (cbi) {
68                 /* See if the vnode's preferred record is still available */
69                 for (i = 0; i < fc->server_list->nr_servers; i++) {
70                         if (fc->server_list->servers[i].cb_interest == cbi) {
71                                 fc->index = i;
72                                 goto found_interest;
73                         }
74                 }
75
76                 /* If we have a lock outstanding on a server that's no longer
77                  * serving this vnode, then we can't switch to another server
78                  * and have to return an error.
79                  */
80                 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
81                         fc->error = -ESTALE;
82                         return false;
83                 }
84
85                 /* Note that the callback promise is effectively broken */
86                 write_seqlock(&vnode->cb_lock);
87                 ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
88                 rcu_assign_pointer(vnode->cb_interest, NULL);
89                 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
90                         vnode->cb_break++;
91                 write_sequnlock(&vnode->cb_lock);
92
93                 afs_put_cb_interest(afs_v2net(vnode), cbi);
94                 cbi = NULL;
95         }
96
97 found_interest:
98         return true;
99 }
100
101 /*
102  * Post volume busy note.
103  */
104 static void afs_busy(struct afs_volume *volume, u32 abort_code)
105 {
106         const char *m;
107
108         switch (abort_code) {
109         case VOFFLINE:          m = "offline";          break;
110         case VRESTARTING:       m = "restarting";       break;
111         case VSALVAGING:        m = "being salvaged";   break;
112         default:                m = "busy";             break;
113         }
114
115         pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
116 }
117
118 /*
119  * Sleep and retry the operation to the same fileserver.
120  */
121 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
122 {
123         if (fc->flags & AFS_FS_CURSOR_INTR) {
124                 msleep_interruptible(1000);
125                 if (signal_pending(current)) {
126                         fc->error = -ERESTARTSYS;
127                         return false;
128                 }
129         } else {
130                 msleep(1000);
131         }
132
133         return true;
134 }
135
136 /*
137  * Select the fileserver to use.  May be called multiple times to rotate
138  * through the fileservers.
139  */
140 bool afs_select_fileserver(struct afs_fs_cursor *fc)
141 {
142         struct afs_addr_list *alist;
143         struct afs_server *server;
144         struct afs_vnode *vnode = fc->vnode;
145         struct afs_error e;
146         u32 rtt;
147         int error = fc->ac.error, i;
148
149         _enter("%lx[%d],%lx[%d],%d,%d",
150                fc->untried, fc->index,
151                fc->ac.tried, fc->ac.index,
152                error, fc->ac.abort_code);
153
154         if (fc->flags & AFS_FS_CURSOR_STOP) {
155                 _leave(" = f [stopped]");
156                 return false;
157         }
158
159         fc->nr_iterations++;
160
161         /* Evaluate the result of the previous operation, if there was one. */
162         switch (error) {
163         case SHRT_MAX:
164                 goto start;
165
166         case 0:
167         default:
168                 /* Success or local failure.  Stop. */
169                 fc->error = error;
170                 fc->flags |= AFS_FS_CURSOR_STOP;
171                 _leave(" = f [okay/local %d]", error);
172                 return false;
173
174         case -ECONNABORTED:
175                 /* The far side rejected the operation on some grounds.  This
176                  * might involve the server being busy or the volume having been moved.
177                  */
178                 switch (fc->ac.abort_code) {
179                 case VNOVOL:
180                         /* This fileserver doesn't know about the volume.
181                          * - May indicate that the VL is wrong - retry once and compare
182                          *   the results.
183                          * - May indicate that the fileserver couldn't attach to the vol.
184                          */
185                         if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
186                                 fc->error = -EREMOTEIO;
187                                 goto next_server;
188                         }
189
190                         write_lock(&vnode->volume->servers_lock);
191                         fc->server_list->vnovol_mask |= 1 << fc->index;
192                         write_unlock(&vnode->volume->servers_lock);
193
194                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
195                         error = afs_check_volume_status(vnode->volume, fc);
196                         if (error < 0)
197                                 goto failed_set_error;
198
199                         if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
200                                 fc->error = -ENOMEDIUM;
201                                 goto failed;
202                         }
203
204                         /* If the server list didn't change, then assume that
205                          * it's the fileserver having trouble.
206                          */
207                         if (vnode->volume->servers == fc->server_list) {
208                                 fc->error = -EREMOTEIO;
209                                 goto next_server;
210                         }
211
212                         /* Try again */
213                         fc->flags |= AFS_FS_CURSOR_VNOVOL;
214                         _leave(" = t [vnovol]");
215                         return true;
216
217                 case VSALVAGE: /* TODO: Should this return an error or iterate? */
218                 case VVOLEXISTS:
219                 case VNOSERVICE:
220                 case VONLINE:
221                 case VDISKFULL:
222                 case VOVERQUOTA:
223                         fc->error = afs_abort_to_error(fc->ac.abort_code);
224                         goto next_server;
225
226                 case VOFFLINE:
227                         if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
228                                 afs_busy(vnode->volume, fc->ac.abort_code);
229                                 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
230                         }
231                         if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
232                                 fc->error = -EADV;
233                                 goto failed;
234                         }
235                         if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
236                                 fc->error = -ESTALE;
237                                 goto failed;
238                         }
239                         goto busy;
240
241                 case VSALVAGING:
242                 case VRESTARTING:
243                 case VBUSY:
244                         /* Retry after going round all the servers unless we
245                          * have a file lock we need to maintain.
246                          */
247                         if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
248                                 fc->error = -EBUSY;
249                                 goto failed;
250                         }
251                         if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
252                                 afs_busy(vnode->volume, fc->ac.abort_code);
253                                 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
254                         }
255                 busy:
256                         if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
257                                 if (!afs_sleep_and_retry(fc))
258                                         goto failed;
259
260                                  /* Retry with same server & address */
261                                 _leave(" = t [vbusy]");
262                                 return true;
263                         }
264
265                         fc->flags |= AFS_FS_CURSOR_VBUSY;
266                         goto next_server;
267
268                 case VMOVED:
269                         /* The volume migrated to another server.  We consider
270                          * consider all locks and callbacks broken and request
271                          * an update from the VLDB.
272                          *
273                          * We also limit the number of VMOVED hops we will
274                          * honour, just in case someone sets up a loop.
275                          */
276                         if (fc->flags & AFS_FS_CURSOR_VMOVED) {
277                                 fc->error = -EREMOTEIO;
278                                 goto failed;
279                         }
280                         fc->flags |= AFS_FS_CURSOR_VMOVED;
281
282                         set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
283                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
284                         error = afs_check_volume_status(vnode->volume, fc);
285                         if (error < 0)
286                                 goto failed_set_error;
287
288                         /* If the server list didn't change, then the VLDB is
289                          * out of sync with the fileservers.  This is hopefully
290                          * a temporary condition, however, so we don't want to
291                          * permanently block access to the file.
292                          *
293                          * TODO: Try other fileservers if we can.
294                          *
295                          * TODO: Retry a few times with sleeps.
296                          */
297                         if (vnode->volume->servers == fc->server_list) {
298                                 fc->error = -ENOMEDIUM;
299                                 goto failed;
300                         }
301
302                         goto restart_from_beginning;
303
304                 default:
305                         clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
306                         clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
307                         fc->error = afs_abort_to_error(fc->ac.abort_code);
308                         goto failed;
309                 }
310
311         case -ETIMEDOUT:
312         case -ETIME:
313                 if (fc->error != -EDESTADDRREQ)
314                         goto iterate_address;
315                 /* Fall through */
316         case -ERFKILL:
317         case -EADDRNOTAVAIL:
318         case -ENETUNREACH:
319         case -EHOSTUNREACH:
320         case -EHOSTDOWN:
321         case -ECONNREFUSED:
322                 _debug("no conn");
323                 fc->error = error;
324                 goto iterate_address;
325
326         case -ECONNRESET:
327                 _debug("call reset");
328                 fc->error = error;
329                 goto failed;
330         }
331
332 restart_from_beginning:
333         _debug("restart");
334         afs_end_cursor(&fc->ac);
335         afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
336         fc->cbi = NULL;
337         afs_put_serverlist(afs_v2net(vnode), fc->server_list);
338         fc->server_list = NULL;
339 start:
340         _debug("start");
341         /* See if we need to do an update of the volume record.  Note that the
342          * volume may have moved or even have been deleted.
343          */
344         error = afs_check_volume_status(vnode->volume, fc);
345         if (error < 0)
346                 goto failed_set_error;
347
348         if (!afs_start_fs_iteration(fc, vnode))
349                 goto failed;
350
351         _debug("__ VOL %llx __", vnode->volume->vid);
352
353 pick_server:
354         _debug("pick [%lx]", fc->untried);
355
356         error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
357         if (error < 0)
358                 goto failed_set_error;
359
360         /* Pick the untried server with the lowest RTT.  If we have outstanding
361          * callbacks, we stick with the server we're already using if we can.
362          */
363         if (fc->cbi) {
364                 _debug("cbi %u", fc->index);
365                 if (test_bit(fc->index, &fc->untried))
366                         goto selected_server;
367                 afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
368                 fc->cbi = NULL;
369                 _debug("nocbi");
370         }
371
372         fc->index = -1;
373         rtt = U32_MAX;
374         for (i = 0; i < fc->server_list->nr_servers; i++) {
375                 struct afs_server *s = fc->server_list->servers[i].server;
376
377                 if (!test_bit(i, &fc->untried) || !s->probe.responded)
378                         continue;
379                 if (s->probe.rtt < rtt) {
380                         fc->index = i;
381                         rtt = s->probe.rtt;
382                 }
383         }
384
385         if (fc->index == -1)
386                 goto no_more_servers;
387
388 selected_server:
389         _debug("use %d", fc->index);
390         __clear_bit(fc->index, &fc->untried);
391
392         /* We're starting on a different fileserver from the list.  We need to
393          * check it, create a callback intercept, find its address list and
394          * probe its capabilities before we use it.
395          */
396         ASSERTCMP(fc->ac.alist, ==, NULL);
397         server = fc->server_list->servers[fc->index].server;
398
399         if (!afs_check_server_record(fc, server))
400                 goto failed;
401
402         _debug("USING SERVER: %pU", &server->uuid);
403
404         /* Make sure we've got a callback interest record for this server.  We
405          * have to link it in before we send the request as we can be sent a
406          * break request before we've finished decoding the reply and
407          * installing the vnode.
408          */
409         error = afs_register_server_cb_interest(vnode, fc->server_list,
410                                                 fc->index);
411         if (error < 0)
412                 goto failed_set_error;
413
414         fc->cbi = afs_get_cb_interest(
415                 rcu_dereference_protected(vnode->cb_interest,
416                                           lockdep_is_held(&vnode->io_lock)));
417
418         read_lock(&server->fs_lock);
419         alist = rcu_dereference_protected(server->addresses,
420                                           lockdep_is_held(&server->fs_lock));
421         afs_get_addrlist(alist);
422         read_unlock(&server->fs_lock);
423
424         memset(&fc->ac, 0, sizeof(fc->ac));
425
426         if (!fc->ac.alist)
427                 fc->ac.alist = alist;
428         else
429                 afs_put_addrlist(alist);
430
431         fc->ac.index = -1;
432
433 iterate_address:
434         ASSERT(fc->ac.alist);
435         /* Iterate over the current server's address list to try and find an
436          * address on which it will respond to us.
437          */
438         if (!afs_iterate_addresses(&fc->ac))
439                 goto next_server;
440
441         _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
442
443         _leave(" = t");
444         return true;
445
446 next_server:
447         _debug("next");
448         afs_end_cursor(&fc->ac);
449         goto pick_server;
450
451 no_more_servers:
452         /* That's all the servers poked to no good effect.  Try again if some
453          * of them were busy.
454          */
455         if (fc->flags & AFS_FS_CURSOR_VBUSY)
456                 goto restart_from_beginning;
457
458         e.error = -EDESTADDRREQ;
459         e.responded = false;
460         for (i = 0; i < fc->server_list->nr_servers; i++) {
461                 struct afs_server *s = fc->server_list->servers[i].server;
462
463                 afs_prioritise_error(&e, READ_ONCE(s->probe.error),
464                                      s->probe.abort_code);
465         }
466
467         error = e.error;
468
469 failed_set_error:
470         fc->error = error;
471 failed:
472         fc->flags |= AFS_FS_CURSOR_STOP;
473         afs_end_cursor(&fc->ac);
474         _leave(" = f [failed %d]", fc->error);
475         return false;
476 }
477
478 /*
479  * Select the same fileserver we used for a vnode before and only that
480  * fileserver.  We use this when we have a lock on that file, which is backed
481  * only by the fileserver we obtained it from.
482  */
483 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
484 {
485         struct afs_vnode *vnode = fc->vnode;
486         struct afs_cb_interest *cbi;
487         struct afs_addr_list *alist;
488         int error = fc->ac.error;
489
490         _enter("");
491
492         cbi = rcu_dereference_protected(vnode->cb_interest,
493                                         lockdep_is_held(&vnode->io_lock));
494
495         switch (error) {
496         case SHRT_MAX:
497                 if (!cbi) {
498                         fc->error = -ESTALE;
499                         fc->flags |= AFS_FS_CURSOR_STOP;
500                         return false;
501                 }
502
503                 fc->cbi = afs_get_cb_interest(cbi);
504
505                 read_lock(&cbi->server->fs_lock);
506                 alist = rcu_dereference_protected(cbi->server->addresses,
507                                                   lockdep_is_held(&cbi->server->fs_lock));
508                 afs_get_addrlist(alist);
509                 read_unlock(&cbi->server->fs_lock);
510                 if (!alist) {
511                         fc->error = -ESTALE;
512                         fc->flags |= AFS_FS_CURSOR_STOP;
513                         return false;
514                 }
515
516                 memset(&fc->ac, 0, sizeof(fc->ac));
517                 fc->ac.alist = alist;
518                 fc->ac.index = -1;
519                 goto iterate_address;
520
521         case 0:
522         default:
523                 /* Success or local failure.  Stop. */
524                 fc->error = error;
525                 fc->flags |= AFS_FS_CURSOR_STOP;
526                 _leave(" = f [okay/local %d]", error);
527                 return false;
528
529         case -ECONNABORTED:
530                 fc->error = afs_abort_to_error(fc->ac.abort_code);
531                 fc->flags |= AFS_FS_CURSOR_STOP;
532                 _leave(" = f [abort]");
533                 return false;
534
535         case -ERFKILL:
536         case -EADDRNOTAVAIL:
537         case -ENETUNREACH:
538         case -EHOSTUNREACH:
539         case -EHOSTDOWN:
540         case -ECONNREFUSED:
541         case -ETIMEDOUT:
542         case -ETIME:
543                 _debug("no conn");
544                 fc->error = error;
545                 goto iterate_address;
546         }
547
548 iterate_address:
549         /* Iterate over the current server's address list to try and find an
550          * address on which it will respond to us.
551          */
552         if (afs_iterate_addresses(&fc->ac)) {
553                 _leave(" = t");
554                 return true;
555         }
556
557         afs_end_cursor(&fc->ac);
558         return false;
559 }
560
561 /*
562  * Dump cursor state in the case of the error being EDESTADDRREQ.
563  */
564 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
565 {
566         static int count;
567         int i;
568
569         if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
570                 return;
571         count++;
572
573         rcu_read_lock();
574
575         pr_notice("EDESTADDR occurred\n");
576         pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
577                   fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
578         pr_notice("FC: ut=%lx ix=%d ni=%u\n",
579                   fc->untried, fc->index, fc->nr_iterations);
580
581         if (fc->server_list) {
582                 const struct afs_server_list *sl = fc->server_list;
583                 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
584                           sl->nr_servers, sl->preferred, sl->vnovol_mask);
585                 for (i = 0; i < sl->nr_servers; i++) {
586                         const struct afs_server *s = sl->servers[i].server;
587                         pr_notice("FC: server fl=%lx av=%u %pU\n",
588                                   s->flags, s->addr_version, &s->uuid);
589                         if (s->addresses) {
590                                 const struct afs_addr_list *a =
591                                         rcu_dereference(s->addresses);
592                                 pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
593                                           a->version,
594                                           a->nr_ipv4, a->nr_addrs, a->max_addrs,
595                                           a->preferred);
596                                 pr_notice("FC:  - R=%lx F=%lx\n",
597                                           a->responded, a->failed);
598                                 if (a == fc->ac.alist)
599                                         pr_notice("FC:  - current\n");
600                         }
601                 }
602         }
603
604         pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
605                   fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
606                   fc->ac.responded, fc->ac.nr_iterations);
607         rcu_read_unlock();
608 }
609
610 /*
611  * Tidy up a filesystem cursor and unlock the vnode.
612  */
613 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
614 {
615         struct afs_net *net = afs_v2net(fc->vnode);
616
617         if (fc->error == -EDESTADDRREQ ||
618             fc->error == -EADDRNOTAVAIL ||
619             fc->error == -ENETUNREACH ||
620             fc->error == -EHOSTUNREACH)
621                 afs_dump_edestaddrreq(fc);
622
623         mutex_unlock(&fc->vnode->io_lock);
624
625         afs_end_cursor(&fc->ac);
626         afs_put_cb_interest(net, fc->cbi);
627         afs_put_serverlist(net, fc->server_list);
628
629         if (fc->error == -ECONNABORTED)
630                 fc->error = afs_abort_to_error(fc->ac.abort_code);
631
632         return fc->error;
633 }