1 /* Handle fileserver selection and rotation.
3 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
12 #include <linux/kernel.h>
13 #include <linux/slab.h>
15 #include <linux/sched.h>
16 #include <linux/delay.h>
17 #include <linux/sched/signal.h>
22 * Begin an operation on the fileserver.
24 * Fileserver operations are serialised on the server by vnode, so we serialise
25 * them here also using the io_lock.
27 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
28 struct key *key, bool intr)
30 memset(fc, 0, sizeof(*fc));
33 fc->ac.error = SHRT_MAX;
34 fc->error = -EDESTADDRREQ;
37 fc->flags |= AFS_FS_CURSOR_INTR;
38 if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
40 fc->flags |= AFS_FS_CURSOR_STOP;
44 mutex_lock(&vnode->io_lock);
47 if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
48 fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
53 * Begin iteration through a server list, starting with the vnode's last used
54 * server if possible, or the last recorded good server if not.
56 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
57 struct afs_vnode *vnode)
59 struct afs_cb_interest *cbi;
62 read_lock(&vnode->volume->servers_lock);
63 fc->server_list = afs_get_serverlist(vnode->volume->servers);
64 read_unlock(&vnode->volume->servers_lock);
66 fc->untried = (1UL << fc->server_list->nr_servers) - 1;
67 fc->index = READ_ONCE(fc->server_list->preferred);
69 cbi = rcu_dereference_protected(vnode->cb_interest,
70 lockdep_is_held(&vnode->io_lock));
72 /* See if the vnode's preferred record is still available */
73 for (i = 0; i < fc->server_list->nr_servers; i++) {
74 if (fc->server_list->servers[i].cb_interest == cbi) {
80 /* If we have a lock outstanding on a server that's no longer
81 * serving this vnode, then we can't switch to another server
82 * and have to return an error.
84 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
89 /* Note that the callback promise is effectively broken */
90 write_seqlock(&vnode->cb_lock);
91 ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
92 rcu_assign_pointer(vnode->cb_interest, NULL);
93 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
95 write_sequnlock(&vnode->cb_lock);
97 afs_put_cb_interest(afs_v2net(vnode), cbi);
106 * Post volume busy note.
108 static void afs_busy(struct afs_volume *volume, u32 abort_code)
112 switch (abort_code) {
113 case VOFFLINE: m = "offline"; break;
114 case VRESTARTING: m = "restarting"; break;
115 case VSALVAGING: m = "being salvaged"; break;
116 default: m = "busy"; break;
119 pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
123 * Sleep and retry the operation to the same fileserver.
125 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
127 if (fc->flags & AFS_FS_CURSOR_INTR) {
128 msleep_interruptible(1000);
129 if (signal_pending(current)) {
130 fc->error = -ERESTARTSYS;
141 * Select the fileserver to use. May be called multiple times to rotate
142 * through the fileservers.
144 bool afs_select_fileserver(struct afs_fs_cursor *fc)
146 struct afs_addr_list *alist;
147 struct afs_server *server;
148 struct afs_vnode *vnode = fc->vnode;
151 int error = fc->ac.error, i;
153 _enter("%lx[%d],%lx[%d],%d,%d",
154 fc->untried, fc->index,
155 fc->ac.tried, fc->ac.index,
156 error, fc->ac.abort_code);
158 if (fc->flags & AFS_FS_CURSOR_STOP) {
159 _leave(" = f [stopped]");
165 /* Evaluate the result of the previous operation, if there was one. */
172 /* Success or local failure. Stop. */
174 fc->flags |= AFS_FS_CURSOR_STOP;
175 _leave(" = f [okay/local %d]", error);
179 /* The far side rejected the operation on some grounds. This
180 * might involve the server being busy or the volume having been moved.
182 switch (fc->ac.abort_code) {
184 /* This fileserver doesn't know about the volume.
185 * - May indicate that the VL is wrong - retry once and compare
187 * - May indicate that the fileserver couldn't attach to the vol.
189 if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
190 fc->error = -EREMOTEIO;
194 write_lock(&vnode->volume->servers_lock);
195 fc->server_list->vnovol_mask |= 1 << fc->index;
196 write_unlock(&vnode->volume->servers_lock);
198 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
199 error = afs_check_volume_status(vnode->volume, fc->key);
201 goto failed_set_error;
203 if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
204 fc->error = -ENOMEDIUM;
208 /* If the server list didn't change, then assume that
209 * it's the fileserver having trouble.
211 if (vnode->volume->servers == fc->server_list) {
212 fc->error = -EREMOTEIO;
217 fc->flags |= AFS_FS_CURSOR_VNOVOL;
218 _leave(" = t [vnovol]");
221 case VSALVAGE: /* TODO: Should this return an error or iterate? */
227 fc->error = afs_abort_to_error(fc->ac.abort_code);
231 if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
232 afs_busy(vnode->volume, fc->ac.abort_code);
233 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
235 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
239 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
248 /* Retry after going round all the servers unless we
249 * have a file lock we need to maintain.
251 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
255 if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
256 afs_busy(vnode->volume, fc->ac.abort_code);
257 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
260 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
261 if (!afs_sleep_and_retry(fc))
264 /* Retry with same server & address */
265 _leave(" = t [vbusy]");
269 fc->flags |= AFS_FS_CURSOR_VBUSY;
273 /* The volume migrated to another server. We consider
274 * consider all locks and callbacks broken and request
275 * an update from the VLDB.
277 * We also limit the number of VMOVED hops we will
278 * honour, just in case someone sets up a loop.
280 if (fc->flags & AFS_FS_CURSOR_VMOVED) {
281 fc->error = -EREMOTEIO;
284 fc->flags |= AFS_FS_CURSOR_VMOVED;
286 set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
287 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
288 error = afs_check_volume_status(vnode->volume, fc->key);
290 goto failed_set_error;
292 /* If the server list didn't change, then the VLDB is
293 * out of sync with the fileservers. This is hopefully
294 * a temporary condition, however, so we don't want to
295 * permanently block access to the file.
297 * TODO: Try other fileservers if we can.
299 * TODO: Retry a few times with sleeps.
301 if (vnode->volume->servers == fc->server_list) {
302 fc->error = -ENOMEDIUM;
306 goto restart_from_beginning;
309 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
310 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
311 fc->error = afs_abort_to_error(fc->ac.abort_code);
317 if (fc->error != -EDESTADDRREQ)
318 goto iterate_address;
328 goto iterate_address;
331 _debug("call reset");
336 restart_from_beginning:
338 afs_end_cursor(&fc->ac);
339 afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
341 afs_put_serverlist(afs_v2net(vnode), fc->server_list);
342 fc->server_list = NULL;
345 /* See if we need to do an update of the volume record. Note that the
346 * volume may have moved or even have been deleted.
348 error = afs_check_volume_status(vnode->volume, fc->key);
350 goto failed_set_error;
352 if (!afs_start_fs_iteration(fc, vnode))
355 _debug("__ VOL %llx __", vnode->volume->vid);
356 error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
358 goto failed_set_error;
361 _debug("pick [%lx]", fc->untried);
363 error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
365 goto failed_set_error;
367 /* Pick the untried server with the lowest RTT. If we have outstanding
368 * callbacks, we stick with the server we're already using if we can.
371 _debug("cbi %u", fc->index);
372 if (test_bit(fc->index, &fc->untried))
373 goto selected_server;
374 afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
381 for (i = 0; i < fc->server_list->nr_servers; i++) {
382 struct afs_server *s = fc->server_list->servers[i].server;
384 if (!test_bit(i, &fc->untried) || !s->probe.responded)
386 if (s->probe.rtt < rtt) {
393 goto no_more_servers;
396 _debug("use %d", fc->index);
397 __clear_bit(fc->index, &fc->untried);
399 /* We're starting on a different fileserver from the list. We need to
400 * check it, create a callback intercept, find its address list and
401 * probe its capabilities before we use it.
403 ASSERTCMP(fc->ac.alist, ==, NULL);
404 server = fc->server_list->servers[fc->index].server;
406 if (!afs_check_server_record(fc, server))
409 _debug("USING SERVER: %pU", &server->uuid);
411 /* Make sure we've got a callback interest record for this server. We
412 * have to link it in before we send the request as we can be sent a
413 * break request before we've finished decoding the reply and
414 * installing the vnode.
416 error = afs_register_server_cb_interest(vnode, fc->server_list,
419 goto failed_set_error;
421 fc->cbi = afs_get_cb_interest(
422 rcu_dereference_protected(vnode->cb_interest,
423 lockdep_is_held(&vnode->io_lock)));
425 read_lock(&server->fs_lock);
426 alist = rcu_dereference_protected(server->addresses,
427 lockdep_is_held(&server->fs_lock));
428 afs_get_addrlist(alist);
429 read_unlock(&server->fs_lock);
431 memset(&fc->ac, 0, sizeof(fc->ac));
434 fc->ac.alist = alist;
436 afs_put_addrlist(alist);
441 ASSERT(fc->ac.alist);
442 /* Iterate over the current server's address list to try and find an
443 * address on which it will respond to us.
445 if (!afs_iterate_addresses(&fc->ac))
448 _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
455 afs_end_cursor(&fc->ac);
459 /* That's all the servers poked to no good effect. Try again if some
462 if (fc->flags & AFS_FS_CURSOR_VBUSY)
463 goto restart_from_beginning;
465 e.error = -EDESTADDRREQ;
467 for (i = 0; i < fc->server_list->nr_servers; i++) {
468 struct afs_server *s = fc->server_list->servers[i].server;
470 afs_prioritise_error(&e, READ_ONCE(s->probe.error),
471 s->probe.abort_code);
479 fc->flags |= AFS_FS_CURSOR_STOP;
480 afs_end_cursor(&fc->ac);
481 _leave(" = f [failed %d]", fc->error);
486 * Select the same fileserver we used for a vnode before and only that
487 * fileserver. We use this when we have a lock on that file, which is backed
488 * only by the fileserver we obtained it from.
490 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
492 struct afs_vnode *vnode = fc->vnode;
493 struct afs_cb_interest *cbi;
494 struct afs_addr_list *alist;
495 int error = fc->ac.error;
499 cbi = rcu_dereference_protected(vnode->cb_interest,
500 lockdep_is_held(&vnode->io_lock));
506 fc->flags |= AFS_FS_CURSOR_STOP;
510 fc->cbi = afs_get_cb_interest(cbi);
512 read_lock(&cbi->server->fs_lock);
513 alist = rcu_dereference_protected(cbi->server->addresses,
514 lockdep_is_held(&cbi->server->fs_lock));
515 afs_get_addrlist(alist);
516 read_unlock(&cbi->server->fs_lock);
519 fc->flags |= AFS_FS_CURSOR_STOP;
523 memset(&fc->ac, 0, sizeof(fc->ac));
524 fc->ac.alist = alist;
526 goto iterate_address;
530 /* Success or local failure. Stop. */
532 fc->flags |= AFS_FS_CURSOR_STOP;
533 _leave(" = f [okay/local %d]", error);
537 fc->error = afs_abort_to_error(fc->ac.abort_code);
538 fc->flags |= AFS_FS_CURSOR_STOP;
539 _leave(" = f [abort]");
552 goto iterate_address;
556 /* Iterate over the current server's address list to try and find an
557 * address on which it will respond to us.
559 if (afs_iterate_addresses(&fc->ac)) {
564 afs_end_cursor(&fc->ac);
569 * Dump cursor state in the case of the error being EDESTADDRREQ.
571 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
576 if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
582 pr_notice("EDESTADDR occurred\n");
583 pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
584 fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
585 pr_notice("FC: ut=%lx ix=%d ni=%u\n",
586 fc->untried, fc->index, fc->nr_iterations);
588 if (fc->server_list) {
589 const struct afs_server_list *sl = fc->server_list;
590 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
591 sl->nr_servers, sl->preferred, sl->vnovol_mask);
592 for (i = 0; i < sl->nr_servers; i++) {
593 const struct afs_server *s = sl->servers[i].server;
594 pr_notice("FC: server fl=%lx av=%u %pU\n",
595 s->flags, s->addr_version, &s->uuid);
597 const struct afs_addr_list *a =
598 rcu_dereference(s->addresses);
599 pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
601 a->nr_ipv4, a->nr_addrs, a->max_addrs,
603 pr_notice("FC: - pr=%lx R=%lx F=%lx\n",
604 a->probed, a->responded, a->failed);
605 if (a == fc->ac.alist)
606 pr_notice("FC: - current\n");
611 pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
612 fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
613 fc->ac.responded, fc->ac.nr_iterations);
618 * Tidy up a filesystem cursor and unlock the vnode.
620 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
622 struct afs_net *net = afs_v2net(fc->vnode);
624 if (fc->error == -EDESTADDRREQ ||
625 fc->error == -EADDRNOTAVAIL ||
626 fc->error == -ENETUNREACH ||
627 fc->error == -EHOSTUNREACH)
628 afs_dump_edestaddrreq(fc);
630 mutex_unlock(&fc->vnode->io_lock);
632 afs_end_cursor(&fc->ac);
633 afs_put_cb_interest(net, fc->cbi);
634 afs_put_serverlist(net, fc->server_list);
636 if (fc->error == -ECONNABORTED)
637 fc->error = afs_abort_to_error(fc->ac.abort_code);