diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 81815724db6c..b488072aee87 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -165,7 +165,8 @@ struct afs_status_cb { * AFS volume synchronisation information */ struct afs_volsync { - time64_t creation; /* volume creation time */ + time64_t creation; /* Volume creation time (or TIME64_MIN) */ + time64_t update; /* Volume update time (or TIME64_MIN) */ }; /* diff --git a/fs/afs/callback.c b/fs/afs/callback.c index f67e88076761..8ddc99c9c16b 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -81,7 +81,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; - vnode->cb_v_break = vnode->volume->cb_v_break; + vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break); afs_clear_permits(vnode); if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) @@ -159,12 +159,13 @@ static void afs_break_one_callback(struct afs_volume *volume, struct super_block *sb; struct afs_vnode *vnode; struct inode *inode; + unsigned int cb_v_break; if (fid->vnode == 0 && fid->unique == 0) { /* The callback break applies to an entire volume. */ write_lock(&volume->cb_v_break_lock); - volume->cb_v_break++; - trace_afs_cb_break(fid, volume->cb_v_break, + cb_v_break = atomic_inc_return(&volume->cb_v_break); + trace_afs_cb_break(fid, cb_v_break, afs_cb_break_for_volume_callback, false); write_unlock(&volume->cb_v_break_lock); return; diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 10137681aa7d..99d1e649e929 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -35,11 +35,13 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo key_get(key); } - op->key = key; - op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); - op->net = volume->cell->net; - op->cb_v_break = volume->cb_v_break; - op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->key = key; + op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); + op->net = volume->cell->net; + op->cb_v_break = atomic_read(&volume->cb_v_break); + op->pre_volsync.creation = volume->creation_time; + op->pre_volsync.update = volume->update_time; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); op->nr_iterations = -1; afs_op_set_error(op, -EDESTADDRREQ); @@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op) afs_prepare_vnode(op, &op->file[0], 0); afs_prepare_vnode(op, &op->file[1], 1); - op->cb_v_break = op->volume->cb_v_break; + op->cb_v_break = atomic_read(&op->volume->cb_v_break); _leave(" = true"); return true; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index f1f879ba9cf7..80f7d9e796e3 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1870,7 +1870,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSVolSync(&bp, &op->volsync); + /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled + * with rubbish. + */ + xdr_decode_AFSVolSync(&bp, NULL); call->unmarshall++; fallthrough; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 102e7c37d33c..df3d37577b5b 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -542,7 +542,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) BUG_ON(!(inode->i_state & I_NEW)); vnode = AFS_FS_I(inode); - vnode->cb_v_break = as->volume->cb_v_break, + vnode->cb_v_break = atomic_read(&as->volume->cb_v_break), afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 3d90415c2527..4b730cbcf63e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -662,7 +662,15 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ - unsigned cb_v_break; /* Break-everything counter. */ + /* RO release tracking */ + struct mutex volsync_lock; /* Time/state evaluation lock */ + time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ + time64_t update_time; /* Volume update time (or TIME64_MIN) */ + + /* Callback management */ + atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ + atomic_t cb_v_break; /* Volume-break event counter. */ + atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; afs_voltype_t type; /* type of volume */ @@ -856,7 +864,8 @@ struct afs_operation { struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; - struct afs_volsync volsync; + struct afs_volsync pre_volsync; /* Volsync before op */ + struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ @@ -1063,7 +1072,7 @@ static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { - return cb_break != (vnode->cb_break + vnode->volume->cb_v_break); + return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_v_break)); } /* @@ -1555,6 +1564,7 @@ extern void afs_fs_exit(void); /* * validation.c */ +int afs_update_volume_state(struct afs_operation *op); bool afs_check_validity(struct afs_vnode *vnode); bool afs_pagecache_valid(struct afs_vnode *vnode); int afs_validate(struct afs_vnode *vnode, struct key *key); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 3ab85a907a1d..5c50c9aa1f87 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -486,7 +486,7 @@ selected_server: vnode->cb_server = server; vnode->cb_s_break = server->cb_s_break; vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break); - vnode->cb_v_break = vnode->volume->cb_v_break; + vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } @@ -519,6 +519,8 @@ iterate_address: op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; op->call_responded = false; _debug("address [%u] %u/%u %pISp", op->server_index, addr_index, alist->nr_addrs, diff --git a/fs/afs/validation.c b/fs/afs/validation.c index 18ba2c5e8ead..6aadd5e075e4 100644 --- a/fs/afs/validation.c +++ b/fs/afs/validation.c @@ -10,6 +10,201 @@ #include #include "internal.h" +/* + * See if the server we've just talked to is currently excluded. + */ +static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + const struct afs_server_entry *se; + const struct afs_server_list *slist; + bool is_excluded = true; + int i; + + rcu_read_lock(); + + slist = rcu_dereference(volume->servers); + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + if (op->server == se->server) { + is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); + break; + } + } + + rcu_read_unlock(); + return is_excluded; +} + +/* + * Update the volume's server list when the creation time changes and see if + * the server we've just talked to is currently excluded. + */ +static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + int ret; + + if (__afs_is_server_excluded(op, volume)) + return 1; + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + ret = afs_check_volume_status(op->volume, op); + if (ret < 0) + return ret; + + return __afs_is_server_excluded(op, volume); +} + +/* + * Handle a change to the volume creation time in the VolSync record. + */ +static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) +{ + unsigned int snap; + time64_t cur = volume->creation_time; + time64_t old = op->pre_volsync.creation; + time64_t new = op->volsync.creation; + int ret; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->creation_time = new; + return 0; + } + + if (new == cur) + return 0; + + /* Try to advance the creation timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur != old) + return 0; + + /* If the creation time changes in an unexpected way, we need to scrub + * our caches. For a RW vol, this will only change if the volume is + * restored from a backup; for a RO/Backup vol, this will advance when + * the volume is updated to a new snapshot (eg. "vos release"). + */ + if (volume->type == AFSVL_RWVOL) + goto regressed; + if (volume->type == AFSVL_BACKVOL) { + if (new < old) + goto regressed; + goto advance; + } + + /* We have an RO volume, we need to query the VL server and look at the + * server flags to see if RW->RO replication is in progress. + */ + ret = afs_is_server_excluded(op, volume); + if (ret < 0) + return ret; + if (ret > 0) { + snap = atomic_read(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); + return ret; + } + +advance: + snap = atomic_inc_return(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); + volume->creation_time = new; + return 0; + +regressed: + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); + volume->creation_time = new; + return 0; +} + +/* + * Handle a change to the volume update time in the VolSync record. + */ +static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) +{ + enum afs_cb_break_reason reason = afs_cb_break_no_break; + time64_t cur = volume->update_time; + time64_t old = op->pre_volsync.update; + time64_t new = op->volsync.update; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->update_time = new; + return; + } + + if (new == cur) + return; + + /* If the volume update time changes in an unexpected way, we need to + * scrub our caches. For a RW vol, this will advance on every + * modification op; for a RO/Backup vol, this will advance when the + * volume is updated to a new snapshot (eg. "vos release"). + */ + if (new < old) + reason = afs_cb_break_for_update_regress; + + /* Try to advance the update timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur == old) { + if (reason == afs_cb_break_for_update_regress) { + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, reason); + } + volume->update_time = new; + } +} + +static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) +{ + int ret = 0; + + if (likely(op->volsync.creation == volume->creation_time && + op->volsync.update == volume->update_time)) + return 0; + + mutex_lock(&volume->volsync_lock); + if (op->volsync.creation != volume->creation_time) { + ret = afs_update_volume_creation_time(op, volume); + if (ret < 0) + goto out; + } + if (op->volsync.update != volume->update_time) + afs_update_volume_update_time(op, volume); +out: + mutex_unlock(&volume->volsync_lock); + return ret; +} + +/* + * Update the state of a volume. Returns 1 to redo the operation from the start. + */ +int afs_update_volume_state(struct afs_operation *op) +{ + struct afs_volume *volume = op->volume; + int ret; + + _enter("%llx", op->volume->vid); + + if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { + ret = afs_update_volume_times(op, volume); + if (ret != 0) { + _leave(" = %d", ret); + return ret; + } + } + + return 0; +} + /* * mark the data attached to an inode as obsolete due to a write on the server * - might also want to ditch all the outstanding writes and dirty pages @@ -74,7 +269,7 @@ bool afs_check_validity(struct afs_vnode *vnode) cb_break = vnode->cb_break; if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_v_break != vnode->volume->cb_v_break) + if (vnode->cb_v_break != atomic_read(&vnode->volume->cb_v_break)) need_clear = afs_cb_break_for_v_break; else if (!afs_check_server_good(vnode)) need_clear = afs_cb_break_for_s_reinit; @@ -95,7 +290,7 @@ bool afs_check_validity(struct afs_vnode *vnode) write_seqlock(&vnode->cb_lock); if (need_clear == afs_cb_break_no_promise) - vnode->cb_v_break = vnode->volume->cb_v_break; + vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break); else if (cb_break == vnode->cb_break) __afs_break_callback(vnode, need_clear); else diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 4982fce25057..41ab1d3ff3ea 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -90,11 +90,14 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, volume->type = params->type; volume->type_force = params->force; volume->name_len = vldb->name_len; + volume->creation_time = TIME64_MIN; + volume->update_time = TIME64_MIN; refcount_set(&volume->ref, 1); INIT_HLIST_NODE(&volume->proc_link); INIT_WORK(&volume->destructor, afs_destroy_volume); rwlock_init(&volume->servers_lock); + mutex_init(&volume->volsync_lock); rwlock_init(&volume->cb_v_break_lock); memcpy(volume->name, vldb->name, vldb->name_len + 1); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 11571cca86c1..2d6943f05ea5 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { struct yfs_xdr_YFSVolSync *x = (void *)*_bp; - u64 creation; + u64 creation, update; if (volsync) { creation = xdr_to_u64(x->vol_creation_date); do_div(creation, 10 * 1000 * 1000); volsync->creation = creation; + update = xdr_to_u64(x->vol_update_date); + do_div(update, 10 * 1000 * 1000); + volsync->update = update; } *_bp += xdr_size(x); diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 63ab23876be8..bbe8dcab4b32 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -440,13 +440,17 @@ enum yfs_cm_operation { EM(afs_cb_break_no_break, "no-break") \ EM(afs_cb_break_no_promise, "no-promise") \ EM(afs_cb_break_for_callback, "break-cb") \ + EM(afs_cb_break_for_creation_regress, "creation-regress") \ EM(afs_cb_break_for_deleted, "break-del") \ EM(afs_cb_break_for_lapsed, "break-lapsed") \ EM(afs_cb_break_for_s_reinit, "s-reinit") \ EM(afs_cb_break_for_unlink, "break-unlink") \ + EM(afs_cb_break_for_update_regress, "update-regress") \ EM(afs_cb_break_for_v_break, "break-v") \ EM(afs_cb_break_for_volume_callback, "break-v-cb") \ - E_(afs_cb_break_for_zap, "break-zap") + EM(afs_cb_break_for_vos_release, "break-vos-release") \ + EM(afs_cb_break_for_zap, "break-zap") \ + E_(afs_cb_break_volume_excluded, "vol-excluded") /* * Generate enums for tracing information. @@ -1249,6 +1253,30 @@ TRACE_EVENT(afs_get_tree, __entry->cell, __entry->volume, __entry->vid) ); +TRACE_EVENT(afs_cb_v_break, + TP_PROTO(afs_volid_t vid, unsigned int cb_v_break, + enum afs_cb_break_reason reason), + + TP_ARGS(vid, cb_v_break, reason), + + TP_STRUCT__entry( + __field(afs_volid_t, vid) + __field(unsigned int, cb_v_break) + __field(enum afs_cb_break_reason, reason) + ), + + TP_fast_assign( + __entry->vid = vid; + __entry->cb_v_break = cb_v_break; + __entry->reason = reason; + ), + + TP_printk("%llx vb=%x %s", + __entry->vid, + __entry->cb_v_break, + __print_symbolic(__entry->reason, afs_cb_break_reasons)) + ); + TRACE_EVENT(afs_cb_break, TP_PROTO(struct afs_fid *fid, unsigned int cb_break, enum afs_cb_break_reason reason, bool skipped),