linux-stable/fs/gfs2/glock.c

2961 lines
75 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/buffer_head.h>
#include <linux/delay.h>
#include <linux/sort.h>
#include <linux/hash.h>
#include <linux/jhash.h>
#include <linux/kallsyms.h>
#include <linux/gfs2_ondisk.h>
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/workqueue.h>
#include <linux/jiffies.h>
#include <linux/rcupdate.h>
#include <linux/rculist_bl.h>
#include <linux/bit_spinlock.h>
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
#include <linux/percpu.h>
#include <linux/list_sort.h>
#include <linux/lockref.h>
#include <linux/rhashtable.h>
#include <linux/pid_namespace.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
#include "lops.h"
#include "meta_io.h"
#include "quota.h"
#include "super.h"
#include "util.h"
#include "bmap.h"
#define CREATE_TRACE_POINTS
#include "trace_gfs2.h"
struct gfs2_glock_iter {
struct gfs2_sbd *sdp; /* incore superblock */
struct rhashtable_iter hti; /* rhashtable iterator */
struct gfs2_glock *gl; /* current glock struct */
loff_t last_pos; /* last position */
};
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
static void __gfs2_glock_dq(struct gfs2_holder *gh);
static void handle_callback(struct gfs2_glock *gl, unsigned int state,
unsigned long delay, bool remote);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
static LIST_HEAD(lru_list);
static atomic_t lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT)
static const struct rhashtable_params ht_parms = {
.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
.key_len = offsetofend(struct lm_lockname, ln_type),
.key_offset = offsetof(struct gfs2_glock, gl_name),
.head_offset = offsetof(struct gfs2_glock, gl_node),
};
static struct rhashtable gl_hash_table;
#define GLOCK_WAIT_TABLE_BITS 12
#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS)
static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned;
struct wait_glock_queue {
struct lm_lockname *name;
wait_queue_entry_t wait;
};
static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *key)
{
struct wait_glock_queue *wait_glock =
container_of(wait, struct wait_glock_queue, wait);
struct lm_lockname *wait_name = wait_glock->name;
struct lm_lockname *wake_name = key;
if (wake_name->ln_sbd != wait_name->ln_sbd ||
wake_name->ln_number != wait_name->ln_number ||
wake_name->ln_type != wait_name->ln_type)
return 0;
return autoremove_wake_function(wait, mode, sync, key);
}
static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
{
u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0);
return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
}
/**
* wake_up_glock - Wake up waiters on a glock
* @gl: the glock
*/
static void wake_up_glock(struct gfs2_glock *gl)
{
wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name);
if (waitqueue_active(wq))
__wake_up(wq, TASK_NORMAL, 1, &gl->gl_name);
}
static void gfs2_glock_dealloc(struct rcu_head *rcu)
{
struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
kfree(gl->gl_lksb.sb_lvbptr);
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
struct gfs2_glock_aspace *gla =
container_of(gl, struct gfs2_glock_aspace, glock);
kmem_cache_free(gfs2_glock_aspace_cachep, gla);
} else
kmem_cache_free(gfs2_glock_cachep, gl);
}
/**
* glock_blocked_by_withdraw - determine if we can still use a glock
* @gl: the glock
*
* We need to allow some glocks to be enqueued, dequeued, promoted, and demoted
* when we're withdrawn. For example, to maintain metadata integrity, we should
* disallow the use of inode and rgrp glocks when withdrawn. Other glocks like
* the iopen or freeze glock may be safely used because none of their
* metadata goes through the journal. So in general, we should disallow all
* glocks that are journaled, and allow all the others. One exception is:
* we need to allow our active journal to be promoted and demoted so others
* may recover it and we can reacquire it when they're done.
*/
static bool glock_blocked_by_withdraw(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (!gfs2_withdrawing_or_withdrawn(sdp))
return false;
if (gl->gl_ops->go_flags & GLOF_NONDISK)
return false;
if (!sdp->sd_jdesc ||
gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr)
return false;
return true;
}
static void __gfs2_glock_free(struct gfs2_glock *gl)
{
rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
smp_mb();
wake_up_glock(gl);
call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
}
void gfs2_glock_free(struct gfs2_glock *gl) {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
__gfs2_glock_free(gl);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_kill_wait);
}
void gfs2_glock_free_later(struct gfs2_glock *gl) {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
spin_lock(&lru_lock);
list_add(&gl->gl_lru, &sdp->sd_dead_glocks);
spin_unlock(&lru_lock);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_kill_wait);
}
static void gfs2_free_dead_glocks(struct gfs2_sbd *sdp)
{
struct list_head *list = &sdp->sd_dead_glocks;
while(!list_empty(list)) {
struct gfs2_glock *gl;
gl = list_first_entry(list, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
__gfs2_glock_free(gl);
}
}
/**
* gfs2_glock_hold() - increment reference count on glock
* @gl: The glock to hold
*
*/
struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl)
{
GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
lockref_get(&gl->gl_lockref);
return gl;
}
/**
* demote_ok - Check to see if it's ok to unlock a glock
* @gl: the glock
*
* Returns: 1 if it's ok
*/
static int demote_ok(const struct gfs2_glock *gl)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
return glops->go_demote_ok(gl);
return 1;
}
void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
{
if (!(gl->gl_ops->go_flags & GLOF_LRU))
return;
spin_lock(&lru_lock);
list_move_tail(&gl->gl_lru, &lru_list);
if (!test_bit(GLF_LRU, &gl->gl_flags)) {
set_bit(GLF_LRU, &gl->gl_flags);
atomic_inc(&lru_count);
}
spin_unlock(&lru_lock);
}
static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
if (!(gl->gl_ops->go_flags & GLOF_LRU))
return;
spin_lock(&lru_lock);
if (test_bit(GLF_LRU, &gl->gl_flags)) {
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
clear_bit(GLF_LRU, &gl->gl_flags);
}
spin_unlock(&lru_lock);
}
/*
* Enqueue the glock on the work queue. Passes one glock reference on to the
* work queue.
*/
static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) {
if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) {
/*
* We are holding the lockref spinlock, and the work was still
* queued above. The queued work (glock_work_func) takes that
* spinlock before dropping its glock reference(s), so it
* cannot have dropped them in the meantime.
*/
GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2);
gl->gl_lockref.count--;
}
}
static void __gfs2_glock_put(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
lockref_mark_dead(&gl->gl_lockref);
spin_unlock(&gl->gl_lockref.lock);
gfs2_glock_remove_from_lru(gl);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
if (mapping) {
truncate_inode_pages_final(mapping);
if (!gfs2_withdrawing_or_withdrawn(sdp))
GLOCK_BUG_ON(gl, !mapping_empty(mapping));
}
trace_gfs2_glock_put(gl);
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
/**
* gfs2_glock_put() - Decrement reference count on glock
* @gl: The glock to put
*
*/
void gfs2_glock_put(struct gfs2_glock *gl)
{
if (lockref_put_or_lock(&gl->gl_lockref))
return;
__gfs2_glock_put(gl);
}
/*
* gfs2_glock_put_async - Decrement reference count without sleeping
* @gl: The glock to put
*
* Decrement the reference count on glock immediately unless it is the last
* reference. Defer putting the last reference to work queue context.
*/
void gfs2_glock_put_async(struct gfs2_glock *gl)
{
if (lockref_put_or_lock(&gl->gl_lockref))
return;
GLOCK_BUG_ON(gl, gl->gl_lockref.count != 1);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
}
/**
* may_grant - check if it's ok to grant a new lock
* @gl: The glock
* @current_gh: One of the current holders of @gl
* @gh: The lock request which we wish to grant
*
* With our current compatibility rules, if a glock has one or more active
* holders (HIF_HOLDER flag set), any of those holders can be passed in as
* @current_gh; they are all the same as far as compatibility with the new @gh
* goes.
*
* Returns true if it's ok to grant the lock.
*/
static inline bool may_grant(struct gfs2_glock *gl,
struct gfs2_holder *current_gh,
struct gfs2_holder *gh)
{
if (current_gh) {
GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));
switch(current_gh->gh_state) {
case LM_ST_EXCLUSIVE:
/*
* Here we make a special exception to grant holders
* who agree to share the EX lock with other holders
* who also have the bit set. If the original holder
* has the LM_FLAG_NODE_SCOPE bit set, we grant more
* holders with the bit set.
*/
return gh->gh_state == LM_ST_EXCLUSIVE &&
(current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
(gh->gh_flags & LM_FLAG_NODE_SCOPE);
case LM_ST_SHARED:
case LM_ST_DEFERRED:
return gh->gh_state == current_gh->gh_state;
default:
return false;
}
}
if (gl->gl_state == gh->gh_state)
return true;
if (gh->gh_flags & GL_EXACT)
return false;
if (gl->gl_state == LM_ST_EXCLUSIVE) {
return gh->gh_state == LM_ST_SHARED ||
gh->gh_state == LM_ST_DEFERRED;
}
if (gh->gh_flags & LM_FLAG_ANY)
return gl->gl_state != LM_ST_UNLOCKED;
return false;
}
static void gfs2_holder_wake(struct gfs2_holder *gh)
{
clear_bit(HIF_WAIT, &gh->gh_iflags);
smp_mb__after_atomic();
wake_up_bit(&gh->gh_iflags, HIF_WAIT);
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
if (gh->gh_flags & GL_ASYNC) {
struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd;
wake_up(&sdp->sd_async_glock_wait);
}
}
/**
* do_error - Something unexpected has happened during a lock request
* @gl: The glock
* @ret: The status from the DLM
*/
static void do_error(struct gfs2_glock *gl, const int ret)
{
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
gh->gh_error = GLR_TRYFAILED;
else
continue;
list_del_init(&gh->gh_list);
trace_gfs2_glock_queue(gh, 0);
gfs2_holder_wake(gh);
}
}
/**
* find_first_holder - find the first "holder" gh
* @gl: the glock
*/
static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
{
struct gfs2_holder *gh;
if (!list_empty(&gl->gl_holders)) {
gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
gh_list);
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
return gh;
}
return NULL;
}
/*
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
*
* Returns: 0 if instantiate was successful, or error.
*/
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
int gfs2_instantiate(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
const struct gfs2_glock_operations *glops = gl->gl_ops;
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
int ret;
again:
if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
goto done;
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
/*
* Since we unlock the lockref lock, we set a flag to indicate
* instantiate is in progress.
*/
if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) {
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG,
TASK_UNINTERRUPTIBLE);
/*
* Here we just waited for a different instantiate to finish.
* But that may not have been successful, as when a process
* locks an inode glock _before_ it has an actual inode to
* instantiate into. So we check again. This process might
* have an inode to instantiate, so might be successful.
*/
goto again;
}
ret = glops->go_instantiate(gl);
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
if (!ret)
clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
if (ret)
return ret;
done:
if (glops->go_held)
return glops->go_held(gh);
return 0;
}
/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
* Returns true on success (i.e., progress was made or there are no waiters).
*/
static bool do_promote(struct gfs2_glock *gl)
{
struct gfs2_holder *gh, *current_gh;
current_gh = find_first_holder(gl);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
if (!may_grant(gl, current_gh, gh)) {
/*
* If we get here, it means we may not grant this
* holder for some reason. If this holder is at the
* head of the list, it means we have a blocked holder
* at the head, so return false.
*/
if (list_is_first(&gh->gh_list, &gl->gl_holders))
return false;
do_error(gl, 0);
break;
}
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
if (!current_gh)
current_gh = gh;
}
return true;
}
/**
* find_first_waiter - find the first gh that's waiting for the glock
* @gl: the glock
*/
static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
{
struct gfs2_holder *gh;
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
return gh;
}
return NULL;
}
/**
* find_last_waiter - find the last gh that's waiting for the glock
* @gl: the glock
*
* This also is a fast way of finding out if there are any waiters.
*/
static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl)
{
struct gfs2_holder *gh;
if (list_empty(&gl->gl_holders))
return NULL;
gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh;
}
/**
* state_change - record that the glock is now in a different state
* @gl: the glock
* @new_state: the new state
*/
static void state_change(struct gfs2_glock *gl, unsigned int new_state)
{
int held1, held2;
held1 = (gl->gl_state != LM_ST_UNLOCKED);
held2 = (new_state != LM_ST_UNLOCKED);
if (held1 != held2) {
GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
if (held2)
gl->gl_lockref.count++;
else
gl->gl_lockref.count--;
}
if (new_state != gl->gl_target)
/* shorten our minimum hold time */
gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR,
GL_GLOCK_MIN_HOLD);
gl->gl_state = new_state;
gl->gl_tchange = jiffies;
}
static void gfs2_set_demote(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
set_bit(GLF_DEMOTE, &gl->gl_flags);
smp_mb();
wake_up(&sdp->sd_async_glock_wait);
}
static void gfs2_demote_wake(struct gfs2_glock *gl)
{
gl->gl_demote_state = LM_ST_EXCLUSIVE;
clear_bit(GLF_DEMOTE, &gl->gl_flags);
smp_mb__after_atomic();
wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
}
/**
* finish_xmote - The DLM has replied to one of our lock requests
* @gl: The glock
* @ret: The status from the DLM
*
*/
static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh;
unsigned state = ret & LM_OUT_ST_MASK;
trace_gfs2_glock_state_change(gl, state);
state_change(gl, state);
gh = find_first_waiter(gl);
/* Demote to UN request arrived during demote to SH or DF */
if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
gl->gl_target = LM_ST_UNLOCKED;
/* Check for state != intended state */
if (unlikely(state != gl->gl_target)) {
if (gh && (ret & LM_OUT_CANCELED))
gfs2_holder_wake(gh);
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
/* move to back of queue and try next entry */
if (ret & LM_OUT_CANCELED) {
list_move_tail(&gh->gh_list, &gl->gl_holders);
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
2023-07-26 17:01:08 +00:00
if (do_promote(gl))
goto out;
goto retry;
}
/* Some error or failed "try lock" - report it */
if ((ret & LM_OUT_ERROR) ||
(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
gl->gl_target = gl->gl_state;
do_error(gl, ret);
goto out;
}
}
switch(state) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED:
retry:
do_xmote(gl, gh, gl->gl_target);
break;
/* Conversion fails, unlock and try again */
case LM_ST_SHARED:
case LM_ST_DEFERRED:
do_xmote(gl, gh, LM_ST_UNLOCKED);
break;
default: /* Everything else */
fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n",
gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
return;
}
/* Fast path - we got what we asked for */
if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
int rv;
spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl);
spin_lock(&gl->gl_lockref.lock);
if (rv) {
do_error(gl, rv);
goto out;
}
}
do_promote(gl);
}
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
}
static bool is_system_glock(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
if (gl == m_ip->i_gl)
return true;
return false;
}
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
* @gh: The holder (only for promotes)
* @target: The target lock state
*
*/
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh,
unsigned int target)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
gfs2: Force withdraw to replay journals and wait for it to finish When a node withdraws from a file system, it often leaves its journal in an incomplete state. This is especially true when the withdraw is caused by io errors writing to the journal. Before this patch, a withdraw would try to write a "shutdown" record to the journal, tell dlm it's done with the file system, and none of the other nodes know about the problem. Later, when the problem is fixed and the withdrawn node is rebooted, it would then discover that its own journal was incomplete, and replay it. However, replaying it at this point is almost guaranteed to introduce corruption because the other nodes are likely to have used affected resource groups that appeared in the journal since the time of the withdraw. Replaying the journal later will overwrite any changes made, and not through any fault of dlm, which was instructed during the withdraw to release those resources. This patch makes file system withdraws seen by the entire cluster. Withdrawing nodes dequeue their journal glock to allow recovery. The remaining nodes check all the journals to see if they are clean or in need of replay. They try to replay dirty journals, but only the journals of withdrawn nodes will be "not busy" and therefore available for replay. Until the journal replay is complete, no i/o related glocks may be given out, to ensure that the replay does not cause the aforementioned corruption: We cannot allow any journal replay to overwrite blocks associated with a glock once it is held. The "live" glock which is now used to signal when a withdraw occurs. When a withdraw occurs, the node signals its withdraw by dequeueing the "live" glock and trying to enqueue it in EX mode, thus forcing the other nodes to all see a demote request, by way of a "1CB" (one callback) try lock. The "live" glock is not granted in EX; the callback is only just used to indicate a withdraw has occurred. Note that all nodes in the cluster must wait for the recovering node to finish replaying the withdrawing node's journal before continuing. To this end, it checks that the journals are clean multiple times in a retry loop. Also note that the withdraw function may be called from a wide variety of situations, and therefore, we need to take extra precautions to make sure pointers are valid before using them in many circumstances. We also need to take care when glocks decide to withdraw, since the withdraw code now uses glocks. Also, before this patch, if a process encountered an error and decided to withdraw, if another process was already withdrawing, the second withdraw would be silently ignored, which set it free to unlock its glocks. That's correct behavior if the original withdrawer encounters further errors down the road. But if secondary waiters don't wait for the journal replay, unlocking glocks will allow other nodes to use them, despite the fact that the journal containing those blocks is being replayed. The replay needs to finish before our glocks are released to other nodes. IOW, secondary withdraws need to wait for the first withdraw to finish. For example, if an rgrp glock is unlocked by a process that didn't wait for the first withdraw, a journal replay could introduce file system corruption by replaying a rgrp block that has already been granted to a different cluster node. Signed-off-by: Bob Peterson <rpeterso@redhat.com>
2020-01-28 19:23:45 +00:00
if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
gh && !(gh->gh_flags & LM_FLAG_NOEXP))
goto skip_inval;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP);
GLOCK_BUG_ON(gl, gl->gl_state == target);
GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
glops->go_inval) {
/*
* If another process is already doing the invalidate, let that
* finish first. The glock state machine will get back to this
* holder again later.
*/
if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS,
&gl->gl_flags))
return;
do_error(gl, 0); /* Fail queued try locks */
}
gl->gl_req = target;
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
set_bit(GLF_BLOCKING, &gl->gl_flags);
if ((gl->gl_req == LM_ST_UNLOCKED) ||
(gl->gl_state == LM_ST_EXCLUSIVE) ||
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
if (!glops->go_inval && !glops->go_sync)
goto skip_inval;
spin_unlock(&gl->gl_lockref.lock);
if (glops->go_sync) {
ret = glops->go_sync(gl);
/* If we had a problem syncing (due to io errors or whatever,
* we should not invalidate the metadata or tell dlm to
* release the glock to other nodes.
*/
if (ret) {
if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
fs_err(sdp, "Error %d syncing glock \n", ret);
gfs2_dump_glock(NULL, gl, true);
}
spin_lock(&gl->gl_lockref.lock);
goto skip_inval;
}
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) {
/*
* The call to go_sync should have cleared out the ail list.
* If there are still items, we have a problem. We ought to
* withdraw, but we can't because the withdraw code also uses
* glocks. Warn about the error, dump the glock, then fall
* through and wait for logd to do the withdraw for us.
*/
if ((atomic_read(&gl->gl_ail_count) != 0) &&
(!cmpxchg(&sdp->sd_log_error, 0, -EIO))) {
gfs2_glock_assert_warn(gl,
!atomic_read(&gl->gl_ail_count));
gfs2_dump_glock(NULL, gl, true);
}
glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
spin_lock(&gl->gl_lockref.lock);
skip_inval:
gl->gl_lockref.count++;
/*
* Check for an error encountered since we called go_sync and go_inval.
* If so, we can't withdraw from the glock code because the withdraw
* code itself uses glocks (see function signal_our_withdraw) to
* change the mount to read-only. Most importantly, we must not call
* dlm to unlock the glock until the journal is in a known good state
* (after journal replay) otherwise other nodes may use the object
* (rgrp or dinode) and then later, journal replay will corrupt the
* file system. The best we can do here is wait for the logd daemon
* to see sd_log_error and withdraw, and in the meantime, requeue the
* work for later.
*
* We make a special exception for some system glocks, such as the
* system statfs inode glock, which needs to be granted before the
* gfs2_quotad daemon can exit, and that exit needs to finish before
* we can unmount the withdrawn file system.
*
* However, if we're just unlocking the lock (say, for unmount, when
* gfs2_gl_hash_clear calls clear_glock) and recovery is complete
* then it's okay to tell dlm to unlock it.
*/
if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp))
gfs2_withdraw_delayed(sdp);
if (glock_blocked_by_withdraw(gl) &&
(target != LM_ST_UNLOCKED ||
test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
if (!is_system_glock(gl)) {
handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */
/*
* Ordinarily, we would call dlm and its callback would call
* finish_xmote, which would call state_change() to the new state.
* Since we withdrew, we won't call dlm, so call state_change
* manually, but to the UNLOCKED state we desire.
*/
state_change(gl, LM_ST_UNLOCKED);
/*
* We skip telling dlm to do the locking, so we won't get a
* reply that would otherwise clear GLF_LOCK. So we clear it here.
*/
clear_bit(GLF_LOCK, &gl->gl_flags);
clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
return;
} else {
clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
}
if (ls->ls_ops->lm_lock) {
spin_unlock(&gl->gl_lockref.lock);
ret = ls->ls_ops->lm_lock(gl, target, lck_flags);
spin_lock(&gl->gl_lockref.lock);
if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED &&
target == LM_ST_UNLOCKED &&
test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
/*
* The lockspace has been released and the lock has
* been unlocked implicitly.
*/
} else if (ret) {
fs_err(sdp, "lm_lock ret %d\n", ret);
target = gl->gl_state | LM_OUT_ERROR;
} else {
/* The operation will be completed asynchronously. */
return;
}
}
/* Complete the operation now. */
finish_xmote(gl, target);
gfs2_glock_queue_work(gl, 0);
}
/**
* run_queue - do all outstanding tasks related to a glock
* @gl: The glock in question
* @nonblock: True if we must not block in run_queue
*
*/
static void run_queue(struct gfs2_glock *gl, const int nonblock)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
if (test_bit(GLF_LOCK, &gl->gl_flags))
return;
set_bit(GLF_LOCK, &gl->gl_flags);
GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
gl->gl_demote_state != gl->gl_state) {
if (find_first_holder(gl))
goto out_unlock;
if (nonblock)
goto out_sched;
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
if (do_promote(gl))
goto out_unlock;
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
}
do_xmote(gl, gh, gl->gl_target);
return;
out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
smp_mb__after_atomic();
gl->gl_lockref.count++;
gfs2_glock_queue_work(gl, 0);
return;
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
smp_mb__after_atomic();
return;
}
/**
* glock_set_object - set the gl_object field of a glock
* @gl: the glock
* @object: the object
*/
void glock_set_object(struct gfs2_glock *gl, void *object)
{
void *prev_object;
spin_lock(&gl->gl_lockref.lock);
prev_object = gl->gl_object;
gl->gl_object = object;
spin_unlock(&gl->gl_lockref.lock);
if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) {
pr_warn("glock=%u/%llx\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number);
gfs2_dump_glock(NULL, gl, true);
}
}
/**
* glock_clear_object - clear the gl_object field of a glock
* @gl: the glock
* @object: object the glock currently points at
*/
void glock_clear_object(struct gfs2_glock *gl, void *object)
{
void *prev_object;
spin_lock(&gl->gl_lockref.lock);
prev_object = gl->gl_object;
gl->gl_object = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) {
pr_warn("glock=%u/%llx\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number);
gfs2_dump_glock(NULL, gl, true);
}
}
void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
{
struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
if (ri->ri_magic == 0)
ri->ri_magic = cpu_to_be32(GFS2_MAGIC);
if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC))
ri->ri_generation_deleted = cpu_to_be64(generation);
}
bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation)
{
struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC))
return false;
return generation <= be64_to_cpu(ri->ri_generation_deleted);
}
static void gfs2_glock_poke(struct gfs2_glock *gl)
{
int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP;
struct gfs2_holder gh;
int error;
__gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_);
error = gfs2_glock_nq(&gh);
if (!error)
gfs2_glock_dq(&gh);
gfs2_holder_uninit(&gh);
}
static bool gfs2_try_evict(struct gfs2_glock *gl)
{
struct gfs2_inode *ip;
bool evicted = false;
/*
* If there is contention on the iopen glock and we have an inode, try
* to grab and release the inode so that it can be evicted. This will
* allow the remote node to go ahead and delete the inode without us
* having to do it, which will avoid rgrp glock thrashing.
*
* The remote node is likely still holding the corresponding inode
* glock, so it will run before we get to verify that the delete has
* happened below.
*/
spin_lock(&gl->gl_lockref.lock);
ip = gl->gl_object;
if (ip && !igrab(&ip->i_inode))
ip = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
gl->gl_no_formal_ino = ip->i_no_formal_ino;
set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
d_prune_aliases(&ip->i_inode);
iput(&ip->i_inode);
/* If the inode was evicted, gl->gl_object will now be NULL. */
spin_lock(&gl->gl_lockref.lock);
ip = gl->gl_object;
if (ip) {
clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
if (!igrab(&ip->i_inode))
ip = NULL;
}
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
gfs2_glock_poke(ip->i_gl);
iput(&ip->i_inode);
}
evicted = !ip;
}
return evicted;
}
bool gfs2_queue_try_to_evict(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags))
return false;
return queue_delayed_work(sdp->sd_delete_wq,
&gl->gl_delete, 0);
}
static bool gfs2_queue_verify_evict(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags))
return false;
return queue_delayed_work(sdp->sd_delete_wq,
&gl->gl_delete, 5 * HZ);
}
static void delete_work_func(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete);
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) {
/*
* If we can evict the inode, give the remote node trying to
* delete the inode some time before verifying that the delete
* has happened. Otherwise, if we cause contention on the inode glock
* immediately, the remote node will think that we still have
* the inode in use, and so it will give up waiting.
*
* If we can't evict the inode, signal to the remote node that
* the inode is still in use. We'll later try to delete the
* inode locally in gfs2_evict_inode.
*
* FIXME: We only need to verify that the remote node has
* deleted the inode because nodes before this remote delete
* rework won't cooperate. At a later time, when we no longer
* care about compatibility with such nodes, we can skip this
* step entirely.
*/
if (gfs2_try_evict(gl)) {
if (test_bit(SDF_KILL, &sdp->sd_flags))
goto out;
if (gfs2_queue_verify_evict(gl))
return;
}
goto out;
}
if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) {
inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
GFS2_BLKST_UNLINKED);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -EAGAIN &&
!test_bit(SDF_KILL, &sdp->sd_flags) &&
gfs2_queue_verify_evict(gl))
return;
} else {
d_prune_aliases(inode);
iput(inode);
}
}
out:
gfs2_glock_put(gl);
}
static void glock_work_func(struct work_struct *work)
{
unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
unsigned int drop_refs = 1;
spin_lock(&gl->gl_lockref.lock);
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
clear_bit(GLF_REPLY_PENDING, &gl->gl_flags);
finish_xmote(gl, gl->gl_reply);
drop_refs++;
}
GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2011-05-24 14:44:42 +00:00
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
gl->gl_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != LM_ST_EXCLUSIVE) {
unsigned long holdtime, now = jiffies;
GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2011-05-24 14:44:42 +00:00
holdtime = gl->gl_tchange + gl->gl_hold_time;
if (time_before(now, holdtime))
delay = holdtime - now;
GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2011-05-24 14:44:42 +00:00
if (!delay) {
clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
gfs2_set_demote(gl);
GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2011-05-24 14:44:42 +00:00
}
}
run_queue(gl, 0);
if (delay) {
/* Keep one glock reference for the work we requeue. */
drop_refs--;
if (gl->gl_name.ln_type != LM_TYPE_INODE)
delay = 0;
gfs2_glock_queue_work(gl, delay);
}
/*
* Drop the remaining glock references manually here. (Mind that
* gfs2_glock_queue_work depends on the lockref spinlock begin held
* here as well.)
*/
gl->gl_lockref.count -= drop_refs;
if (!gl->gl_lockref.count) {
__gfs2_glock_put(gl);
return;
}
spin_unlock(&gl->gl_lockref.lock);
}
static struct gfs2_glock *find_insert_glock(struct lm_lockname *name,
struct gfs2_glock *new)
{
struct wait_glock_queue wait;
wait_queue_head_t *wq = glock_waitqueue(name);
struct gfs2_glock *gl;
wait.name = name;
init_wait(&wait.wait);
wait.wait.func = glock_wake_function;
again:
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
rcu_read_lock();
if (new) {
gl = rhashtable_lookup_get_insert_fast(&gl_hash_table,
&new->gl_node, ht_parms);
if (IS_ERR(gl))
goto out;
} else {
gl = rhashtable_lookup_fast(&gl_hash_table,
name, ht_parms);
}
if (gl && !lockref_get_not_dead(&gl->gl_lockref)) {
rcu_read_unlock();
schedule();
goto again;
}
out:
rcu_read_unlock();
finish_wait(wq, &wait.wait);
return gl;
}
/**
* gfs2_glock_get() - Get a glock, or create one if one doesn't exist
* @sdp: The GFS2 superblock
* @number: the lock number
* @glops: The glock_operations to use
* @create: If 0, don't create the glock if it doesn't exist
* @glp: the glock is returned here
*
* This does not lock a glock, just finds/creates structures for one.
*
* Returns: errno
*/
int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops, int create,
struct gfs2_glock **glp)
{
struct super_block *s = sdp->sd_vfs;
struct lm_lockname name = { .ln_number = number,
.ln_type = glops->go_type,
.ln_sbd = sdp };
struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
int ret = 0;
gl = find_insert_glock(&name, NULL);
if (gl) {
*glp = gl;
return 0;
}
if (!create)
return -ENOENT;
if (glops->go_flags & GLOF_ASPACE) {
struct gfs2_glock_aspace *gla =
kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS);
if (!gla)
return -ENOMEM;
gl = &gla->glock;
} else {
gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS);
if (!gl)
return -ENOMEM;
}
memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
gl->gl_ops = glops;
if (glops->go_flags & GLOF_LVB) {
gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
if (!gl->gl_lksb.sb_lvbptr) {
gfs2_glock_dealloc(&gl->gl_rcu);
return -ENOMEM;
}
}
atomic_inc(&sdp->sd_glock_disposal);
gl->gl_node.next = NULL;
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0;
gl->gl_name = name;
gfs2: set lockdep subclass for iopen glocks This patch introduce a new globs attribute to define the subclass of the glock lockref spinlock. This avoid the following lockdep warning, which occurs when we lock an inode lock while an iopen lock is held: ============================================ WARNING: possible recursive locking detected 5.10.0-rc3+ #4990 Not tainted -------------------------------------------- kworker/0:1/12 is trying to acquire lock: ffff9067d45672d8 (&gl->gl_lockref.lock){+.+.}-{3:3}, at: lockref_get+0x9/0x20 but task is already holding lock: ffff9067da308588 (&gl->gl_lockref.lock){+.+.}-{3:3}, at: delete_work_func+0x164/0x260 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&gl->gl_lockref.lock); lock(&gl->gl_lockref.lock); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by kworker/0:1/12: #0: ffff9067c1bfdd38 ((wq_completion)delete_workqueue){+.+.}-{0:0}, at: process_one_work+0x1b7/0x540 #1: ffffac594006be70 ((work_completion)(&(&gl->gl_delete)->work)){+.+.}-{0:0}, at: process_one_work+0x1b7/0x540 #2: ffff9067da308588 (&gl->gl_lockref.lock){+.+.}-{3:3}, at: delete_work_func+0x164/0x260 stack backtrace: CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.10.0-rc3+ #4990 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Workqueue: delete_workqueue delete_work_func Call Trace: dump_stack+0x8b/0xb0 __lock_acquire.cold+0x19e/0x2e3 lock_acquire+0x150/0x410 ? lockref_get+0x9/0x20 _raw_spin_lock+0x27/0x40 ? lockref_get+0x9/0x20 lockref_get+0x9/0x20 delete_work_func+0x188/0x260 process_one_work+0x237/0x540 worker_thread+0x4d/0x3b0 ? process_one_work+0x540/0x540 kthread+0x127/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Suggested-by: Andreas Gruenbacher <agruenba@redhat.com> Signed-off-by: Alexander Aring <aahringo@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2020-11-23 15:53:35 +00:00
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
gl->gl_dstamp = 0;
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
preempt_disable();
/* We use the global stats to estimate the initial per-glock stats */
gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type];
preempt_enable();
gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
gl->gl_tchange = jiffies;
gl->gl_object = NULL;
gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
if (gl->gl_name.ln_type == LM_TYPE_IOPEN)
INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func);
mapping = gfs2_glock2aspace(gl);
if (mapping) {
mapping->a_ops = &gfs2_meta_aops;
mapping->host = s->s_bdev->bd_mapping->host;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->i_private_data = NULL;
mapping->writeback_index = 0;
}
tmp = find_insert_glock(&name, gl);
if (!tmp) {
*glp = gl;
goto out;
}
if (IS_ERR(tmp)) {
ret = PTR_ERR(tmp);
goto out_free;
}
*glp = tmp;
out_free:
gfs2_glock_dealloc(&gl->gl_rcu);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_kill_wait);
out:
return ret;
}
/**
* __gfs2_holder_init - initialize a struct gfs2_holder in the default way
* @gl: the glock
* @state: the state we're requesting
* @flags: the modifier flags
* @gh: the holder structure
*
*/
void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
gh->gh_gl = gfs2_glock_hold(gl);
gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
gh->gh_iflags = 0;
}
/**
* gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
* @state: the state we're requesting
* @flags: the modifier flags
* @gh: the holder structure
*
* Don't mess with the glock.
*
*/
void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh)
{
gh->gh_state = state;
gh->gh_flags = flags;
gh->gh_iflags = 0;
gh->gh_ip = _RET_IP_;
put_pid(gh->gh_owner_pid);
gh->gh_owner_pid = get_pid(task_pid(current));
}
/**
* gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
* @gh: the holder structure
*
*/
void gfs2_holder_uninit(struct gfs2_holder *gh)
{
put_pid(gh->gh_owner_pid);
gfs2_glock_put(gh->gh_gl);
gfs2_holder_mark_uninitialized(gh);
gh->gh_ip = 0;
}
static void gfs2_glock_update_hold_time(struct gfs2_glock *gl,
unsigned long start_time)
{
/* Have we waited longer that a second? */
if (time_after(jiffies, start_time + HZ)) {
/* Lengthen the minimum hold time. */
gl->gl_hold_time = min(gl->gl_hold_time + GL_GLOCK_HOLD_INCR,
GL_GLOCK_MAX_HOLD);
}
}
/**
* gfs2_glock_holder_ready - holder is ready and its error code can be collected
* @gh: the glock holder
*
* Called when a glock holder no longer needs to be waited for because it is
* now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has
* failed (gh_error != 0).
*/
int gfs2_glock_holder_ready(struct gfs2_holder *gh)
{
if (gh->gh_error || (gh->gh_flags & GL_SKIP))
return gh->gh_error;
gh->gh_error = gfs2_instantiate(gh);
if (gh->gh_error)
gfs2_glock_dq(gh);
return gh->gh_error;
}
/**
* gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
* Returns: 0 on success
*/
int gfs2_glock_wait(struct gfs2_holder *gh)
{
unsigned long start_time = jiffies;
might_sleep();
sched: Remove proliferation of wait_on_bit() action functions The current "wait_on_bit" interface requires an 'action' function to be provided which does the actual waiting. There are over 20 such functions, many of them identical. Most cases can be satisfied by one of just two functions, one which uses io_schedule() and one which just uses schedule(). So: Rename wait_on_bit and wait_on_bit_lock to wait_on_bit_action and wait_on_bit_lock_action to make it explicit that they need an action function. Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io which are *not* given an action function but implicitly use a standard one. The decision to error-out if a signal is pending is now made based on the 'mode' argument rather than being encoded in the action function. All instances of the old wait_on_bit and wait_on_bit_lock which can use the new version have been changed accordingly and their action functions have been discarded. wait_on_bit{_lock} does not return any specific error code in the event of a signal so the caller must check for non-zero and interpolate their own error code as appropriate. The wait_on_bit() call in __fscache_wait_on_invalidate() was ambiguous as it specified TASK_UNINTERRUPTIBLE but used fscache_wait_bit_interruptible as an action function. David Howells confirms this should be uniformly "uninterruptible" The main remaining user of wait_on_bit{,_lock}_action is NFS which needs to use a freezer-aware schedule() call. A comment in fs/gfs2/glock.c notes that having multiple 'action' functions is useful as they display differently in the 'wchan' field of 'ps'. (and /proc/$PID/wchan). As the new bit_wait{,_io} functions are tagged "__sched", they will not show up at all, but something higher in the stack. So the distinction will still be visible, only with different function names (gds2_glock_wait versus gfs2_glock_dq_wait in the gfs2/glock.c case). Since first version of this patch (against 3.15) two new action functions appeared, on in NFS and one in CIFS. CIFS also now uses an action function that makes the same freezer aware schedule call as NFS. Signed-off-by: NeilBrown <neilb@suse.de> Acked-by: David Howells <dhowells@redhat.com> (fscache, keys) Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2) Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steve French <sfrench@samba.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 05:16:04 +00:00
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
gfs2_glock_update_hold_time(gh->gh_gl, start_time);
return gfs2_glock_holder_ready(gh);
}
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs)
{
int i;
for (i = 0; i < num_gh; i++)
if (test_bit(HIF_WAIT, &ghs[i].gh_iflags))
return 1;
return 0;
}
/**
* gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions
* @num_gh: the number of holders in the array
* @ghs: the glock holder array
*
* Returns: 0 on success, meaning all glocks have been granted and are held.
* -ESTALE if the request timed out, meaning all glocks were released,
* and the caller should retry the operation.
*/
int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
{
struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd;
int i, ret = 0, timeout = 0;
unsigned long start_time = jiffies;
might_sleep();
/*
* Total up the (minimum hold time * 2) of all glocks and use that to
* determine the max amount of time we should wait.
*/
for (i = 0; i < num_gh; i++)
timeout += ghs[i].gh_gl->gl_hold_time << 1;
if (!wait_event_timeout(sdp->sd_async_glock_wait,
!glocks_pending(num_gh, ghs), timeout)) {
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
ret = -ESTALE; /* request timed out. */
goto out;
}
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
for (i = 0; i < num_gh; i++) {
struct gfs2_holder *gh = &ghs[i];
int ret2;
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
if (test_bit(HIF_HOLDER, &gh->gh_iflags)) {
gfs2_glock_update_hold_time(gh->gh_gl,
start_time);
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
}
ret2 = gfs2_glock_holder_ready(gh);
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
if (!ret)
ret = ret2;
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
}
out:
if (ret) {
for (i = 0; i < num_gh; i++) {
struct gfs2_holder *gh = &ghs[i];
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
gfs2_glock_dq(gh);
}
}
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
return ret;
}
/**
* handle_callback - process a demote request
* @gl: the glock
* @state: the state the caller wants us to change to
* @delay: zero to demote immediately; otherwise pending demote
* @remote: true if this came from a different cluster node
*
* There are only two requests that we are going to see in actual
* practise: LM_ST_SHARED and LM_ST_UNLOCKED
*/
static void handle_callback(struct gfs2_glock *gl, unsigned int state,
unsigned long delay, bool remote)
{
if (delay)
set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
else
gfs2_set_demote(gl);
if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
gl->gl_demote_state = state;
gl->gl_demote_time = jiffies;
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
gl->gl_demote_state = LM_ST_UNLOCKED;
}
if (gl->gl_ops->go_callback)
gl->gl_ops->go_callback(gl, remote);
trace_gfs2_demote_rq(gl, remote);
}
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
if (seq) {
seq_vprintf(seq, fmt, args);
} else {
vaf.fmt = fmt;
vaf.va = &args;
pr_err("%pV", &vaf);
}
va_end(args);
}
static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
{
if (!(gh->gh_flags & GL_NOPID))
return true;
if (gh->gh_state == LM_ST_UNLOCKED)
return true;
return false;
}
/**
* add_to_queue - Add a holder to the wait queue (but look for recursion)
* @gh: the holder structure to add
*
* Eventually we should move the recursive locking trap to a
* debugging option or something like that. This is the fast
* path and needs to have the minimum number of distractions.
*
*/
static inline void add_to_queue(struct gfs2_holder *gh)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *insert_pt = NULL;
struct gfs2_holder *gh2;
int try_futile = 0;
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL);
if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-08-30 17:31:02 +00:00
GLOCK_BUG_ON(gl, true);
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
struct gfs2_holder *current_gh;
current_gh = find_first_holder(gl);
try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (likely(gh2->gh_owner_pid != gh->gh_owner_pid))
continue;
if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK)
continue;
if (!pid_is_meaningful(gh2))
continue;
goto trap_recursive;
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (try_futile &&
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
fail:
gh->gh_error = GLR_TRYFAILED;
gfs2_holder_wake(gh);
return;
}
if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
continue;
}
trace_gfs2_glock_queue(gh, 1);
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
if (likely(insert_pt == NULL)) {
list_add_tail(&gh->gh_list, &gl->gl_holders);
return;
}
list_add_tail(&gh->gh_list, insert_pt);
spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
spin_lock(&gl->gl_lockref.lock);
return;
trap_recursive:
fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip);
fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid));
fs_err(sdp, "lock type: %d req lock state : %d\n",
gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip);
fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid));
fs_err(sdp, "lock type: %d req lock state : %d\n",
gh->gh_gl->gl_name.ln_type, gh->gh_state);
gfs2_dump_glock(NULL, gl, true);
BUG();
}
/**
* gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
* @gh: the holder structure
*
* if (gh->gh_flags & GL_ASYNC), this never returns an error
*
* Returns: 0, GLR_TRYFAILED, or errno on failure
*/
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
int error;
gfs2: Force withdraw to replay journals and wait for it to finish When a node withdraws from a file system, it often leaves its journal in an incomplete state. This is especially true when the withdraw is caused by io errors writing to the journal. Before this patch, a withdraw would try to write a "shutdown" record to the journal, tell dlm it's done with the file system, and none of the other nodes know about the problem. Later, when the problem is fixed and the withdrawn node is rebooted, it would then discover that its own journal was incomplete, and replay it. However, replaying it at this point is almost guaranteed to introduce corruption because the other nodes are likely to have used affected resource groups that appeared in the journal since the time of the withdraw. Replaying the journal later will overwrite any changes made, and not through any fault of dlm, which was instructed during the withdraw to release those resources. This patch makes file system withdraws seen by the entire cluster. Withdrawing nodes dequeue their journal glock to allow recovery. The remaining nodes check all the journals to see if they are clean or in need of replay. They try to replay dirty journals, but only the journals of withdrawn nodes will be "not busy" and therefore available for replay. Until the journal replay is complete, no i/o related glocks may be given out, to ensure that the replay does not cause the aforementioned corruption: We cannot allow any journal replay to overwrite blocks associated with a glock once it is held. The "live" glock which is now used to signal when a withdraw occurs. When a withdraw occurs, the node signals its withdraw by dequeueing the "live" glock and trying to enqueue it in EX mode, thus forcing the other nodes to all see a demote request, by way of a "1CB" (one callback) try lock. The "live" glock is not granted in EX; the callback is only just used to indicate a withdraw has occurred. Note that all nodes in the cluster must wait for the recovering node to finish replaying the withdrawing node's journal before continuing. To this end, it checks that the journals are clean multiple times in a retry loop. Also note that the withdraw function may be called from a wide variety of situations, and therefore, we need to take extra precautions to make sure pointers are valid before using them in many circumstances. We also need to take care when glocks decide to withdraw, since the withdraw code now uses glocks. Also, before this patch, if a process encountered an error and decided to withdraw, if another process was already withdrawing, the second withdraw would be silently ignored, which set it free to unlock its glocks. That's correct behavior if the original withdrawer encounters further errors down the road. But if secondary waiters don't wait for the journal replay, unlocking glocks will allow other nodes to use them, despite the fact that the journal containing those blocks is being replayed. The replay needs to finish before our glocks are released to other nodes. IOW, secondary withdraws need to wait for the first withdraw to finish. For example, if an rgrp glock is unlocked by a process that didn't wait for the first withdraw, a journal replay could introduce file system corruption by replaying a rgrp block that has already been granted to a different cluster node. Signed-off-by: Bob Peterson <rpeterso@redhat.com>
2020-01-28 19:23:45 +00:00
if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP))
return -EIO;
if (gh->gh_flags & GL_NOBLOCK) {
struct gfs2_holder *current_gh;
error = -ECHILD;
spin_lock(&gl->gl_lockref.lock);
if (find_last_waiter(gl))
goto unlock;
current_gh = find_first_holder(gl);
if (!may_grant(gl, current_gh, gh))
goto unlock;
set_bit(HIF_HOLDER, &gh->gh_iflags);
list_add_tail(&gh->gh_list, &gl->gl_holders);
trace_gfs2_promote(gh);
error = 0;
unlock:
spin_unlock(&gl->gl_lockref.lock);
return error;
}
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
gh->gh_error = 0;
spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gl->gl_lockref.count++;
gfs2_glock_queue_work(gl, 0);
}
run_queue(gl, 1);
spin_unlock(&gl->gl_lockref.lock);
error = 0;
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
return error;
}
/**
* gfs2_glock_poll - poll to see if an async request has been completed
* @gh: the holder
*
* Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
*/
int gfs2_glock_poll(struct gfs2_holder *gh)
{
return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
static inline bool needs_demote(struct gfs2_glock *gl)
{
return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
}
static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
unsigned delay = 0;
int fast_path = 0;
/*
* This holder should not be cached, so mark it for demote.
* Note: this should be done before the check for needs_demote
* below.
*/
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
clear_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_glock_queue(gh, 0);
/*
* If there hasn't been a demote request we are done.
* (Let the remaining holders, if any, keep holding it.)
*/
if (!needs_demote(gl)) {
if (list_empty(&gl->gl_holders))
fast_path = 1;
}
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
gfs2_glock_add_to_lru(gl);
if (unlikely(!fast_path)) {
gl->gl_lockref.count++;
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
!test_bit(GLF_DEMOTE, &gl->gl_flags) &&
gl->gl_name.ln_type == LM_TYPE_INODE)
delay = gl->gl_hold_time;
gfs2_glock_queue_work(gl, delay);
}
}
/**
* gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
* @gh: the glock holder
*
*/
void gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
spin_lock(&gl->gl_lockref.lock);
if (!gfs2_holder_queued(gh)) {
/*
* May have already been dequeued because the locking request
* was GL_ASYNC and it has failed in the meantime.
*/
goto out;
}
if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
spin_unlock(&gl->gl_lockref.lock);
gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl);
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
spin_lock(&gl->gl_lockref.lock);
}
/*
* If we're in the process of file system withdraw, we cannot just
* dequeue any glocks until our journal is recovered, lest we introduce
* file system corruption. We need two exceptions to this rule: We need
* to allow unlocking of nondisk glocks and the glock for our own
* journal that needs recovery.
*/
if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
glock_blocked_by_withdraw(gl) &&
gh->gh_gl != sdp->sd_jinode_gl) {
sdp->sd_glock_dqs_held++;
spin_unlock(&gl->gl_lockref.lock);
might_sleep();
wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
TASK_UNINTERRUPTIBLE);
spin_lock(&gl->gl_lockref.lock);
}
__gfs2_glock_dq(gh);
out:
spin_unlock(&gl->gl_lockref.lock);
}
void gfs2_glock_dq_wait(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
gfs2_glock_dq(gh);
might_sleep();
sched: Remove proliferation of wait_on_bit() action functions The current "wait_on_bit" interface requires an 'action' function to be provided which does the actual waiting. There are over 20 such functions, many of them identical. Most cases can be satisfied by one of just two functions, one which uses io_schedule() and one which just uses schedule(). So: Rename wait_on_bit and wait_on_bit_lock to wait_on_bit_action and wait_on_bit_lock_action to make it explicit that they need an action function. Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io which are *not* given an action function but implicitly use a standard one. The decision to error-out if a signal is pending is now made based on the 'mode' argument rather than being encoded in the action function. All instances of the old wait_on_bit and wait_on_bit_lock which can use the new version have been changed accordingly and their action functions have been discarded. wait_on_bit{_lock} does not return any specific error code in the event of a signal so the caller must check for non-zero and interpolate their own error code as appropriate. The wait_on_bit() call in __fscache_wait_on_invalidate() was ambiguous as it specified TASK_UNINTERRUPTIBLE but used fscache_wait_bit_interruptible as an action function. David Howells confirms this should be uniformly "uninterruptible" The main remaining user of wait_on_bit{,_lock}_action is NFS which needs to use a freezer-aware schedule() call. A comment in fs/gfs2/glock.c notes that having multiple 'action' functions is useful as they display differently in the 'wchan' field of 'ps'. (and /proc/$PID/wchan). As the new bit_wait{,_io} functions are tagged "__sched", they will not show up at all, but something higher in the stack. So the distinction will still be visible, only with different function names (gds2_glock_wait versus gfs2_glock_dq_wait in the gfs2/glock.c case). Since first version of this patch (against 3.15) two new action functions appeared, on in NFS and one in CIFS. CIFS also now uses an action function that makes the same freezer aware schedule call as NFS. Signed-off-by: NeilBrown <neilb@suse.de> Acked-by: David Howells <dhowells@redhat.com> (fscache, keys) Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2) Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steve French <sfrench@samba.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 05:16:04 +00:00
wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
}
/**
* gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
* @gh: the holder structure
*
*/
void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
{
gfs2_glock_dq(gh);
gfs2_holder_uninit(gh);
}
/**
* gfs2_glock_nq_num - acquire a glock based on lock number
* @sdp: the filesystem
* @number: the lock number
* @glops: the glock operations for the type of glock
* @state: the state to acquire the glock in
* @flags: modifier flags for the acquisition
* @gh: the struct gfs2_holder
*
* Returns: errno
*/
int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
unsigned int state, u16 flags, struct gfs2_holder *gh)
{
struct gfs2_glock *gl;
int error;
error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
if (!error) {
error = gfs2_glock_nq_init(gl, state, flags, gh);
gfs2_glock_put(gl);
}
return error;
}
/**
* glock_compare - Compare two struct gfs2_glock structures for sorting
* @arg_a: the first structure
* @arg_b: the second structure
*
*/
static int glock_compare(const void *arg_a, const void *arg_b)
{
const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
if (a->ln_number > b->ln_number)
return 1;
if (a->ln_number < b->ln_number)
return -1;
BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type);
return 0;
}
/**
* nq_m_sync - synchronously acquire more than one glock in deadlock free order
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
* @p: placeholder for the holder structure to pass back
*
* Returns: 0 on success (all glocks acquired),
* errno on failure (no glocks acquired)
*/
static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
struct gfs2_holder **p)
{
unsigned int x;
int error = 0;
for (x = 0; x < num_gh; x++)
p[x] = &ghs[x];
sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
for (x = 0; x < num_gh; x++) {
error = gfs2_glock_nq(p[x]);
if (error) {
while (x--)
gfs2_glock_dq(p[x]);
break;
}
}
return error;
}
/**
* gfs2_glock_nq_m - acquire multiple glocks
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
*
* Returns: 0 on success (all glocks acquired),
* errno on failure (no glocks acquired)
*/
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
{
struct gfs2_holder *tmp[4];
struct gfs2_holder **pph = tmp;
int error = 0;
switch(num_gh) {
case 0:
return 0;
case 1:
return gfs2_glock_nq(ghs);
default:
if (num_gh <= 4)
break;
treewide: kmalloc() -> kmalloc_array() The kmalloc() function has a 2-factor argument form, kmalloc_array(). This patch replaces cases of: kmalloc(a * b, gfp) with: kmalloc_array(a * b, gfp) as well as handling cases of: kmalloc(a * b * c, gfp) with: kmalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kmalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kmalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The tools/ directory was manually excluded, since it has its own implementation of kmalloc(). The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(char) * COUNT + COUNT , ...) | kmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kmalloc + kmalloc_array ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kmalloc(C1 * C2 * C3, ...) | kmalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kmalloc(sizeof(THING) * C2, ...) | kmalloc(sizeof(TYPE) * C2, ...) | kmalloc(C1 * C2 * C3, ...) | kmalloc(C1 * C2, ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - (E1) * E2 + E1, E2 , ...) | - kmalloc + kmalloc_array ( - (E1) * (E2) + E1, E2 , ...) | - kmalloc + kmalloc_array ( - E1 * E2 + E1, E2 , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 20:55:00 +00:00
pph = kmalloc_array(num_gh, sizeof(struct gfs2_holder *),
GFP_NOFS);
if (!pph)
return -ENOMEM;
}
error = nq_m_sync(num_gh, ghs, pph);
if (pph != tmp)
kfree(pph);
return error;
}
/**
* gfs2_glock_dq_m - release multiple glocks
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
{
while (num_gh--)
gfs2_glock_dq(&ghs[num_gh]);
}
void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
{
unsigned long delay = 0;
unsigned long holdtime;
unsigned long now = jiffies;
gfs2_glock_hold(gl);
spin_lock(&gl->gl_lockref.lock);
holdtime = gl->gl_tchange + gl->gl_hold_time;
if (!list_empty(&gl->gl_holders) &&
gl->gl_name.ln_type == LM_TYPE_INODE) {
if (time_before(now, holdtime))
delay = holdtime - now;
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
handle_callback(gl, state, delay, true);
gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
}
/**
* gfs2_should_freeze - Figure out if glock should be frozen
* @gl: The glock in question
*
* Glocks are not frozen if (a) the result of the dlm operation is
* an error, (b) the locking operation was an unlock operation or
* (c) if there is a "noexp" flagged request anywhere in the queue
*
* Returns: 1 if freezing should occur, 0 otherwise
*/
static int gfs2_should_freeze(const struct gfs2_glock *gl)
{
const struct gfs2_holder *gh;
if (gl->gl_reply & ~LM_OUT_ST_MASK)
return 0;
if (gl->gl_target == LM_ST_UNLOCKED)
return 0;
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
if (LM_FLAG_NOEXP & gh->gh_flags)
return 0;
}
return 1;
}
/**
* gfs2_glock_complete - Callback used by locking
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
* The gl_reply field is under the gl_lockref.lock lock so that it is ok
* to use a bitfield shared with other glock state fields.
*/
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
spin_lock(&gl->gl_lockref.lock);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
return;
}
}
gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
}
static int glock_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct gfs2_glock *gla, *glb;
gla = list_entry(a, struct gfs2_glock, gl_lru);
glb = list_entry(b, struct gfs2_glock, gl_lru);
if (gla->gl_name.ln_number > glb->gl_name.ln_number)
return 1;
if (gla->gl_name.ln_number < glb->gl_name.ln_number)
return -1;
return 0;
}
static bool can_free_glock(struct gfs2_glock *gl)
{
bool held = gl->gl_state != LM_ST_UNLOCKED;
return !test_bit(GLF_LOCK, &gl->gl_flags) &&
gl->gl_lockref.count == held;
}
/**
* gfs2_dispose_glock_lru - Demote a list of glocks
* @list: The list to dispose of
*
* Disposing of glocks may involve disk accesses, so that here we sort
* the glocks by number (i.e. disk location of the inodes) so that if
* there are any such accesses, they'll be sent in order (mostly).
*
* Must be called under the lru_lock, but may drop and retake this
* lock. While the lru_lock is dropped, entries may vanish from the
* list, but no new entries will appear on the list (since it is
* private)
*/
static unsigned long gfs2_dispose_glock_lru(struct list_head *list)
__releases(&lru_lock)
__acquires(&lru_lock)
{
struct gfs2_glock *gl;
unsigned long freed = 0;
list_sort(NULL, list, glock_cmp);
while(!list_empty(list)) {
gl = list_first_entry(list, struct gfs2_glock, gl_lru);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_move(&gl->gl_lru, &lru_list);
continue;
}
if (!can_free_glock(gl)) {
spin_unlock(&gl->gl_lockref.lock);
goto add_back_to_lru;
}
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
clear_bit(GLF_LRU, &gl->gl_flags);
freed++;
gl->gl_lockref.count++;
if (demote_ok(gl))
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
}
return freed;
}
/**
* gfs2_scan_glock_lru - Scan the LRU looking for locks to demote
* @nr: The number of entries to scan
*
* This function selects the entries on the LRU which are able to
* be demoted, and then kicks off the process by calling
* gfs2_dispose_glock_lru() above.
*/
static unsigned long gfs2_scan_glock_lru(unsigned long nr)
{
struct gfs2_glock *gl, *next;
LIST_HEAD(dispose);
unsigned long freed = 0;
spin_lock(&lru_lock);
list_for_each_entry_safe(gl, next, &lru_list, gl_lru) {
if (!nr--)
break;
if (can_free_glock(gl))
list_move(&gl->gl_lru, &dispose);
}
if (!list_empty(&dispose))
freed = gfs2_dispose_glock_lru(&dispose);
spin_unlock(&lru_lock);
fs: convert fs shrinkers to new scan/count API Convert the filesystem shrinkers to use the new API, and standardise some of the behaviours of the shrinkers at the same time. For example, nr_to_scan means the number of objects to scan, not the number of objects to free. I refactored the CIFS idmap shrinker a little - it really needs to be broken up into a shrinker per tree and keep an item count with the tree root so that we don't need to walk the tree every time the shrinker needs to count the number of objects in the tree (i.e. all the time under memory pressure). [glommer@openvz.org: fixes for ext4, ubifs, nfs, cifs and glock. Fixes are needed mainly due to new code merged in the tree] [assorted fixes folded in] Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Glauber Costa <glommer@openvz.org> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Steven Whitehouse <swhiteho@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 00:18:09 +00:00
return freed;
}
fs: convert fs shrinkers to new scan/count API Convert the filesystem shrinkers to use the new API, and standardise some of the behaviours of the shrinkers at the same time. For example, nr_to_scan means the number of objects to scan, not the number of objects to free. I refactored the CIFS idmap shrinker a little - it really needs to be broken up into a shrinker per tree and keep an item count with the tree root so that we don't need to walk the tree every time the shrinker needs to count the number of objects in the tree (i.e. all the time under memory pressure). [glommer@openvz.org: fixes for ext4, ubifs, nfs, cifs and glock. Fixes are needed mainly due to new code merged in the tree] [assorted fixes folded in] Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Glauber Costa <glommer@openvz.org> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Steven Whitehouse <swhiteho@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 00:18:09 +00:00
static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
fs: convert fs shrinkers to new scan/count API Convert the filesystem shrinkers to use the new API, and standardise some of the behaviours of the shrinkers at the same time. For example, nr_to_scan means the number of objects to scan, not the number of objects to free. I refactored the CIFS idmap shrinker a little - it really needs to be broken up into a shrinker per tree and keep an item count with the tree root so that we don't need to walk the tree every time the shrinker needs to count the number of objects in the tree (i.e. all the time under memory pressure). [glommer@openvz.org: fixes for ext4, ubifs, nfs, cifs and glock. Fixes are needed mainly due to new code merged in the tree] [assorted fixes folded in] Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Glauber Costa <glommer@openvz.org> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Steven Whitehouse <swhiteho@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 00:18:09 +00:00
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
return gfs2_scan_glock_lru(sc->nr_to_scan);
}
fs: convert fs shrinkers to new scan/count API Convert the filesystem shrinkers to use the new API, and standardise some of the behaviours of the shrinkers at the same time. For example, nr_to_scan means the number of objects to scan, not the number of objects to free. I refactored the CIFS idmap shrinker a little - it really needs to be broken up into a shrinker per tree and keep an item count with the tree root so that we don't need to walk the tree every time the shrinker needs to count the number of objects in the tree (i.e. all the time under memory pressure). [glommer@openvz.org: fixes for ext4, ubifs, nfs, cifs and glock. Fixes are needed mainly due to new code merged in the tree] [assorted fixes folded in] Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Glauber Costa <glommer@openvz.org> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Steven Whitehouse <swhiteho@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 00:18:09 +00:00
static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
super: fix calculation of shrinkable objects for small numbers The sysctl knob sysctl_vfs_cache_pressure is used to determine which percentage of the shrinkable objects in our cache we should actively try to shrink. It works great in situations in which we have many objects (at least more than 100), because the aproximation errors will be negligible. But if this is not the case, specially when total_objects < 100, we may end up concluding that we have no objects at all (total / 100 = 0, if total < 100). This is certainly not the biggest killer in the world, but may matter in very low kernel memory situations. Signed-off-by: Glauber Costa <glommer@openvz.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 00:17:53 +00:00
return vfs_pressure_ratio(atomic_read(&lru_count));
}
gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Abhinav Kumar <quic_abhinavk@quicinc.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Anna Schumaker <anna@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Carlos Llamas <cmllamas@google.com> Cc: Chandan Babu R <chandan.babu@oracle.com> Cc: Chao Yu <chao@kernel.org> Cc: Chris Mason <clm@fb.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christian Koenig <christian.koenig@amd.com> Cc: Chuck Lever <cel@kernel.org> Cc: Coly Li <colyli@suse.de> Cc: Dai Ngo <Dai.Ngo@oracle.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Airlie <airlied@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Sterba <dsterba@suse.com> Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> Cc: Gao Xiang <hsiangkao@linux.alibaba.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Rui <ray.huang@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Wang <jasowang@redhat.com> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jeffle Xu <jefflexu@linux.alibaba.com> Cc: Joel Fernandes (Google) <joel@joelfernandes.org> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Juergen Gross <jgross@suse.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Kirill Tkhai <tkhai@ya.ru> Cc: Marijn Suijten <marijn.suijten@somainline.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mike Snitzer <snitzer@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nadav Amit <namit@vmware.com> Cc: Neil Brown <neilb@suse.de> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Olga Kornievskaia <kolga@netapp.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Richard Weinberger <richard@nod.at> Cc: Rob Clark <robdclark@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Sean Paul <sean@poorly.run> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Song Liu <song@kernel.org> Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Tom Talpey <tom@talpey.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com> Cc: Yue Hu <huyue2@coolpad.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-11 09:44:07 +00:00
static struct shrinker *glock_shrinker;
/**
* glock_hash_walk - Call a function for glock in a hash bucket
* @examiner: the function
* @sdp: the filesystem
*
* Note that the function can be called multiple times on the same
* object. So the user must ensure that the function can cope with
* that.
*/
static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
struct rhashtable_iter iter;
rhashtable_walk_enter(&gl_hash_table, &iter);
do {
rhashtable_walk_start(&iter);
while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) {
if (gl->gl_name.ln_sbd == sdp)
examiner(gl);
}
rhashtable_walk_stop(&iter);
} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
rhashtable_walk_exit(&iter);
}
void gfs2_cancel_delete_work(struct gfs2_glock *gl)
{
clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags);
clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags);
if (cancel_delayed_work(&gl->gl_delete))
gfs2_glock_put(gl);
}
static void flush_delete_work(struct gfs2_glock *gl)
{
if (gl->gl_name.ln_type == LM_TYPE_IOPEN) {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (cancel_delayed_work(&gl->gl_delete)) {
queue_delayed_work(sdp->sd_delete_wq,
&gl->gl_delete, 0);
}
}
}
void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
{
glock_hash_walk(flush_delete_work, sdp);
flush_workqueue(sdp->sd_delete_wq);
}
/**
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @gl: The glock to thaw
*
*/
static void thaw_glock(struct gfs2_glock *gl)
{
if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
return;
if (!lockref_get_not_dead(&gl->gl_lockref))
return;
spin_lock(&gl->gl_lockref.lock);
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
}
/**
* clear_glock - look at a glock and see if we can free it from glock cache
* @gl: the glock to look at
*
*/
static void clear_glock(struct gfs2_glock *gl)
{
gfs2_glock_remove_from_lru(gl);
spin_lock(&gl->gl_lockref.lock);
if (!__lockref_is_dead(&gl->gl_lockref)) {
gl->gl_lockref.count++;
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
gfs2_glock_queue_work(gl, 0);
}
spin_unlock(&gl->gl_lockref.lock);
}
/**
* gfs2_glock_thaw - Thaw any frozen glocks
* @sdp: The super block
*
*/
void gfs2_glock_thaw(struct gfs2_sbd *sdp)
{
glock_hash_walk(thaw_glock, sdp);
}
static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid)
{
spin_lock(&gl->gl_lockref.lock);
gfs2_dump_glock(seq, gl, fsid);
spin_unlock(&gl->gl_lockref.lock);
}
static void dump_glock_func(struct gfs2_glock *gl)
{
dump_glock(NULL, gl, true);
}
static void withdraw_dq(struct gfs2_glock *gl)
{
spin_lock(&gl->gl_lockref.lock);
if (!__lockref_is_dead(&gl->gl_lockref) &&
glock_blocked_by_withdraw(gl))
do_error(gl, LM_OUT_ERROR); /* remove pending waiters */
spin_unlock(&gl->gl_lockref.lock);
}
void gfs2_gl_dq_holders(struct gfs2_sbd *sdp)
{
glock_hash_walk(withdraw_dq, sdp);
}
/**
* gfs2_gl_hash_clear - Empty out the glock hash table
* @sdp: the filesystem
*
* Called when unmounting the filesystem.
*/
void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
{
set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
flush_workqueue(glock_workqueue);
glock_hash_walk(clear_glock, sdp);
flush_workqueue(glock_workqueue);
wait_event_timeout(sdp->sd_kill_wait,
atomic_read(&sdp->sd_glock_disposal) == 0,
HZ * 600);
gfs2_lm_unmount(sdp);
gfs2_free_dead_glocks(sdp);
glock_hash_walk(dump_glock_func, sdp);
}
static const char *state2str(unsigned state)
{
switch(state) {
case LM_ST_UNLOCKED:
return "UN";
case LM_ST_SHARED:
return "SH";
case LM_ST_DEFERRED:
return "DF";
case LM_ST_EXCLUSIVE:
return "EX";
}
return "??";
}
static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
{
char *p = buf;
if (flags & LM_FLAG_TRY)
*p++ = 't';
if (flags & LM_FLAG_TRY_1CB)
*p++ = 'T';
if (flags & LM_FLAG_NOEXP)
*p++ = 'e';
if (flags & LM_FLAG_ANY)
*p++ = 'A';
if (flags & LM_FLAG_NODE_SCOPE)
*p++ = 'n';
if (flags & GL_ASYNC)
*p++ = 'a';
if (flags & GL_EXACT)
*p++ = 'E';
if (flags & GL_NOCACHE)
*p++ = 'c';
if (test_bit(HIF_HOLDER, &iflags))
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
if (flags & GL_SKIP)
*p++ = 's';
*p = 0;
return buf;
}
/**
* dump_holder - print information about a glock holder
* @seq: the seq_file struct
* @gh: the glock holder
* @fs_id_buf: pointer to file system id (if requested)
*
*/
static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh,
const char *fs_id_buf)
{
const char *comm = "(none)";
pid_t owner_pid = 0;
char flags_buf[32];
rcu_read_lock();
if (pid_is_meaningful(gh)) {
struct task_struct *gh_owner;
comm = "(ended)";
owner_pid = pid_nr(gh->gh_owner_pid);
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
if (gh_owner)
comm = gh_owner->comm;
}
gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
fs_id_buf, state2str(gh->gh_state),
hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip);
rcu_read_unlock();
}
static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
{
const unsigned long *gflags = &gl->gl_flags;
char *p = buf;
if (test_bit(GLF_LOCK, gflags))
*p++ = 'l';
if (test_bit(GLF_DEMOTE, gflags))
*p++ = 'D';
if (test_bit(GLF_PENDING_DEMOTE, gflags))
*p++ = 'd';
if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
*p++ = 'p';
if (test_bit(GLF_DIRTY, gflags))
*p++ = 'y';
if (test_bit(GLF_LFLUSH, gflags))
*p++ = 'f';
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
*p++ = 'i';
if (test_bit(GLF_REPLY_PENDING, gflags))
*p++ = 'r';
if (test_bit(GLF_INITIAL, gflags))
*p++ = 'I';
if (test_bit(GLF_FROZEN, gflags))
*p++ = 'F';
if (!list_empty(&gl->gl_holders))
*p++ = 'q';
if (test_bit(GLF_LRU, gflags))
*p++ = 'L';
if (gl->gl_object)
*p++ = 'o';
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
if (test_bit(GLF_BLOCKING, gflags))
*p++ = 'b';
if (test_bit(GLF_FREEING, gflags))
*p++ = 'x';
gfs2: fix GL_SKIP node_scope problems Before this patch, when a glock was locked, the very first holder on the queue would unlock the lockref and call the go_instantiate glops function (if one existed), unless GL_SKIP was specified. When we introduced the new node-scope concept, we allowed multiple holders to lock glocks in EX mode and share the lock. But node-scope introduced a new problem: if the first holder has GL_SKIP and the next one does NOT, since it is not the first holder on the queue, the go_instantiate op was not called. Eventually the GL_SKIP holder may call the instantiate sub-function (e.g. gfs2_rgrp_bh_get) but there was still a window of time in which another non-GL_SKIP holder assumes the instantiate function had been called by the first holder. In the case of rgrp glocks, this led to a NULL pointer dereference on the buffer_heads. This patch tries to fix the problem by introducing two new glock flags: GLF_INSTANTIATE_NEEDED, which keeps track of when the instantiate function needs to be called to "fill in" or "read in" the object before it is referenced. GLF_INSTANTIATE_IN_PROG which is used to determine when a process is in the process of reading in the object. Whenever a function needs to reference the object, it checks the GLF_INSTANTIATE_NEEDED flag, and if set, it sets GLF_INSTANTIATE_IN_PROG and calls the glops "go_instantiate" function. As before, the gl_lockref spin_lock is unlocked during the IO operation, which may take a relatively long amount of time to complete. While unlocked, if another process determines go_instantiate is still needed, it sees GLF_INSTANTIATE_IN_PROG is set, and waits for the go_instantiate glop operation to be completed. Once GLF_INSTANTIATE_IN_PROG is cleared, it needs to check GLF_INSTANTIATE_NEEDED again because the other process's go_instantiate operation may not have been successful. Functions that previously called the instantiate sub-functions now call directly into gfs2_instantiate so the new bits are managed properly. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2021-10-06 14:29:18 +00:00
if (test_bit(GLF_INSTANTIATE_NEEDED, gflags))
*p++ = 'n';
if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags))
*p++ = 'N';
if (test_bit(GLF_TRY_TO_EVICT, gflags))
*p++ = 'e';
if (test_bit(GLF_VERIFY_EVICT, gflags))
*p++ = 'E';
*p = 0;
return buf;
}
/**
* gfs2_dump_glock - print information about a glock
* @seq: The seq_file struct
* @gl: the glock
* @fsid: If true, also dump the file system id
*
* The file format is as follows:
* One line per object, capital letters are used to indicate objects
* G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
* other objects are indented by a single space and follow the glock to
* which they are related. Fields are indicated by lower case letters
* followed by a colon and the field value, except for strings which are in
* [] so that its possible to see if they are composed of spaces for
* example. The field's are n = number (id of the object), f = flags,
* t = type, s = state, r = refcount, e = error, p = pid.
*
*/
void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned long long dtime;
const struct gfs2_holder *gh;
char gflags_buf[32];
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
unsigned long nrpages = 0;
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
struct address_space *mapping = gfs2_glock2aspace(gl);
nrpages = mapping->nrpages;
}
memset(fs_id_buf, 0, sizeof(fs_id_buf));
if (fsid && sdp) /* safety precaution */
sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
dtime = jiffies - gl->gl_demote_time;
dtime *= 1000000/HZ; /* demote time in uSec */
if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
dtime = 0;
gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d "
"v:%d r:%d m:%ld p:%lu\n",
fs_id_buf, state2str(gl->gl_state),
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number,
gflags2str(gflags_buf, gl),
state2str(gl->gl_target),
state2str(gl->gl_demote_state), dtime,
atomic_read(&gl->gl_ail_count),
atomic_read(&gl->gl_revokes),
(int)gl->gl_lockref.count, gl->gl_hold_time, nrpages);
list_for_each_entry(gh, &gl->gl_holders, gh_list)
dump_holder(seq, gh, fs_id_buf);
if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
glops->go_dump(seq, gl, fs_id_buf);
}
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock *gl = iter_ptr;
seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n",
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number,
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
(unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
return 0;
}
static const char *gfs2_gltype[] = {
"type",
"reserved",
"nondisk",
"inode",
"rgrp",
"meta",
"iopen",
"flock",
"plock",
"quota",
"journal",
};
static const char *gfs2_stype[] = {
[GFS2_LKS_SRTT] = "srtt",
[GFS2_LKS_SRTTVAR] = "srttvar",
[GFS2_LKS_SRTTB] = "srttb",
[GFS2_LKS_SRTTVARB] = "srttvarb",
[GFS2_LKS_SIRT] = "sirt",
[GFS2_LKS_SIRTVAR] = "sirtvar",
[GFS2_LKS_DCOUNT] = "dlm",
[GFS2_LKS_QCOUNT] = "queue",
};
#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype))
static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_sbd *sdp = seq->private;
loff_t pos = *(loff_t *)iter_ptr;
unsigned index = pos >> 3;
unsigned subindex = pos & 0x07;
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
int i;
if (index == 0 && subindex != 0)
return 0;
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
seq_printf(seq, "%-10s %8s:", gfs2_gltype[index],
(index == 0) ? "cpu": gfs2_stype[subindex]);
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
for_each_possible_cpu(i) {
const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
if (index == 0)
seq_printf(seq, " %15u", i);
else
seq_printf(seq, " %15llu", (unsigned long long)lkstats->
lkstats[index - 1].stats[subindex]);
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
}
seq_putc(seq, '\n');
return 0;
}
int __init gfs2_glock_init(void)
{
int i, ret;
ret = rhashtable_init(&gl_hash_table, &ht_parms);
if (ret < 0)
return ret;
glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZABLE, 0);
if (!glock_workqueue) {
rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
}
gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Abhinav Kumar <quic_abhinavk@quicinc.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Anna Schumaker <anna@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Carlos Llamas <cmllamas@google.com> Cc: Chandan Babu R <chandan.babu@oracle.com> Cc: Chao Yu <chao@kernel.org> Cc: Chris Mason <clm@fb.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christian Koenig <christian.koenig@amd.com> Cc: Chuck Lever <cel@kernel.org> Cc: Coly Li <colyli@suse.de> Cc: Dai Ngo <Dai.Ngo@oracle.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Airlie <airlied@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Sterba <dsterba@suse.com> Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> Cc: Gao Xiang <hsiangkao@linux.alibaba.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Rui <ray.huang@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Wang <jasowang@redhat.com> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jeffle Xu <jefflexu@linux.alibaba.com> Cc: Joel Fernandes (Google) <joel@joelfernandes.org> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Juergen Gross <jgross@suse.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Kirill Tkhai <tkhai@ya.ru> Cc: Marijn Suijten <marijn.suijten@somainline.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mike Snitzer <snitzer@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nadav Amit <namit@vmware.com> Cc: Neil Brown <neilb@suse.de> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Olga Kornievskaia <kolga@netapp.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Richard Weinberger <richard@nod.at> Cc: Rob Clark <robdclark@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Sean Paul <sean@poorly.run> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Song Liu <song@kernel.org> Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Tom Talpey <tom@talpey.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com> Cc: Yue Hu <huyue2@coolpad.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-11 09:44:07 +00:00
glock_shrinker = shrinker_alloc(0, "gfs2-glock");
if (!glock_shrinker) {
destroy_workqueue(glock_workqueue);
rhashtable_destroy(&gl_hash_table);
gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Abhinav Kumar <quic_abhinavk@quicinc.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Anna Schumaker <anna@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Carlos Llamas <cmllamas@google.com> Cc: Chandan Babu R <chandan.babu@oracle.com> Cc: Chao Yu <chao@kernel.org> Cc: Chris Mason <clm@fb.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christian Koenig <christian.koenig@amd.com> Cc: Chuck Lever <cel@kernel.org> Cc: Coly Li <colyli@suse.de> Cc: Dai Ngo <Dai.Ngo@oracle.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Airlie <airlied@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Sterba <dsterba@suse.com> Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> Cc: Gao Xiang <hsiangkao@linux.alibaba.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Rui <ray.huang@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Wang <jasowang@redhat.com> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jeffle Xu <jefflexu@linux.alibaba.com> Cc: Joel Fernandes (Google) <joel@joelfernandes.org> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Juergen Gross <jgross@suse.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Kirill Tkhai <tkhai@ya.ru> Cc: Marijn Suijten <marijn.suijten@somainline.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mike Snitzer <snitzer@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nadav Amit <namit@vmware.com> Cc: Neil Brown <neilb@suse.de> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Olga Kornievskaia <kolga@netapp.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Richard Weinberger <richard@nod.at> Cc: Rob Clark <robdclark@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Sean Paul <sean@poorly.run> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Song Liu <song@kernel.org> Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Tom Talpey <tom@talpey.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com> Cc: Yue Hu <huyue2@coolpad.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-11 09:44:07 +00:00
return -ENOMEM;
}
gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Abhinav Kumar <quic_abhinavk@quicinc.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Anna Schumaker <anna@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Carlos Llamas <cmllamas@google.com> Cc: Chandan Babu R <chandan.babu@oracle.com> Cc: Chao Yu <chao@kernel.org> Cc: Chris Mason <clm@fb.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christian Koenig <christian.koenig@amd.com> Cc: Chuck Lever <cel@kernel.org> Cc: Coly Li <colyli@suse.de> Cc: Dai Ngo <Dai.Ngo@oracle.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Airlie <airlied@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Sterba <dsterba@suse.com> Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> Cc: Gao Xiang <hsiangkao@linux.alibaba.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Rui <ray.huang@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Wang <jasowang@redhat.com> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jeffle Xu <jefflexu@linux.alibaba.com> Cc: Joel Fernandes (Google) <joel@joelfernandes.org> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Juergen Gross <jgross@suse.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Kirill Tkhai <tkhai@ya.ru> Cc: Marijn Suijten <marijn.suijten@somainline.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mike Snitzer <snitzer@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nadav Amit <namit@vmware.com> Cc: Neil Brown <neilb@suse.de> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Olga Kornievskaia <kolga@netapp.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Richard Weinberger <richard@nod.at> Cc: Rob Clark <robdclark@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Sean Paul <sean@poorly.run> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Song Liu <song@kernel.org> Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Tom Talpey <tom@talpey.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com> Cc: Yue Hu <huyue2@coolpad.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-11 09:44:07 +00:00
glock_shrinker->count_objects = gfs2_glock_shrink_count;
glock_shrinker->scan_objects = gfs2_glock_shrink_scan;
shrinker_register(glock_shrinker);
for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(glock_wait_table + i);
return 0;
}
void gfs2_glock_exit(void)
{
gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Abhinav Kumar <quic_abhinavk@quicinc.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Anna Schumaker <anna@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Carlos Llamas <cmllamas@google.com> Cc: Chandan Babu R <chandan.babu@oracle.com> Cc: Chao Yu <chao@kernel.org> Cc: Chris Mason <clm@fb.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christian Koenig <christian.koenig@amd.com> Cc: Chuck Lever <cel@kernel.org> Cc: Coly Li <colyli@suse.de> Cc: Dai Ngo <Dai.Ngo@oracle.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Airlie <airlied@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Sterba <dsterba@suse.com> Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> Cc: Gao Xiang <hsiangkao@linux.alibaba.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Rui <ray.huang@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Wang <jasowang@redhat.com> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jeffle Xu <jefflexu@linux.alibaba.com> Cc: Joel Fernandes (Google) <joel@joelfernandes.org> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Juergen Gross <jgross@suse.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Kirill Tkhai <tkhai@ya.ru> Cc: Marijn Suijten <marijn.suijten@somainline.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mike Snitzer <snitzer@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nadav Amit <namit@vmware.com> Cc: Neil Brown <neilb@suse.de> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Olga Kornievskaia <kolga@netapp.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Richard Weinberger <richard@nod.at> Cc: Rob Clark <robdclark@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Sean Paul <sean@poorly.run> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Song Liu <song@kernel.org> Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Tom Talpey <tom@talpey.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com> Cc: Yue Hu <huyue2@coolpad.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-11 09:44:07 +00:00
shrinker_free(glock_shrinker);
rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
}
static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n)
{
struct gfs2_glock *gl = gi->gl;
if (gl) {
if (n == 0)
return;
gfs2_glock_put_async(gl);
}
for (;;) {
gl = rhashtable_walk_next(&gi->hti);
if (IS_ERR_OR_NULL(gl)) {
if (gl == ERR_PTR(-EAGAIN)) {
n = 1;
continue;
}
gl = NULL;
break;
}
if (gl->gl_name.ln_sbd != gi->sdp)
continue;
if (n <= 1) {
if (!lockref_get_not_dead(&gl->gl_lockref))
continue;
break;
} else {
if (__lockref_is_dead(&gl->gl_lockref))
continue;
n--;
}
}
gi->gl = gl;
}
static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
struct gfs2_glock_iter *gi = seq->private;
loff_t n;
/*
* We can either stay where we are, skip to the next hash table
* entry, or start from the beginning.
*/
if (*pos < gi->last_pos) {
rhashtable_walk_exit(&gi->hti);
rhashtable_walk_enter(&gl_hash_table, &gi->hti);
n = *pos + 1;
} else {
n = *pos - gi->last_pos;
}
rhashtable_walk_start(&gi->hti);
gfs2_glock_iter_next(gi, n);
gi->last_pos = *pos;
return gi->gl;
}
static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
gi->last_pos = *pos;
gfs2_glock_iter_next(gi, 1);
return gi->gl;
}
static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
__releases(RCU)
{
struct gfs2_glock_iter *gi = seq->private;
rhashtable_walk_stop(&gi->hti);
}
static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
{
dump_glock(seq, iter_ptr, false);
return 0;
}
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
{
preempt_disable();
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
if (*pos >= GFS2_NR_SBSTATS)
return NULL;
return pos;
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
}
static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
(*pos)++;
if (*pos >= GFS2_NR_SBSTATS)
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
return NULL;
return pos;
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
}
static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
{
preempt_enable();
}
static const struct seq_operations gfs2_glock_seq_ops = {
.start = gfs2_glock_seq_start,
.next = gfs2_glock_seq_next,
.stop = gfs2_glock_seq_stop,
.show = gfs2_glock_seq_show,
};
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
static const struct seq_operations gfs2_glstats_seq_ops = {
.start = gfs2_glock_seq_start,
.next = gfs2_glock_seq_next,
.stop = gfs2_glock_seq_stop,
.show = gfs2_glstats_seq_show,
};
static const struct seq_operations gfs2_sbstats_sops = {
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
.start = gfs2_sbstats_seq_start,
.next = gfs2_sbstats_seq_next,
.stop = gfs2_sbstats_seq_stop,
.show = gfs2_sbstats_seq_show,
};
#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)
static int __gfs2_glocks_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter));
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
gi->sdp = inode->i_private;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
/*
* Initially, we are "before" the first hash table entry; the
* first call to rhashtable_walk_next gets us the first entry.
*/
gi->last_pos = -1;
gi->gl = NULL;
rhashtable_walk_enter(&gl_hash_table, &gi->hti);
}
return ret;
}
static int gfs2_glocks_open(struct inode *inode, struct file *file)
{
return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops);
}
static int gfs2_glocks_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
if (gi->gl)
gfs2_glock_put(gi->gl);
rhashtable_walk_exit(&gi->hti);
return seq_release_private(inode, file);
}
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
}
static const struct file_operations gfs2_glocks_fops = {
.owner = THIS_MODULE,
.open = gfs2_glocks_open,
.read = seq_read,
.llseek = seq_lseek,
.release = gfs2_glocks_release,
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
};
static const struct file_operations gfs2_glstats_fops = {
.owner = THIS_MODULE,
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
.open = gfs2_glstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = gfs2_glocks_release,
GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2012-01-20 10:38:36 +00:00
};
struct gfs2_glockfd_iter {
struct super_block *sb;
unsigned int tgid;
struct task_struct *task;
unsigned int fd;
struct file *file;
};
static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i)
{
struct pid_namespace *ns = task_active_pid_ns(current);
struct pid *pid;
if (i->task)
put_task_struct(i->task);
rcu_read_lock();
retry:
i->task = NULL;
pid = find_ge_pid(i->tgid, ns);
if (pid) {
i->tgid = pid_nr_ns(pid, ns);
i->task = pid_task(pid, PIDTYPE_TGID);
if (!i->task) {
i->tgid++;
goto retry;
}
get_task_struct(i->task);
}
rcu_read_unlock();
return i->task;
}
static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
{
if (i->file) {
fput(i->file);
i->file = NULL;
}
rcu_read_lock();
for(;; i->fd++) {
struct inode *inode;
file: convert to SLAB_TYPESAFE_BY_RCU In recent discussions around some performance improvements in the file handling area we discussed switching the file cache to rely on SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based freeing for files completely. This is a pretty sensitive change overall but it might actually be worth doing. The main downside is the subtlety. The other one is that we should really wait for Jann's patch to land that enables KASAN to handle SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this exists. With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times which requires a few changes. So it isn't sufficient anymore to just acquire a reference to the file in question under rcu using atomic_long_inc_not_zero() since the file might have already been recycled and someone else might have bumped the reference. In other words, callers might see reference count bumps from newer users. For this reason it is necessary to verify that the pointer is the same before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file without first aqcuiring a reference on it. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a reference under rcu or they must hold the files_lock of the fdtable. Failing to do either one of this is a bug. Thanks to Jann for pointing out that we need to ensure memory ordering between reallocations and pointer check by ensuring that all subsequent loads have a dependency on the second load in get_file_rcu() and providing a fixup that was folded into this patch. Cc: Jann Horn <jannh@google.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-09-29 06:45:59 +00:00
i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
file: convert to SLAB_TYPESAFE_BY_RCU In recent discussions around some performance improvements in the file handling area we discussed switching the file cache to rely on SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based freeing for files completely. This is a pretty sensitive change overall but it might actually be worth doing. The main downside is the subtlety. The other one is that we should really wait for Jann's patch to land that enables KASAN to handle SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this exists. With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times which requires a few changes. So it isn't sufficient anymore to just acquire a reference to the file in question under rcu using atomic_long_inc_not_zero() since the file might have already been recycled and someone else might have bumped the reference. In other words, callers might see reference count bumps from newer users. For this reason it is necessary to verify that the pointer is the same before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file without first aqcuiring a reference on it. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a reference under rcu or they must hold the files_lock of the fdtable. Failing to do either one of this is a bug. Thanks to Jann for pointing out that we need to ensure memory ordering between reallocations and pointer check by ensuring that all subsequent loads have a dependency on the second load in get_file_rcu() and providing a fixup that was folded into this patch. Cc: Jann Horn <jannh@google.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-09-29 06:45:59 +00:00
inode = file_inode(i->file);
file: convert to SLAB_TYPESAFE_BY_RCU In recent discussions around some performance improvements in the file handling area we discussed switching the file cache to rely on SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based freeing for files completely. This is a pretty sensitive change overall but it might actually be worth doing. The main downside is the subtlety. The other one is that we should really wait for Jann's patch to land that enables KASAN to handle SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this exists. With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times which requires a few changes. So it isn't sufficient anymore to just acquire a reference to the file in question under rcu using atomic_long_inc_not_zero() since the file might have already been recycled and someone else might have bumped the reference. In other words, callers might see reference count bumps from newer users. For this reason it is necessary to verify that the pointer is the same before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file without first aqcuiring a reference on it. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a reference under rcu or they must hold the files_lock of the fdtable. Failing to do either one of this is a bug. Thanks to Jann for pointing out that we need to ensure memory ordering between reallocations and pointer check by ensuring that all subsequent loads have a dependency on the second load in get_file_rcu() and providing a fixup that was folded into this patch. Cc: Jann Horn <jannh@google.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-09-29 06:45:59 +00:00
if (inode->i_sb == i->sb)
break;
file: convert to SLAB_TYPESAFE_BY_RCU In recent discussions around some performance improvements in the file handling area we discussed switching the file cache to rely on SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based freeing for files completely. This is a pretty sensitive change overall but it might actually be worth doing. The main downside is the subtlety. The other one is that we should really wait for Jann's patch to land that enables KASAN to handle SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this exists. With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times which requires a few changes. So it isn't sufficient anymore to just acquire a reference to the file in question under rcu using atomic_long_inc_not_zero() since the file might have already been recycled and someone else might have bumped the reference. In other words, callers might see reference count bumps from newer users. For this reason it is necessary to verify that the pointer is the same before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file without first aqcuiring a reference on it. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a reference under rcu or they must hold the files_lock of the fdtable. Failing to do either one of this is a bug. Thanks to Jann for pointing out that we need to ensure memory ordering between reallocations and pointer check by ensuring that all subsequent loads have a dependency on the second load in get_file_rcu() and providing a fixup that was folded into this patch. Cc: Jann Horn <jannh@google.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-09-29 06:45:59 +00:00
rcu_read_unlock();
fput(i->file);
rcu_read_lock();
}
rcu_read_unlock();
return i->file;
}
static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos)
{
struct gfs2_glockfd_iter *i = seq->private;
if (*pos)
return NULL;
while (gfs2_glockfd_next_task(i)) {
if (gfs2_glockfd_next_file(i))
return i;
i->tgid++;
}
return NULL;
}
static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
struct gfs2_glockfd_iter *i = seq->private;
(*pos)++;
i->fd++;
do {
if (gfs2_glockfd_next_file(i))
return i;
i->tgid++;
} while (gfs2_glockfd_next_task(i));
return NULL;
}
static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glockfd_iter *i = seq->private;
if (i->file)
fput(i->file);
if (i->task)
put_task_struct(i->task);
}
static void gfs2_glockfd_seq_show_flock(struct seq_file *seq,
struct gfs2_glockfd_iter *i)
{
struct gfs2_file *fp = i->file->private_data;
struct gfs2_holder *fl_gh = &fp->f_fl_gh;
struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED };
if (!READ_ONCE(fl_gh->gh_gl))
return;
spin_lock(&i->file->f_lock);
if (gfs2_holder_initialized(fl_gh))
gl_name = fl_gh->gh_gl->gl_name;
spin_unlock(&i->file->f_lock);
if (gl_name.ln_type != LM_TYPE_RESERVED) {
seq_printf(seq, "%d %u %u/%llx\n",
i->tgid, i->fd, gl_name.ln_type,
(unsigned long long)gl_name.ln_number);
}
}
static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glockfd_iter *i = seq->private;
struct inode *inode = file_inode(i->file);
struct gfs2_glock *gl;
inode_lock_shared(inode);
gl = GFS2_I(inode)->i_iopen_gh.gh_gl;
if (gl) {
seq_printf(seq, "%d %u %u/%llx\n",
i->tgid, i->fd, gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number);
}
gfs2_glockfd_seq_show_flock(seq, i);
inode_unlock_shared(inode);
return 0;
}
static const struct seq_operations gfs2_glockfd_seq_ops = {
.start = gfs2_glockfd_seq_start,
.next = gfs2_glockfd_seq_next,
.stop = gfs2_glockfd_seq_stop,
.show = gfs2_glockfd_seq_show,
};
static int gfs2_glockfd_open(struct inode *inode, struct file *file)
{
struct gfs2_glockfd_iter *i;
struct gfs2_sbd *sdp = inode->i_private;
i = __seq_open_private(file, &gfs2_glockfd_seq_ops,
sizeof(struct gfs2_glockfd_iter));
if (!i)
return -ENOMEM;
i->sb = sdp->sd_vfs;
return 0;
}
static const struct file_operations gfs2_glockfd_fops = {
.owner = THIS_MODULE,
.open = gfs2_glockfd_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats);
void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
{
sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glocks_fops);
debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glockfd_fops);
debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glstats_fops);
debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_sbstats_fops);
}
void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
{
debugfs_remove_recursive(sdp->debugfs_dir);
sdp->debugfs_dir = NULL;
}
void gfs2_register_debugfs(void)
{
gfs2_root = debugfs_create_dir("gfs2", NULL);
}
void gfs2_unregister_debugfs(void)
{
debugfs_remove(gfs2_root);
gfs2_root = NULL;
}