linux-stable/fs/ocfs2/slot_map.c

521 lines
12 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* slot_map.c
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
struct ocfs2_slot {
int sl_valid;
unsigned int sl_node_num;
};
struct ocfs2_slot_info {
int si_extended;
int si_slots_per_block;
struct inode *si_inode;
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
struct ocfs2_slot si_slots[] __counted_by(si_num_slots);
};
static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
unsigned int node_num);
static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
int slot_num)
{
BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
si->si_slots[slot_num].sl_valid = 0;
}
static void ocfs2_set_slot(struct ocfs2_slot_info *si,
int slot_num, unsigned int node_num)
{
BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
si->si_slots[slot_num].sl_valid = 1;
si->si_slots[slot_num].sl_node_num = node_num;
}
/* This version is for the extended slot map */
static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
{
int b, i, slotno;
struct ocfs2_slot_map_extended *se;
slotno = 0;
for (b = 0; b < si->si_blocks; b++) {
se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
for (i = 0;
(i < si->si_slots_per_block) &&
(slotno < si->si_num_slots);
i++, slotno++) {
if (se->se_slots[i].es_valid)
ocfs2_set_slot(si, slotno,
le32_to_cpu(se->se_slots[i].es_node_num));
else
ocfs2_invalidate_slot(si, slotno);
}
}
}
/*
* Post the slot information on disk into our slot_info struct.
* Must be protected by osb_lock.
*/
static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
{
int i;
struct ocfs2_slot_map *sm;
sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
for (i = 0; i < si->si_num_slots; i++) {
if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
ocfs2_invalidate_slot(si, i);
else
ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
}
}
static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
/*
* The slot data will have been refreshed when ocfs2_super_lock
* was taken.
*/
if (si->si_extended)
ocfs2_update_slot_info_extended(si);
else
ocfs2_update_slot_info_old(si);
}
int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
{
int ret;
struct ocfs2_slot_info *si = osb->slot_info;
if (si == NULL)
return 0;
BUG_ON(si->si_blocks == 0);
BUG_ON(si->si_bh == NULL);
trace_ocfs2_refresh_slot_info(si->si_blocks);
/*
* We pass -1 as blocknr because we expect all of si->si_bh to
* be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
spin_unlock(&osb->osb_lock);
}
return ret;
}
/* post the our slot info stuff into it's destination bh and write it
* out. */
static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
int slot_num,
struct buffer_head **bh)
{
int blkind = slot_num / si->si_slots_per_block;
int slotno = slot_num % si->si_slots_per_block;
struct ocfs2_slot_map_extended *se;
BUG_ON(blkind >= si->si_blocks);
se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
if (si->si_slots[slot_num].sl_valid)
se->se_slots[slotno].es_node_num =
cpu_to_le32(si->si_slots[slot_num].sl_node_num);
*bh = si->si_bh[blkind];
}
static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
int slot_num,
struct buffer_head **bh)
{
int i;
struct ocfs2_slot_map *sm;
sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
for (i = 0; i < si->si_num_slots; i++) {
if (si->si_slots[i].sl_valid)
sm->sm_slots[i] =
cpu_to_le16(si->si_slots[i].sl_node_num);
else
sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
}
*bh = si->si_bh[0];
}
static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
struct ocfs2_slot_info *si,
int slot_num)
{
int status;
struct buffer_head *bh;
spin_lock(&osb->osb_lock);
if (si->si_extended)
ocfs2_update_disk_slot_extended(si, slot_num, &bh);
else
ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);
status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
mlog_errno(status);
return status;
}
/*
* Calculate how many bytes are needed by the slot map. Returns
* an error if the slot map file is too small.
*/
static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
struct inode *inode,
unsigned long long *bytes)
{
unsigned long long bytes_needed;
if (ocfs2_uses_extended_slot_map(osb)) {
bytes_needed = osb->max_slots *
sizeof(struct ocfs2_extended_slot);
} else {
bytes_needed = osb->max_slots * sizeof(__le16);
}
if (bytes_needed > i_size_read(inode)) {
mlog(ML_ERROR,
"Slot map file is too small! (size %llu, needed %llu)\n",
i_size_read(inode), bytes_needed);
return -ENOSPC;
}
*bytes = bytes_needed;
return 0;
}
/* try to find global node in the slot info. Returns -ENOENT
* if nothing is found. */
static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
unsigned int node_num)
{
int i, ret = -ENOENT;
for(i = 0; i < si->si_num_slots; i++) {
if (si->si_slots[i].sl_valid &&
(node_num == si->si_slots[i].sl_node_num)) {
ret = i;
break;
}
}
return ret;
}
static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int preferred)
{
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
Revert "ocfs2: mount shared volume without ha stack" This reverts commit 912f655d78c5d4ad05eac287f23a435924df7144. This commit introduced a regression that can cause mount hung. The changes in __ocfs2_find_empty_slot causes that any node with none-zero node number can grab the slot that was already taken by node 0, so node 1 will access the same journal with node 0, when it try to grab journal cluster lock, it will hung because it was already acquired by node 0. It's very easy to reproduce this, in one cluster, mount node 0 first, then node 1, you will see the following call trace from node 1. [13148.735424] INFO: task mount.ocfs2:53045 blocked for more than 122 seconds. [13148.739691] Not tainted 5.15.0-2148.0.4.el8uek.mountracev2.x86_64 #2 [13148.742560] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13148.745846] task:mount.ocfs2 state:D stack: 0 pid:53045 ppid: 53044 flags:0x00004000 [13148.749354] Call Trace: [13148.750718] <TASK> [13148.752019] ? usleep_range+0x90/0x89 [13148.753882] __schedule+0x210/0x567 [13148.755684] schedule+0x44/0xa8 [13148.757270] schedule_timeout+0x106/0x13c [13148.759273] ? __prepare_to_swait+0x53/0x78 [13148.761218] __wait_for_common+0xae/0x163 [13148.763144] __ocfs2_cluster_lock.constprop.0+0x1d6/0x870 [ocfs2] [13148.765780] ? ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.768312] ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.770968] ocfs2_journal_init+0x91/0x340 [ocfs2] [13148.773202] ocfs2_check_volume+0x39/0x461 [ocfs2] [13148.775401] ? iput+0x69/0xba [13148.777047] ocfs2_mount_volume.isra.0.cold+0x40/0x1f5 [ocfs2] [13148.779646] ocfs2_fill_super+0x54b/0x853 [ocfs2] [13148.781756] mount_bdev+0x190/0x1b7 [13148.783443] ? ocfs2_remount+0x440/0x440 [ocfs2] [13148.785634] legacy_get_tree+0x27/0x48 [13148.787466] vfs_get_tree+0x25/0xd0 [13148.789270] do_new_mount+0x18c/0x2d9 [13148.791046] __x64_sys_mount+0x10e/0x142 [13148.792911] do_syscall_64+0x3b/0x89 [13148.794667] entry_SYSCALL_64_after_hwframe+0x170/0x0 [13148.797051] RIP: 0033:0x7f2309f6e26e [13148.798784] RSP: 002b:00007ffdcee7d408 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [13148.801974] RAX: ffffffffffffffda RBX: 00007ffdcee7d4a0 RCX: 00007f2309f6e26e [13148.804815] RDX: 0000559aa762a8ae RSI: 0000559aa939d340 RDI: 0000559aa93a22b0 [13148.807719] RBP: 00007ffdcee7d5b0 R08: 0000559aa93a2290 R09: 00007f230a0b4820 [13148.810659] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffdcee7d420 [13148.813609] R13: 0000000000000000 R14: 0000559aa939f000 R15: 0000000000000000 [13148.816564] </TASK> To fix it, we can just fix __ocfs2_find_empty_slot. But original commit introduced the feature to mount ocfs2 locally even it is cluster based, that is a very dangerous, it can easily cause serious data corruption, there is no way to stop other nodes mounting the fs and corrupting it. Setup ha or other cluster-aware stack is just the cost that we have to take for avoiding corruption, otherwise we have to do it in kernel. Link: https://lkml.kernel.org/r/20220603222801.42488-1-junxiao.bi@oracle.com Fixes: 912f655d78c5("ocfs2: mount shared volume without ha stack") Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <gechangwei@live.cn> Cc: Gang He <ghe@suse.com> Cc: Jun Piao <piaojun@huawei.com> Cc: <heming.zhao@suse.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-03 22:28:01 +00:00
if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
Revert "ocfs2: mount shared volume without ha stack" This reverts commit 912f655d78c5d4ad05eac287f23a435924df7144. This commit introduced a regression that can cause mount hung. The changes in __ocfs2_find_empty_slot causes that any node with none-zero node number can grab the slot that was already taken by node 0, so node 1 will access the same journal with node 0, when it try to grab journal cluster lock, it will hung because it was already acquired by node 0. It's very easy to reproduce this, in one cluster, mount node 0 first, then node 1, you will see the following call trace from node 1. [13148.735424] INFO: task mount.ocfs2:53045 blocked for more than 122 seconds. [13148.739691] Not tainted 5.15.0-2148.0.4.el8uek.mountracev2.x86_64 #2 [13148.742560] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13148.745846] task:mount.ocfs2 state:D stack: 0 pid:53045 ppid: 53044 flags:0x00004000 [13148.749354] Call Trace: [13148.750718] <TASK> [13148.752019] ? usleep_range+0x90/0x89 [13148.753882] __schedule+0x210/0x567 [13148.755684] schedule+0x44/0xa8 [13148.757270] schedule_timeout+0x106/0x13c [13148.759273] ? __prepare_to_swait+0x53/0x78 [13148.761218] __wait_for_common+0xae/0x163 [13148.763144] __ocfs2_cluster_lock.constprop.0+0x1d6/0x870 [ocfs2] [13148.765780] ? ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.768312] ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.770968] ocfs2_journal_init+0x91/0x340 [ocfs2] [13148.773202] ocfs2_check_volume+0x39/0x461 [ocfs2] [13148.775401] ? iput+0x69/0xba [13148.777047] ocfs2_mount_volume.isra.0.cold+0x40/0x1f5 [ocfs2] [13148.779646] ocfs2_fill_super+0x54b/0x853 [ocfs2] [13148.781756] mount_bdev+0x190/0x1b7 [13148.783443] ? ocfs2_remount+0x440/0x440 [ocfs2] [13148.785634] legacy_get_tree+0x27/0x48 [13148.787466] vfs_get_tree+0x25/0xd0 [13148.789270] do_new_mount+0x18c/0x2d9 [13148.791046] __x64_sys_mount+0x10e/0x142 [13148.792911] do_syscall_64+0x3b/0x89 [13148.794667] entry_SYSCALL_64_after_hwframe+0x170/0x0 [13148.797051] RIP: 0033:0x7f2309f6e26e [13148.798784] RSP: 002b:00007ffdcee7d408 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [13148.801974] RAX: ffffffffffffffda RBX: 00007ffdcee7d4a0 RCX: 00007f2309f6e26e [13148.804815] RDX: 0000559aa762a8ae RSI: 0000559aa939d340 RDI: 0000559aa93a22b0 [13148.807719] RBP: 00007ffdcee7d5b0 R08: 0000559aa93a2290 R09: 00007f230a0b4820 [13148.810659] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffdcee7d420 [13148.813609] R13: 0000000000000000 R14: 0000559aa939f000 R15: 0000000000000000 [13148.816564] </TASK> To fix it, we can just fix __ocfs2_find_empty_slot. But original commit introduced the feature to mount ocfs2 locally even it is cluster based, that is a very dangerous, it can easily cause serious data corruption, there is no way to stop other nodes mounting the fs and corrupting it. Setup ha or other cluster-aware stack is just the cost that we have to take for avoiding corruption, otherwise we have to do it in kernel. Link: https://lkml.kernel.org/r/20220603222801.42488-1-junxiao.bi@oracle.com Fixes: 912f655d78c5("ocfs2: mount shared volume without ha stack") Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <gechangwei@live.cn> Cc: Gang He <ghe@suse.com> Cc: Jun Piao <piaojun@huawei.com> Cc: <heming.zhao@suse.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-03 22:28:01 +00:00
if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
}
out:
return ret;
}
int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
{
int slot;
struct ocfs2_slot_info *si = osb->slot_info;
spin_lock(&osb->osb_lock);
slot = __ocfs2_node_num_to_slot(si, node_num);
spin_unlock(&osb->osb_lock);
return slot;
}
int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
unsigned int *node_num)
{
struct ocfs2_slot_info *si = osb->slot_info;
assert_spin_locked(&osb->osb_lock);
BUG_ON(slot_num < 0);
BUG_ON(slot_num >= osb->max_slots);
if (!si->si_slots[slot_num].sl_valid)
return -ENOENT;
*node_num = si->si_slots[slot_num].sl_node_num;
return 0;
}
static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
{
unsigned int i;
if (si == NULL)
return;
iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
brelse(si->si_bh[i]);
si->si_bh[i] = NULL;
}
}
kfree(si->si_bh);
}
kfree(si);
}
int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
{
struct ocfs2_slot_info *si = osb->slot_info;
if (si == NULL)
return 0;
spin_lock(&osb->osb_lock);
ocfs2_invalidate_slot(si, slot_num);
spin_unlock(&osb->osb_lock);
return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
}
static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
struct ocfs2_slot_info *si)
{
int status = 0;
u64 blkno;
unsigned long long blocks, bytes = 0;
unsigned int i;
struct buffer_head *bh;
status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
if (status)
goto bail;
blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
BUG_ON(blocks > UINT_MAX);
si->si_blocks = blocks;
if (!si->si_blocks)
goto bail;
if (si->si_extended)
si->si_slots_per_block =
(osb->sb->s_blocksize /
sizeof(struct ocfs2_extended_slot));
else
si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
/* The size checks above should ensure this */
BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!si->si_bh) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
for (i = 0; i < si->si_blocks; i++) {
status = ocfs2_extent_map_get_blocks(si->si_inode, i,
&blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
bh = NULL; /* Acquire a fresh bh */
status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
}
si->si_bh[i] = bh;
}
bail:
return status;
}
int ocfs2_init_slot_info(struct ocfs2_super *osb)
{
int status;
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
return status;
}
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!inode) {
status = -EINVAL;
mlog_errno(status);
goto bail;
}
si->si_inode = inode;
status = ocfs2_map_slot_buffers(osb, si);
if (status < 0) {
mlog_errno(status);
goto bail;
}
osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
if (status < 0)
__ocfs2_free_slot_info(si);
return status;
}
void ocfs2_free_slot_info(struct ocfs2_super *osb)
{
struct ocfs2_slot_info *si = osb->slot_info;
osb->slot_info = NULL;
__ocfs2_free_slot_info(si);
}
int ocfs2_find_slot(struct ocfs2_super *osb)
{
int status;
int slot;
struct ocfs2_slot_info *si;
si = osb->slot_info;
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
Revert "ocfs2: mount shared volume without ha stack" This reverts commit 912f655d78c5d4ad05eac287f23a435924df7144. This commit introduced a regression that can cause mount hung. The changes in __ocfs2_find_empty_slot causes that any node with none-zero node number can grab the slot that was already taken by node 0, so node 1 will access the same journal with node 0, when it try to grab journal cluster lock, it will hung because it was already acquired by node 0. It's very easy to reproduce this, in one cluster, mount node 0 first, then node 1, you will see the following call trace from node 1. [13148.735424] INFO: task mount.ocfs2:53045 blocked for more than 122 seconds. [13148.739691] Not tainted 5.15.0-2148.0.4.el8uek.mountracev2.x86_64 #2 [13148.742560] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13148.745846] task:mount.ocfs2 state:D stack: 0 pid:53045 ppid: 53044 flags:0x00004000 [13148.749354] Call Trace: [13148.750718] <TASK> [13148.752019] ? usleep_range+0x90/0x89 [13148.753882] __schedule+0x210/0x567 [13148.755684] schedule+0x44/0xa8 [13148.757270] schedule_timeout+0x106/0x13c [13148.759273] ? __prepare_to_swait+0x53/0x78 [13148.761218] __wait_for_common+0xae/0x163 [13148.763144] __ocfs2_cluster_lock.constprop.0+0x1d6/0x870 [ocfs2] [13148.765780] ? ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.768312] ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.770968] ocfs2_journal_init+0x91/0x340 [ocfs2] [13148.773202] ocfs2_check_volume+0x39/0x461 [ocfs2] [13148.775401] ? iput+0x69/0xba [13148.777047] ocfs2_mount_volume.isra.0.cold+0x40/0x1f5 [ocfs2] [13148.779646] ocfs2_fill_super+0x54b/0x853 [ocfs2] [13148.781756] mount_bdev+0x190/0x1b7 [13148.783443] ? ocfs2_remount+0x440/0x440 [ocfs2] [13148.785634] legacy_get_tree+0x27/0x48 [13148.787466] vfs_get_tree+0x25/0xd0 [13148.789270] do_new_mount+0x18c/0x2d9 [13148.791046] __x64_sys_mount+0x10e/0x142 [13148.792911] do_syscall_64+0x3b/0x89 [13148.794667] entry_SYSCALL_64_after_hwframe+0x170/0x0 [13148.797051] RIP: 0033:0x7f2309f6e26e [13148.798784] RSP: 002b:00007ffdcee7d408 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [13148.801974] RAX: ffffffffffffffda RBX: 00007ffdcee7d4a0 RCX: 00007f2309f6e26e [13148.804815] RDX: 0000559aa762a8ae RSI: 0000559aa939d340 RDI: 0000559aa93a22b0 [13148.807719] RBP: 00007ffdcee7d5b0 R08: 0000559aa93a2290 R09: 00007f230a0b4820 [13148.810659] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffdcee7d420 [13148.813609] R13: 0000000000000000 R14: 0000559aa939f000 R15: 0000000000000000 [13148.816564] </TASK> To fix it, we can just fix __ocfs2_find_empty_slot. But original commit introduced the feature to mount ocfs2 locally even it is cluster based, that is a very dangerous, it can easily cause serious data corruption, there is no way to stop other nodes mounting the fs and corrupting it. Setup ha or other cluster-aware stack is just the cost that we have to take for avoiding corruption, otherwise we have to do it in kernel. Link: https://lkml.kernel.org/r/20220603222801.42488-1-junxiao.bi@oracle.com Fixes: 912f655d78c5("ocfs2: mount shared volume without ha stack") Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <gechangwei@live.cn> Cc: Gang He <ghe@suse.com> Cc: Jun Piao <piaojun@huawei.com> Cc: <heming.zhao@suse.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-03 22:28:01 +00:00
/* search for ourselves first and take the slot if it already
* exists. Perhaps we need to mark this in a variable for our
* own journal recovery? Possibly not, though we certainly
* need to warn to the user */
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
if (slot < 0) {
/* if no slot yet, then just take 1st available
* one. */
slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
Revert "ocfs2: mount shared volume without ha stack" This reverts commit 912f655d78c5d4ad05eac287f23a435924df7144. This commit introduced a regression that can cause mount hung. The changes in __ocfs2_find_empty_slot causes that any node with none-zero node number can grab the slot that was already taken by node 0, so node 1 will access the same journal with node 0, when it try to grab journal cluster lock, it will hung because it was already acquired by node 0. It's very easy to reproduce this, in one cluster, mount node 0 first, then node 1, you will see the following call trace from node 1. [13148.735424] INFO: task mount.ocfs2:53045 blocked for more than 122 seconds. [13148.739691] Not tainted 5.15.0-2148.0.4.el8uek.mountracev2.x86_64 #2 [13148.742560] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13148.745846] task:mount.ocfs2 state:D stack: 0 pid:53045 ppid: 53044 flags:0x00004000 [13148.749354] Call Trace: [13148.750718] <TASK> [13148.752019] ? usleep_range+0x90/0x89 [13148.753882] __schedule+0x210/0x567 [13148.755684] schedule+0x44/0xa8 [13148.757270] schedule_timeout+0x106/0x13c [13148.759273] ? __prepare_to_swait+0x53/0x78 [13148.761218] __wait_for_common+0xae/0x163 [13148.763144] __ocfs2_cluster_lock.constprop.0+0x1d6/0x870 [ocfs2] [13148.765780] ? ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.768312] ocfs2_inode_lock_full_nested+0x18d/0x398 [ocfs2] [13148.770968] ocfs2_journal_init+0x91/0x340 [ocfs2] [13148.773202] ocfs2_check_volume+0x39/0x461 [ocfs2] [13148.775401] ? iput+0x69/0xba [13148.777047] ocfs2_mount_volume.isra.0.cold+0x40/0x1f5 [ocfs2] [13148.779646] ocfs2_fill_super+0x54b/0x853 [ocfs2] [13148.781756] mount_bdev+0x190/0x1b7 [13148.783443] ? ocfs2_remount+0x440/0x440 [ocfs2] [13148.785634] legacy_get_tree+0x27/0x48 [13148.787466] vfs_get_tree+0x25/0xd0 [13148.789270] do_new_mount+0x18c/0x2d9 [13148.791046] __x64_sys_mount+0x10e/0x142 [13148.792911] do_syscall_64+0x3b/0x89 [13148.794667] entry_SYSCALL_64_after_hwframe+0x170/0x0 [13148.797051] RIP: 0033:0x7f2309f6e26e [13148.798784] RSP: 002b:00007ffdcee7d408 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [13148.801974] RAX: ffffffffffffffda RBX: 00007ffdcee7d4a0 RCX: 00007f2309f6e26e [13148.804815] RDX: 0000559aa762a8ae RSI: 0000559aa939d340 RDI: 0000559aa93a22b0 [13148.807719] RBP: 00007ffdcee7d5b0 R08: 0000559aa93a2290 R09: 00007f230a0b4820 [13148.810659] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffdcee7d420 [13148.813609] R13: 0000000000000000 R14: 0000559aa939f000 R15: 0000000000000000 [13148.816564] </TASK> To fix it, we can just fix __ocfs2_find_empty_slot. But original commit introduced the feature to mount ocfs2 locally even it is cluster based, that is a very dangerous, it can easily cause serious data corruption, there is no way to stop other nodes mounting the fs and corrupting it. Setup ha or other cluster-aware stack is just the cost that we have to take for avoiding corruption, otherwise we have to do it in kernel. Link: https://lkml.kernel.org/r/20220603222801.42488-1-junxiao.bi@oracle.com Fixes: 912f655d78c5("ocfs2: mount shared volume without ha stack") Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <gechangwei@live.cn> Cc: Gang He <ghe@suse.com> Cc: Jun Piao <piaojun@huawei.com> Cc: <heming.zhao@suse.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-03 22:28:01 +00:00
spin_unlock(&osb->osb_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
goto bail;
}
} else
printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
"allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
spin_unlock(&osb->osb_lock);
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0) {
mlog_errno(status);
/*
* if write block failed, invalidate slot to avoid overwrite
* slot during dismount in case another node rightly has mounted
*/
spin_lock(&osb->osb_lock);
ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
spin_unlock(&osb->osb_lock);
}
bail:
return status;
}
void ocfs2_put_slot(struct ocfs2_super *osb)
{
int status, slot_num;
struct ocfs2_slot_info *si = osb->slot_info;
if (!si)
return;
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
slot_num = osb->slot_num;
ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
if (status < 0)
mlog_errno(status);
ocfs2_free_slot_info(osb);
}