linux-stable/kernel/ksysfs.c
Eric DeVolder a72bbec70d crash: hotplug support for kexec_load()
The hotplug support for kexec_load() requires changes to the userspace
kexec-tools and a little extra help from the kernel.

Given a kdump capture kernel loaded via kexec_load(), and a subsequent
hotplug event, the crash hotplug handler finds the elfcorehdr and rewrites
it to reflect the hotplug change.  That is the desired outcome, however,
at kernel panic time, the purgatory integrity check fails (because the
elfcorehdr changed), and the capture kernel does not boot and no vmcore is
generated.

Therefore, the userspace kexec-tools/kexec must indicate to the kernel
that the elfcorehdr can be modified (because the kexec excluded the
elfcorehdr from the digest, and sized the elfcorehdr memory buffer
appropriately).

To facilitate hotplug support with kexec_load():
 - a new kexec flag KEXEC_UPATE_ELFCOREHDR indicates that it is
   safe for the kernel to modify the kexec_load()'d elfcorehdr
 - the /sys/kernel/crash_elfcorehdr_size node communicates the
   preferred size of the elfcorehdr memory buffer
 - The sysfs crash_hotplug nodes (ie.
   /sys/devices/system/[cpu|memory]/crash_hotplug) dynamically
   take into account kexec_file_load() vs kexec_load() and
   KEXEC_UPDATE_ELFCOREHDR.
   This is critical so that the udev rule processing of crash_hotplug
   is all that is needed to determine if the userspace unload-then-load
   of the kdump image is to be skipped, or not. The proposed udev
   rule change looks like:
   # The kernel updates the crash elfcorehdr for CPU and memory changes
   SUBSYSTEM=="cpu", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"
   SUBSYSTEM=="memory", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"

The table below indicates the behavior of kexec_load()'d kdump image
updates (with the new udev crash_hotplug rule in place):

 Kernel |Kexec
 -------+-----+----
 Old    |Old  |New
        |  a  | a
 -------+-----+----
 New    |  a  | b
 -------+-----+----

where kexec 'old' and 'new' delineate kexec-tools has the needed
modifications for the crash hotplug feature, and kernel 'old' and 'new'
delineate the kernel supports this crash hotplug feature.

Behavior 'a' indicates the unload-then-reload of the entire kdump image. 
For the kexec 'old' column, the unload-then-reload occurs due to the
missing flag KEXEC_UPDATE_ELFCOREHDR.  An 'old' kernel (with 'new' kexec)
does not present the crash_hotplug sysfs node, which leads to the
unload-then-reload of the kdump image.

Behavior 'b' indicates the desired optimized behavior of the kernel
directly modifying the elfcorehdr and avoiding the unload-then-reload of
the kdump image.

If the udev rule is not updated with crash_hotplug node check, then no
matter any combination of kernel or kexec is new or old, the kdump image
continues to be unload-then-reload on hotplug changes.

To fully support crash hotplug feature, there needs to be a rollout of
kernel, kexec-tools and udev rule changes.  However, the order of the
rollout of these pieces does not matter; kexec_load()'d kdump images still
function for hotplug as-is.

Link: https://lkml.kernel.org/r/20230814214446.6659-7-eric.devolder@oracle.com
Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Suggested-by: Hari Bathini <hbathini@linux.ibm.com>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Akhil Raj <lf32.dev@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Mimi Zohar <zohar@linux.ibm.com>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-24 16:25:14 -07:00

315 lines
7.4 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/profile.h>
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/compiler.h>
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
#if defined(__LITTLE_ENDIAN)
#define CPU_BYTEORDER_STRING "little"
#elif defined(__BIG_ENDIAN)
#define CPU_BYTEORDER_STRING "big"
#else
#error Unknown byteorder
#endif
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
/* cpu byteorder */
static ssize_t cpu_byteorder_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
}
KERNEL_ATTR_RO(cpu_byteorder);
/* address bits */
static ssize_t address_bits_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
}
KERNEL_ATTR_RO(address_bits);
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (count+1 > UEVENT_HELPER_PATH_LEN)
return -ENOENT;
memcpy(uevent_helper, buf, count);
uevent_helper[count] = '\0';
if (count && uevent_helper[count-1] == '\n')
uevent_helper[count-1] = '\0';
return count;
}
KERNEL_ATTR_RW(uevent_helper);
#endif
#ifdef CONFIG_PROFILING
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int ret;
if (prof_on)
return -EEXIST;
/*
* This eventually calls into get_option() which
* has a ton of callers and is not const. It is
* easiest to cast it away here.
*/
profile_setup((char *)buf);
ret = profile_init();
if (ret)
return ret;
ret = create_proc_profile();
if (ret)
return ret;
return count;
}
KERNEL_ATTR_RW(profiling);
#endif
#ifdef CONFIG_KEXEC_CORE
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
ssize_t size = crash_get_memory_size();
if (size < 0)
return size;
return sysfs_emit(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long cnt;
int ret;
if (kstrtoul(buf, 0, &cnt))
return -EINVAL;
ret = crash_shrink_memory(cnt);
return ret < 0 ? ret : count;
}
KERNEL_ATTR_RW(kexec_crash_size);
#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_CRASH_CORE
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
return sysfs_emit(buf, "%pa %x\n", &vmcore_base,
(unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
#ifdef CONFIG_CRASH_HOTPLUG
static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
unsigned int sz = crash_get_elfcorehdr_size();
return sysfs_emit(buf, "%u\n", sz);
}
KERNEL_ATTR_RO(crash_elfcorehdr_size);
#endif
#endif /* CONFIG_CRASH_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
#ifndef CONFIG_TINY_RCU
int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (kstrtoint(buf, 0, &rcu_expedited))
return -EINVAL;
return count;
}
KERNEL_ATTR_RW(rcu_expedited);
int rcu_normal;
static ssize_t rcu_normal_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal));
}
static ssize_t rcu_normal_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (kstrtoint(buf, 0, &rcu_normal))
return -EINVAL;
return count;
}
KERNEL_ATTR_RW(rcu_normal);
#endif /* #ifndef CONFIG_TINY_RCU */
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
extern const void __start_notes __weak;
extern const void __stop_notes __weak;
#define notes_size (&__stop_notes - &__start_notes)
static ssize_t notes_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
memcpy(buf, &__start_notes + off, count);
return count;
}
static struct bin_attribute notes_attr __ro_after_init = {
.attr = {
.name = "notes",
.mode = S_IRUGO,
},
.read = &notes_read,
};
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
&cpu_byteorder_attr.attr,
&address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
#endif
#ifdef CONFIG_CRASH_CORE
&vmcoreinfo_attr.attr,
#ifdef CONFIG_CRASH_HOTPLUG
&crash_elfcorehdr_size_attr.attr,
#endif
#endif
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
&rcu_normal_attr.attr,
#endif
NULL
};
static const struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
static int __init ksysfs_init(void)
{
int error;
kernel_kobj = kobject_create_and_add("kernel", NULL);
if (!kernel_kobj) {
error = -ENOMEM;
goto exit;
}
error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
if (error)
goto kset_exit;
if (notes_size > 0) {
notes_attr.size = notes_size;
error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
if (error)
goto group_exit;
}
return 0;
group_exit:
sysfs_remove_group(kernel_kobj, &kernel_attr_group);
kset_exit:
kobject_put(kernel_kobj);
exit:
return error;
}
core_initcall(ksysfs_init);