diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..d35fd3ced0db 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4038,6 +4038,14 @@ latencies, which will choose a value aligned with the appropriate hardware boundaries. + rcutree.rcu_min_cached_objs= [KNL] + Minimum number of objects which are cached and + maintained per one CPU. Object size is equal + to PAGE_SIZE. The cache allows to reduce the + pressure to page allocator, also it makes the + whole algorithm to behave better in low memory + condition. + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. @@ -4258,6 +4266,20 @@ Set time (jiffies) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. + rcutorture.read_exit= [KNL] + Set the number of read-then-exit kthreads used + to test the interaction of RCU updaters and + task-exit processing. + + rcutorture.read_exit_burst= [KNL] + The number of times in a given read-then-exit + episode that a set of read-then-exit kthreads + is spawned. + + rcutorture.read_exit_delay= [KNL] + The delay, in seconds, between successive + read-then-exit testing episodes. + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode @@ -4407,6 +4429,45 @@ reboot_cpu is s[mp]#### with #### being the processor to be used for rebooting. + refscale.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + + refscale.loops= [KNL] + Set the number of loops over the synchronization + primitive under test. Increasing this number + reduces noise due to loop start/end overhead, + but the default has already reduced the per-pass + noise to a handful of picoseconds on ca. 2020 + x86 laptops. + + refscale.nreaders= [KNL] + Set number of readers. The default value of -1 + selects N, where N is roughly 75% of the number + of CPUs. A value of zero is an interesting choice. + + refscale.nruns= [KNL] + Set number of runs, each of which is dumped onto + the console log. + + refscale.readdelay= [KNL] + Set the read-side critical-section duration, + measured in microseconds. + + refscale.scale_type= [KNL] + Specify the read-protection implementation to test. + + refscale.shutdown= [KNL] + Shut down the system at the end of the performance + test. This defaults to 1 (shut it down) when + rcuperf is built into the kernel and to 0 (leave + it running) when rcuperf is built as a module. + + refscale.verbose= [KNL] + Enable additional printk() statements. + relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. @@ -5082,6 +5143,13 @@ Prevent the CPU-hotplug component of torturing until after init has spawned. + torture.ftrace_dump_at_shutdown= [KNL] + Dump the ftrace buffer at torture-test shutdown, + even if there were no errors. This can be a + very costly operation when many torture tests + are running concurrently, especially on systems + with rotating-rust storage. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 68c96057ad2d..704239546093 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4515,6 +4515,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(tree, page, mask); diff --git a/include/linux/rculist.h b/include/linux/rculist.h index df587d181844..7eed65b5f713 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -512,7 +512,7 @@ static inline void hlist_replace_rcu(struct hlist_node *old, * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and - [@right ][node2 ... ] + * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 659cbfa7581a..d15d46db61f7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /* * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kfree_rcu()? + * structure can be handled by kvfree_rcu()? */ -#define __is_kfree_rcu_offset(offset) ((offset) < 4096) +#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ -#define __kfree_rcu(head, offset) \ +#define __kvfree_rcu(head, offset) \ do { \ - BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ - kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ + BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ + kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** @@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will - * be generated in __kfree_rcu(). If this error is triggered, you can + * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * @@ -872,7 +872,46 @@ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ - __kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ + __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ +} while (0) + +/** + * kvfree_rcu() - kvfree an object after a grace period. + * + * This macro consists of one or two arguments and it is + * based on whether an object is head-less or not. If it + * has a head then a semantic stays the same as it used + * to be before: + * + * kvfree_rcu(ptr, rhf); + * + * where @ptr is a pointer to kvfree(), @rhf is the name + * of the rcu_head structure within the type of @ptr. + * + * When it comes to head-less variant, only one argument + * is passed and that is just a pointer which has to be + * freed after a grace period. Therefore the semantic is + * + * kvfree_rcu(ptr); + * + * where @ptr is a pointer to kvfree(). + * + * Please note, head-less way of freeing is permitted to + * use from a context that has to follow might_sleep() + * annotation. Otherwise, please switch and embed the + * rcu_head structure within the type of @ptr. + */ +#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ + kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) + +#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME +#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf) +#define kvfree_rcu_arg_1(ptr) \ +do { \ + typeof(ptr) ___p = (ptr); \ + \ + if (___p) \ + kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ } while (0) /* diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 4c25a41f8b27..d9015aac78c6 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -36,8 +36,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section * - * When synchronize_rcu_trace() is invoked by one task, then that task - * is guaranteed to block until all other tasks exit their read-side + * When synchronize_rcu_tasks_trace() is invoked by one task, then that + * task is guaranteed to block until all other tasks exit their read-side * critical sections. Similarly, if call_rcu_trace() is invoked on one * task while other tasks are within RCU read-side critical sections, * invocation of the corresponding RCU callback is deferred until after diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 8512caeb7682..5cc9637cac16 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +/* + * Add one more declaration of kvfree() here. It is + * not so straight forward to just include + * where it is defined due to getting many compile + * errors caused by that include. + */ +extern void kvfree(const void *addr); + +static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - call_rcu(head, func); + if (head) { + call_rcu(head, func); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree((void *) func); } void rcu_qs(void); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d5cc9d675987..d2f4064ebd1d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu) } void synchronize_rcu_expedited(void); -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/include/linux/torture.h b/include/linux/torture.h index 629b66e6c161..7f65bd1dd307 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -55,6 +55,11 @@ struct torture_random_state { #define DEFINE_TORTURE_RANDOM_PERCPU(name) \ DEFINE_PER_CPU(struct torture_random_state, name) unsigned long torture_random(struct torture_random_state *trsp); +static inline void torture_random_init(struct torture_random_state *trsp) +{ + trsp->trs_state = 0; + trsp->trs_count = 0; +} /* Task shuffler, which causes CPUs to occasionally go idle. */ void torture_shuffle_task_register(struct task_struct *tp); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index f9a7811148e2..ced71237b7e4 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -435,11 +435,12 @@ TRACE_EVENT_RCU(rcu_fqs, #endif /* #if defined(CONFIG_TREE_RCU) */ /* - * Tracepoint for dyntick-idle entry/exit events. These take a string - * as argument: "Start" for entering dyntick-idle mode, "Startirq" for - * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it - * to irq/NMI, "--=" for events moving towards idle, and "++=" for events - * moving away from idle. + * Tracepoint for dyntick-idle entry/exit events. These take 2 strings + * as argument: + * polarity: "Start", "End", "StillNonIdle" for entering, exiting or still not + * being in dyntick-idle mode. + * context: "USER" or "IDLE" or "IRQ". + * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context. * * These events also take a pair of numbers, which indicate the nesting * depth before and after the event of interest, and a third number that is @@ -506,13 +507,13 @@ TRACE_EVENT_RCU(rcu_callback, /* * Tracepoint for the registration of a single RCU callback of the special - * kfree() form. The first argument is the RCU type, the second argument + * kvfree() form. The first argument is the RCU type, the second argument * is a pointer to the RCU callback, the third argument is the offset * of the callback within the enclosing RCU-protected data structure, * the fourth argument is the number of lazy callbacks queued, and the * fifth argument is the total number of callbacks queued. */ -TRACE_EVENT_RCU(rcu_kfree_callback, +TRACE_EVENT_RCU(rcu_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, long qlen), @@ -596,12 +597,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback, /* * Tracepoint for the invocation of a single RCU callback of the special - * kfree() form. The first argument is the RCU flavor, the second + * kvfree() form. The first argument is the RCU flavor, the second * argument is a pointer to the RCU callback, and the third argument * is the offset of the callback within the enclosing RCU-protected * data structure. */ -TRACE_EVENT_RCU(rcu_invoke_kfree_callback, +TRACE_EVENT_RCU(rcu_invoke_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset), diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..0a7549d159ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", + : "", rcu_scheduler_active, debug_locks); /* diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..8ff6f50e06a0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; - lock_is_write_held = 1; + lock_is_write_held = true; if (WARN_ON_ONCE(lock_is_read_held)) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; + lock_is_write_held = false; cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(); - lock_is_read_held = 1; + lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; + lock_is_read_held = false; cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); @@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + bool fail = false; int i, n_stress; long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; @@ -904,7 +904,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +935,7 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; + lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 452feae8de20..3cf6132a4bb9 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,6 +61,25 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index f91f2c2cf138..95f5117ef8da 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..d906ca987936 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney "); * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader performance statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE @@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void) } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. + * RCU perf reader kthread. Repeatedly does empty RCU read-side critical + * section, minimizing update-side interference. However, the point of + * this test is not to evaluate reader performance, but instead to serve + * as a test load for update-side performance testing. */ static int rcu_perf_reader(void *arg) @@ -576,11 +583,8 @@ static int compute_real(int n) static int rcu_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_perf_cleanup(); kernel_power_off(); @@ -693,11 +697,8 @@ kfree_perf_cleanup(void) static int kfree_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= - kfree_nrealthreads); - } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8205295fc33e..d0d265304d14 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -109,6 +109,10 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, read_exit_delay, 13, + "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, + "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -146,6 +150,7 @@ static struct task_struct *stall_task; static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -177,6 +182,7 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; @@ -1166,6 +1172,7 @@ rcu_torture_writer(void *arg) WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -1370,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) struct rt_read_seg *rtrsp1; unsigned long long ts; + WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); @@ -1539,10 +1547,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", + pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld\n", data_race(n_read_exits)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1634,7 +1643,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1643,7 +1653,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2175,7 +2186,7 @@ static void rcu_torture_barrier1cb(void *rcu_void) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; @@ -2338,6 +2349,99 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + (void)rcu_torture_one_read(trsp); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + int count = 0; + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + if (++count > read_exit_burst) { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + for (i = 0; i < read_exit_delay; i++) { + schedule_timeout_uninterruptible(HZ); + if (READ_ONCE(read_exit_child_stop)) + break; + } + if (!READ_ONCE(read_exit_child_stop)) + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + count = 0; + } + if (READ_ONCE(read_exit_child_stop)) + break; + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", + "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + errexit = true; + tsp = NULL; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits ++; + stutter_wait("rcu_torture_read_exit"); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return -EINVAL; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + static enum cpuhp_state rcutor_hp; static void @@ -2359,6 +2463,7 @@ rcu_torture_cleanup(void) } show_rcu_gp_kthreads(); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2370,7 +2475,6 @@ rcu_torture_cleanup(void) reader_tasks[i]); kfree(reader_tasks); } - rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { @@ -2680,6 +2784,9 @@ rcu_torture_init(void) if (firsterr) goto unwind; firsterr = rcu_torture_barrier_init(); + if (firsterr) + goto unwind; + firsterr = rcu_torture_read_exit_init(); if (firsterr) goto unwind; if (object_debug) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c new file mode 100644 index 000000000000..d9291f883b54 --- /dev/null +++ b/kernel/rcu/refscale.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Scalability test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +#define SCALE_FLAG "-ref-scale: " + +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) + +#define VERBOSE_SCALEOUT(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + +#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) "); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. +torture_param(long, loops, 10000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); + +#ifdef MODULE +# define REFSCALE_SHUTDOWN 0 +#else +# define REFSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); + +struct reader_task { + struct task_struct *task; + int start_reader; + wait_queue_head_t wq; + u64 last_duration_ns; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; +static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_scale_ops { + void (*init)(void); + void (*cleanup)(void); + void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int udl, const int ndl); + const char *name; +}; + +static struct ref_scale_ops *cur_ops; + +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + +static void ref_rcu_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + un_delay(udl, ndl); + rcu_read_unlock(); + } +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, + .name = "rcu" +}; + +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; + +static void srcu_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, + .name = "srcu" +}; + +// Definitions for RCU Tasks ref scale testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, + .name = "rcu-tasks" +}; + +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, + .name = "rcu-trace" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static void ref_refcnt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } +} + +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + un_delay(udl, ndl); + atomic_dec(&refcnt); + } +} + +static struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + un_delay(udl, ndl); + read_unlock(&test_rwlock); + } +} + +static struct ref_scale_ops rwlock_ops = { + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static void ref_rwsem_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } +} + +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + un_delay(udl, ndl); + up_read(&test_rwsem); + } +} + +static struct ref_scale_ops rwsem_ops = { + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, + .name = "rwsem" +}; + +static void rcu_scale_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); +} + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_scale_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + u64 start; + s64 duration; + + VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); +repeat: + VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + rcu_scale_one_reader(); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + rcu_scale_one_reader(); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + rcu_scale_one_reader(); + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + rcu_scale_one_reader(); + + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_scale_reader"); + return 0; +} + +static void reset_readers(void) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < nreaders; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +static u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char *buf; + u64 sum = 0; + + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: :)", + exp_idx); + + for (i = 0; i < n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + SCALEOUT("%s\n", buf); + + kfree(buf); + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + bool errexit = false; + int exp, r; + char buf1[64]; + char *buf; + u64 *result_avg; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_SCALEOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + errexit = true; + } + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + // Start exp readers up per experiment + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (errexit) + break; + if (torture_must_stop()) + goto end; + + reset_readers(); + atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); + + exp_idx = exp; + + for (r = 0; r < nreaders; r++) { + smp_store_release(&reader_tasks[r].start_reader, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", + nreaders); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_SCALEOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); + } + + // Print the average of all experiments + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + + for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + + if (errexit) + break; + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); + strcat(buf, buf1); + } + + if (!errexit) + SCALEOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); + return 0; +} + +static void +ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); +} + +static void +ref_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_scale_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do scale-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_scale_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_scale_init(void) +{ + long i; + int firsterr = 0; + static struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_scale_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_scale_cleanup(); + return firsterr; +} + +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d3ef700fb0e..c100acf332ed 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ static bool srcu_might_be_idle(struct srcu_struct *ssp) @@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long t; unsigned long tlast; + check_init_srcu_struct(ssp); /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabalistically probe global state. @@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, } rhp->func = func; idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..835e2df8590a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644); #define RTGS_WAIT_READERS 9 #define RTGS_INVOKE_CBS 10 #define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INIT", "RTGS_WAIT_WAIT_CBS", @@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INVOKE_CBS", "RTGS_WAIT_CBS", }; +#endif /* #ifndef CONFIG_TINY_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) rtp->gp_jiffies = jiffies; } +#ifndef CONFIG_TINY_RCU /* Return state name. */ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) { @@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) return "???"; return rcu_tasks_gp_state_names[j]; } +#endif /* #ifndef CONFIG_TINY_RCU */ // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, @@ -205,7 +209,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_interruptible(HZ/10); + schedule_timeout_idle(HZ/10); } continue; } @@ -227,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); + schedule_timeout_idle(HZ/10); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } +#endif /* #ifndef CONFIG_TINY_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -336,7 +342,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_interruptible(HZ/fract); + schedule_timeout_idle(HZ/fract); if (fract > 1) fract--; @@ -402,7 +408,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(struct list_head *hop) +static void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This @@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RUDE_RCU */ static void show_rcu_tasks_rude_gp_kthread(void) {} @@ -727,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -atomic_t trc_n_readers_need_end; // Number of waited-for readers. -DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. +static atomic_t trc_n_readers_need_end; // Number of waited-for readers. +static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -835,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { - WARN_ON_ONCE(ofl & !is_idle_task(t)); + WARN_ON_ONCE(ofl && !is_idle_task(t)); // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -1118,11 +1128,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period * * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, - * anyway) cond_resched(). + * grace period has elapsed, in other words after all currently executing + * rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). * * This is a very specialized primitive, intended only for a few uses in * tracing and other situations requiring manipulation of function preambles @@ -1164,6 +1173,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1174,18 +1184,21 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +#ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); show_rcu_tasks_trace_gp_kthread(); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dd572ce7c747..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rcu.h" @@ -84,9 +85,9 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); - kfree((void *)head - offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); + kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..ac7198ed3197 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -175,6 +177,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -954,7 +965,6 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know @@ -990,8 +1000,11 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() @@ -1638,7 +1651,7 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); } static unsigned long sleep_duration; @@ -1661,7 +1674,7 @@ static void rcu_gp_torture_wait(void) duration = xchg(&sleep_duration, 0UL); if (duration > 0) { pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); - schedule_timeout_uninterruptible(duration); + schedule_timeout_idle(duration); pr_alert("%s: Wait complete\n", __func__); } } @@ -2443,6 +2456,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2726,7 +2740,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } @@ -2894,8 +2908,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -2957,53 +2971,53 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 + +/** + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @nr_records: Number of active pointers in the array + * @next: Next bulk object in the block chain + * @records: Array of the kvfree_rcu() pointers + */ +struct kvfree_rcu_bulk_data { + unsigned long nr_records; + struct kvfree_rcu_bulk_data *next; + void *records[]; +}; /* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. + * kvfree_rcu_bulk_data structure becomes exactly one page. */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) - -/** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers - * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers - * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set - */ -struct kfree_rcu_bulk_data { - unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; - struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; -}; +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3012,28 +3026,84 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; - // Number of objects for which GP not started int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + +static inline struct kvfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kvfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3041,38 +3111,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head) static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); + + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } + + // Channel 3. head = krwp->head_free; krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - if (cmpxchg(&krcp->bcached, NULL, bhead)) - free_page((unsigned long) bhead); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - cond_resched_tasks_rcu_qs(); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); + + cond_resched_tasks_rcu_qs(); + } } /* @@ -3082,14 +3177,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3105,8 +3201,8 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; - int i; + bool repeat = false; + int i, j; lockdep_assert_held(&krcp->lock); @@ -3114,21 +3210,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3137,17 +3237,21 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, @@ -3157,14 +3261,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -3177,32 +3281,50 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; - bnode = (struct kfree_rcu_bulk_data *) + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3212,53 +3334,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; + void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - spin_lock(&krcp->lock); + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + + krcp = krc_this_cpu_lock(&flags); // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3266,10 +3397,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } WRITE_ONCE(krcp->count, krcp->count + 1); @@ -3282,11 +3419,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } } -EXPORT_SYMBOL_GPL(kfree_call_rcu); +EXPORT_SYMBOL_GPL(kvfree_call_rcu); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) @@ -3315,11 +3461,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); sc->nr_to_scan -= count; freed += count; @@ -3328,7 +3474,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - return freed; + return freed == 0 ? SHRINK_STOP : freed; } static struct shrinker kfree_rcu_shrinker = { @@ -3346,15 +3492,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3842,10 +3988,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3857,12 +4002,10 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4249,13 +4392,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kvfree_rcu_bulk_data *bnode; - spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 43991a40b084..c96ae351688b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -41,7 +41,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -73,9 +73,9 @@ struct rcu_node { unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ - u8 grpnum; /* CPU/group number for next level up. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ @@ -149,7 +149,7 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -301,6 +302,8 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -346,8 +349,6 @@ struct rcu_state { /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 72952edad1e4..1888c0eb1216 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -403,7 +403,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } /* CPU really is offline, so we must report its QS. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 352223664ebd..982fc5be5269 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg) if (spincnt > 10) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } @@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_interruptible(1); + schedule_timeout_idle(1); } else if (!needwait_gp) { /* Wait for callbacks to appear. */ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b04256cd7e12..b5d3b4794db4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr { */ static bool check_slow_task(struct task_struct *t, void *arg) { - struct rcu_node *rnp; struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) return false; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; - rnp = t->rcu_blocked_node; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); return true; } @@ -649,6 +647,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -690,9 +689,11 @@ void show_rcu_gp_kthreads(void) } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..2de49b5d8dd2 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -42,6 +42,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -207,7 +208,7 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); - rcu_boot_ended = 1; + rcu_boot_ended = true; } /* @@ -279,6 +280,7 @@ struct lockdep_map rcu_sched_lock_map = { }; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); @@ -390,13 +392,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -407,9 +410,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); diff --git a/kernel/torture.c b/kernel/torture.c index a1a41484ff6d..1061492f14bd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney "); static bool disable_onoff_at_boot; module_param(disable_onoff_at_boot, bool, 0444); +static bool ftrace_dump_at_shutdown; +module_param(ftrace_dump_at_shutdown, bool, 0444); + static char *torture_type; static int verbose; @@ -527,7 +530,8 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - rcu_ftrace_dump(DUMP_ALL); + if (ftrace_dump_at_shutdown) + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index ddc9685702b1..5cf2fe9aab9e 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #define __param(type, name, init, msg) \ static type name = init; \ @@ -35,14 +37,18 @@ __param(int, test_loop_count, 1000000, __param(int, run_test_mask, INT_MAX, "Set tests specified in the mask.\n\n" - "\t\tid: 1, name: fix_size_alloc_test\n" - "\t\tid: 2, name: full_fit_alloc_test\n" - "\t\tid: 4, name: long_busy_list_alloc_test\n" - "\t\tid: 8, name: random_size_alloc_test\n" - "\t\tid: 16, name: fix_align_alloc_test\n" - "\t\tid: 32, name: random_size_align_alloc_test\n" - "\t\tid: 64, name: align_shift_alloc_test\n" - "\t\tid: 128, name: pcpu_alloc_test\n" + "\t\tid: 1, name: fix_size_alloc_test\n" + "\t\tid: 2, name: full_fit_alloc_test\n" + "\t\tid: 4, name: long_busy_list_alloc_test\n" + "\t\tid: 8, name: random_size_alloc_test\n" + "\t\tid: 16, name: fix_align_alloc_test\n" + "\t\tid: 32, name: random_size_align_alloc_test\n" + "\t\tid: 64, name: align_shift_alloc_test\n" + "\t\tid: 128, name: pcpu_alloc_test\n" + "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n" + "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" + "\t\tid: 1024, name: kvfree_rcu_1_arg_slab_test\n" + "\t\tid: 2048, name: kvfree_rcu_2_arg_slab_test\n" /* Add a new test case description here. */ ); @@ -316,6 +322,83 @@ pcpu_alloc_test(void) return rv; } +struct test_kvfree_rcu { + struct rcu_head rcu; + unsigned char array[20]; +}; + +static int +kvfree_rcu_1_arg_vmalloc_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = vmalloc(1 * PAGE_SIZE); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p); + } + + return 0; +} + +static int +kvfree_rcu_2_arg_vmalloc_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = vmalloc(1 * PAGE_SIZE); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p, rcu); + } + + return 0; +} + +static int +kvfree_rcu_1_arg_slab_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p); + } + + return 0; +} + +static int +kvfree_rcu_2_arg_slab_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p, rcu); + } + + return 0; +} + struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -330,6 +413,10 @@ static struct test_case_desc test_case_array[] = { { "random_size_align_alloc_test", random_size_align_alloc_test }, { "align_shift_alloc_test", align_shift_alloc_test }, { "pcpu_alloc_test", pcpu_alloc_test }, + { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test }, + { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test }, + { "kvfree_rcu_1_arg_slab_test", kvfree_rcu_1_arg_slab_test }, + { "kvfree_rcu_2_arg_slab_test", kvfree_rcu_2_arg_slab_test }, /* Add a new test case here. */ }; diff --git a/mm/list_lru.c b/mm/list_lru.c index 9222910ab1cb..e825804b3928 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; /* * This is called when shrinker has already been unregistered, - * and nobody can use it. So, there is no need to use kvfree_rcu(). + * and nobody can use it. So, there is no need to use kvfree_rcu_local(). */ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); kvfree(memcg_lrus); } -static void kvfree_rcu(struct rcu_head *head) +static void kvfree_rcu_local(struct rcu_head *head) { struct list_lru_memcg *mlru; @@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - call_rcu(&old->rcu, kvfree_rcu); + call_rcu(&old->rcu, kvfree_rcu_local); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 59a4682ebf3f..972f839c6ec8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3159,6 +3159,7 @@ void exit_mmap(struct mm_struct *mm) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); + cond_resched(); } vm_unacct_memory(nr_accounted); } diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh index 93e80a42249a..d6e5ce084b1c 100755 --- a/tools/testing/selftests/rcutorture/bin/configinit.sh +++ b/tools/testing/selftests/rcutorture/bin/configinit.sh @@ -32,11 +32,11 @@ if test -z "$TORTURE_TRUST_MAKE" then make clean > $resdir/Make.clean 2>&1 fi -make $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 +make $TORTURE_KMAKE_ARG $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 mv .config .config.sav sh $T/upd.sh < .config.sav > .config cp .config .config.new -yes '' | make oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err +yes '' | make $TORTURE_KMAKE_ARG oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err # verify new config matches specification. configcheck.sh .config $c diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh new file mode 100755 index 000000000000..0e4c0b2eb7f0 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Scan standard input for error messages, dumping any found to standard +# output. +# +# Usage: console-badness.sh +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney + +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' | +grep -v 'ODEBUG: ' | +grep -v 'This means that this is a DEBUG kernel and it is' | +grep -v 'Warning: unable to open an initial console' diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 12810229fddc..51f3464b96d3 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -215,9 +215,6 @@ identify_qemu_args () { then echo -device spapr-vlan,netdev=net0,mac=$TORTURE_QEMU_MAC echo -netdev bridge,br=br0,id=net0 - elif test -n "$TORTURE_QEMU_INTERACTIVE" - then - echo -net nic -net user fi ;; esac @@ -234,7 +231,7 @@ identify_qemu_args () { # Returns the number of virtual CPUs available to the aggregate of the # guest OSes. identify_qemu_vcpus () { - lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' + lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' -e 's/[ ]*//g' } # print_bug @@ -275,3 +272,21 @@ specify_qemu_cpus () { esac fi } + +# specify_qemu_net qemu-args +# +# Appends a string containing "-net none" to qemu-args, unless the incoming +# qemu-args already contains "-smp" or unless the TORTURE_QEMU_INTERACTIVE +# environment variable is set, in which case the string that is be added is +# instead "-net nic -net user". +specify_qemu_net () { + if echo $1 | grep -q -e -net + then + echo $1 + elif test -n "$TORTURE_QEMU_INTERACTIVE" + then + echo $1 -net nic -net user + else + echo $1 -net none + fi +} diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 30cb5b27d32e..188b864bc4bf 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -46,6 +46,12 @@ do exit 0; fi + # Check for stop request. + if test -f "$TORTURE_STOPFILE" + then + exit 1; + fi + # Set affinity to randomly selected online CPU if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 | sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'` diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 18d6518504ee..115e1822b26f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -9,6 +9,12 @@ # # Authors: Paul E. McKenney +if test -f "$TORTURE_STOPFILE" +then + echo "kvm-build.sh early exit due to run STOP request" + exit 1 +fi + config_template=${1} if test -z "$config_template" -o ! -f "$config_template" -o ! -r "$config_template" then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh new file mode 100755 index 000000000000..6e65c134e5f1 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -0,0 +1,108 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Run a group of kvm.sh tests on the specified commits. This currently +# unconditionally does three-minute runs on each scenario in CFLIST, +# taking advantage of all available CPUs and trusting the "make" utility. +# In the short term, adjustments can be made by editing this script and +# CFLIST. If some adjustments appear to have ongoing value, this script +# might grow some command-line arguments. +# +# Usage: kvm-check-branches.sh commit1 commit2..commit3 commit4 ... +# +# This script considers its arguments one at a time. If more elaborate +# specification of commits is needed, please use "git rev-list" to +# produce something that this simple script can understand. The reason +# for retaining the simplicity is that it allows the user to more easily +# see which commit came from which branch. +# +# This script creates a yyyy.mm.dd-hh.mm.ss-group entry in the "res" +# directory. The calls to kvm.sh create the usual entries, but this script +# moves them under the yyyy.mm.dd-hh.mm.ss-group entry, each in its own +# directory numbered in run order, that is, "0001", "0002", and so on. +# For successful runs, the large build artifacts are removed. Doing this +# reduces the disk space required by about two orders of magnitude for +# successful runs. +# +# Copyright (C) Facebook, 2020 +# +# Authors: Paul E. McKenney + +if ! git status > /dev/null 2>&1 +then + echo '!!!' This script needs to run in a git archive. 1>&2 + echo '!!!' Giving up. 1>&2 + exit 1 +fi + +# Remember where we started so that we can get back and the end. +curcommit="`git status | head -1 | awk '{ print $NF }'`" + +nfail=0 +ntry=0 +resdir="tools/testing/selftests/rcutorture/res" +ds="`date +%Y.%m.%d-%H.%M.%S`-group" +if ! test -e $resdir +then + mkdir $resdir || : +fi +mkdir $resdir/$ds +echo Results directory: $resdir/$ds + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh +cpus="`identify_qemu_vcpus`" +echo Using up to $cpus CPUs. + +# Each pass through this loop does one command-line argument. +for gitbr in $@ +do + echo ' --- git branch ' $gitbr + + # Each pass through this loop tests one commit. + for i in `git rev-list "$gitbr"` + do + ntry=`expr $ntry + 1` + idir=`awk -v ntry="$ntry" 'END { printf "%04d", ntry; }' < /dev/null` + echo ' --- commit ' $i from branch $gitbr + date + mkdir $resdir/$ds/$idir + echo $gitbr > $resdir/$ds/$idir/gitbr + echo $i >> $resdir/$ds/$idir/gitbr + + # Test the specified commit. + git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 + echo git checkout return code: $? "(Commit $ntry: $i)" + kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + ret=$? + echo kvm.sh return code $ret for commit $i from branch $gitbr + + # Move the build products to their resting place. + runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`" + mv $runresdir $resdir/$ds/$idir + rrd="`echo $runresdir | sed -e 's,^.*/,,'`" + echo Run results: $resdir/$ds/$idir/$rrd + if test "$ret" -ne 0 + then + # Failure, so leave all evidence intact. + nfail=`expr $nfail + 1` + else + # Success, so remove large files to save about 1GB. + ( cd $resdir/$ds/$idir/$rrd; rm -f */vmlinux */bzImage */System.map */Module.symvers ) + fi + done +done +date + +# Go back to the original commit. +git checkout "$curcommit" + +if test $nfail -ne 0 +then + echo '!!! ' $nfail failures in $ntry 'runs!!!' + exit 1 +else + echo No failures in $ntry runs. + exit 0 +fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh new file mode 100755 index 000000000000..35a463dddffe --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Analyze a given results directory for refscale performance measurements. +# +# Usage: kvm-recheck-refscale.sh resdir +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +i="$1" +if test -d "$i" -a -r "$i" +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH +. functions.sh + +configfile=`echo $i | sed -e 's/^.*\///'` + +sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' | +awk -v configfile="$configfile" ' +/^[ ]*Runs Time\(ns\) *$/ { + if (dataphase + 0 == 0) { + dataphase = 1; + # print configfile, $0; + } + next; +} + +/[^ ]*[0-9][0-9]* [0-9][0-9]*\.[0-9][0-9]*$/ { + if (dataphase == 1) { + # print $0; + readertimes[++n] = $2; + sum += $2; + } + next; +} + +{ + if (dataphase == 1) + dataphase == 2; + next; +} + +END { + print configfile " results:"; + newNR = asort(readertimes); + if (newNR <= 0) { + print "No refscale records found???" + exit; + } + medianidx = int(newNR / 2); + if (newNR == medianidx * 2) + medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2; + else + medianvalue = readertimes[medianidx]; + points = "Points:"; + for (i = 1; i <= newNR; i++) + points = points " " readertimes[i]; + print points; + print "Average reader duration: " sum / newNR " nanoseconds"; + print "Minimum reader duration: " readertimes[1]; + print "Median reader duration: " medianvalue; + print "Maximum reader duration: " readertimes[newNR]; + print "Computed from refscale printk output."; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 736f04749b90..840a4679a0d7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -31,6 +31,7 @@ do head -1 $resdir/log fi TORTURE_SUITE="`cat $i/../TORTURE_SUITE`" + configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags kvm-recheck-${TORTURE_SUITE}.sh $i if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137 @@ -43,7 +44,8 @@ do then echo QEMU killed fi - configcheck.sh $i/.config $i/ConfigFragment + configcheck.sh $i/.config $i/ConfigFragment > $T 2>&1 + cat $T if test -r $i/Make.oldconfig.err then cat $i/Make.oldconfig.err @@ -55,15 +57,15 @@ do cat $i/Warnings fi else - if test -f "$i/qemu-cmd" - then - print_bug qemu failed - echo " $i" - elif test -f "$i/buildonly" + if test -f "$i/buildonly" then echo Build-only run, no boot/test configcheck.sh $i/.config $i/ConfigFragment parse-build.sh $i/Make.out $configfile + elif test -f "$i/qemu-cmd" + then + print_bug qemu failed + echo " $i" else print_bug Build failed echo " $i" @@ -72,7 +74,11 @@ do done if test -f "$rd/kcsan.sum" then - if test -s "$rd/kcsan.sum" + if grep -q CONFIG_KCSAN=y $T + then + echo "Compiler or architecture does not support KCSAN!" + echo Did you forget to switch your compiler with '--kmake-arg CC='? + elif test -s "$rd/kcsan.sum" then echo KCSAN summary in $rd/kcsan.sum else diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 6ff611c630d1..e07779a62634 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -124,7 +124,6 @@ seconds=$4 qemu_args=$5 boot_args=$6 -cd $KVM kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` if test -z "$TORTURE_BUILDONLY" then @@ -141,6 +140,7 @@ then cpu_count=$TORTURE_ALLOTED_CPUS fi qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" +qemu_args="`specify_qemu_net "$qemu_args"`" # Generate architecture-specific and interaction-specific qemu arguments qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" @@ -152,6 +152,7 @@ qemu_append="`identify_qemu_append "$QEMU"`" boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" +echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then @@ -159,9 +160,16 @@ then touch $resdir/buildonly exit 0 fi + +# Decorate qemu-cmd with redirection, backgrounding, and PID capture +sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd +echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd + +# In case qemu refuses to run... echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & + +# Attempt to run qemu +( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file if test -s "$resdir/qemu_pid" @@ -181,7 +189,7 @@ do kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then - if test $kruntime -ge $seconds + if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE" then break; fi @@ -210,10 +218,19 @@ then fi if test $commandcompleted -eq 0 -a -n "$qemu_pid" then - echo Grace period for qemu job at pid $qemu_pid + if ! test -f "$TORTURE_STOPFILE" + then + echo Grace period for qemu job at pid $qemu_pid + fi oldline="`tail $resdir/console.log`" while : do + if test -f "$TORTURE_STOPFILE" + then + echo "PID $qemu_pid killed due to run STOP request" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if kill -0 $qemu_pid > /dev/null 2>&1 then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh new file mode 100755 index 000000000000..c45a953ef393 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Transform a qemu-cmd file to allow reuse. +# +# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out +# +# bzImage: Kernel and initrd from the same prior kvm.sh run. +# console.log: File into which to place console output. +# +# The original qemu-cmd file is provided on standard input. +# The transformed qemu-cmd file is on standard output. +# The transformation assumes that the qemu command is confined to a +# single line. It also assumes no whitespace in filenames. +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney + +image="$1" +if test -z "$image" +then + echo Need kernel image file. + exit 1 +fi +consolelog="$2" +if test -z "$consolelog" +then + echo "Need console log file name." + exit 1 +fi + +awk -v image="$image" -v consolelog="$consolelog" ' +{ + line = ""; + for (i = 1; i <= NF; i++) { + if (line == "") + line = $i; + else + line = line " " $i; + if ($i == "-serial") { + i++; + line = line " file:" consolelog; + } + if ($i == "-kernel") { + i++; + line = line " " image; + } + } + print line; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index c279cf9cb010..e655983b7429 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -73,6 +73,10 @@ usage () { while test $# -gt 0 do case "$1" in + --allcpus) + cpus=$TORTURE_ALLOTED_CPUS + max_cpus=$TORTURE_ALLOTED_CPUS + ;; --bootargs|--bootarg) checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--' TORTURE_BOOTARGS="$2" @@ -180,13 +184,14 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\)$' '^--' TORTURE_SUITE=$2 shift - if test "$TORTURE_SUITE" = rcuperf + if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale then - # If you really want jitter for rcuperf, specify - # it after specifying rcuperf. (But why?) + # If you really want jitter for refscale or + # rcuperf, specify it after specifying the rcuperf + # or the refscale. (But why jitter in these cases?) jitter=0 fi ;; @@ -333,6 +338,8 @@ then mkdir -p "$resdir" || : fi mkdir $resdir/$ds +TORTURE_RESDIR="$resdir/$ds"; export TORTURE_RESDIR +TORTURE_STOPFILE="$resdir/$ds/STOP"; export TORTURE_STOPFILE echo Results directory: $resdir/$ds echo $scriptname $args touch $resdir/$ds/log @@ -497,3 +504,7 @@ fi # Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier # Function-graph tracing: ftrace=function_graph ftrace_graph_filter=sched_setaffinity,migration_cpu_stop # Also --kconfig "CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y" +# Control buffer size: --bootargs trace_buf_size=3k +# Get trace-buffer dumps on all oopses: --bootargs ftrace_dump_on_oops +# Ditto, but dump only the oopsing CPU: --bootargs ftrace_dump_on_oops=orig_cpu +# Heavy-handed way to also dump on warnings: --bootargs panic_on_warn diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 4bf62d7b1cbc..71a9f43a3918 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,8 +33,8 @@ then fi cat /dev/null > $file.diags -# Check for proper termination, except that rcuperf runs don't indicate this. -if test "$TORTURE_SUITE" != rcuperf +# Check for proper termination, except for rcuperf and refscale. +if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refscale then # check for abject failure @@ -44,11 +44,23 @@ then tail -1 | awk ' { - for (i=NF-8;i<=NF;i++) + normalexit = 1; + for (i=NF-8;i<=NF;i++) { + if (i <= 0 || i !~ /^[0-9]*$/) { + bangstring = $0; + gsub(/^\[[^]]*] /, "", bangstring); + print bangstring; + normalexit = 0; + exit 0; + } sum+=$i; + } } - END { print sum }'` - print_bug $title FAILURE, $nerrs instances + END { + if (normalexit) + print sum " instances" + }'` + print_bug $title FAILURE, $nerrs exit fi @@ -104,10 +116,7 @@ then fi fi | tee -a $file.diags -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file | -grep -v 'ODEBUG: ' | -grep -v 'This means that this is a DEBUG kernel and it is' | -grep -v 'Warning: unable to open an initial console' > $T.diags +console-badness.sh < $file > $T.diags if test -s $T.diags then print_warning "Assertion failure in $file $title" diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFLIST b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST new file mode 100644 index 000000000000..4d62eb4a39f9 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST @@ -0,0 +1,2 @@ +NOPREEMPT +PREEMPT diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon new file mode 100644 index 000000000000..a98b58b54bb1 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_REF_SCALE_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT new file mode 100644 index 000000000000..1cd25b7314e3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_PREEMPT_RCU=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT new file mode 100644 index 000000000000..d10bc694f42c --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh new file mode 100644 index 000000000000..321e82641287 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# Copyright (C) IBM Corporation, 2015 +# +# Authors: Paul E. McKenney + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 refscale.shutdown=1 \ + refscale.verbose=1 +}