Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "14 fixes and one selftest to verify the ipc fixes herein"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm: limit boost_watermark on small zones
  ubsan: disable UBSAN_ALIGNMENT under COMPILE_TEST
  mm/vmscan: remove unnecessary argument description of isolate_lru_pages()
  epoll: atomically remove wait entry on wake up
  kselftests: introduce new epoll60 testcase for catching lost wakeups
  percpu: make pcpu_alloc() aware of current gfp context
  mm/slub: fix incorrect interpretation of s->offset
  scripts/gdb: repair rb_first() and rb_last()
  eventpoll: fix missing wakeup for ovflist in ep_poll_callback
  arch/x86/kvm/svm/sev.c: change flag passed to GUP fast in sev_pin_memory()
  scripts/decodecode: fix trapping instruction formatting
  kernel/kcov.c: fix typos in kcov_remote_start documentation
  mm/page_alloc: fix watchdog soft lockups during set_zone_contiguous()
  mm, memcg: fix error return value of mem_cgroup_css_alloc()
  ipc/mqueue.c: change __do_notify() to bypass check_kill_permission()
This commit is contained in:
Linus Torvalds 2020-05-08 08:41:09 -07:00
commit af38553c66
14 changed files with 276 additions and 79 deletions

View File

@ -345,7 +345,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;

View File

@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi)
{
struct eventpoll *ep = epi->ep;
/* Fast preliminary check */
if (epi->next != EP_UNACTIVE_PTR)
return false;
/* Check that the same epi has not been just chained from another CPU */
if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
return false;
@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (epi->next == EP_UNACTIVE_PTR &&
chain_epi_lockless(epi))
if (chain_epi_lockless(epi))
ep_pm_stay_awake_rcu(epi);
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(epi) &&
list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
/*
@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@ -1867,21 +1866,23 @@ fetch_events:
*/
ep_reset_busy_poll_napi_id(ep);
/*
* We don't have any available event to return to the caller. We need
* to sleep here, and we will be woken by ep_poll_callback() when events
* become available.
*/
if (!waiter) {
waiter = true;
init_waitqueue_entry(&wait, current);
do {
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*/
init_wait(&wait);
write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
@ -1911,10 +1912,20 @@ fetch_events:
timed_out = 1;
break;
}
}
/* We were woken up, thus go and try to harvest some events */
eavail = 1;
} while (0);
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
@ -1925,12 +1936,6 @@ send_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
if (waiter) {
write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
return res;
}

View File

@ -142,6 +142,7 @@ struct mqueue_inode_info {
struct sigevent notify;
struct pid *notify_owner;
u32 notify_self_exec_id;
struct user_namespace *notify_user_ns;
struct user_struct *user; /* user who created, for accounting */
struct sock *notify_sock;
@ -773,28 +774,44 @@ static void __do_notify(struct mqueue_inode_info *info)
* synchronously. */
if (info->notify_owner &&
info->attr.mq_curmsgs == 1) {
struct kernel_siginfo sig_i;
switch (info->notify.sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
/* sends signal */
case SIGEV_SIGNAL: {
struct kernel_siginfo sig_i;
struct task_struct *task;
/* do_mq_notify() accepts sigev_signo == 0, why?? */
if (!info->notify.sigev_signo)
break;
clear_siginfo(&sig_i);
sig_i.si_signo = info->notify.sigev_signo;
sig_i.si_errno = 0;
sig_i.si_code = SI_MESGQ;
sig_i.si_value = info->notify.sigev_value;
/* map current pid/uid into info->owner's namespaces */
rcu_read_lock();
/* map current pid/uid into info->owner's namespaces */
sig_i.si_pid = task_tgid_nr_ns(current,
ns_of_pid(info->notify_owner));
sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
current_uid());
/*
* We can't use kill_pid_info(), this signal should
* bypass check_kill_permission(). It is from kernel
* but si_fromuser() can't know this.
* We do check the self_exec_id, to avoid sending
* signals to programs that don't expect them.
*/
task = pid_task(info->notify_owner, PIDTYPE_TGID);
if (task && task->self_exec_id ==
info->notify_self_exec_id) {
do_send_sig_info(info->notify.sigev_signo,
&sig_i, task, PIDTYPE_TGID);
}
rcu_read_unlock();
kill_pid_info(info->notify.sigev_signo,
&sig_i, info->notify_owner);
break;
}
case SIGEV_THREAD:
set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
netlink_sendskb(info->notify_sock, info->notify_cookie);
@ -1383,6 +1400,7 @@ retry:
info->notify.sigev_signo = notification->sigev_signo;
info->notify.sigev_value = notification->sigev_value;
info->notify.sigev_notify = SIGEV_SIGNAL;
info->notify_self_exec_id = current->self_exec_id;
break;
}

View File

@ -740,8 +740,8 @@ static const struct file_operations kcov_fops = {
* kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an
* arbitrary 4-byte non-zero number as the instance id). This common handle
* then gets saved into the task_struct of the process that issued the
* KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn
* kernel threads, the common handle must be retrived via kcov_common_handle()
* KCOV_REMOTE_ENABLE ioctl. When this process issues system calls that spawn
* kernel threads, the common handle must be retrieved via kcov_common_handle()
* and passed to the spawned threads via custom annotations. Those kernel
* threads must in turn be annotated with kcov_remote_start(common_handle) and
* kcov_remote_stop(). All of the threads that are spawned by the same process

View File

@ -60,17 +60,14 @@ config UBSAN_SANITIZE_ALL
Enabling this option will get kernel image size increased
significantly.
config UBSAN_NO_ALIGNMENT
bool "Disable checking of pointers alignment"
default y if HAVE_EFFICIENT_UNALIGNED_ACCESS
help
This option disables the check of unaligned memory accesses.
This option should be used when building allmodconfig.
Disabling this option on architectures that support unaligned
accesses may produce a lot of false positives.
config UBSAN_ALIGNMENT
def_bool !UBSAN_NO_ALIGNMENT
bool "Enable checks for pointers alignment"
default !HAVE_EFFICIENT_UNALIGNED_ACCESS
depends on !X86 || !COMPILE_TEST
help
This option enables the check of unaligned memory accesses.
Enabling this option on architectures that support unaligned
accesses may produce a lot of false positives.
config TEST_UBSAN
tristate "Module for testing for undefined behavior detection"

View File

@ -4990,19 +4990,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
unsigned int size;
int node;
int __maybe_unused i;
long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
if (memcg->id.id < 0)
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
}
memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_local)
@ -5046,7 +5049,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return NULL;
return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
@ -5057,8 +5060,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
long error = -ENOMEM;
memcg = mem_cgroup_alloc();
if (!memcg)
return ERR_PTR(error);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
@ -5108,7 +5111,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(-ENOMEM);
return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)

View File

@ -1607,6 +1607,7 @@ void set_zone_contiguous(struct zone *zone)
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
cond_resched();
}
/* We confirm that there is no hole */
@ -2400,6 +2401,14 @@ static inline void boost_watermark(struct zone *zone)
if (!watermark_boost_factor)
return;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
* in a small area, boosting the watermark can cause an out of
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
return;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);

View File

@ -80,6 +80,7 @@
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@ -1557,10 +1558,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
/* whitelisted flags that can be passed to the backing allocators */
gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
@ -1569,6 +1569,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
void __percpu *ptr;
size_t bits, bit_align;
gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
do_warn = !(gfp & __GFP_NOWARN);
/*
* There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
* therefore alignment must be a minimum of that many bytes.

View File

@ -551,15 +551,32 @@ static void print_section(char *level, char *text, u8 *addr,
metadata_access_disable();
}
/*
* See comment in calculate_sizes().
*/
static inline bool freeptr_outside_object(struct kmem_cache *s)
{
return s->offset >= s->inuse;
}
/*
* Return offset of the end of info block which is inuse + free pointer if
* not overlapping with object.
*/
static inline unsigned int get_info_end(struct kmem_cache *s)
{
if (freeptr_outside_object(s))
return s->inuse + sizeof(void *);
else
return s->inuse;
}
static struct track *get_track(struct kmem_cache *s, void *object,
enum track_item alloc)
{
struct track *p;
if (s->offset)
p = object + s->offset + sizeof(void *);
else
p = object + s->inuse;
p = object + get_info_end(s);
return p + alloc;
}
@ -686,10 +703,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
if (s->offset)
off = s->offset + sizeof(void *);
else
off = s->inuse;
off = get_info_end(s);
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
@ -782,7 +796,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* object address
* Bytes of the object to be managed.
* If the freepointer may overlay the object then the free
* pointer is the first word of the object.
* pointer is at the middle of the object.
*
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
@ -816,11 +830,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned long off = s->inuse; /* The end of info */
if (s->offset)
/* Freepointer is placed after the object. */
off += sizeof(void *);
unsigned long off = get_info_end(s); /* The end of info */
if (s->flags & SLAB_STORE_USER)
/* We also have user information there */
@ -907,7 +917,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
check_pad_bytes(s, page, p);
}
if (!s->offset && val == SLUB_RED_ACTIVE)
if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@ -3587,6 +3597,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*
* This is the case if we do RCU, have a constructor or
* destructor or are poisoning the objects.
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
* freeptr_outside_object() function. If that is no
* longer true, the function needs to be modified.
*/
s->offset = size;
size += sizeof(void *);

View File

@ -1625,7 +1625,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @mode: One of the LRU isolation modes
* @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.

View File

@ -126,7 +126,7 @@ faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \
faultline=`cat $T.dis | head -1 | cut -d":" -f2-`
faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'`
cat $T.oo | sed -e "${faultlinenum}s/^\(.*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/"
cat $T.oo | sed -e "${faultlinenum}s/^\([^:]*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/"
echo
cat $T.aa
cleanup

View File

@ -12,7 +12,7 @@ rb_node_type = utils.CachedType("struct rb_node")
def rb_first(root):
if root.type == rb_root_type.get_type():
node = node.address.cast(rb_root_type.get_type().pointer())
node = root.address.cast(rb_root_type.get_type().pointer())
elif root.type != rb_root_type.get_type().pointer():
raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
@ -28,7 +28,7 @@ def rb_first(root):
def rb_last(root):
if root.type == rb_root_type.get_type():
node = node.address.cast(rb_root_type.get_type().pointer())
node = root.address.cast(rb_root_type.get_type().pointer())
elif root.type != rb_root_type.get_type().pointer():
raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))

View File

@ -3,6 +3,7 @@
#define _GNU_SOURCE
#include <poll.h>
#include <unistd.h>
#include <assert.h>
#include <signal.h>
#include <pthread.h>
#include <sys/epoll.h>
@ -3136,4 +3137,149 @@ TEST(epoll59)
close(ctx.sfd[0]);
}
enum {
EPOLL60_EVENTS_NR = 10,
};
struct epoll60_ctx {
volatile int stopped;
int ready;
int waiters;
int epfd;
int evfd[EPOLL60_EVENTS_NR];
};
static void *epoll60_wait_thread(void *ctx_)
{
struct epoll60_ctx *ctx = ctx_;
struct epoll_event e;
sigset_t sigmask;
uint64_t v;
int ret;
/* Block SIGUSR1 */
sigemptyset(&sigmask);
sigaddset(&sigmask, SIGUSR1);
sigprocmask(SIG_SETMASK, &sigmask, NULL);
/* Prepare empty mask for epoll_pwait() */
sigemptyset(&sigmask);
while (!ctx->stopped) {
/* Mark we are ready */
__atomic_fetch_add(&ctx->ready, 1, __ATOMIC_ACQUIRE);
/* Start when all are ready */
while (__atomic_load_n(&ctx->ready, __ATOMIC_ACQUIRE) &&
!ctx->stopped);
/* Account this waiter */
__atomic_fetch_add(&ctx->waiters, 1, __ATOMIC_ACQUIRE);
ret = epoll_pwait(ctx->epfd, &e, 1, 2000, &sigmask);
if (ret != 1) {
/* We expect only signal delivery on stop */
assert(ret < 0 && errno == EINTR && "Lost wakeup!\n");
assert(ctx->stopped);
break;
}
ret = read(e.data.fd, &v, sizeof(v));
/* Since we are on ET mode, thus each thread gets its own fd. */
assert(ret == sizeof(v));
__atomic_fetch_sub(&ctx->waiters, 1, __ATOMIC_RELEASE);
}
return NULL;
}
static inline unsigned long long msecs(void)
{
struct timespec ts;
unsigned long long msecs;
clock_gettime(CLOCK_REALTIME, &ts);
msecs = ts.tv_sec * 1000ull;
msecs += ts.tv_nsec / 1000000ull;
return msecs;
}
static inline int count_waiters(struct epoll60_ctx *ctx)
{
return __atomic_load_n(&ctx->waiters, __ATOMIC_ACQUIRE);
}
TEST(epoll60)
{
struct epoll60_ctx ctx = { 0 };
pthread_t waiters[ARRAY_SIZE(ctx.evfd)];
struct epoll_event e;
int i, n, ret;
signal(SIGUSR1, signal_handler);
ctx.epfd = epoll_create1(0);
ASSERT_GE(ctx.epfd, 0);
/* Create event fds */
for (i = 0; i < ARRAY_SIZE(ctx.evfd); i++) {
ctx.evfd[i] = eventfd(0, EFD_NONBLOCK);
ASSERT_GE(ctx.evfd[i], 0);
e.events = EPOLLIN | EPOLLET;
e.data.fd = ctx.evfd[i];
ASSERT_EQ(epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd[i], &e), 0);
}
/* Create waiter threads */
for (i = 0; i < ARRAY_SIZE(waiters); i++)
ASSERT_EQ(pthread_create(&waiters[i], NULL,
epoll60_wait_thread, &ctx), 0);
for (i = 0; i < 300; i++) {
uint64_t v = 1, ms;
/* Wait for all to be ready */
while (__atomic_load_n(&ctx.ready, __ATOMIC_ACQUIRE) !=
ARRAY_SIZE(ctx.evfd))
;
/* Steady, go */
__atomic_fetch_sub(&ctx.ready, ARRAY_SIZE(ctx.evfd),
__ATOMIC_ACQUIRE);
/* Wait all have gone to kernel */
while (count_waiters(&ctx) != ARRAY_SIZE(ctx.evfd))
;
/* 1ms should be enough to schedule away */
usleep(1000);
/* Quickly signal all handles at once */
for (n = 0; n < ARRAY_SIZE(ctx.evfd); n++) {
ret = write(ctx.evfd[n], &v, sizeof(v));
ASSERT_EQ(ret, sizeof(v));
}
/* Busy loop for 1s and wait for all waiters to wake up */
ms = msecs();
while (count_waiters(&ctx) && msecs() < ms + 1000)
;
ASSERT_EQ(count_waiters(&ctx), 0);
}
ctx.stopped = 1;
/* Stop waiters */
for (i = 0; i < ARRAY_SIZE(waiters); i++)
ret = pthread_kill(waiters[i], SIGUSR1);
for (i = 0; i < ARRAY_SIZE(waiters); i++)
pthread_join(waiters[i], NULL);
for (i = 0; i < ARRAY_SIZE(waiters); i++)
close(ctx.evfd[i]);
close(ctx.epfd);
}
TEST_HARNESS_MAIN

View File

@ -25,7 +25,6 @@ CONFIG_KASAN=y
CONFIG_KASAN_INLINE=y
CONFIG_UBSAN=y
CONFIG_UBSAN_SANITIZE_ALL=y
CONFIG_UBSAN_NO_ALIGNMENT=y
CONFIG_UBSAN_NULL=y
CONFIG_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=8192