From f8f72ad5396987e05a42cf7eff826fb2a15ff148 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Oct 2010 14:21:10 -0700 Subject: [PATCH 001/176] mm: fix return value of scan_lru_pages in memory unplug scan_lru_pages returns pfn. So, it's type should be "unsigned long" not "int". Note: I guess this has been work until now because memory hotplug tester's machine has not very big memory.... physical address < 32bit << PAGE_SHIFT. Reported-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d4e940a26945..06662c5a3e86 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -659,7 +659,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) * Scanning pfn is much easier than scanning lru list. * Scan pfn from start to end and Find LRU page. */ -int scan_lru_pages(unsigned long start, unsigned long end) +unsigned long scan_lru_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; From a75d377686037982cbec320bb770b19fe7be6a5d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:21:10 -0700 Subject: [PATCH 002/176] types.h: move misplaced comment This comment landed in the wrong place. Cc: Andi Kleen Cc: Arnd Bergmann Cc: David Miller Cc: Eric Paris Cc: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/types.h b/include/linux/types.h index 357dbc19606f..c2a9eb44f2fa 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -121,15 +121,7 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif -/* - * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid - * common 32/64-bit compat problems. - * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other - * architectures) and to 8-byte boundaries on 64-bit architetures. The new - * aligned_64 type enforces 8-byte alignment so that structs containing - * aligned_64 values have the same alignment on 32-bit and 64-bit architectures. - * No conversions are necessary between 32-bit user-space and a 64-bit kernel. - */ +/* this is a special 64bit data type that is 8-byte aligned */ #define aligned_u64 __u64 __attribute__((aligned(8))) #define aligned_be64 __be64 __attribute__((aligned(8))) #define aligned_le64 __le64 __attribute__((aligned(8))) @@ -186,7 +178,15 @@ typedef __u64 __bitwise __be64; typedef __u16 __bitwise __sum16; typedef __u32 __bitwise __wsum; -/* this is a special 64bit data type that is 8-byte aligned */ +/* + * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid + * common 32/64-bit compat problems. + * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other + * architectures) and to 8-byte boundaries on 64-bit architetures. The new + * aligned_64 type enforces 8-byte alignment so that structs containing + * aligned_64 values have the same alignment on 32-bit and 64-bit architectures. + * No conversions are necessary between 32-bit user-space and a 64-bit kernel. + */ #define __aligned_u64 __u64 __attribute__((aligned(8))) #define __aligned_be64 __be64 __attribute__((aligned(8))) #define __aligned_le64 __le64 __attribute__((aligned(8))) From b7f50cfa3630b6e079929ffccfd442d65064ee1f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:11 -0700 Subject: [PATCH 003/176] mm, page-allocator: do not check the state of a non-existant buddy during free There is a bug in commit 6dda9d55 ("page allocator: reduce fragmentation in buddy allocator by adding buddies that are merging to the tail of the free lists") that means a buddy at order MAX_ORDER is checked for merging. A page of this order never exists so at times, an effectively random piece of memory is being checked. Alan Curry has reported that this is causing memory corruption in userspace data on a PPC32 platform (http://lkml.org/lkml/2010/10/9/32). It is not clear why this is happening. It could be a cache coherency problem where pages mapped in both user and kernel space are getting different cache lines due to the bad read from kernel space (http://lkml.org/lkml/2010/10/13/179). It could also be that there are some special registers being io-remapped at the end of the memmap array and that a read has special meaning on them. Compiler bugs have been ruled out because the assembly before and after the patch looks relatively harmless. This patch fixes the problem by ensuring we are not reading a possibly invalid location of memory. It's not clear why the read causes corruption but one way or the other it is a buggy read. Signed-off-by: Mel Gorman Cc: Corrado Zoccolo Reported-by: Alan Curry Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a362c52fdf4..5faf876cfc3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -531,7 +531,7 @@ static inline void __free_one_page(struct page *page, * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; combined_idx = __find_combined_index(page_idx, order); higher_page = page + combined_idx - page_idx; From 482db6df1746c4fa7d64a2441d4cb2610249c679 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:21:13 -0700 Subject: [PATCH 004/176] um: fix global timer issue when using CONFIG_NO_HZ This fixes a issue which was introduced by fe2cc53e ("uml: track and make up lost ticks"). timeval_to_ns() returns long long and not int. Due to that UML's timer did not work properlt and caused timer freezes. Signed-off-by: Richard Weinberger Acked-by: Pekka Enberg Cc: Jeff Dike Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/os-Linux/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index dec5678fc17f..6e3359d6a839 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -60,7 +60,7 @@ static inline long long timeval_to_ns(const struct timeval *tv) long long disable_timer(void) { struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } }); - int remain, max = UM_NSEC_PER_SEC / UM_HZ; + long long remain, max = UM_NSEC_PER_SEC / UM_HZ; if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0) printk(UM_KERN_ERR "disable_timer - setitimer failed, " From 09358972bff5ce99de496bbba97c85d417b3c054 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 26 Oct 2010 14:21:15 -0700 Subject: [PATCH 005/176] sgi-xp: incoming XPC channel messages can come in after the channel's partition structures have been torn down Under some workloads, some channel messages have been observed being delayed on the sending side past the point where the receiving side has been able to tear down its partition structures. This condition is already detected in xpc_handle_activate_IRQ_uv(), but that information is not given to xpc_handle_activate_mq_msg_uv(). As a result, xpc_handle_activate_mq_msg_uv() assumes the structures still exist and references them, causing a NULL-pointer deref. Signed-off-by: Robin Holt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/sgi-xp/xpc_uv.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 1f59ee2226ca..17bbacb1b4b1 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -417,6 +417,7 @@ xpc_process_activate_IRQ_rcvd_uv(void) static void xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, struct xpc_activate_mq_msghdr_uv *msg_hdr, + int part_setup, int *wakeup_hb_checker) { unsigned long irq_flags; @@ -481,6 +482,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV: { struct xpc_activate_mq_msg_chctl_closerequest_uv *msg; + if (!part_setup) + break; + msg = container_of(msg_hdr, struct xpc_activate_mq_msg_chctl_closerequest_uv, hdr); @@ -497,6 +501,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV: { struct xpc_activate_mq_msg_chctl_closereply_uv *msg; + if (!part_setup) + break; + msg = container_of(msg_hdr, struct xpc_activate_mq_msg_chctl_closereply_uv, hdr); @@ -511,6 +518,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV: { struct xpc_activate_mq_msg_chctl_openrequest_uv *msg; + if (!part_setup) + break; + msg = container_of(msg_hdr, struct xpc_activate_mq_msg_chctl_openrequest_uv, hdr); @@ -528,6 +538,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV: { struct xpc_activate_mq_msg_chctl_openreply_uv *msg; + if (!part_setup) + break; + msg = container_of(msg_hdr, struct xpc_activate_mq_msg_chctl_openreply_uv, hdr); args = &part->remote_openclose_args[msg->ch_number]; @@ -545,6 +558,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV: { struct xpc_activate_mq_msg_chctl_opencomplete_uv *msg; + if (!part_setup) + break; + msg = container_of(msg_hdr, struct xpc_activate_mq_msg_chctl_opencomplete_uv, hdr); spin_lock_irqsave(&part->chctl_lock, irq_flags); @@ -621,6 +637,7 @@ xpc_handle_activate_IRQ_uv(int irq, void *dev_id) part_referenced = xpc_part_ref(part); xpc_handle_activate_mq_msg_uv(part, msg_hdr, + part_referenced, &wakeup_hb_checker); if (part_referenced) xpc_part_deref(part); From 6915e04f8847bea16d0890f559694ad8eedd026c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:21:16 -0700 Subject: [PATCH 006/176] um: remove PAGE_SIZE alignment in linker script causing kernel segfault. The linker script cleanup that I did in commit 5d150a97f93 ("um: Clean up linker script using standard macros.") (2.6.32) accidentally introduced an ALIGN(PAGE_SIZE) when converting to use INIT_TEXT_SECTION; Richard Weinberger reported that this causes the kernel to segfault with CONFIG_STATIC_LINK=y. I'm not certain why this extra alignment is a problem, but it seems likely it is because previously __init_begin = _stext = _text = _sinittext and with the extra ALIGN(PAGE_SIZE), _sinittext becomes different from the rest. So there is likely a bug here where something is assuming that _sinittext is the same as one of those other symbols. But reverting the accidental change fixes the regression, so it seems worth committing that now. Signed-off-by: Tim Abbott Reported-by: Richard Weinberger Cc: Jeff Dike Tested by: Antoine Martin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/uml.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index ec6378550671..9a873d765615 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -22,7 +22,7 @@ SECTIONS _text = .; _stext = .; __init_begin = .; - INIT_TEXT_SECTION(PAGE_SIZE) + INIT_TEXT_SECTION(0) . = ALIGN(PAGE_SIZE); .text : From 1f9fa5216eacf4fdf9d3e4ab57feb8b642f0e78b Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Tue, 26 Oct 2010 14:21:16 -0700 Subject: [PATCH 007/176] drivers/misc/ad525x_dpot.c: fix typo in spi write16 and write24 transfer counts This is a bug fix. Some SPI connected devices using 16/24 bit accesses, previously failed, now work. This typo slipped in after testing, during some restructuring. Signed-off-by: Michael Hennerich Cc: Mike Frysinger Cc: Chris Verges Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/ad525x_dpot-spi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ad525x_dpot-spi.c b/drivers/misc/ad525x_dpot-spi.c index b8c6df9c8437..6cfcb636577a 100644 --- a/drivers/misc/ad525x_dpot-spi.c +++ b/drivers/misc/ad525x_dpot-spi.c @@ -53,13 +53,13 @@ static int write8(void *client, u8 val) static int write16(void *client, u8 reg, u8 val) { u8 data[2] = {reg, val}; - return spi_write(client, data, 1); + return spi_write(client, data, 2); } static int write24(void *client, u8 reg, u16 val) { u8 data[3] = {reg, val >> 8, val}; - return spi_write(client, data, 1); + return spi_write(client, data, 3); } static int read8(void *client) From de5e2ddf9bb3ce7b643223b9b0718062254f302f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:21:17 -0700 Subject: [PATCH 008/176] ipmi: proper spinlock initialization Unloading ipmi module can trigger following error. (if CONFIG_DEBUG_SPINLOCK=y) [ 9633.779590] BUG: spinlock bad magic on CPU#1, rmmod/7170 [ 9633.779606] lock: f41f5414, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 9633.779626] Pid: 7170, comm: rmmod Not tainted 2.6.36-rc7-11474-gb71eb1e-dirty #328 [ 9633.779644] Call Trace: [ 9633.779657] [] ? printk+0x18/0x1c [ 9633.779672] [] spin_bug+0xa3/0xf0 [ 9633.779685] [] do_raw_spin_lock+0x7d/0x160 [ 9633.779702] [] ? release_sysfs_dirent+0x47/0xb0 [ 9633.779718] [] ? sysfs_addrm_finish+0xa8/0xd0 [ 9633.779734] [] _raw_spin_lock_irqsave+0xc/0x20 [ 9633.779752] [] cleanup_one_si+0x6a/0x200 [ipmi_si] [ 9633.779768] [] ? sysfs_hash_and_remove+0x72/0x80 [ 9633.779786] [] ipmi_pnp_remove+0xd/0xf [ipmi_si] [ 9633.779802] [] pnp_device_remove+0x1b/0x40 Fix this by initializing spinlocks in a smi_info_alloc() helper function, right after memory allocation and clearing. Signed-off-by: Eric Dumazet Acked-by: David Miller Cc: Yinghai Lu Acked-by: Corey Minyard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_si_intf.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index e537610d2f09..b293d57d30a7 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1665,6 +1665,17 @@ static int check_hotmod_int_op(const char *curr, const char *option, return 0; } +static struct smi_info *smi_info_alloc(void) +{ + struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (info) { + spin_lock_init(&info->si_lock); + spin_lock_init(&info->msg_lock); + } + return info; +} + static int hotmod_handler(const char *val, struct kernel_param *kp) { char *str = kstrdup(val, GFP_KERNEL); @@ -1779,7 +1790,7 @@ static int hotmod_handler(const char *val, struct kernel_param *kp) } if (op == HM_ADD) { - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) { rv = -ENOMEM; goto out; @@ -1844,7 +1855,7 @@ static __devinit void hardcode_find_bmc(void) if (!ports[i] && !addrs[i]) continue; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) return; @@ -2027,7 +2038,7 @@ static __devinit int try_init_spmi(struct SPMITable *spmi) return -ENODEV; } - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) { printk(KERN_ERR PFX "Could not allocate SI data (3)\n"); return -ENOMEM; @@ -2137,7 +2148,7 @@ static int __devinit ipmi_pnp_probe(struct pnp_dev *dev, if (!acpi_dev) return -ENODEV; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) return -ENOMEM; @@ -2318,7 +2329,7 @@ static __devinit void try_init_dmi(struct dmi_ipmi_data *ipmi_data) { struct smi_info *info; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) { printk(KERN_ERR PFX "Could not allocate SI data\n"); return; @@ -2425,7 +2436,7 @@ static int __devinit ipmi_pci_probe(struct pci_dev *pdev, int class_type = pdev->class & PCI_ERMC_CLASSCODE_TYPE_MASK; struct smi_info *info; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) return -ENOMEM; @@ -2566,7 +2577,7 @@ static int __devinit ipmi_of_probe(struct platform_device *dev, return -EINVAL; } - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) { dev_err(&dev->dev, @@ -3013,7 +3024,7 @@ static __devinit void default_find_bmc(void) if (check_legacy_ioport(ipmi_defaults[i].port)) continue; #endif - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = smi_info_alloc(); if (!info) return; @@ -3138,9 +3149,6 @@ static int try_smi_init(struct smi_info *new_smi) goto out_err; } - spin_lock_init(&(new_smi->si_lock)); - spin_lock_init(&(new_smi->msg_lock)); - /* Do low-level detection first. */ if (new_smi->handlers->detect(new_smi->si_sm)) { if (new_smi->addr_source) From 1b627d5771312c92404b66f0a0b16f66036dd2e1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:21:18 -0700 Subject: [PATCH 009/176] hostfs: fix UML crash: remove f_spare from hostfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 365b1818 ("add f_flags to struct statfs(64)") resized f_spare within struct statfs which caused a UML crash. There is no need to copy f_spare. Signed-off-by: Richard Weinberger Reported-by: Toralf Förster Tested-by: Toralf Förster Cc: Christoph Hellwig Cc: Al Viro Cc: Jeff Dike Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs.h | 3 +-- fs/hostfs/hostfs_kern.c | 2 +- fs/hostfs/hostfs_user.c | 9 ++------- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 7c232c1487ee..bf15a43016b9 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out); + void *fsid_out, int fsid_size, long *namelen_out); #endif diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f7dc9b5f9ef8..cd7c93917cc7 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) err = do_statfs(dentry->d_sb->s_fs_info, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), - &sf->f_namelen, sf->f_spare); + &sf->f_namelen); if (err) return err; sf->f_blocks = f_blocks; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 6777aa06ce2c..8d02683585e0 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -364,8 +364,7 @@ int rename_file(char *from, char *to) int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out) + void *fsid_out, int fsid_size, long *namelen_out) { struct statfs64 buf; int err; @@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out, sizeof(buf.f_fsid) > fsid_size ? fsid_size : sizeof(buf.f_fsid)); *namelen_out = buf.f_namelen; - spare_out[0] = buf.f_spare[0]; - spare_out[1] = buf.f_spare[1]; - spare_out[2] = buf.f_spare[2]; - spare_out[3] = buf.f_spare[3]; - spare_out[4] = buf.f_spare[4]; + return 0; } From 52c5171214ff3327961d0ce0db7e8d2ce55004fd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:21:19 -0700 Subject: [PATCH 010/176] kfifo: disable __kfifo_must_check_helper() This helper is wrong: it coerces signed values into unsigned ones, so code such as if (kfifo_alloc(...) < 0) { error } will fail to detect the error. So let's disable __kfifo_must_check_helper() for 2.6.36. Cc: Randy Dunlap Cc: Stefani Seibold Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 62dbee554f60..c238ad2f82ea 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -171,11 +171,8 @@ struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void); } -static inline unsigned int __must_check -__kfifo_must_check_helper(unsigned int val) -{ - return val; -} +/* __kfifo_must_check_helper() is temporarily disabled because it was faulty */ +#define __kfifo_must_check_helper(x) (x) /** * kfifo_initialized - Check if the fifo is initialized From 8474b591faf3bb0a1e08a60d21d6baac498f15e4 Mon Sep 17 00:00:00 2001 From: Masanori ITOH Date: Tue, 26 Oct 2010 14:21:20 -0700 Subject: [PATCH 011/176] percpu: fix list_head init bug in __percpu_counter_init() WARNING: at lib/list_debug.c:26 __list_add+0x3f/0x81() Hardware name: Express5800/B120a [N8400-085] list_add corruption. next->prev should be prev (ffffffff81a7ea00), but was dead000000200200. (next=ffff88080b872d58). Modules linked in: aoe ipt_MASQUERADE iptable_nat nf_nat autofs4 sunrpc bridge 8021q garp stp llc ipv6 cpufreq_ondemand acpi_cpufreq freq_table dm_round_robin dm_multipath kvm_intel kvm uinput lpfc scsi_transport_fc igb ioatdma scsi_tgt i2c_i801 i2c_core dca iTCO_wdt iTCO_vendor_support pcspkr shpchp megaraid_sas [last unloaded: aoe] Pid: 54, comm: events/3 Tainted: G W 2.6.34-vanilla1 #1 Call Trace: [] warn_slowpath_common+0x7c/0x94 [] warn_slowpath_fmt+0x41/0x43 [] __list_add+0x3f/0x81 [] __percpu_counter_init+0x59/0x6b [] bdi_init+0x118/0x17e [] blk_alloc_queue_node+0x79/0x143 [] blk_alloc_queue+0x11/0x13 [] aoeblk_gdalloc+0x8e/0x1c9 [aoe] [] aoecmd_sleepwork+0x25/0xa8 [aoe] [] worker_thread+0x1a9/0x237 [] ? aoecmd_sleepwork+0x0/0xa8 [aoe] [] ? autoremove_wake_function+0x0/0x39 [] ? worker_thread+0x0/0x237 [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x87 [] ? kernel_thread_helper+0x0/0x10 It's because there is no initialization code for a list_head contained in the struct backing_dev_info under CONFIG_HOTPLUG_CPU, and the bug comes up when block device drivers calling blk_alloc_queue() are used. In case of me, I got them by using aoe. Signed-off-by: Masanori Itoh Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index ec9048e74f44..209448e1d2b9 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -76,6 +76,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, if (!fbc->counters) return -ENOMEM; #ifdef CONFIG_HOTPLUG_CPU + INIT_LIST_HEAD(&fbc->list); mutex_lock(&percpu_counters_lock); list_add(&fbc->list, &percpu_counters); mutex_unlock(&percpu_counters_lock); From dbec921370ea049eccd38e34e067373fb3bd9b01 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:21:21 -0700 Subject: [PATCH 012/176] um: fix IRQ flag handling naming Commit df9ee292 ("Fix IRQ flag handling naming") changed the IRQ flag handling naming scheme and broke UML: In file included from arch/um/include/asm/fixmap.h:5, from arch/um/include/shared/um_uaccess.h:10, from arch/um/include/asm/uaccess.h:41, from arch/um/include/asm/thread_info.h:13, from include/linux/thread_info.h:56, from include/linux/preempt.h:9, from include/linux/spinlock.h:50, from include/linux/seqlock.h:29, from include/linux/time.h:8, from include/linux/stat.h:60, from include/linux/module.h:10, from init/main.c:13: arch/um/include/asm/system.h:11:1: warning: "local_save_flags" redefined This patch brings the new scheme to UML and makes it work again. Signed-off-by: Richard Weinberger Acked-by: David Howells Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/asm/system.h | 43 ++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/arch/um/include/asm/system.h b/arch/um/include/asm/system.h index 93af1cf0907d..68a90ecd1450 100644 --- a/arch/um/include/asm/system.h +++ b/arch/um/include/asm/system.h @@ -8,23 +8,38 @@ extern int set_signals(int enable); extern void block_signals(void); extern void unblock_signals(void); -#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ - (flags) = get_signals(); } while(0) -#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ - set_signals(flags); } while(0) +static inline unsigned long arch_local_save_flags(void) +{ + return get_signals(); +} -#define local_irq_save(flags) do { local_save_flags(flags); \ - local_irq_disable(); } while(0) +static inline void arch_local_irq_restore(unsigned long flags) +{ + set_signals(flags); +} -#define local_irq_enable() unblock_signals() -#define local_irq_disable() block_signals() +static inline void arch_local_irq_enable(void) +{ + unblock_signals(); +} -#define irqs_disabled() \ -({ \ - unsigned long flags; \ - local_save_flags(flags); \ - (flags == 0); \ -}) +static inline void arch_local_irq_disable(void) +{ + block_signals(); +} + +static inline unsigned long arch_local_irq_save(void) +{ + unsigned long flags; + flags = arch_local_save_flags(); + arch_local_irq_disable(); + return flags; +} + +static inline bool arch_irqs_disabled(void) +{ + return arch_local_save_flags() == 0; +} extern void *_switch_to(void *prev, void *next, void *last); #define switch_to(prev, next, last) prev = _switch_to(prev, next, last) From a4f7326da2bfe788b85509ec0c261e64596cdde4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 26 Oct 2010 14:21:21 -0700 Subject: [PATCH 013/176] vmcore: it is not experimental any more We use vmcore in our production kernel for a long time, it is pretty stable now. So I don't think we need to mark it as experimental any more. Signed-off-by: WANG Cong Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 50f8f0600f06..6a0068841d96 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -33,8 +33,8 @@ config PROC_KCORE depends on PROC_FS && MMU config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && CRASH_DUMP + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP default y help Exports the dump image of crashed kernel in ELF format. From 0f4d208f1975f16f269134cee5f44c1f048581da Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Tue, 26 Oct 2010 14:21:22 -0700 Subject: [PATCH 014/176] Documentation/filesystems/proc.txt: improve smaps field documentation Signed-off-by: Matt Mackall Cc: Nikanth Karthikesan Cc: Balbir Singh Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a6aca8740883..a563b74c7aef 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -374,13 +374,13 @@ Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB -The first of these lines shows the same information as is displayed for the -mapping in /proc/PID/maps. The remaining lines show the size of the mapping, -the amount of the mapping that is currently resident in RAM, the "proportional -set size” (divide each shared page by the number of processes sharing it), the -number of clean and dirty shared pages in the mapping, and the number of clean -and dirty private pages in the mapping. The "Referenced" indicates the amount -of memory currently marked as referenced or accessed. +The first of these lines shows the same information as is displayed for the +mapping in /proc/PID/maps. The remaining lines show the size of the mapping +(size), the amount of the mapping that is currently resident in RAM (RSS), the +process' proportional share of this mapping (PSS), the number of clean and +dirty shared pages in the mapping, and the number of clean and dirty private +pages in the mapping. The "Referenced" indicates the amount of memory +currently marked as referenced or accessed. This file is only present if the CONFIG_MMU kernel configuration option is enabled. From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: [PATCH 015/176] oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++++ fs/proc/base.c | 30 ++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ kernel/exit.c | 3 +++ kernel/fork.c | 15 ++++++++++++++- 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 6d2b6f936858..3aa75b8888a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -759,6 +760,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&old_mm->oom_disable_count); + atomic_inc(&tsk->mm->oom_disable_count); + } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5d5f51f3fe..6e50c8e65513 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EACCES; } + task_lock(task); + if (!task->mm) { + task_unlock(task); + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EINVAL; + } + + if (oom_adjust != task->signal->oom_adj) { + if (oom_adjust == OOM_DISABLE) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_adj == OOM_DISABLE) + atomic_dec(&task->mm->oom_disable_count); + } + /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. @@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + task_unlock(task); unlock_task_sighand(task, &flags); put_task_struct(task); @@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, return -EACCES; } + task_lock(task); + if (!task->mm) { + task_unlock(task); + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EINVAL; + } + if (oom_score_adj != task->signal->oom_score_adj) { + if (oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&task->mm->oom_disable_count); + } task->signal->oom_score_adj = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is @@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; + task_unlock(task); unlock_task_sighand(task, &flags); put_task_struct(task); return count; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cb57d657ce4d..bb7288a782fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -310,6 +310,8 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif + /* How many tasks sharing this mm are OOM_DISABLE */ + atomic_t oom_disable_count; }; /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fde..894179a32ec1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408d..e87aaaaf5131 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +743,8 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1299,8 +1303,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } From e18641e19a9204f241f04a5ac700168dcd18de4f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: [PATCH 016/176] oom: avoid killing a task if a thread sharing its mm cannot be killed The oom killer's goal is to kill a memory-hogging task so that it may exit, free its memory, and allow the current context to allocate the memory that triggered it in the first place. Thus, killing a task is pointless if other threads sharing its mm cannot be killed because of its /proc/pid/oom_adj or /proc/pid/oom_score_adj value. This patch checks whether any other thread sharing p->mm has an oom_score_adj of OOM_SCORE_ADJ_MIN. If so, the thread cannot be killed and oom_badness(p) returns 0, meaning it's unkillable. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4029583a1024..4395f371bc7c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, return 0; /* - * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't - * need to be executed for something that cannot be killed. + * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN + * so the entire heuristic doesn't need to be executed for something + * that cannot be killed. */ - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + if (atomic_read(&p->mm->oom_disable_count)) { task_unlock(p); return 0; } @@ -680,7 +681,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && - (current->signal->oom_adj != OOM_DISABLE)) { + current->mm && !atomic_read(¤t->mm->oom_disable_count)) { /* * oom_kill_process() needs tasklist_lock held. If it returns * non-zero, current could not be killed so we must fallback to From 1e99bad0d9c12a4aaa60cd812c84ef152564bcf5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 26 Oct 2010 14:21:24 -0700 Subject: [PATCH 017/176] oom: kill all threads sharing oom killed task's mm It's necessary to kill all threads that share an oom killed task's mm if the goal is to lead to future memory freeing. This patch reintroduces the code removed in 8c5cd6f3 (oom: oom_kill doesn't kill vfork parent (or child)) since it is obsoleted. It's now guaranteed that any task passed to oom_kill_task() does not share an mm with any thread that is unkillable. Thus, we're safe to issue a SIGKILL to any thread sharing the same mm. This is especially necessary to solve an mm->mmap_sem livelock issue whereas an oom killed thread must acquire the lock in the exit path while another thread is holding it in the page allocator while trying to allocate memory itself (and will preempt the oom killer since a task was already killed). Since tasks with pending fatal signals are now granted access to memory reserves, the thread holding the lock may quickly allocate and release the lock so that the oom killed task may exit. This mainly is for threads that are cloned with CLONE_VM but not CLONE_THREAD, so they are in a different thread group. Non-NPTL threads exist in the wild and this change is necessary to prevent the livelock in such cases. We care more about preventing the livelock than incurring the additional tasklist in the oom killer when a task has been killed. Systems that are sufficiently large to not want the tasklist scan in the oom killer in the first place already have the option of enabling /proc/sys/vm/oom_kill_allocating_task, which was designed specifically for that purpose. This code had existed in the oom killer for over eight years dating back to the 2.4 kernel. [akpm@linux-foundation.org: add nice comment] Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4395f371bc7c..7dcca55ede7c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, #define K(x) ((x) << (PAGE_SHIFT-10)) static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) { + struct task_struct *q; + struct mm_struct *mm; + p = find_lock_task_mm(p); if (!p) return 1; + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), K(get_mm_counter(p->mm, MM_ANONPAGES)), K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); + /* + * Kill all processes sharing p->mm in other thread groups, if any. + * They don't get access to memory reserves or a higher scheduler + * priority, though, to avoid depletion of all memory or task + * starvation. This prevents mm->mmap_sem livelock when an oom killed + * task cannot exit because it requires the semaphore and its contended + * by another thread trying to allocate memory itself. That thread will + * now get access to memory reserves since it has a pending fatal + * signal. + */ + for_each_process(q) + if (q->mm == mm && !same_thread_group(q, p)) { + task_lock(q); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(q), q->comm); + task_unlock(q); + force_sig(SIGKILL, q); + } set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); From 723548bff1dde9ab6bdb23f4bb92277c4da49473 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 26 Oct 2010 14:21:25 -0700 Subject: [PATCH 018/176] oom: rewrite error handling for oom_adj and oom_score_adj tunables It's better to use proper error handling in oom_adjust_write() and oom_score_adj_write() instead of duplicating the locking order on various exit paths. Suggested-by: Andrew Morton Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 83 +++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 6e50c8e65513..34d11ac31f2e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1023,36 +1023,39 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } err = strict_strtol(strstrip(buffer), 0, &oom_adjust); if (err) - return -EINVAL; + goto out; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) - return -EINVAL; + oom_adjust != OOM_DISABLE) { + err = -EINVAL; + goto out; + } task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; + if (!task) { + err = -ESRCH; + goto out; + } if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_struct; } if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + err = -EACCES; + goto err_sighand; } task_lock(task); if (!task->mm) { - task_unlock(task); - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EINVAL; + err = -EINVAL; + goto err_task_lock; } if (oom_adjust != task->signal->oom_adj) { @@ -1080,11 +1083,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; +err_task_lock: task_unlock(task); +err_sighand: unlock_task_sighand(task, &flags); +err_task_struct: put_task_struct(task); - - return count; +out: + return err < 0 ? err : count; } static const struct file_operations proc_oom_adjust_operations = { @@ -1125,36 +1131,39 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); if (err) - return -EINVAL; + goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || - oom_score_adj > OOM_SCORE_ADJ_MAX) - return -EINVAL; + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; + if (!task) { + err = -ESRCH; + goto out; + } if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_struct; } if (oom_score_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + err = -EACCES; + goto err_sighand; } task_lock(task); if (!task->mm) { - task_unlock(task); - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EINVAL; + err = -EINVAL; + goto err_task_lock; } if (oom_score_adj != task->signal->oom_score_adj) { if (oom_score_adj == OOM_SCORE_ADJ_MIN) @@ -1172,10 +1181,14 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; +err_task_lock: task_unlock(task); +err_sighand: unlock_task_sighand(task, &flags); +err_task_struct: put_task_struct(task); - return count; +out: + return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { From d19d5476f4b9f91d2de92b91588bb118beba6c0d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 26 Oct 2010 14:21:26 -0700 Subject: [PATCH 019/176] oom: fix locking for oom_adj and oom_score_adj The locking order in oom_adjust_write() and oom_score_adj_write() for task->alloc_lock and task->sighand->siglock is reversed, and lockdep notices that irqs could encounter an ABBA scenario. This fixes the locking order so that we always take task_lock(task) prior to lock_task_sighand(task). Signed-off-by: David Rientjes Reported-by: Andrew Morton Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 34d11ac31f2e..53dc8ad40ae6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1042,15 +1042,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, err = -ESRCH; goto out; } - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_struct; - } - - if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } task_lock(task); if (!task->mm) { @@ -1058,6 +1049,16 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto err_task_lock; } + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + if (oom_adjust != task->signal->oom_adj) { if (oom_adjust == OOM_DISABLE) atomic_inc(&task->mm->oom_disable_count); @@ -1083,11 +1084,10 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; -err_task_lock: - task_unlock(task); err_sighand: unlock_task_sighand(task, &flags); -err_task_struct: +err_task_lock: + task_unlock(task); put_task_struct(task); out: return err < 0 ? err : count; @@ -1150,21 +1150,24 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, err = -ESRCH; goto out; } - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_struct; - } - if (oom_score_adj < task->signal->oom_score_adj && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } task_lock(task); if (!task->mm) { err = -EINVAL; goto err_task_lock; } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if (oom_score_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + if (oom_score_adj != task->signal->oom_score_adj) { if (oom_score_adj == OOM_SCORE_ADJ_MIN) atomic_inc(&task->mm->oom_disable_count); @@ -1181,11 +1184,10 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; -err_task_lock: - task_unlock(task); err_sighand: unlock_task_sighand(task, &flags); -err_task_struct: +err_task_lock: + task_unlock(task); put_task_struct(task); out: return err < 0 ? err : count; From 1b430beee5e388605dfb092b214ef0320f752cf6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 26 Oct 2010 14:21:26 -0700 Subject: [PATCH 020/176] writeback: remove nonblocking/encountered_congestion references This removes more dead code that was somehow missed by commit 0d99519efef (writeback: remove unused nonblocking and congestion checks). There are no behavior change except for the removal of two entries from one of the ext4 tracing interface. The nonblocking checks in ->writepages are no longer used because the flusher now prefer to block on get_request_wait() than to skip inodes on IO congestion. The latter will lead to more seeky IO. The nonblocking checks in ->writepage are no longer used because it's redundant with the WB_SYNC_NONE check. We no long set ->nonblocking in VM page out and page migration, because a) it's effectively redundant with WB_SYNC_NONE in current code b) it's old semantic of "Don't get stuck on request queues" is mis-behavior: that would skip some dirty inodes on congestion and page out others, which is unfair in terms of LRU age. Inspired by Christoph Hellwig. Thanks! Signed-off-by: Wu Fengguang Cc: Theodore Ts'o Cc: David Howells Cc: Sage Weil Cc: Steve French Cc: Chris Mason Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/write.c | 19 +------------------ fs/buffer.c | 2 +- fs/ceph/addr.c | 9 --------- fs/cifs/file.c | 10 ---------- fs/gfs2/meta_io.c | 2 +- fs/nfs/write.c | 4 +--- fs/reiserfs/inode.c | 2 +- fs/xfs/linux-2.6/xfs_aops.c | 3 +-- include/trace/events/ext4.h | 8 +++++--- include/trace/events/writeback.h | 2 -- mm/migrate.c | 1 - mm/vmscan.c | 1 - 12 files changed, 11 insertions(+), 52 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 722743b152d8..15690bb1d3b5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -438,7 +438,6 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; struct afs_writeback *wb; int ret; @@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) } wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) - wbc->encountered_congestion = 1; _leave(" = 0"); return 0; @@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { - struct backing_dev_info *bdi = mapping->backing_dev_info; struct afs_writeback *wb; struct page *page; int ret, n; @@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping, wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - break; - } - cond_resched(); } while (index < end && wbc->nr_to_write > 0); @@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; pgoff_t start, end, next; int ret; _enter(""); - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - _leave(" = 0 [congest]"); - return 0; - } - if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; ret = afs_writepages_region(mapping, wbc, start, end, &next); - if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && - !(wbc->nonblocking && wbc->encountered_congestion)) + if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); mapping->writeback_index = next; diff --git a/fs/buffer.c b/fs/buffer.c index 7f0b9b083f77..ec21a92e08b4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1706,7 +1706,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * and kswapd activity, but those code paths have their own * higher-level throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 51bcc5ce3230..e9c874abc9e1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc; pgoff_t index, start, end; @@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping, pagevec_init(&pvec, 0); - /* ?? */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - dout(" writepages congested\n"); - wbc->encountered_congestion = 1; - goto out_final; - } - /* where to start/end? */ if (wbc->range_cyclic) { start = mapping->writeback_index; /* Start from prev offset */ @@ -885,7 +877,6 @@ static int ceph_writepages_start(struct address_space *mapping, rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); -out_final: return rc; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c81e7b14d53..45af003865d2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1303,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned int bytes_to_write; unsigned int bytes_written; struct cifs_sb_info *cifs_sb; @@ -1326,15 +1325,6 @@ static int cifs_writepages(struct address_space *mapping, int scanned = 0; int xid, long_op; - /* - * BB: Is this meaningful for a non-block-device file system? - * If it is, we should test it again after we do I/O - */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - cifs_sb = CIFS_SB(mapping->host->i_sb); /* diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f3b071f921aa..939739c7b3f9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb * activity, but those code paths have their own higher-level * throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 605e292501f4..4c14c17a5276 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -290,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - ret = nfs_page_async_flush(pgio, page, - wbc->sync_mode == WB_SYNC_NONE || - wbc->nonblocking != 0); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index caa758377d66..c1f93896cb53 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2438,7 +2438,7 @@ static int reiserfs_write_full_page(struct page *page, /* from this point on, we know the buffer is mapped to a * real block and not a direct item */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else { if (!trylock_buffer(bh)) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index b552f816de15..c9af48fffcd7 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1139,8 +1139,7 @@ xfs_vm_writepage( type = IO_DELAY; flags = BMAPI_ALLOCATE; - if (wbc->sync_mode == WB_SYNC_NONE && - wbc->nonblocking) + if (wbc->sync_mode == WB_SYNC_NONE) flags |= BMAPI_TRYLOCK; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 01e9e0076a92..6bcb00645de4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -242,18 +242,20 @@ TRACE_EVENT(ext4_da_writepages, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; - __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld " + "range_start %llu range_end %llu " + "for_kupdate %d for_reclaim %d " + "range_cyclic %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, - __entry->range_end, __entry->nonblocking, + __entry->range_end, __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, (unsigned long) __entry->writeback_index) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index f345f66ae9d1..0bb01ab2e984 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -96,8 +96,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) - __field(int, nonblocking) - __field(int, encountered_congestion) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) diff --git a/mm/migrate.c b/mm/migrate.c index f8c9bccf2520..d917ac3207f5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -497,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; diff --git a/mm/vmscan.c b/mm/vmscan.c index b94c9464f262..6cbc1aac23ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -376,7 +376,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; From f19e77a3dc884510dba740caa6dee126b7d40156 Mon Sep 17 00:00:00 2001 From: zeal Date: Tue, 26 Oct 2010 14:21:27 -0700 Subject: [PATCH 021/176] include/linux/pageblock-flags.h: fix set_pageblock_flags() macro definiton The presently-unused macro was missing one parameter. Signed-off-by: zeal Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pageblock-flags.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e8c06122be36..19ef95d293ae 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -67,7 +67,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, #define get_pageblock_flags(page) \ get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1) -#define set_pageblock_flags(page) \ - set_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1) +#define set_pageblock_flags(page, flags) \ + set_pageblock_flags_group(page, flags, \ + 0, NR_PAGEBLOCK_BITS-1) #endif /* PAGEBLOCK_FLAGS_H */ From e4455abb50a19562dbfdc51a8424fda9b588bd6d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 26 Oct 2010 14:21:28 -0700 Subject: [PATCH 022/176] mm: only build per-node scan_unevictable functions when NUMA is enabled Non-NUMA systems do never create these files anyway, since they are only created by driver subsystem when NUMA is configured. [akpm@linux-foundation.org: cleanup] Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: KOSAKI Motohiro Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 10 ++++++++++ mm/vmscan.c | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 7cdd63366f88..eba53e71d2cc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -271,8 +271,18 @@ extern void scan_mapping_unevictable_pages(struct address_space *); extern unsigned long scan_unevictable_pages; extern int scan_unevictable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +#ifdef CONFIG_NUMA extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); +#else +static inline int scan_unevictable_register_node(struct node *node) +{ + return 0; +} +static inline void scan_unevictable_unregister_node(struct node *node) +{ +} +#endif extern int kswapd_run(int nid); extern void kswapd_stop(int nid); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6cbc1aac23ae..f5871ee50000 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2986,6 +2986,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA /* * per node 'scan_unevictable_pages' attribute. On demand re-scan of * a specified node's per zone unevictable lists for evictable pages. @@ -3032,4 +3033,4 @@ void scan_unevictable_unregister_node(struct node *node) { sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } - +#endif From cf608ac19c95804dc2df43b1f4f9e068aa9034ab Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 26 Oct 2010 14:21:29 -0700 Subject: [PATCH 023/176] mm: compaction: fix COMPACTPAGEFAILED counting Presently update_nr_listpages() doesn't have a role. That's because lists passed is always empty just after calling migrate_pages. The migrate_pages cleans up page list which have failed to migrate before returning by aaa994b3. [PATCH] page migration: handle freeing of pages in migrate_pages() Do not leave pages on the lists passed to migrate_pages(). Seems that we will not need any postprocessing of pages. This will simplify the handling of pages by the callers of migrate_pages(). At that time, we thought we don't need any postprocessing of pages. But the situation is changed. The compaction need to know the number of failed to migrate for COMPACTPAGEFAILED stat This patch makes new rule for caller of migrate_pages to call putback_lru_pages. So caller need to clean up the lists so it has a chance to postprocess the pages. [suggested by Christoph Lameter] Signed-off-by: Minchan Kim Cc: Hugh Dickins Cc: Andi Kleen Reviewed-by: Mel Gorman Reviewed-by: Wu Fengguang Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 1 + mm/memory_hotplug.c | 2 ++ mm/mempolicy.c | 10 ++++++++-- mm/migrate.c | 12 +++++++----- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44a8cefeae6e..124324134ff6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1292,6 +1292,7 @@ static int soft_offline_huge_page(struct page *page, int flags) list_add(&hpage->lru, &pagelist); ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 06662c5a3e86..4821338b4e4b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -731,6 +731,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + if (ret) + putback_lru_pages(&source); out: return ret; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f969da5dd8a2..21243b2b7b07 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -931,8 +931,11 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, 0); + if (err) + putback_lru_pages(&pagelist); + } return err; } @@ -1147,9 +1150,12 @@ static long do_mbind(unsigned long start, unsigned long len, err = mbind_range(mm, start, end, new); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, 0); + if (nr_failed) + putback_lru_pages(&pagelist); + } if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; diff --git a/mm/migrate.c b/mm/migrate.c index d917ac3207f5..35e454189966 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -883,8 +883,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. All pages will be - * returned to the LRU or freed. + * or no retryable pages exist anymore. + * Caller should call putback_lru_pages to return pages to the LRU + * or free list. * * Return: Number of pages not migrated or error code. */ @@ -931,8 +932,6 @@ int migrate_pages(struct list_head *from, if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - putback_lru_pages(from); - if (rc) return rc; @@ -1087,9 +1086,12 @@ static int do_move_page_to_node_array(struct mm_struct *mm, } err = 0; - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm, 0); + if (err) + putback_lru_pages(&pagelist); + } up_read(&mm->mmap_sem); return err; From 4b20477f588055fbe87e69435d3c2344d250f0d7 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Oct 2010 14:21:29 -0700 Subject: [PATCH 024/176] memory hotplug: fix notifier's return value check Even if notifier cannot find any pages, it doesn't mean no pages are available...And, if there are no notifiers registered, this condition will be always true and memory hotplug will show -EBUSY. This is a bug but not critical. In most case, a pageblock which will be offlined is MIGRATE_MOVABLE This "notifier" is called only when the pageblock is _not_ MIGRATE_MOVABLE. But if not MIGRATE_MOVABLE, it's common case that memory hotplug will fail. So, no one notice this bug. Signed-off-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Wu Fengguang Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5faf876cfc3a..099790052bfe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5336,7 +5336,7 @@ int set_migratetype_isolate(struct page *page) */ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret || !arg.pages_found) + if (notifier_ret) goto out; for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { From 49ac825587f33afec8841b7fab2eb4db775014e6 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Oct 2010 14:21:30 -0700 Subject: [PATCH 025/176] memory hotplug: unify is_removable and offline detection code Now, sysfs interface of memory hotplug shows whether the section is removable or not. But it checks only migrateype of pages and doesn't check details of cluster of pages. Next, memory hotplug's set_migratetype_isolate() has the same kind of check, too. This patch adds the function __count_unmovable_pages() and makes above 2 checks to use the same logic. Then, is_removable and hotremove code uses the same logic. No changes in the hotremove logic itself. TODO: need to find a way to check RECLAMABLE. But, considering bit, calling shrink_slab() against a range before starting memory hotremove sounds better. If so, this patch's logic doesn't need to be changed. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Reported-by: Michal Hocko Cc: Wu Fengguang Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++ mm/memory_hotplug.c | 17 +------ mm/page_alloc.c | 87 ++++++++++++++++++++++++++-------- 3 files changed, 72 insertions(+), 36 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 864035fb8f8a..4307231bd22f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -70,6 +70,10 @@ extern void online_page(struct page *page); extern int online_pages(unsigned long, unsigned long); extern void __offline_isolated_pages(unsigned long, unsigned long); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern bool is_pageblock_removable_nolock(struct page *page); +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4821338b4e4b..b0dc65452973 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page) /* Checks if this range of memory is likely to be hot-removable. */ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - int type; struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { - type = get_pageblock_migratetype(page); - - /* - * A pageblock containing MOVABLE or free pages is considered - * removable - */ - if (type != MIGRATE_MOVABLE && !pageblock_free(page)) - return 0; - - /* - * A pageblock starting with a PageReserved page is not - * considered removable. - */ - if (PageReserved(page)) + if (!is_pageblock_removable_nolock(page)) return 0; + cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 099790052bfe..6a683f819439 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5297,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * page allocater never alloc memory from ISOLATE block. */ +static int +__count_immobile_pages(struct zone *zone, struct page *page, int count) +{ + unsigned long pfn, iter, found; + /* + * For avoiding noise data, lru_add_drain_all() should be called + * If ZONE_MOVABLE, the zone never contains immobile pages + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return true; + + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + return true; + + pfn = page_to_pfn(page); + for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + unsigned long check = pfn + iter; + + if (!pfn_valid_within(check)) { + iter++; + continue; + } + page = pfn_to_page(check); + if (!page_count(page)) { + if (PageBuddy(page)) + iter += (1 << page_order(page)) - 1; + continue; + } + if (!PageLRU(page)) + found++; + /* + * If there are RECLAIMABLE pages, we need to check it. + * But now, memory offline itself doesn't call shrink_slab() + * and it still to be fixed. + */ + /* + * If the page is not RAM, page_count()should be 0. + * we don't need more check. This is an _used_ not-movable page. + * + * The problematic thing here is PG_reserved pages. PG_reserved + * is set to both of a memory hole page and a _used_ kernel + * page at boot. + */ + if (found > count) + return false; + } + return true; +} + +bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone = page_zone(page); + return __count_immobile_pages(zone, page, 0); +} + int set_migratetype_isolate(struct page *page) { struct zone *zone; - struct page *curr_page; - unsigned long flags, pfn, iter; - unsigned long immobile = 0; + unsigned long flags, pfn; struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; @@ -5312,11 +5365,6 @@ int set_migratetype_isolate(struct page *page) zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || - zone_idx == ZONE_MOVABLE) { - ret = 0; - goto out; - } pfn = page_to_pfn(page); arg.start_pfn = pfn; @@ -5338,21 +5386,18 @@ int set_migratetype_isolate(struct page *page) notifier_ret = notifier_to_errno(notifier_ret); if (notifier_ret) goto out; - - for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { - if (!pfn_valid_within(pfn)) - continue; - - curr_page = pfn_to_page(iter); - if (!page_count(curr_page) || PageLRU(curr_page)) - continue; - - immobile++; - } - - if (arg.pages_found == immobile) + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (__count_immobile_pages(zone, page, arg.pages_found)) ret = 0; + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + out: if (!ret) { set_pageblock_migratetype(page, MIGRATE_ISOLATE); From 74e3f3c3391d81a959f58a1191a560703a4415b4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 26 Oct 2010 14:21:31 -0700 Subject: [PATCH 026/176] vmscan: prevent background aging of anon page in no swap system Ying Han reported that backing aging of anon pages in no swap system causes unnecessary TLB flush. When I sent a patch(69c8548175), I wanted this patch but Rik pointed out and allowed aging of anon pages to give a chance to promote from inactive to active LRU. It has a two problem. 1) non-swap system Never make sense to age anon pages. 2) swap configured but still doesn't swapon It doesn't make sense to age anon pages until swap-on time. But it's arguable. If we have aged anon pages by swapon, VM have moved anon pages from active to inactive. And in the time swapon by admin, the VM can't reclaim hot pages so we can protect hot pages swapout. But let's think about it. When does swap-on happen? It depends on admin. we can't expect it. Nonetheless, we have done aging of anon pages to protect hot pages swapout. It means we lost run time overhead when below high watermark but gain hot page swap-[in/out] overhead when VM decide swapout. Is it true? Let's think more detail. We don't promote anon pages in case of non-swap system. So even though VM does aging of anon pages, the pages would be in inactive LRU for a long time. It means many of pages in there would mark access bit again. So access bit hot/code separation would be pointless. This patch prevents unnecessary anon pages demotion in not-yet-swapon and non-configured swap system. Even, in non-configuared swap system inactive_anon_is_low can be compiled out. It could make side effect that hot anon pages could swap out when admin does swap on. But I think sooner or later it would be steady state. So it's not a big problem. We could lose someting but gain more thing(TLB flush and unnecessary function call to demote anon pages). Signed-off-by: Ying Han Signed-off-by: Minchan Kim Reviewed-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f5871ee50000..0c33a0997907 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1505,6 +1505,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); } +#ifdef CONFIG_SWAP static int inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1530,12 +1531,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) { int low; + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); return low; } +#else +static inline int inactive_anon_is_low(struct zone *zone, + struct scan_control *sc) +{ + return 0; +} +#endif static int inactive_file_is_low_global(struct zone *zone) { @@ -1781,7 +1796,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) + if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); From 0def08e3acc2c9c934e4671487029aed52202d42 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 26 Oct 2010 14:21:32 -0700 Subject: [PATCH 027/176] mm/mempolicy.c: check return code of check_range Function check_range may return ERR_PTR(...). Check for it. Signed-off-by: Vasiliy Kulikov Acked-by: David Rientjes Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 21243b2b7b07..81a127643aea 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -924,12 +924,15 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (IS_ERR(vma)) + return PTR_ERR(vma); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, 0); From f629d1c9bd0dbc44a6c4f9a4a67d1646c42bfc6f Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Tue, 26 Oct 2010 14:21:33 -0700 Subject: [PATCH 028/176] mm: add account_page_writeback() To help developers and applications gain visibility into writeback behaviour this patch adds two counters to /proc/vmstat. # grep nr_dirtied /proc/vmstat nr_dirtied 3747 # grep nr_written /proc/vmstat nr_written 3618 These entries allow user apps to understand writeback behaviour over time and learn how it is impacting their performance. Currently there is no way to inspect dirty and writeback speed over time. It's not possible for nr_dirty/nr_writeback. These entries are necessary to give visibility into writeback behaviour. We have /proc/diskstats which lets us understand the io in the block layer. We have blktrace for more in depth understanding. We have e2fsprogs and debugsfs to give insight into the file systems behaviour, but we don't offer our users the ability understand what writeback is doing. There is no way to know how active it is over the whole system, if it's falling behind or to quantify it's efforts. With these values exported users can easily see how much data applications are sending through writeback and also at what rates writeback is processing this data. Comparing the rates of change between the two allow developers to see when writeback is not able to keep up with incoming traffic and the rate of dirty memory being sent to the IO back end. This allows folks to understand their io workloads and track kernel issues. Non kernel engineers at Google often use these counters to solve puzzling performance problems. Patch #4 adds a pernode vmstat file with nr_dirtied and nr_written Patch #5 add writeback thresholds to /proc/vmstat Currently these values are in debugfs. But they should be promoted to /proc since they are useful for developers who are writing databases and file servers and are not debugging the kernel. The output is as below: # grep threshold /proc/vmstat nr_pages_dirty_threshold 409111 nr_pages_dirty_background_threshold 818223 This patch: This allows code outside of the mm core to safely manipulate page writeback state and not worry about the other accounting. Not using these routines means that some code will lose track of the accounting and we get bugs. Modify nilfs2 to use interface. Signed-off-by: Michael Rubin Reviewed-by: KOSAKI Motohiro Reviewed-by: Wu Fengguang Cc: KONISHI Ryusuke Cc: Jiro SEKIBA Cc: Dave Chinner Cc: Jens Axboe Cc: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 2 +- include/linux/mm.h | 1 + mm/page-writeback.c | 13 ++++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index d926af626177..687d090cea34 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1609,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) kunmap_atomic(kaddr, KM_USER0); if (!TestSetPageWriteback(clone_page)) - inc_zone_page_state(clone_page, NR_WRITEBACK); + account_page_writeback(clone_page); unlock_page(clone_page); return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index a4c66846fb8f..c36297faf7cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -868,6 +868,7 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); +void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e3bccac1f025..94159819a651 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1128,6 +1128,17 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); +/* + * Helper function for set_page_writeback family. + * NOTE: Unlike account_page_dirtied this does not rely on being atomic + * wrt interrupts. + */ +void account_page_writeback(struct page *page) +{ + inc_zone_page_state(page, NR_WRITEBACK); +} +EXPORT_SYMBOL(account_page_writeback); + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -1366,7 +1377,7 @@ int test_set_page_writeback(struct page *page) ret = TestSetPageWriteback(page); } if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); + account_page_writeback(page); return ret; } From ea941f0e2a8c02ae876cd73deb4e1557248f258c Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Tue, 26 Oct 2010 14:21:35 -0700 Subject: [PATCH 029/176] writeback: add nr_dirtied and nr_written to /proc/vmstat To help developers and applications gain visibility into writeback behaviour adding two entries to vm_stat_items and /proc/vmstat. This will allow us to track the "written" and "dirtied" counts. # grep nr_dirtied /proc/vmstat nr_dirtied 3747 # grep nr_written /proc/vmstat nr_written 3618 Signed-off-by: Michael Rubin Reviewed-by: Wu Fengguang Cc: Dave Chinner Cc: Jens Axboe Cc: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ mm/page-writeback.c | 2 ++ mm/vmstat.c | 3 +++ 3 files changed, 7 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3984c4eb41fd..c3c17fb675ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -104,6 +104,8 @@ enum zone_stat_item { NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ + NR_DIRTIED, /* page dirtyings since bootup */ + NR_WRITTEN, /* page writings since bootup */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94159819a651..4dd91f7fd39f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1121,6 +1121,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); @@ -1136,6 +1137,7 @@ EXPORT_SYMBOL(account_page_dirtied); void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); } EXPORT_SYMBOL(account_page_writeback); diff --git a/mm/vmstat.c b/mm/vmstat.c index 355a9e669aaa..44e7ac0fdb66 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -745,6 +745,9 @@ static const char * const vmstat_text[] = { "nr_isolated_anon", "nr_isolated_file", "nr_shmem", + "nr_dirtied", + "nr_written", + #ifdef CONFIG_NUMA "numa_hit", "numa_miss", From 2ac390370aac4aaa49cab17f328b478cbd5b3d8d Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Tue, 26 Oct 2010 14:21:35 -0700 Subject: [PATCH 030/176] writeback: add /sys/devices/system/node//vmstat For NUMA node systems it is important to have visibility in memory characteristics. Two of the /proc/vmstat values "nr_written" and "nr_dirtied" are added here. # cat /sys/devices/system/node/node20/vmstat nr_written 0 nr_dirtied 0 Signed-off-by: Michael Rubin Reviewed-by: Wu Fengguang Cc: Dave Chinner Cc: Jens Axboe Cc: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index ee53558b452f..ce012a9c6201 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -160,6 +160,18 @@ static ssize_t node_read_numastat(struct sys_device * dev, } static SYSDEV_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); +static ssize_t node_read_vmstat(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + int nid = dev->id; + return sprintf(buf, + "nr_written %lu\n" + "nr_dirtied %lu\n", + node_page_state(nid, NR_WRITTEN), + node_page_state(nid, NR_DIRTIED)); +} +static SYSDEV_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL); + static ssize_t node_read_distance(struct sys_device * dev, struct sysdev_attribute *attr, char * buf) { @@ -243,6 +255,7 @@ int register_node(struct node *node, int num, struct node *parent) sysdev_create_file(&node->sysdev, &attr_meminfo); sysdev_create_file(&node->sysdev, &attr_numastat); sysdev_create_file(&node->sysdev, &attr_distance); + sysdev_create_file(&node->sysdev, &attr_vmstat); scan_unevictable_register_node(node); @@ -267,6 +280,7 @@ void unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_meminfo); sysdev_remove_file(&node->sysdev, &attr_numastat); sysdev_remove_file(&node->sysdev, &attr_distance); + sysdev_remove_file(&node->sysdev, &attr_vmstat); scan_unevictable_unregister_node(node); hugetlb_unregister_node(node); /* no-op, if memoryless node */ From 79da826aee6a10902ef411bc65864bd02102fa83 Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Tue, 26 Oct 2010 14:21:36 -0700 Subject: [PATCH 031/176] writeback: report dirty thresholds in /proc/vmstat The kernel already exposes the user desired thresholds in /proc/sys/vm with dirty_background_ratio and background_ratio. But the kernel may alter the number requested without giving the user any indication that is the case. Knowing the actual ratios the kernel is honoring can help app developers understand how their buffered IO will be sent to the disk. $ grep threshold /proc/vmstat nr_dirty_threshold 409111 nr_dirty_background_threshold 818223 Signed-off-by: Michael Rubin Cc: Wu Fengguang Cc: Dave Chinner Cc: Jens Axboe Cc: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 44e7ac0fdb66..baa4ab387db7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -747,6 +748,8 @@ static const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_dirty_threshold", + "nr_dirty_background_threshold", #ifdef CONFIG_NUMA "numa_hit", @@ -907,36 +910,44 @@ static const struct file_operations proc_zoneinfo_file_operations = { .release = seq_release, }; +enum writeback_stat_item { + NR_DIRTY_THRESHOLD, + NR_DIRTY_BG_THRESHOLD, + NR_VM_WRITEBACK_STAT_ITEMS, +}; + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; -#ifdef CONFIG_VM_EVENT_COUNTERS - unsigned long *e; -#endif - int i; + int i, stat_items_size; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); + stat_items_size += sizeof(struct vm_event_state); #endif + + v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; #endif - return v + *pos; + return m->private + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) From bce54bbfde07e8b300f39dae14756c12a6ceca65 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 Oct 2010 14:21:37 -0700 Subject: [PATCH 032/176] mm: fix typo in mm.h when NODE_NOT_IN_PAGE_FLAGS NODE_NOT_IN_PAGE_FLAGS is defined in mm.h when the node information is not stored in the page flags bitmap. Unfortunately, there's a typo in one of the checks for it. This patch fixes it (s/NODE_NOT_IN_PAGEFLAGS/NODE_NOT_IN_PAGE_FLAGS/). Since this has been around for ages, I doubt it's been causing any serious problems. Signed-off-by: Will Deacon Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c36297faf7cb..2862009f9573 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -497,8 +497,8 @@ static inline void set_compound_order(struct page *page, unsigned long order) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */ -#ifdef NODE_NOT_IN_PAGEFLAGS +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ +#ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ SECTIONS_PGOFF : ZONES_PGOFF) From 66d9a986cddbbc2ea5db013e7999c621a956cc47 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Oct 2010 14:21:37 -0700 Subject: [PATCH 033/176] vmscan: delete dead code `priority' cannot be negative here. And the comment is obsolete. Signed-off-by: Shaohua Li Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c33a0997907..28b0521e4bfd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1956,16 +1956,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } out: - /* - * Now that we've scanned all the zones at this priority level, note - * that level within the zone so that the next thread which performs - * scanning of this zone will immediately start out at this priority - * level. This affects only the decision whether or not to bring - * mapped pages onto the inactive list. - */ - if (priority < 0) - priority = 0; - delayacct_freepages_end(); put_mems_allowed(); From e11da5b4fdf01d71d73c21cb92b00595b917d7fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:40 -0700 Subject: [PATCH 034/176] tracing, vmscan: add trace events for LRU list shrinking There have been numerous reports of stalls that pointed at the problem being somewhere in the VM. There are multiple roots to the problems which means dealing with any of the root problems in isolation is tricky to justify on their own and they would still need integration testing. This patch series puts together two different patch sets which in combination should tackle some of the root causes of latency problems being reported. Patch 1 adds a tracepoint for shrink_inactive_list. For this series, the most important results is being able to calculate the scanning/reclaim ratio as a measure of the amount of work being done by page reclaim. Patch 2 accounts for time spent in congestion_wait. Patches 3-6 were originally developed by Kosaki Motohiro but reworked for this series. It has been noted that lumpy reclaim is far too aggressive and trashes the system somewhat. As SLUB uses high-order allocations, a large cost incurred by lumpy reclaim will be noticeable. It was also reported during transparent hugepage support testing that lumpy reclaim was trashing the system and these patches should mitigate that problem without disabling lumpy reclaim. Patch 7 adds wait_iff_congested() and replaces some callers of congestion_wait(). wait_iff_congested() only sleeps if there is a BDI that is currently congested. Patch 8 notes that any BDI being congested is not necessarily a problem because there could be multiple BDIs of varying speeds and numberous zones. It attempts to track when a zone being reclaimed contains many pages backed by a congested BDI and if so, reclaimers wait on the congestion queue. I ran a number of tests with monitoring on X86, X86-64 and PPC64. Each machine had 3G of RAM and the CPUs were X86: Intel P4 2-core X86-64: AMD Phenom 4-core PPC64: PPC970MP Each used a single disk and the onboard IO controller. Dirty ratio was left at 20. I'm just going to report for X86-64 and PPC64 in a vague attempt to keep this report short. Four kernels were tested each based on v2.6.36-rc4 traceonly-v2r2: Patches 1 and 2 to instrument vmscan reclaims and congestion_wait lowlumpy-v2r3: Patches 1-6 to test if lumpy reclaim is better waitcongest-v2r3: Patches 1-7 to only wait on congestion waitwriteback-v2r4: Patches 1-8 to detect when a zone is congested nocongest-v1r5: Patches 1-3 for testing wait_iff_congestion nodirect-v1r5: Patches 1-10 to disable filesystem writeback for better IO The tests run were as follows kernbench compile-based benchmark. Smoke test performance sysbench OLTP read-only benchmark. Will be re-run in the future as read-write micro-mapped-file-stream This is a micro-benchmark from Johannes Weiner that accesses a large sparse-file through mmap(). It was configured to run in only single-CPU mode but can be indicative of how well page reclaim identifies suitable pages. stress-highalloc Tries to allocate huge pages under heavy load. kernbench, iozone and sysbench did not report any performance regression on any machine. sysbench did pressure the system lightly and there was reclaim activity but there were no difference of major interest between the kernels. X86-64 micro-mapped-file-stream traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4 pgalloc_dma 1639.00 ( 0.00%) 667.00 (-145.73%) 1167.00 ( -40.45%) 578.00 (-183.56%) pgalloc_dma32 2842410.00 ( 0.00%) 2842626.00 ( 0.01%) 2843043.00 ( 0.02%) 2843014.00 ( 0.02%) pgalloc_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgsteal_dma 729.00 ( 0.00%) 85.00 (-757.65%) 609.00 ( -19.70%) 125.00 (-483.20%) pgsteal_dma32 2338721.00 ( 0.00%) 2447354.00 ( 4.44%) 2429536.00 ( 3.74%) 2436772.00 ( 4.02%) pgsteal_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_kswapd_dma 1469.00 ( 0.00%) 532.00 (-176.13%) 1078.00 ( -36.27%) 220.00 (-567.73%) pgscan_kswapd_dma32 4597713.00 ( 0.00%) 4503597.00 ( -2.09%) 4295673.00 ( -7.03%) 3891686.00 ( -18.14%) pgscan_kswapd_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_direct_dma 71.00 ( 0.00%) 134.00 ( 47.01%) 243.00 ( 70.78%) 352.00 ( 79.83%) pgscan_direct_dma32 305820.00 ( 0.00%) 280204.00 ( -9.14%) 600518.00 ( 49.07%) 957485.00 ( 68.06%) pgscan_direct_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pageoutrun 16296.00 ( 0.00%) 21254.00 ( 23.33%) 18447.00 ( 11.66%) 20067.00 ( 18.79%) allocstall 443.00 ( 0.00%) 273.00 ( -62.27%) 513.00 ( 13.65%) 1568.00 ( 71.75%) These are based on the raw figures taken from /proc/vmstat. It's a rough measure of reclaim activity. Note that allocstall counts are higher because we are entering direct reclaim more often as a result of not sleeping in congestion. In itself, it's not necessarily a bad thing. It's easier to get a view of what happened from the vmscan tracepoint report. FTrace Reclaim Statistics: vmscan traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4 Direct reclaims 443 273 513 1568 Direct reclaim pages scanned 305968 280402 600825 957933 Direct reclaim pages reclaimed 43503 19005 30327 117191 Direct reclaim write file async I/O 0 0 0 0 Direct reclaim write anon async I/O 0 3 4 12 Direct reclaim write file sync I/O 0 0 0 0 Direct reclaim write anon sync I/O 0 0 0 0 Wake kswapd requests 187649 132338 191695 267701 Kswapd wakeups 3 1 4 1 Kswapd pages scanned 4599269 4454162 4296815 3891906 Kswapd pages reclaimed 2295947 2428434 2399818 2319706 Kswapd reclaim write file async I/O 1 0 1 1 Kswapd reclaim write anon async I/O 59 187 41 222 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 4.34 2.52 6.63 2.96 Time kswapd awake (seconds) 11.15 10.25 11.01 10.19 Total pages scanned 4905237 4734564 4897640 4849839 Total pages reclaimed 2339450 2447439 2430145 2436897 %age total pages scanned/reclaimed 47.69% 51.69% 49.62% 50.25% %age total pages scanned/written 0.00% 0.00% 0.00% 0.00% %age file pages scanned/written 0.00% 0.00% 0.00% 0.00% Percentage Time Spent Direct Reclaim 29.23% 19.02% 38.48% 20.25% Percentage Time kswapd Awake 78.58% 78.85% 76.83% 79.86% What is interesting here for nocongest in particular is that while direct reclaim scans more pages, the overall number of pages scanned remains the same and the ratio of pages scanned to pages reclaimed is more or less the same. In other words, while we are sleeping less, reclaim is not doing more work and as direct reclaim and kswapd is awake for less time, it would appear to be doing less work. FTrace Reclaim Statistics: congestion_wait Direct number congest waited 87 196 64 0 Direct time congest waited 4604ms 4732ms 5420ms 0ms Direct full congest waited 72 145 53 0 Direct number conditional waited 0 0 324 1315 Direct time conditional waited 0ms 0ms 0ms 0ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 20 10 15 7 KSwapd time congest waited 1264ms 536ms 884ms 284ms KSwapd full congest waited 10 4 6 2 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 The vanilla kernel spent 8 seconds asleep in direct reclaim and no time at all asleep with the patches. MMTests Statistics: duration User/Sys Time Running Test (seconds) 10.51 10.73 10.6 11.66 Total Elapsed Time (seconds) 14.19 13.00 14.33 12.76 Overall, the tests completed faster. It is interesting to note that backing off further when a zone is congested and not just a BDI was more efficient overall. PPC64 micro-mapped-file-stream pgalloc_dma 3024660.00 ( 0.00%) 3027185.00 ( 0.08%) 3025845.00 ( 0.04%) 3026281.00 ( 0.05%) pgalloc_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgsteal_dma 2508073.00 ( 0.00%) 2565351.00 ( 2.23%) 2463577.00 ( -1.81%) 2532263.00 ( 0.96%) pgsteal_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_kswapd_dma 4601307.00 ( 0.00%) 4128076.00 ( -11.46%) 3912317.00 ( -17.61%) 3377165.00 ( -36.25%) pgscan_kswapd_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_direct_dma 629825.00 ( 0.00%) 971622.00 ( 35.18%) 1063938.00 ( 40.80%) 1711935.00 ( 63.21%) pgscan_direct_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pageoutrun 27776.00 ( 0.00%) 20458.00 ( -35.77%) 18763.00 ( -48.04%) 18157.00 ( -52.98%) allocstall 977.00 ( 0.00%) 2751.00 ( 64.49%) 2098.00 ( 53.43%) 5136.00 ( 80.98%) Similar trends to x86-64. allocstalls are up but it's not necessarily bad. FTrace Reclaim Statistics: vmscan Direct reclaims 977 2709 2098 5136 Direct reclaim pages scanned 629825 963814 1063938 1711935 Direct reclaim pages reclaimed 75550 242538 150904 387647 Direct reclaim write file async I/O 0 0 0 2 Direct reclaim write anon async I/O 0 10 0 4 Direct reclaim write file sync I/O 0 0 0 0 Direct reclaim write anon sync I/O 0 0 0 0 Wake kswapd requests 392119 1201712 571935 571921 Kswapd wakeups 3 2 3 3 Kswapd pages scanned 4601307 4128076 3912317 3377165 Kswapd pages reclaimed 2432523 2318797 2312673 2144616 Kswapd reclaim write file async I/O 20 1 1 1 Kswapd reclaim write anon async I/O 57 132 11 121 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 6.19 7.30 13.04 10.88 Time kswapd awake (seconds) 21.73 26.51 25.55 23.90 Total pages scanned 5231132 5091890 4976255 5089100 Total pages reclaimed 2508073 2561335 2463577 2532263 %age total pages scanned/reclaimed 47.95% 50.30% 49.51% 49.76% %age total pages scanned/written 0.00% 0.00% 0.00% 0.00% %age file pages scanned/written 0.00% 0.00% 0.00% 0.00% Percentage Time Spent Direct Reclaim 18.89% 20.65% 32.65% 27.65% Percentage Time kswapd Awake 72.39% 80.68% 78.21% 77.40% Again, a similar trend that the congestion_wait changes mean that direct reclaim scans more pages but the overall number of pages scanned while slightly reduced, are very similar. The ratio of scanning/reclaimed remains roughly similar. The downside is that kswapd and direct reclaim was awake longer and for a larger percentage of the overall workload. It's possible there were big differences in the amount of time spent reclaiming slab pages between the different kernels which is plausible considering that the micro tests runs after fsmark and sysbench. Trace Reclaim Statistics: congestion_wait Direct number congest waited 845 1312 104 0 Direct time congest waited 19416ms 26560ms 7544ms 0ms Direct full congest waited 745 1105 72 0 Direct number conditional waited 0 0 1322 2935 Direct time conditional waited 0ms 0ms 12ms 312ms Direct full conditional waited 0 0 0 3 KSwapd number congest waited 39 102 75 63 KSwapd time congest waited 2484ms 6760ms 5756ms 3716ms KSwapd full congest waited 20 48 46 25 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 The vanilla kernel spent 20 seconds asleep in direct reclaim and only 312ms asleep with the patches. The time kswapd spent congest waited was also reduced by a large factor. MMTests Statistics: duration ser/Sys Time Running Test (seconds) 26.58 28.05 26.9 28.47 Total Elapsed Time (seconds) 30.02 32.86 32.67 30.88 With all patches applies, the completion times are very similar. X86-64 STRESS-HIGHALLOC traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Pass 1 82.00 ( 0.00%) 84.00 ( 2.00%) 85.00 ( 3.00%) 85.00 ( 3.00%) Pass 2 90.00 ( 0.00%) 87.00 (-3.00%) 88.00 (-2.00%) 89.00 (-1.00%) At Rest 92.00 ( 0.00%) 90.00 (-2.00%) 90.00 (-2.00%) 91.00 (-1.00%) Success figures across the board are broadly similar. traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Direct reclaims 1045 944 886 887 Direct reclaim pages scanned 135091 119604 109382 101019 Direct reclaim pages reclaimed 88599 47535 47863 46671 Direct reclaim write file async I/O 494 283 465 280 Direct reclaim write anon async I/O 29357 13710 16656 13462 Direct reclaim write file sync I/O 154 2 2 3 Direct reclaim write anon sync I/O 14594 571 509 561 Wake kswapd requests 7491 933 872 892 Kswapd wakeups 814 778 731 780 Kswapd pages scanned 7290822 15341158 11916436 13703442 Kswapd pages reclaimed 3587336 3142496 3094392 3187151 Kswapd reclaim write file async I/O 91975 32317 28022 29628 Kswapd reclaim write anon async I/O 1992022 789307 829745 849769 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 4588.93 2467.16 2495.41 2547.07 Time kswapd awake (seconds) 2497.66 1020.16 1098.06 1176.82 Total pages scanned 7425913 15460762 12025818 13804461 Total pages reclaimed 3675935 3190031 3142255 3233822 %age total pages scanned/reclaimed 49.50% 20.63% 26.13% 23.43% %age total pages scanned/written 28.66% 5.41% 7.28% 6.47% %age file pages scanned/written 1.25% 0.21% 0.24% 0.22% Percentage Time Spent Direct Reclaim 57.33% 42.15% 42.41% 42.99% Percentage Time kswapd Awake 43.56% 27.87% 29.76% 31.25% Scanned/reclaimed ratios again look good with big improvements in efficiency. The Scanned/written ratios also look much improved. With a better scanned/written ration, there is an expectation that IO would be more efficient and indeed, the time spent in direct reclaim is much reduced by the full series and kswapd spends a little less time awake. Overall, indications here are that allocations were happening much faster and this can be seen with a graph of the latency figures as the allocations were taking place http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-hydra-mean.ps FTrace Reclaim Statistics: congestion_wait Direct number congest waited 1333 204 169 4 Direct time congest waited 78896ms 8288ms 7260ms 200ms Direct full congest waited 756 92 69 2 Direct number conditional waited 0 0 26 186 Direct time conditional waited 0ms 0ms 0ms 2504ms Direct full conditional waited 0 0 0 25 KSwapd number congest waited 4 395 227 282 KSwapd time congest waited 384ms 25136ms 10508ms 18380ms KSwapd full congest waited 3 232 98 176 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 KSwapd full conditional waited 318 0 312 9 Overall, the time spent speeping is reduced. kswapd is still hitting congestion_wait() but that is because there are callers remaining where it wasn't clear in advance if they should be changed to wait_iff_congested() or not. Overall the sleep imes are reduced though - from 79ish seconds to about 19. MMTests Statistics: duration User/Sys Time Running Test (seconds) 3415.43 3386.65 3388.39 3377.5 Total Elapsed Time (seconds) 5733.48 3660.33 3689.41 3765.39 With the full series, the time to complete the tests are reduced by 30% PPC64 STRESS-HIGHALLOC traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Pass 1 17.00 ( 0.00%) 34.00 (17.00%) 38.00 (21.00%) 43.00 (26.00%) Pass 2 25.00 ( 0.00%) 37.00 (12.00%) 42.00 (17.00%) 46.00 (21.00%) At Rest 49.00 ( 0.00%) 43.00 (-6.00%) 45.00 (-4.00%) 51.00 ( 2.00%) Success rates there are *way* up particularly considering that the 16MB huge pages on PPC64 mean that it's always much harder to allocate them. FTrace Reclaim Statistics: vmscan stress-highalloc stress-highalloc stress-highalloc stress-highalloc traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Direct reclaims 499 505 564 509 Direct reclaim pages scanned 223478 41898 51818 45605 Direct reclaim pages reclaimed 137730 21148 27161 23455 Direct reclaim write file async I/O 399 136 162 136 Direct reclaim write anon async I/O 46977 2865 4686 3998 Direct reclaim write file sync I/O 29 0 1 3 Direct reclaim write anon sync I/O 31023 159 237 239 Wake kswapd requests 420 351 360 326 Kswapd wakeups 185 294 249 277 Kswapd pages scanned 15703488 16392500 17821724 17598737 Kswapd pages reclaimed 5808466 2908858 3139386 3145435 Kswapd reclaim write file async I/O 159938 18400 18717 13473 Kswapd reclaim write anon async I/O 3467554 228957 322799 234278 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 9665.35 1707.81 2374.32 1871.23 Time kswapd awake (seconds) 9401.21 1367.86 1951.75 1328.88 Total pages scanned 15926966 16434398 17873542 17644342 Total pages reclaimed 5946196 2930006 3166547 3168890 %age total pages scanned/reclaimed 37.33% 17.83% 17.72% 17.96% %age total pages scanned/written 23.27% 1.52% 1.94% 1.43% %age file pages scanned/written 1.01% 0.11% 0.11% 0.08% Percentage Time Spent Direct Reclaim 44.55% 35.10% 41.42% 36.91% Percentage Time kswapd Awake 86.71% 43.58% 52.67% 41.14% While the scanning rates are slightly up, the scanned/reclaimed and scanned/written figures are much improved. The time spent in direct reclaim and with kswapd are massively reduced, mostly by the lowlumpy patches. FTrace Reclaim Statistics: congestion_wait Direct number congest waited 725 303 126 3 Direct time congest waited 45524ms 9180ms 5936ms 300ms Direct full congest waited 487 190 52 3 Direct number conditional waited 0 0 200 301 Direct time conditional waited 0ms 0ms 0ms 1904ms Direct full conditional waited 0 0 0 19 KSwapd number congest waited 0 2 23 4 KSwapd time congest waited 0ms 200ms 420ms 404ms KSwapd full congest waited 0 2 2 4 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 Not as dramatic a story here but the time spent asleep is reduced and we can still see what wait_iff_congested is going to sleep when necessary. MMTests Statistics: duration User/Sys Time Running Test (seconds) 12028.09 3157.17 3357.79 3199.16 Total Elapsed Time (seconds) 10842.07 3138.72 3705.54 3229.85 The time to complete this test goes way down. With the full series, we are allocating over twice the number of huge pages in 30% of the time and there is a corresponding impact on the allocation latency graph available at. http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-powyah-mean.ps This patch: Add a trace event for shrink_inactive_list() and updates the sample postprocessing script appropriately. It can be used to determine how many pages were reclaimed and for non-lumpy reclaim where exactly the pages were reclaimed from. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../postprocess/trace-vmscan-postprocess.pl | 39 ++++++++++++----- include/trace/events/vmscan.h | 42 +++++++++++++++++++ mm/vmscan.c | 6 +++ 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 1b55146d1c8d..b3e73ddb1567 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -46,7 +46,7 @@ use constant HIGH_KSWAPD_LATENCY => 20; use constant HIGH_KSWAPD_REWAKEUP => 21; use constant HIGH_NR_SCANNED => 22; use constant HIGH_NR_TAKEN => 23; -use constant HIGH_NR_RECLAIM => 24; +use constant HIGH_NR_RECLAIMED => 24; use constant HIGH_NR_CONTIG_DIRTY => 25; my %perprocesspid; @@ -58,11 +58,13 @@ my $opt_read_procstat; my $total_wakeup_kswapd; my ($total_direct_reclaim, $total_direct_nr_scanned); my ($total_direct_latency, $total_kswapd_latency); +my ($total_direct_nr_reclaimed); my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async); my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async); my ($total_kswapd_nr_scanned, $total_kswapd_wake); my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async); my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async); +my ($total_kswapd_nr_reclaimed); # Catch sigint and exit on request my $sigint_report = 0; @@ -104,7 +106,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)'; -my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -203,8 +205,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, "nid", "zid", - "lru", - "nr_scanned", "nr_reclaimed", "priority"); + "nr_scanned", "nr_reclaimed", "priority", + "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", $regex_lru_shrink_active_default, @@ -375,6 +377,16 @@ EVENT_PROCESS: my $nr_contig_dirty = $7; $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty; + } elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") { + $details = $5; + if ($details !~ /$regex_lru_shrink_inactive/o) { + print "WARNING: Failed to parse mm_vmscan_lru_shrink_inactive as expected\n"; + print " $details\n"; + print " $regex_lru_shrink_inactive/o\n"; + next; + } + my $nr_reclaimed = $4; + $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed; } elsif ($tracepoint eq "mm_vmscan_writepage") { $details = $5; if ($details !~ /$regex_writepage/o) { @@ -464,8 +476,8 @@ sub dump_stats { # Print out process activity printf("\n"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "Process", "Direct", "Wokeup", "Pages", "Pages", "Pages", "Time"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "details", "Rclms", "Kswapd", "Scanned", "Sync-IO", "ASync-IO", "Stalled"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s\n", "Process", "Direct", "Wokeup", "Pages", "Pages", "Pages", "Pages", "Time"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s\n", "details", "Rclms", "Kswapd", "Scanned", "Rclmed", "Sync-IO", "ASync-IO", "Stalled"); foreach $process_pid (keys %stats) { if (!$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) { @@ -475,6 +487,7 @@ sub dump_stats { $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}; $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_direct_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -489,11 +502,12 @@ sub dump_stats { $index++; } - printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8u %8.3f", + printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8u %8u %8.3f", $process_pid, $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}, $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}, $this_reclaim_delay / 1000); @@ -529,8 +543,8 @@ sub dump_stats { # Print out kswapd activity printf("\n"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Kswapd", "Kswapd", "Order", "Pages", "Pages", "Pages"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Sync-IO", "ASync-IO"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Kswapd", "Kswapd", "Order", "Pages", "Pages", "Pages", "Pages"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Rclmed", "Sync-IO", "ASync-IO"); foreach $process_pid (keys %stats) { if (!$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) { @@ -539,16 +553,18 @@ sub dump_stats { $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}; $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_kswapd_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; $total_kswapd_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}; - printf("%-" . $max_strlen . "s %8d %10d %8u %8i %8u", + printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8i %8u", $process_pid, $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}, $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}); @@ -579,6 +595,7 @@ sub dump_stats { print "\nSummary\n"; print "Direct reclaims: $total_direct_reclaim\n"; print "Direct reclaim pages scanned: $total_direct_nr_scanned\n"; + print "Direct reclaim pages reclaimed: $total_direct_nr_reclaimed\n"; print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n"; print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n"; print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n"; @@ -588,6 +605,7 @@ sub dump_stats { print "\n"; print "Kswapd wakeups: $total_kswapd_wake\n"; print "Kswapd pages scanned: $total_kswapd_nr_scanned\n"; + print "Kswapd pages reclaimed: $total_kswapd_nr_reclaimed\n"; print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n"; print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n"; print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n"; @@ -612,6 +630,7 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}; $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED}; + $perprocess{$process}->{HIGH_NR_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 370aa5a87322..ecf952192a93 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -10,6 +10,7 @@ #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u +#define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u #define RECLAIM_WB_ASYNC 0x0008u @@ -17,6 +18,7 @@ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ + {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" @@ -26,6 +28,12 @@ (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) +#define trace_shrink_flags(file, sync) ( \ + (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ + (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + ) + TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), @@ -269,6 +277,40 @@ TRACE_EVENT(mm_vmscan_writepage, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_lru_shrink_inactive, + + TP_PROTO(int nid, int zid, + unsigned long nr_scanned, unsigned long nr_reclaimed, + int priority, int reclaim_flags), + + TP_ARGS(nid, zid, nr_scanned, nr_reclaimed, priority, reclaim_flags), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, zid) + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_reclaimed) + __field(int, priority) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->zid = zid; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->priority = priority; + __entry->reclaim_flags = reclaim_flags; + ), + + TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + __entry->nid, __entry->zid, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->priority, + show_reclaim_flags(__entry->reclaim_flags)) +); + + #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 28b0521e4bfd..4a6dccb57586 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1358,6 +1358,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + priority, + trace_shrink_flags(file, sc->lumpy_reclaim_mode)); return nr_reclaimed; } From 52bb9198668968506f9d12bf35d7f5d3f094921e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:41 -0700 Subject: [PATCH 035/176] writeback: account for time spent congestion_waited There is strong evidence to indicate a lot of time is being spent in congestion_wait(), some of it unnecessarily. This patch adds a tracepoint for congestion_wait to record when congestion_wait() was called, how long the timeout was for and how long it actually slept. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 28 ++++++++++++++++++++++++++++ mm/backing-dev.c | 5 +++++ 2 files changed, 33 insertions(+) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 0bb01ab2e984..d2b2654606ec 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -151,6 +151,34 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written); DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +DECLARE_EVENT_CLASS(writeback_congest_waited_template, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed), + + TP_STRUCT__entry( + __field( unsigned int, usec_timeout ) + __field( unsigned int, usec_delayed ) + ), + + TP_fast_assign( + __entry->usec_timeout = usec_timeout; + __entry->usec_delayed = usec_delayed; + ), + + TP_printk("usec_timeout=%u usec_delayed=%u", + __entry->usec_timeout, + __entry->usec_delayed) +); + +DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 65d420499a61..55627306abe0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -764,12 +764,17 @@ EXPORT_SYMBOL(set_bdi_congested); long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); From bc57e00f5e0b2480ef222c775c49552d3a930db7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:41 -0700 Subject: [PATCH 036/176] vmscan: synchronous lumpy reclaim should not call congestion_wait() congestion_wait() means "wait until queue congestion is cleared". However, synchronous lumpy reclaim does not need this congestion_wait() as shrink_page_list(PAGEOUT_IO_SYNC) uses wait_on_page_writeback() and it provides the necessary waiting. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Reviewed-by: Wu Fengguang Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4a6dccb57586..399d54e8a82c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1340,8 +1340,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - /* * The attempt at page out may have made some * of the pages active, mark them inactive again. From 7d3579e8e61937cbba268ea9b218d006b6d64221 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:42 -0700 Subject: [PATCH 037/176] vmscan: narrow the scenarios in whcih lumpy reclaim uses synchrounous reclaim shrink_page_list() can decide to give up reclaiming a page under a number of conditions such as 1. trylock_page() failure 2. page is unevictable 3. zone reclaim and page is mapped 4. PageWriteback() is true 5. page is swapbacked and swap is full 6. add_to_swap() failure 7. page is dirty and gfpmask don't have GFP_IO, GFP_FS 8. page is pinned 9. IO queue is congested 10. pageout() start IO, but not finished With lumpy reclaim, failures result in entering synchronous lumpy reclaim but this can be unnecessary. In cases (2), (3), (5), (6), (7) and (8), there is no point retrying. This patch causes lumpy reclaim to abort when it is known it will fail. Case (9) is more interesting. current behavior is, 1. start shrink_page_list(async) 2. found queue_congested() 3. skip pageout write 4. still start shrink_page_list(sync) 5. wait on a lot of pages 6. again, found queue_congested() 7. give up pageout write again So, it's useless time wasting. However, just skipping page reclaim is also notgood as x86 allocating a huge page needs 512 pages for example. It can have more dirty pages than queue congestion threshold (~=128). After this patch, pageout() behaves as follows; - If order > PAGE_ALLOC_COSTLY_ORDER Ignore queue congestion always. - If order <= PAGE_ALLOC_COSTLY_ORDER skip write page and disable lumpy reclaim. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +- mm/vmscan.c | 120 +++++++++++++++++++++------------- 2 files changed, 78 insertions(+), 48 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index ecf952192a93..c255fcc587bf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index 399d54e8a82c..d9fc2dce93af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -51,6 +51,12 @@ #define CREATE_TRACE_POINTS #include +enum lumpy_mode { + LUMPY_MODE_NONE, + LUMPY_MODE_ASYNC, + LUMPY_MODE_SYNC, +}; + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -82,7 +88,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - bool lumpy_reclaim_mode; + enum lumpy_mode lumpy_reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, + bool sync) +{ + enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + + /* + * Some reclaim have alredy been failed. No worth to try synchronous + * lumpy reclaim. + */ + if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = mode; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = mode; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + +static void disable_lumpy_reclaim_mode(struct scan_control *sc) +{ + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) return 1; if (bdi == current->backing_dev_info) return 1; + + /* lumpy reclaim for hugepage often need a lot of write */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return 1; return 0; } @@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -330,7 +365,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -366,8 +401,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) { + disable_lumpy_reclaim_mode(sc); return PAGE_KEEP; + } if (clear_page_dirty_for_io(page)) { int res; @@ -393,7 +430,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + if (PageWriteback(page) && + sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -401,7 +439,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sync_writeback)); + trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -579,7 +617,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) return PAGEREF_RECLAIM; /* @@ -643,8 +681,7 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -693,10 +730,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + may_enter_fs) wait_on_page_writeback(page); - else - goto keep_locked; + else { + unlock_page(page); + goto keep_lumpy; + } } references = page_check_references(page, sc); @@ -750,14 +790,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep_lumpy; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -840,6 +883,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, try_to_free_swap(page); unlock_page(page); putback_lru_page(page); + disable_lumpy_reclaim_mode(sc); continue; activate_locked: @@ -852,6 +896,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, keep_locked: unlock_page(page); keep: + disable_lumpy_reclaim_mode(sc); +keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } @@ -1252,7 +1298,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (!sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1297,15 +1343,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - + set_lumpy_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1317,8 +1363,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, zone, sc->mem_cgroup, 0, file); /* @@ -1336,7 +1382,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + nr_reclaimed = shrink_page_list(&page_list, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { @@ -1347,7 +1393,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_active = clear_active_flags(&page_list, NULL); count_vm_events(PGDEACTIVATE, nr_active); - nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + set_lumpy_reclaim_mode(priority, sc, true); + nr_reclaimed += shrink_page_list(&page_list, sc); } local_irq_disable(); @@ -1739,21 +1786,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, } } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) -{ - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = 1; - else - sc->lumpy_reclaim_mode = 0; -} - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1768,8 +1800,6 @@ static void shrink_zone(int priority, struct zone *zone, get_scan_count(zone, sc, nr, priority); - set_lumpy_reclaim_mode(priority, sc); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { From 47185052165a4c5de0a461018238375dd982c2ec Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:43 -0700 Subject: [PATCH 038/176] vmscan: remove dead code in shrink_inactive_list() After synchrounous lumpy reclaim, the page_list is guaranteed to not have active pages as page activation in shrink_page_list() disables lumpy reclaim. Remove the dead code. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Cc: Johannes Weiner Cc: Wu Fengguang Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d9fc2dce93af..cec8081bbd95 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1331,7 +1331,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_active; unsigned long nr_anon; unsigned long nr_file; @@ -1386,13 +1385,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, NULL); - count_vm_events(PGDEACTIVATE, nr_active); - set_lumpy_reclaim_mode(priority, sc, true); nr_reclaimed += shrink_page_list(&page_list, sc); } From 08fc468f4eaf6683bae5bdb94743a09d8630cb80 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:44 -0700 Subject: [PATCH 039/176] vmscan: isolate_lru_pages(): stop neighbour search if neighbour cannot be isolated isolate_lru_pages() does not just isolate LRU tail pages, but also isolates neighbour pages of the eviction page. The neighbour search does not stop even if neighbours cannot be isolated which is excessive as the lumpy reclaim will no longer result in a successful higher order allocation. This patch stops the PFN neighbour pages if an isolation fails and moves on to the next block. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Wu Fengguang Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cec8081bbd95..130ad0239f52 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1051,7 +1051,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) - continue; + break; /* * If we don't have enough swap space, reclaiming of @@ -1059,8 +1059,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * pointless. */ if (nr_swap_pages <= 0 && PageAnon(cursor_page) && - !PageSwapCache(cursor_page)) - continue; + !PageSwapCache(cursor_page)) + break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); @@ -1071,11 +1071,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_dirty++; scan++; } else { - if (mode == ISOLATE_BOTH && - page_count(cursor_page)) - nr_lumpy_failed++; + /* the page is freed already. */ + if (!page_count(cursor_page)) + continue; + break; } } + + /* If we break out of the loop above, lumpy reclaim failed */ + if (pfn < end_pfn) + nr_lumpy_failed++; } *scanned = scan; From 0e093d99763eb4cea09f8ca4f1d01f34e121d10b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:45 -0700 Subject: [PATCH 040/176] writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone If congestion_wait() is called with no BDI congested, the caller will sleep for the full timeout and this may be an unnecessary sleep. This patch adds a wait_iff_congested() that checks congestion and only sleeps if a BDI is congested else, it calls cond_resched() to ensure the caller is not hogging the CPU longer than its quota but otherwise will not sleep. This is aimed at reducing some of the major desktop stalls reported during IO. For example, while kswapd is operating, it calls congestion_wait() but it could just have been reclaiming clean page cache pages with no congestion. Without this patch, it would sleep for a full timeout but after this patch, it'll just call schedule() if it has been on the CPU too long. Similar logic applies to direct reclaimers that are not making enough progress. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- include/linux/mmzone.h | 8 +++++ include/trace/events/writeback.h | 7 ++++ mm/backing-dev.c | 61 ++++++++++++++++++++++++++++++-- mm/page_alloc.c | 4 +-- mm/vmscan.c | 42 ++++++++++++++++++---- 6 files changed, 112 insertions(+), 12 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 35b00746c712..f1b402a50679 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -285,7 +285,7 @@ enum { void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); - +long wait_iff_congested(struct zone *zone, int sync, long timeout); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c3c17fb675ee..39c24ebe9cfd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -423,6 +423,9 @@ struct zone { typedef enum { ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ + ZONE_CONGESTED, /* zone has many dirty pages backed by + * a congested BDI + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) clear_bit(flag, &zone->flags); } +static inline int zone_is_reclaim_congested(const struct zone *zone) +{ + return test_bit(ZONE_CONGESTED, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index d2b2654606ec..89a2b2db4375 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_ARGS(usec_timeout, usec_delayed) ); +DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 55627306abe0..5ad3c106606b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); @@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout) } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a683f819439..b13bc5e5bd7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -2095,7 +2095,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 130ad0239f52..30fd658bb289 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) { - disable_lumpy_reclaim_mode(sc); + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; - } if (clear_page_dirty_for_io(page)) { int res; @@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, + struct zone *zone, struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; cond_resched(); @@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { + nr_dirty++; + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: + nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -902,6 +907,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + /* + * Tag a zone as congested if all the dirty pages encountered were + * backed by a congested BDI. In this case, reclaimers should just + * back off and wait for congestion to clear because further reclaim + * will encounter the same problem + */ + if (nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + free_page_list(&free_pages); list_splice(&ret_pages, page_list); @@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, sc); + nr_reclaimed = shrink_page_list(&page_list, zone, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_lumpy_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, sc); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); } local_irq_disable(); @@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + priority < DEF_PRIORITY - 2) { + struct zone *preferred_zone; + + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), + NULL, &preferred_zone); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); + } } out: @@ -2282,6 +2301,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; + } else { + /* + * If a zone reaches its high watermark, + * consider it to be no longer congested. It's + * possible there are dirty pages backed by + * congested BDIs but as pressure is relieved, + * spectulatively avoid congestion waits + */ + zone_clear_flag(zone, ZONE_CONGESTED); } } From 4cbec4c8b9fda9ec784086fe7f74cd32a8adda95 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 26 Oct 2010 14:21:45 -0700 Subject: [PATCH 041/176] writeback: remove the internal 5% low bound on dirty_ratio The dirty_ratio was silently limited in global_dirty_limits() to >= 5%. This is not a user expected behavior. And it's inconsistent with calc_period_shift(), which uses the plain vm_dirty_ratio value. Let's remove the internal bound. At the same time, fix balance_dirty_pages() to work with the dirty_thresh=0 case. This allows applications to proceed when dirty+writeback pages are all cleaned. And ">" fits with the name "exceeded" better than ">=" does. Neil thinks it is an aesthetic improvement as well as a functional one :) Signed-off-by: Wu Fengguang Cc: Jan Kara Proposed-by: Con Kolivas Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Neil Brown Reviewed-by: KOSAKI Motohiro Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- mm/page-writeback.c | 16 +++++----------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ab38fef1c9a1..97d2951bd4d1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -582,7 +582,7 @@ static inline bool over_bground_thresh(void) global_dirty_limits(&background_thresh, &dirty_thresh); return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) >= background_thresh); + global_page_state(NR_UNSTABLE_NFS) > background_thresh); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4dd91f7fd39f..b840afa89761 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } + else + dirty = (vm_dirty_ratio * available_memory) / 100; if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); @@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping, * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback < + if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; @@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping, * the last resort safeguard. */ dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) - || (nr_reclaimable + nr_writeback >= dirty_thresh); + (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) + || (nr_reclaimable + nr_writeback > dirty_thresh); if (!dirty_exceeded) break; From 2e30244a7cc1ff09013a1238d415b4076406388e Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:46 -0700 Subject: [PATCH 042/176] vmscan,tmpfs: treat used once pages on tmpfs as used once When a page has PG_referenced, shrink_page_list() discards it only if it is not dirty. This rule works fine if the backing filesystem is a regular one. PG_dirty is a good signal that the page was used recently because the flusher threads clean pages periodically. In addition, page writeback is costlier than simple page discard. However, when a page is on tmpfs this heuristic doesn't work because flusher threads don't write back tmpfs pages. Consequently tmpfs pages always rotate around the lru twice at least and adds unnecessary lru churn. Simple tmpfs streaming io shouldn't cause large anonymous page swap-out. Remove this unncessary reclaim bonus of tmpfs pages. Signed-off-by: KOSAKI Motohiro Cc: Hugh Dickins Reviewed-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 30fd658bb289..b8a6fdc21312 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -651,7 +651,7 @@ static enum page_references page_check_references(struct page *page, } /* Reclaim if clean, defer dirty pages to writeback */ - if (referenced_page) + if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; From 61ecdb801ef2cd28e32442383106d7837d76deac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:47 -0700 Subject: [PATCH 043/176] mm: strictly nested kmap_atomic() Ensure kmap_atomic() usage is strictly nested Signed-off-by: Peter Zijlstra Reviewed-by: Rik van Riel Acked-by: Chris Metcalf Cc: David Howells Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- crypto/async_tx/async_memcpy.c | 2 +- crypto/blkcipher.c | 2 +- drivers/block/loop.c | 4 ++-- include/linux/highmem.h | 4 ++-- kernel/power/snapshot.c | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c index 0ec1fb69d4ea..518c22bd9562 100644 --- a/crypto/async_tx/async_memcpy.c +++ b/crypto/async_tx/async_memcpy.c @@ -83,8 +83,8 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, memcpy(dest_buf, src_buf, len); - kunmap_atomic(dest_buf, KM_USER0); kunmap_atomic(src_buf, KM_USER1); + kunmap_atomic(dest_buf, KM_USER0); async_tx_sync_epilog(submit); } diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c index 90d26c91f4e9..7a7219266e3c 100644 --- a/crypto/blkcipher.c +++ b/crypto/blkcipher.c @@ -89,9 +89,9 @@ static inline unsigned int blkcipher_done_fast(struct blkcipher_walk *walk, memcpy(walk->dst.virt.addr, walk->page, n); blkcipher_unmap_dst(walk); } else if (!(walk->flags & BLKCIPHER_WALK_PHYS)) { - blkcipher_unmap_src(walk); if (walk->flags & BLKCIPHER_WALK_DIFF) blkcipher_unmap_dst(walk); + blkcipher_unmap_src(walk); } scatterwalk_advance(&walk->in, n); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6c48b3545f84..450c958b514f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -101,8 +101,8 @@ static int transfer_none(struct loop_device *lo, int cmd, else memcpy(raw_buf, loop_buf, size); - kunmap_atomic(raw_buf, KM_USER0); kunmap_atomic(loop_buf, KM_USER1); + kunmap_atomic(raw_buf, KM_USER0); cond_resched(); return 0; } @@ -130,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd, for (i = 0; i < size; i++) *out++ = *in++ ^ key[(i & 511) % keysize]; - kunmap_atomic(raw_buf, KM_USER0); kunmap_atomic(loop_buf, KM_USER1); + kunmap_atomic(raw_buf, KM_USER0); cond_resched(); return 0; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e3060ef85b6d..283cd47bb34c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -201,8 +201,8 @@ static inline void copy_user_highpage(struct page *to, struct page *from, vfrom = kmap_atomic(from, KM_USER0); vto = kmap_atomic(to, KM_USER1); copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vfrom, KM_USER0); kunmap_atomic(vto, KM_USER1); + kunmap_atomic(vfrom, KM_USER0); } #endif @@ -214,8 +214,8 @@ static inline void copy_highpage(struct page *to, struct page *from) vfrom = kmap_atomic(from, KM_USER0); vto = kmap_atomic(to, KM_USER1); copy_page(vto, vfrom); - kunmap_atomic(vfrom, KM_USER0); kunmap_atomic(vto, KM_USER1); + kunmap_atomic(vfrom, KM_USER0); } #endif /* _LINUX_HIGHMEM_H */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ac7eb109f196..9e3581f4619a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) src = kmap_atomic(s_page, KM_USER0); dst = kmap_atomic(d_page, KM_USER1); do_copy_page(dst, src); - kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel @@ -2273,8 +2273,8 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) memcpy(buf, kaddr1, PAGE_SIZE); memcpy(kaddr1, kaddr2, PAGE_SIZE); memcpy(kaddr2, buf, PAGE_SIZE); - kunmap_atomic(kaddr1, KM_USER0); kunmap_atomic(kaddr2, KM_USER1); + kunmap_atomic(kaddr1, KM_USER0); } /** From 3e4d3af501cccdc8a8cca41bdbe57d54ad7e7e73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:51 -0700 Subject: [PATCH 044/176] mm: stack based kmap_atomic() Keep the current interface but ignore the KM_type and use a stack based approach. The advantage is that we get rid of crappy code like: #define __KM_PTE \ (in_nmi() ? KM_NMI_PTE : \ in_irq() ? KM_IRQ_PTE : \ KM_PTE0) and in general can stop worrying about what context we're in and what kmap slots might be appropriate for that. The downside is that FRV kmap_atomic() gets more expensive. For now we use a CPP trick suggested by Andrew: #define kmap_atomic(page, args...) __kmap_atomic(page) to avoid having to touch all kmap_atomic() users in a single patch. [ not compiled on: - mn10300: the arch doesn't actually build with highmem to begin with ] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix up drivers/gpu/drm/i915/intel_overlay.c] Acked-by: Rik van Riel Signed-off-by: Peter Zijlstra Acked-by: Chris Metcalf Cc: David Howells Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Dave Airlie Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/highmem.h | 6 +- arch/arm/mm/highmem.c | 23 ++++--- arch/frv/include/asm/highmem.h | 25 ++------ arch/frv/mb93090-mb00/pci-dma.c | 4 +- arch/frv/mm/cache-page.c | 8 +-- arch/frv/mm/highmem.c | 50 +++++++++++++++ arch/mips/include/asm/highmem.h | 18 ++---- arch/mips/mm/highmem.c | 50 ++++++++------- arch/mn10300/include/asm/highmem.h | 42 ++++++++----- arch/powerpc/include/asm/highmem.h | 9 ++- arch/powerpc/mm/highmem.c | 35 ++++++----- arch/sparc/include/asm/highmem.h | 4 +- arch/sparc/mm/highmem.c | 48 ++++++++------- arch/tile/include/asm/highmem.h | 10 +-- arch/tile/mm/highmem.c | 85 +++++++------------------- arch/x86/include/asm/highmem.h | 11 ++-- arch/x86/include/asm/iomap.h | 4 +- arch/x86/kernel/crash_dump_32.c | 2 +- arch/x86/mm/highmem_32.c | 83 +++++++++++++------------ arch/x86/mm/iomap_32.c | 42 +++++++------ drivers/gpu/drm/i915/i915_gem.c | 25 ++++---- drivers/gpu/drm/i915/i915_irq.c | 5 +- drivers/gpu/drm/i915/intel_overlay.c | 5 +- drivers/gpu/drm/nouveau/nouveau_bios.c | 8 +-- drivers/gpu/drm/ttm/ttm_bo_util.c | 8 +-- include/linux/highmem.h | 61 +++++++++++------- include/linux/io-mapping.h | 14 ++--- mm/highmem.c | 62 ++----------------- 28 files changed, 371 insertions(+), 376 deletions(-) diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 5aff58126602..1fc684e70ab6 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -35,9 +35,9 @@ extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte); #ifdef CONFIG_HIGHMEM extern void *kmap(struct page *page); extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page, enum km_type type); -extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +extern void *__kmap_atomic(struct page *page); +extern void __kunmap_atomic(void *kvaddr); +extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(const void *ptr); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 1fbdb55bfd1b..c00f119babbf 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -36,18 +36,17 @@ void kunmap(struct page *page) } EXPORT_SYMBOL(kunmap); -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page) { unsigned int idx; unsigned long vaddr; void *kmap; + int type; pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); - #ifdef CONFIG_DEBUG_HIGHMEM /* * There is no cache coherency issue when non VIVT, so force the @@ -61,6 +60,8 @@ void *kmap_atomic(struct page *page, enum km_type type) if (kmap) return kmap; + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM @@ -80,14 +81,17 @@ void *kmap_atomic(struct page *page, enum km_type type) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(__kmap_atomic); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - unsigned int idx = type + KM_TYPE_NR * smp_processor_id(); + int idx, type; if (kvaddr >= (void *)FIXADDR_START) { + type = kmap_atomic_idx_pop(); + idx = type + KM_TYPE_NR * smp_processor_id(); + if (cache_is_vivt()) __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); #ifdef CONFIG_DEBUG_HIGHMEM @@ -103,15 +107,16 @@ void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) } pagefault_enable(); } -EXPORT_SYMBOL(kunmap_atomic_notypecheck); +EXPORT_SYMBOL(__kunmap_atomic); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *kmap_atomic_pfn(unsigned long pfn) { - unsigned int idx; unsigned long vaddr; + int idx, type; pagefault_disable(); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h index cb4c317eaecc..a8d6565d415d 100644 --- a/arch/frv/include/asm/highmem.h +++ b/arch/frv/include/asm/highmem.h @@ -112,12 +112,11 @@ extern struct page *kmap_atomic_to_page(void *ptr); (void *) damlr; \ }) -static inline void *kmap_atomic(struct page *page, enum km_type type) +static inline void *kmap_atomic_primary(struct page *page, enum km_type type) { unsigned long paddr; pagefault_disable(); - debug_kmap_atomic(type); paddr = page_to_phys(page); switch (type) { @@ -125,14 +124,6 @@ static inline void *kmap_atomic(struct page *page, enum km_type type) case 1: return __kmap_atomic_primary(1, paddr, 3); case 2: return __kmap_atomic_primary(2, paddr, 4); case 3: return __kmap_atomic_primary(3, paddr, 5); - case 4: return __kmap_atomic_primary(4, paddr, 6); - case 5: return __kmap_atomic_primary(5, paddr, 7); - case 6: return __kmap_atomic_primary(6, paddr, 8); - case 7: return __kmap_atomic_primary(7, paddr, 9); - case 8: return __kmap_atomic_primary(8, paddr, 10); - - case 9 ... 9 + NR_TLB_LINES - 1: - return __kmap_atomic_secondary(type - 9, paddr); default: BUG(); @@ -152,22 +143,13 @@ do { \ asm volatile("tlbpr %0,gr0,#4,#1" : : "r"(vaddr) : "memory"); \ } while(0) -static inline void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +static inline void kunmap_atomic_primary(void *kvaddr, enum km_type type) { switch (type) { case 0: __kunmap_atomic_primary(0, 2); break; case 1: __kunmap_atomic_primary(1, 3); break; case 2: __kunmap_atomic_primary(2, 4); break; case 3: __kunmap_atomic_primary(3, 5); break; - case 4: __kunmap_atomic_primary(4, 6); break; - case 5: __kunmap_atomic_primary(5, 7); break; - case 6: __kunmap_atomic_primary(6, 8); break; - case 7: __kunmap_atomic_primary(7, 9); break; - case 8: __kunmap_atomic_primary(8, 10); break; - - case 9 ... 9 + NR_TLB_LINES - 1: - __kunmap_atomic_secondary(type - 9, kvaddr); - break; default: BUG(); @@ -175,6 +157,9 @@ static inline void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) pagefault_enable(); } +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index 85d110b71cf7..41098a3803a2 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c @@ -61,14 +61,14 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, dampr2 = __get_DAMPR(2); for (i = 0; i < nents; i++) { - vaddr = kmap_atomic(sg_page(&sg[i]), __KM_CACHE); + vaddr = kmap_atomic_primary(sg_page(&sg[i]), __KM_CACHE); frv_dcache_writeback((unsigned long) vaddr, (unsigned long) vaddr + PAGE_SIZE); } - kunmap_atomic(vaddr, __KM_CACHE); + kunmap_atomic_primary(vaddr, __KM_CACHE); if (dampr2) { __set_DAMPR(2, dampr2); __set_IAMPR(2, dampr2); diff --git a/arch/frv/mm/cache-page.c b/arch/frv/mm/cache-page.c index 0261cbe153b5..b24ade27a0f0 100644 --- a/arch/frv/mm/cache-page.c +++ b/arch/frv/mm/cache-page.c @@ -26,11 +26,11 @@ void flush_dcache_page(struct page *page) dampr2 = __get_DAMPR(2); - vaddr = kmap_atomic(page, __KM_CACHE); + vaddr = kmap_atomic_primary(page, __KM_CACHE); frv_dcache_writeback((unsigned long) vaddr, (unsigned long) vaddr + PAGE_SIZE); - kunmap_atomic(vaddr, __KM_CACHE); + kunmap_atomic_primary(vaddr, __KM_CACHE); if (dampr2) { __set_DAMPR(2, dampr2); @@ -54,12 +54,12 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, dampr2 = __get_DAMPR(2); - vaddr = kmap_atomic(page, __KM_CACHE); + vaddr = kmap_atomic_primary(page, __KM_CACHE); start = (start & ~PAGE_MASK) | (unsigned long) vaddr; frv_cache_wback_inv(start, start + len); - kunmap_atomic(vaddr, __KM_CACHE); + kunmap_atomic_primary(vaddr, __KM_CACHE); if (dampr2) { __set_DAMPR(2, dampr2); diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c index eadd07658075..61088dcc1594 100644 --- a/arch/frv/mm/highmem.c +++ b/arch/frv/mm/highmem.c @@ -36,3 +36,53 @@ struct page *kmap_atomic_to_page(void *ptr) { return virt_to_page(ptr); } + +void *__kmap_atomic(struct page *page) +{ + unsigned long paddr; + int type; + + pagefault_disable(); + type = kmap_atomic_idx_push(); + paddr = page_to_phys(page); + + switch (type) { + /* + * The first 4 primary maps are reserved for architecture code + */ + case 0: return __kmap_atomic_primary(4, paddr, 6); + case 1: return __kmap_atomic_primary(5, paddr, 7); + case 2: return __kmap_atomic_primary(6, paddr, 8); + case 3: return __kmap_atomic_primary(7, paddr, 9); + case 4: return __kmap_atomic_primary(8, paddr, 10); + + case 5 ... 5 + NR_TLB_LINES - 1: + return __kmap_atomic_secondary(type - 5, paddr); + + default: + BUG(); + return NULL; + } +} +EXPORT_SYMBOL(__kmap_atomic); + +void __kunmap_atomic(void *kvaddr) +{ + int type = kmap_atomic_idx_pop(); + switch (type) { + case 0: __kunmap_atomic_primary(4, 6); break; + case 1: __kunmap_atomic_primary(5, 7); break; + case 2: __kunmap_atomic_primary(6, 8); break; + case 3: __kunmap_atomic_primary(7, 9); break; + case 4: __kunmap_atomic_primary(8, 10); break; + + case 5 ... 5 + NR_TLB_LINES - 1: + __kunmap_atomic_secondary(type - 5, kvaddr); + break; + + default: + BUG(); + } + pagefault_enable(); +} +EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 75753ca73bfd..77e644082a3b 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -45,18 +45,12 @@ extern pte_t *pkmap_page_table; extern void * kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *__kmap(struct page *page); -extern void __kunmap(struct page *page); -extern void *__kmap_atomic(struct page *page, enum km_type type); -extern void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); -extern struct page *__kmap_atomic_to_page(void *ptr); - -#define kmap __kmap -#define kunmap __kunmap -#define kmap_atomic __kmap_atomic -#define kunmap_atomic_notypecheck __kunmap_atomic_notypecheck -#define kmap_atomic_to_page __kmap_atomic_to_page +extern void *kmap(struct page *page); +extern void kunmap(struct page *page); +extern void *__kmap_atomic(struct page *page); +extern void __kunmap_atomic(void *kvaddr); +extern void *kmap_atomic_pfn(unsigned long pfn); +extern struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 6a2b1bf9ef11..1e69b1fb4b85 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -9,7 +9,7 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *__kmap(struct page *page) +void *kmap(struct page *page) { void *addr; @@ -21,16 +21,16 @@ void *__kmap(struct page *page) return addr; } -EXPORT_SYMBOL(__kmap); +EXPORT_SYMBOL(kmap); -void __kunmap(struct page *page) +void kunmap(struct page *page) { BUG_ON(in_interrupt()); if (!PageHighMem(page)) return; kunmap_high(page); } -EXPORT_SYMBOL(__kunmap); +EXPORT_SYMBOL(kunmap); /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because @@ -41,17 +41,17 @@ EXPORT_SYMBOL(__kunmap); * kmaps are appropriate for short, tight code paths only. */ -void *__kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM @@ -64,43 +64,47 @@ void *__kmap_atomic(struct page *page, enum km_type type) } EXPORT_SYMBOL(__kmap_atomic); -void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { -#ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + int type; if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); return; } - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + type = kmap_atomic_idx_pop(); +#ifdef CONFIG_DEBUG_HIGHMEM + { + int idx = type + KM_TYPE_NR * smp_processor_id(); - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_one(vaddr); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + local_flush_tlb_one(vaddr); + } #endif - pagefault_enable(); } -EXPORT_SYMBOL(__kunmap_atomic_notypecheck); +EXPORT_SYMBOL(__kunmap_atomic); /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *kmap_atomic_pfn(unsigned long pfn) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; pagefault_disable(); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); @@ -109,7 +113,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) return (void*) vaddr; } -struct page *__kmap_atomic_to_page(void *ptr) +struct page *kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h index b0b187a29b88..f577ba2268ca 100644 --- a/arch/mn10300/include/asm/highmem.h +++ b/arch/mn10300/include/asm/highmem.h @@ -70,15 +70,16 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline unsigned long kmap_atomic(struct page *page, enum km_type type) +static inline unsigned long __kmap_atomic(struct page *page) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; + pagefault_disable(); if (page < highmem_start_page) return page_address(page); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #if HIGHMEM_DEBUG @@ -91,26 +92,35 @@ static inline unsigned long kmap_atomic(struct page *page, enum km_type type) return vaddr; } -static inline void kunmap_atomic_notypecheck(unsigned long vaddr, enum km_type type) +static inline void __kunmap_atomic(unsigned long vaddr) { -#if HIGHMEM_DEBUG - enum fixed_addresses idx = type + KM_TYPE_NR * smp_processor_id(); + int type; - if (vaddr < FIXADDR_START) /* FIXME */ + if (vaddr < FIXADDR_START) { /* FIXME */ + pagefault_enable(); return; + } - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)) - BUG(); + type = kmap_atomic_idx_pop(); - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(kmap_pte - idx); - __flush_tlb_one(vaddr); +#if HIGHMEM_DEBUG + { + unsigned int idx; + idx = type + KM_TYPE_NR * smp_processor_id(); + + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)) + BUG(); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(kmap_pte - idx); + __flush_tlb_one(vaddr); + } #endif + pagefault_enable(); } - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index d10d64a4be38..dbc264010d0b 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -60,9 +60,8 @@ extern pte_t *pkmap_page_table; extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap_atomic_prot(struct page *page, enum km_type type, - pgprot_t prot); -extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); +extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void __kunmap_atomic(void *kvaddr); static inline void *kmap(struct page *page) { @@ -80,9 +79,9 @@ static inline void kunmap(struct page *page) kunmap_high(page); } -static inline void *kmap_atomic(struct page *page, enum km_type type) +static inline void *__kmap_atomic(struct page *page) { - return kmap_atomic_prot(page, type, kmap_prot); + return kmap_atomic_prot(page, kmap_prot); } static inline struct page *kmap_atomic_to_page(void *ptr) diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 857d4173f9c6..b0848b462bbc 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -29,17 +29,17 @@ * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - unsigned int idx; unsigned long vaddr; + int idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM @@ -52,26 +52,33 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { -#ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + int type; if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); return; } - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + type = kmap_atomic_idx_pop(); - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_page(NULL, vaddr); +#ifdef CONFIG_DEBUG_HIGHMEM + { + unsigned int idx; + + idx = type + KM_TYPE_NR * smp_processor_id(); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + local_flush_tlb_page(NULL, vaddr); + } #endif pagefault_enable(); } -EXPORT_SYMBOL(kunmap_atomic_notypecheck); +EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index ec23b0a87b98..3d7afbb7f4bb 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -70,8 +70,8 @@ static inline void kunmap(struct page *page) kunmap_high(page); } -extern void *kmap_atomic(struct page *page, enum km_type type); -extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); +extern void *__kmap_atomic(struct page *page); +extern void __kunmap_atomic(void *kvaddr); extern struct page *kmap_atomic_to_page(void *vaddr); #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index e139e9cbf5f7..5e50c09b7dce 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -29,17 +29,17 @@ #include #include -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page) { - unsigned long idx; unsigned long vaddr; + long idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -63,44 +63,50 @@ void *kmap_atomic(struct page *page, enum km_type type) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(__kmap_atomic); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { -#ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - unsigned long idx = type + KM_TYPE_NR*smp_processor_id(); + int type; if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); return; } - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); + type = kmap_atomic_idx_pop(); -/* XXX Fix - Anton */ +#ifdef CONFIG_DEBUG_HIGHMEM + { + unsigned long idx; + + idx = type + KM_TYPE_NR * smp_processor_id(); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); + + /* XXX Fix - Anton */ #if 0 - __flush_cache_one(vaddr); + __flush_cache_one(vaddr); #else - flush_cache_all(); + flush_cache_all(); #endif - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); -/* XXX Fix - Anton */ + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + /* XXX Fix - Anton */ #if 0 - __flush_tlb_one(vaddr); + __flush_tlb_one(vaddr); #else - flush_tlb_all(); + flush_tlb_all(); #endif + } #endif - pagefault_enable(); } -EXPORT_SYMBOL(kunmap_atomic_notypecheck); +EXPORT_SYMBOL(__kunmap_atomic); /* We may be fed a pagetable here by ptep_to_xxx and others. */ struct page *kmap_atomic_to_page(void *ptr) diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h index d155db6fa9bd..e0f7ee186721 100644 --- a/arch/tile/include/asm/highmem.h +++ b/arch/tile/include/asm/highmem.h @@ -60,12 +60,12 @@ void *kmap_fix_kpte(struct page *page, int finished); /* This macro is used only in map_new_virtual() to map "page". */ #define kmap_prot page_to_kpgprot(page) -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); +void *kmap_atomic_pfn(unsigned long pfn); +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); -void *kmap_atomic(struct page *page, enum km_type type); +void *kmap_atomic_prot(struct page *page, pgprot_t prot); void kmap_atomic_fix_kpte(struct page *page, int finished); #define flush_cache_kmaps() do { } while (0) diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c index 12ab137e7d4f..8ef6595e162c 100644 --- a/arch/tile/mm/highmem.c +++ b/arch/tile/mm/highmem.c @@ -56,50 +56,6 @@ void kunmap(struct page *page) } EXPORT_SYMBOL(kunmap); -static void debug_kmap_atomic_prot(enum km_type type) -{ -#ifdef CONFIG_DEBUG_HIGHMEM - static unsigned warn_count = 10; - - if (unlikely(warn_count == 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && - /* type != KM_BIO_DST_IRQ && */ - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ /* || type == KM_BIO_DST_IRQ */) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#endif -} - /* * Describe a single atomic mapping of a page on a given cpu at a * given address, and allow it to be linked into a list. @@ -240,10 +196,10 @@ void kmap_atomic_fix_kpte(struct page *page, int finished) * When holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; pte_t *pte; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ @@ -255,8 +211,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic_prot(type); - + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); pte = kmap_get_pte(vaddr); @@ -269,25 +224,31 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page) { /* PAGE_NONE is a magic value that tells us to check immutability. */ return kmap_atomic_prot(page, type, PAGE_NONE); } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(__kmap_atomic); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - /* - * Force other mappings to Oops if they try to access this pte without - * first remapping it. Keeping stale mappings around is a bad idea. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) { + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { pte_t *pte = kmap_get_pte(vaddr); pte_t pteval = *pte; + int idx, type; + + type = kmap_atomic_idx_pop(); + idx = type + KM_TYPE_NR*smp_processor_id(); + + /* + * Force other mappings to Oops if they try to access this pte + * without first remapping it. Keeping stale mappings around + * is a bad idea. + */ BUG_ON(!pte_present(pteval) && !pte_migrating(pteval)); kmap_atomic_unregister(pte_page(pteval), vaddr); kpte_clear_flush(pte, vaddr); @@ -300,19 +261,19 @@ void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) arch_flush_lazy_mmu_mode(); pagefault_enable(); } -EXPORT_SYMBOL(kunmap_atomic_notypecheck); +EXPORT_SYMBOL(__kunmap_atomic); /* * This API is supposed to allow us to map memory without a "struct page". * Currently we don't support this, though this may change in the future. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *kmap_atomic_pfn(unsigned long pfn) { - return kmap_atomic(pfn_to_page(pfn), type); + return kmap_atomic(pfn_to_page(pfn)); } -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { - return kmap_atomic_prot(pfn_to_page(pfn), type, prot); + return kmap_atomic_prot(pfn_to_page(pfn), prot); } struct page *kmap_atomic_to_page(void *ptr) diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 8caac76ac324..3bd04022fd0c 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page); void *kmap(struct page *page); void kunmap(struct page *page); -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); + +void *kmap_atomic_prot(struct page *page, pgprot_t prot); +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); +void *kmap_atomic_pfn(unsigned long pfn); +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index c4191b3b7056..363e33eb6ec1 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -27,10 +27,10 @@ #include void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type); +iounmap_atomic(void __iomem *kvaddr); int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 67414550c3cc..d5cd13945d5a 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + vaddr = kmap_atomic_pfn(pfn); if (!userbuf) { memcpy(buf, (vaddr + offset), csize); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 5e8fa12ef861..d723e369003c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -9,6 +9,7 @@ void *kmap(struct page *page) return page_address(page); return kmap_high(page); } +EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { @@ -18,6 +19,7 @@ void kunmap(struct page *page) return; kunmap_high(page); } +EXPORT_SYMBOL(kunmap); /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because @@ -27,10 +29,10 @@ void kunmap(struct page *page) * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); @@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); - + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); @@ -47,44 +48,56 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) return (void *)vaddr; } +EXPORT_SYMBOL(kmap_atomic_prot); -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page) { - return kmap_atomic_prot(page, type, kmap_prot); -} - -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) - kpte_clear_flush(kmap_pte-idx, vaddr); - else { -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr < PAGE_OFFSET); - BUG_ON(vaddr >= (unsigned long)high_memory); -#endif - } - - pagefault_enable(); + return kmap_atomic_prot(page, kmap_prot); } +EXPORT_SYMBOL(__kmap_atomic); /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *kmap_atomic_pfn(unsigned long pfn) { - return kmap_atomic_prot_pfn(pfn, type, kmap_prot); + return kmap_atomic_prot_pfn(pfn, kmap_prot); } -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); + +void __kunmap_atomic(void *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx_pop(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ + kpte_clear_flush(kmap_pte-idx, vaddr); + } +#ifdef CONFIG_DEBUG_HIGHMEM + else { + BUG_ON(vaddr < PAGE_OFFSET); + BUG_ON(vaddr >= (unsigned long)high_memory); + } +#endif + + pagefault_enable(); +} +EXPORT_SYMBOL(__kunmap_atomic); struct page *kmap_atomic_to_page(void *ptr) { @@ -98,12 +111,6 @@ struct page *kmap_atomic_to_page(void *ptr) pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } - -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic_notypecheck); -EXPORT_SYMBOL(kmap_atomic_prot); EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 72fc70cf6184..75a3d7f24a2c 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) } EXPORT_SYMBOL_GPL(iomap_create_wc); -void -iomap_free(resource_size_t base, unsigned long size) +void iomap_free(resource_size_t base, unsigned long size) { io_free_memtype(base, base + size); } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; pagefault_disable(); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); @@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) } /* - * Map 'pfn' using fixed map 'type' and protections 'prot' + * Map 'pfn' using protections 'prot' */ void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. @@ -86,24 +85,33 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type) +iounmap_atomic(void __iomem *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx_pop(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); + } pagefault_enable(); } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 90b1d6753b9d..eb6c473c6d1b 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -155,11 +155,11 @@ fast_shmem_read(struct page **pages, char __iomem *vaddr; int unwritten; - vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0); + vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT]); if (vaddr == NULL) return -ENOMEM; unwritten = __copy_to_user_inatomic(data, vaddr + page_offset, length); - kunmap_atomic(vaddr, KM_USER0); + kunmap_atomic(vaddr); if (unwritten) return -EFAULT; @@ -509,10 +509,10 @@ fast_user_write(struct io_mapping *mapping, char *vaddr_atomic; unsigned long unwritten; - vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base, KM_USER0); + vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base); unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset, user_data, length); - io_mapping_unmap_atomic(vaddr_atomic, KM_USER0); + io_mapping_unmap_atomic(vaddr_atomic); if (unwritten) return -EFAULT; return 0; @@ -551,11 +551,11 @@ fast_shmem_write(struct page **pages, char __iomem *vaddr; unsigned long unwritten; - vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0); + vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT]); if (vaddr == NULL) return -ENOMEM; unwritten = __copy_from_user_inatomic(vaddr + page_offset, data, length); - kunmap_atomic(vaddr, KM_USER0); + kunmap_atomic(vaddr); if (unwritten) return -EFAULT; @@ -3346,8 +3346,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj, reloc_offset = obj_priv->gtt_offset + reloc->offset; reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, (reloc_offset & - ~(PAGE_SIZE - 1)), - KM_USER0); + ~(PAGE_SIZE - 1))); reloc_entry = (uint32_t __iomem *)(reloc_page + (reloc_offset & (PAGE_SIZE - 1))); reloc_val = target_obj_priv->gtt_offset + reloc->delta; @@ -3358,7 +3357,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj, readl(reloc_entry), reloc_val); #endif writel(reloc_val, reloc_entry); - io_mapping_unmap_atomic(reloc_page, KM_USER0); + io_mapping_unmap_atomic(reloc_page); /* The updated presumed offset for this entry will be * copied back out to the user. @@ -4772,11 +4771,11 @@ void i915_gem_detach_phys_object(struct drm_device *dev, page_count = obj->size / PAGE_SIZE; for (i = 0; i < page_count; i++) { - char *dst = kmap_atomic(obj_priv->pages[i], KM_USER0); + char *dst = kmap_atomic(obj_priv->pages[i]); char *src = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE); memcpy(dst, src, PAGE_SIZE); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); } drm_clflush_pages(obj_priv->pages, page_count); drm_agp_chipset_flush(dev); @@ -4833,11 +4832,11 @@ i915_gem_attach_phys_object(struct drm_device *dev, page_count = obj->size / PAGE_SIZE; for (i = 0; i < page_count; i++) { - char *src = kmap_atomic(obj_priv->pages[i], KM_USER0); + char *src = kmap_atomic(obj_priv->pages[i]); char *dst = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE); memcpy(dst, src, PAGE_SIZE); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(src); } i915_gem_object_put_pages(obj); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 744225ebb4b2..b80010f0c4c9 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -456,10 +456,9 @@ i915_error_object_create(struct drm_device *dev, local_irq_save(flags); s = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, - reloc_offset, - KM_IRQ0); + reloc_offset); memcpy_fromio(d, s, PAGE_SIZE); - io_mapping_unmap_atomic(s, KM_IRQ0); + io_mapping_unmap_atomic(s); local_irq_restore(flags); dst->pages[page] = d; diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c index 1d306a458be6..3264bbd47e65 100644 --- a/drivers/gpu/drm/i915/intel_overlay.c +++ b/drivers/gpu/drm/i915/intel_overlay.c @@ -187,8 +187,7 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over if (OVERLAY_NONPHYSICAL(overlay->dev)) { regs = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, - overlay->reg_bo->gtt_offset, - KM_USER0); + overlay->reg_bo->gtt_offset); if (!regs) { DRM_ERROR("failed to map overlay regs in GTT\n"); @@ -203,7 +202,7 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over static void intel_overlay_unmap_regs_atomic(struct intel_overlay *overlay) { if (OVERLAY_NONPHYSICAL(overlay->dev)) - io_mapping_unmap_atomic(overlay->virt_addr, KM_USER0); + io_mapping_unmap_atomic(overlay->virt_addr); overlay->virt_addr = NULL; diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c index 974b0f8ae048..8fa339600fe3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.c +++ b/drivers/gpu/drm/nouveau/nouveau_bios.c @@ -2167,11 +2167,11 @@ peek_fb(struct drm_device *dev, struct io_mapping *fb, if (off < pci_resource_len(dev->pdev, 1)) { uint8_t __iomem *p = - io_mapping_map_atomic_wc(fb, off & PAGE_MASK, KM_USER0); + io_mapping_map_atomic_wc(fb, off & PAGE_MASK); val = ioread32(p + (off & ~PAGE_MASK)); - io_mapping_unmap_atomic(p, KM_USER0); + io_mapping_unmap_atomic(p); } return val; @@ -2183,12 +2183,12 @@ poke_fb(struct drm_device *dev, struct io_mapping *fb, { if (off < pci_resource_len(dev->pdev, 1)) { uint8_t __iomem *p = - io_mapping_map_atomic_wc(fb, off & PAGE_MASK, KM_USER0); + io_mapping_map_atomic_wc(fb, off & PAGE_MASK); iowrite32(val, p + (off & ~PAGE_MASK)); wmb(); - io_mapping_unmap_atomic(p, KM_USER0); + io_mapping_unmap_atomic(p); } } diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 3451a82adba7..e8a73e65da69 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -170,7 +170,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); #ifdef CONFIG_X86 - dst = kmap_atomic_prot(d, KM_USER0, prot); + dst = kmap_atomic_prot(d, prot); #else if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) dst = vmap(&d, 1, 0, prot); @@ -183,7 +183,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, memcpy_fromio(dst, src, PAGE_SIZE); #ifdef CONFIG_X86 - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); #else if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) vunmap(dst); @@ -206,7 +206,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); #ifdef CONFIG_X86 - src = kmap_atomic_prot(s, KM_USER0, prot); + src = kmap_atomic_prot(s, prot); #else if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) src = vmap(&s, 1, 0, prot); @@ -219,7 +219,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, memcpy_toio(dst, src, PAGE_SIZE); #ifdef CONFIG_X86 - kunmap_atomic(src, KM_USER0); + kunmap_atomic(src); #else if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) vunmap(src); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 283cd47bb34c..8a85ec109a3a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -28,18 +28,6 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include -#ifdef CONFIG_DEBUG_HIGHMEM - -void debug_kmap_atomic(enum km_type type); - -#else - -static inline void debug_kmap_atomic(enum km_type type) -{ -} - -#endif - #ifdef CONFIG_HIGHMEM #include @@ -49,6 +37,27 @@ extern unsigned long totalhigh_pages; void kmap_flush_unused(void); +DECLARE_PER_CPU(int, __kmap_atomic_idx); + +static inline int kmap_atomic_idx_push(void) +{ + int idx = __get_cpu_var(__kmap_atomic_idx)++; +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(in_irq() && !irqs_disabled()); + BUG_ON(idx > KM_TYPE_NR); +#endif + return idx; +} + +static inline int kmap_atomic_idx_pop(void) +{ + int idx = --__get_cpu_var(__kmap_atomic_idx); +#ifdef CONFIG_DEBUG_HIGHMEM + BUG_ON(idx < 0); +#endif + return idx; +} + #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } @@ -66,19 +75,19 @@ static inline void kunmap(struct page *page) { } -static inline void *kmap_atomic(struct page *page, enum km_type idx) +static inline void *__kmap_atomic(struct page *page) { pagefault_disable(); return page_address(page); } -#define kmap_atomic_prot(page, idx, prot) kmap_atomic(page, idx) +#define kmap_atomic_prot(page, prot) __kmap_atomic(page) -static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx) +static inline void __kunmap_atomic(void *addr) { pagefault_enable(); } -#define kmap_atomic_pfn(pfn, idx) kmap_atomic(pfn_to_page(pfn), (idx)) +#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) #define kmap_atomic_to_page(ptr) virt_to_page(ptr) #define kmap_flush_unused() do {} while(0) @@ -86,12 +95,20 @@ static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx) #endif /* CONFIG_HIGHMEM */ -/* Prevent people trying to call kunmap_atomic() as if it were kunmap() */ -/* kunmap_atomic() should get the return value of kmap_atomic, not the page. */ -#define kunmap_atomic(addr, idx) do { \ - BUILD_BUG_ON(__same_type((addr), struct page *)); \ - kunmap_atomic_notypecheck((addr), (idx)); \ - } while (0) +/* + * Make both: kmap_atomic(page, idx) and kmap_atomic(page) work. + */ +#define kmap_atomic(page, args...) __kmap_atomic(page) + +/* + * Prevent people trying to call kunmap_atomic() as if it were kunmap() + * kunmap_atomic() should get the return value of kmap_atomic, not the page. + */ +#define kunmap_atomic(addr, args...) \ +do { \ + BUILD_BUG_ON(__same_type((addr), struct page *)); \ + __kunmap_atomic(addr); \ +} while (0) /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 7fb592793738..8cdcc2a199ad 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -81,8 +81,7 @@ io_mapping_free(struct io_mapping *mapping) /* Atomic map/unmap */ static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, - unsigned long offset, - int slot) + unsigned long offset) { resource_size_t phys_addr; unsigned long pfn; @@ -90,13 +89,13 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; pfn = (unsigned long) (phys_addr >> PAGE_SHIFT); - return iomap_atomic_prot_pfn(pfn, slot, mapping->prot); + return iomap_atomic_prot_pfn(pfn, mapping->prot); } static inline void -io_mapping_unmap_atomic(void __iomem *vaddr, int slot) +io_mapping_unmap_atomic(void __iomem *vaddr) { - iounmap_atomic(vaddr, slot); + iounmap_atomic(vaddr); } static inline void __iomem * @@ -137,14 +136,13 @@ io_mapping_free(struct io_mapping *mapping) /* Atomic map/unmap */ static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, - unsigned long offset, - int slot) + unsigned long offset) { return ((char __force __iomem *) mapping) + offset; } static inline void -io_mapping_unmap_atomic(void __iomem *vaddr, int slot) +io_mapping_unmap_atomic(void __iomem *vaddr) { } diff --git a/mm/highmem.c b/mm/highmem.c index 7a0aa1be4993..781e754a75ac 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -42,6 +42,10 @@ unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); + +DEFINE_PER_CPU(int, __kmap_atomic_idx); +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); + unsigned int nr_free_highpages (void) { pg_data_t *pgdat; @@ -422,61 +426,3 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ - -#ifdef CONFIG_DEBUG_HIGHMEM - -void debug_kmap_atomic(enum km_type type) -{ - static int warn_count = 10; - - if (unlikely(warn_count < 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_nmi()) { - if (type != KM_NMI && type != KM_NMI_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || - type == KM_IRQ_PTE || type == KM_NMI || - type == KM_NMI_PTE ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#ifdef CONFIG_KGDB_KDB - if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) { - WARN_ON(1); - warn_count--; - } -#endif /* CONFIG_KGDB_KDB */ -} - -#endif From ece0e2b6406a995c371e0311190631ea34ad851a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:52 -0700 Subject: [PATCH 045/176] mm: remove pte_*map_nested() Since we no longer need to provide KM_type, the whole pte_*map_nested() API is now redundant, remove it. Signed-off-by: Peter Zijlstra Acked-by: Chris Metcalf Cc: David Howells Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 2 -- arch/arm/include/asm/pgtable.h | 14 ++++++-------- arch/arm/mm/fault-armv.c | 4 ++-- arch/arm/mm/pgd.c | 4 ++-- arch/avr32/include/asm/pgtable.h | 2 -- arch/cris/include/asm/pgtable.h | 2 -- arch/frv/include/asm/pgtable.h | 9 ++------- arch/ia64/include/asm/pgtable.h | 2 -- arch/m32r/include/asm/pgtable.h | 2 -- arch/m68k/include/asm/motorola_pgtable.h | 2 -- arch/m68k/include/asm/sun3_pgtable.h | 2 -- arch/microblaze/include/asm/pgtable.h | 7 ++----- arch/mips/include/asm/pgtable-32.h | 3 --- arch/mips/include/asm/pgtable-64.h | 3 --- arch/mn10300/include/asm/pgtable.h | 2 -- arch/parisc/include/asm/pgtable.h | 2 -- arch/powerpc/include/asm/pgtable-ppc32.h | 8 ++------ arch/powerpc/include/asm/pgtable-ppc64.h | 2 -- arch/s390/include/asm/pgtable.h | 2 -- arch/score/include/asm/pgtable.h | 3 --- arch/sh/include/asm/pgtable_32.h | 3 --- arch/sh/include/asm/pgtable_64.h | 2 -- arch/sparc/include/asm/pgtable_32.h | 3 --- arch/sparc/include/asm/pgtable_64.h | 2 -- arch/tile/include/asm/pgtable.h | 5 ----- arch/um/include/asm/pgtable.h | 2 -- arch/x86/include/asm/pgtable_32.h | 14 ++------------ arch/x86/include/asm/pgtable_64.h | 2 -- arch/xtensa/include/asm/pgtable.h | 3 --- mm/memory.c | 4 ++-- mm/mremap.c | 4 ++-- 31 files changed, 22 insertions(+), 99 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 71a243294142..de98a732683d 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -318,9 +318,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) } #define pte_offset_map(dir,addr) pte_offset_kernel((dir),(addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir),(addr)) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) extern pgd_t swapper_pg_dir[1024]; diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index a9672e8406a3..b155414192da 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -263,17 +263,15 @@ extern struct page *empty_zero_page; #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) #define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + __pte_index(addr)) -#define pte_offset_map(dir,addr) (__pte_map(dir, KM_PTE0) + __pte_index(addr)) -#define pte_offset_map_nested(dir,addr) (__pte_map(dir, KM_PTE1) + __pte_index(addr)) -#define pte_unmap(pte) __pte_unmap(pte, KM_PTE0) -#define pte_unmap_nested(pte) __pte_unmap(pte, KM_PTE1) +#define pte_offset_map(dir,addr) (__pte_map(dir) + __pte_index(addr)) +#define pte_unmap(pte) __pte_unmap(pte) #ifndef CONFIG_HIGHPTE -#define __pte_map(dir,km) pmd_page_vaddr(*(dir)) -#define __pte_unmap(pte,km) do { } while (0) +#define __pte_map(dir) pmd_page_vaddr(*(dir)) +#define __pte_unmap(pte) do { } while (0) #else -#define __pte_map(dir,km) ((pte_t *)kmap_atomic(pmd_page(*(dir)), km) + PTRS_PER_PTE) -#define __pte_unmap(pte,km) kunmap_atomic((pte - PTRS_PER_PTE), km) +#define __pte_map(dir) ((pte_t *)kmap_atomic(pmd_page(*(dir))) + PTRS_PER_PTE) +#define __pte_unmap(pte) kunmap_atomic((pte - PTRS_PER_PTE)) #endif #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 8440d952ba6d..c493d7244d3d 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -89,13 +89,13 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, * open-code the spin-locking. */ ptl = pte_lockptr(vma->vm_mm, pmd); - pte = pte_offset_map_nested(pmd, address); + pte = pte_offset_map(pmd, address); spin_lock(ptl); ret = do_adjust_pte(vma, address, pfn, pte); spin_unlock(ptl); - pte_unmap_nested(pte); + pte_unmap(pte); return ret; } diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index be5f58e153bf..69bbfc6645a6 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -57,9 +57,9 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) goto no_pte; init_pmd = pmd_offset(init_pgd, 0); - init_pte = pte_offset_map_nested(init_pmd, 0); + init_pte = pte_offset_map(init_pmd, 0); set_pte_ext(new_pte, *init_pte, 0); - pte_unmap_nested(init_pte); + pte_unmap(init_pte); pte_unmap(new_pte); } diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h index a9ae30c41e74..6fbfea61f7bb 100644 --- a/arch/avr32/include/asm/pgtable.h +++ b/arch/avr32/include/asm/pgtable.h @@ -319,9 +319,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) #define pte_offset_map(dir, address) pte_offset_kernel(dir, address) -#define pte_offset_map_nested(dir, address) pte_offset_kernel(dir, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) struct vm_area_struct; extern void update_mmu_cache(struct vm_area_struct * vma, diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h index f63d6fccbc6c..9eaae217b21b 100644 --- a/arch/cris/include/asm/pgtable.h +++ b/arch/cris/include/asm/pgtable.h @@ -248,10 +248,8 @@ static inline pgd_t * pgd_offset(const struct mm_struct *mm, unsigned long addre ((pte_t *) pmd_page_vaddr(*(dir)) + __pte_offset(address)) #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #define pte_pfn(x) ((unsigned long)(__va((x).pte)) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h index c18b0d32e636..6bc241e4b4f8 100644 --- a/arch/frv/include/asm/pgtable.h +++ b/arch/frv/include/asm/pgtable.h @@ -451,17 +451,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) + ((pte_t *)kmap_atomic(pmd_page(*(dir))) + pte_index(address)) +#define pte_unmap(pte) kunmap_atomic(pte) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #endif /* diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index c3286f42e501..1a97af31ef17 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -406,9 +406,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address) #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir,addr) ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr)) #define pte_offset_map(dir,addr) pte_offset_kernel(dir, addr) -#define pte_offset_map_nested(dir,addr) pte_offset_map(dir, addr) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) /* atomic versions of the some PTE manipulations: */ diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h index e6359c566b50..8a28cfea2729 100644 --- a/arch/m32r/include/asm/pgtable.h +++ b/arch/m32r/include/asm/pgtable.h @@ -332,9 +332,7 @@ static inline void pmd_set(pmd_t * pmdp, pte_t * ptep) ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index(address)) #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) /* Encode and de-code a swap entry */ #define __swp_type(x) (((x).val >> 2) & 0x1f) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 8e9a8a754dde..45bd3f589bf0 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -221,9 +221,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmdp, unsigned long address) } #define pte_offset_map(pmdp,address) ((pte_t *)__pmd_page(*pmdp) + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) -#define pte_offset_map_nested(pmdp, address) pte_offset_map(pmdp, address) #define pte_unmap(pte) ((void)0) -#define pte_unmap_nested(pte) ((void)0) /* * Allocate and free page tables. The xxx_kernel() versions are diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index f847ec732d62..cf5fad9b5250 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -219,9 +219,7 @@ static inline pte_t pgoff_to_pte(unsigned off) #define pte_offset_kernel(pmd, address) ((pte_t *) __pmd_page(*pmd) + pte_index(address)) /* FIXME: should we bother with kmap() here? */ #define pte_offset_map(pmd, address) ((pte_t *)kmap(pmd_page(*pmd)) + pte_index(address)) -#define pte_offset_map_nested(pmd, address) pte_offset_map(pmd, address) #define pte_unmap(pte) kunmap(pte) -#define pte_unmap_nested(pte) kunmap(pte) /* Macros to (de)construct the fake PTEs representing swap pages. */ #define __swp_type(x) ((x).val & 0x7F) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index d4f421672d3b..cae268c22ba2 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -504,12 +504,9 @@ static inline pmd_t *pmd_offset(pgd_t *dir, unsigned long address) #define pte_offset_kernel(dir, addr) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(addr)) #define pte_offset_map(dir, addr) \ - ((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE0) + pte_index(addr)) -#define pte_offset_map_nested(dir, addr) \ - ((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE1) + pte_index(addr)) + ((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) +#define pte_unmap(pte) kunmap_atomic(pte) /* Encode and decode a nonlinear file mapping entry */ #define PTE_FILE_MAX_BITS 29 diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index ae90412556d0..8a153d2fa62a 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -154,10 +154,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot) #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) #define pte_unmap(pte) ((void)(pte)) -#define pte_unmap_nested(pte) ((void)(pte)) #if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 1be4b0fa30da..f00896087dda 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -257,10 +257,7 @@ static inline pmd_t *pmd_offset(pud_t * pud, unsigned long address) ((pte_t *) pmd_page_vaddr(*(dir)) + __pte_offset(address)) #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) #define pte_unmap(pte) ((void)(pte)) -#define pte_unmap_nested(pte) ((void)(pte)) /* * Initialize a new pgd / pmd table with invalid pointers. diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h index 16d88577f3e0..b049a8bd1577 100644 --- a/arch/mn10300/include/asm/pgtable.h +++ b/arch/mn10300/include/asm/pgtable.h @@ -457,9 +457,7 @@ static inline int set_kernel_exec(unsigned long vaddr, int enable) #define pte_offset_map(dir, address) \ ((pte_t *) page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do {} while (0) -#define pte_unmap_nested(pte) do {} while (0) /* * The MN10300 has external MMU info in the form of a TLB: this is adapted from diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 01c15035e783..865f37a8a881 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -397,9 +397,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define pte_offset_kernel(pmd, address) \ ((pte_t *) pmd_page_vaddr(*(pmd)) + pte_index(address)) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) -#define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index a7db96f2b5c3..47edde8c3556 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -308,12 +308,8 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) #define pte_offset_kernel(dir, addr) \ ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr)) #define pte_offset_map(dir, addr) \ - ((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE0) + pte_index(addr)) -#define pte_offset_map_nested(dir, addr) \ - ((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE1) + pte_index(addr)) - -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) + ((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr)) +#define pte_unmap(pte) kunmap_atomic(pte) /* * Encode and decode a swap entry. diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 49865045d56f..2b09cd522d33 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -193,9 +193,7 @@ (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_unmap(pte) do { } while(0) -#define pte_unmap_nested(pte) do { } while(0) /* to find an entry in a kernel page-table-directory */ /* This now only contains the vmalloc pages */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 986dc9476c21..02ace3491c51 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1094,9 +1094,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) #define pte_offset_kernel(pmd, address) pte_offset(pmd,address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) -#define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) /* * 31 bit swap entry format: diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h index ccf38f06c57d..2fd469807683 100644 --- a/arch/score/include/asm/pgtable.h +++ b/arch/score/include/asm/pgtable.h @@ -88,10 +88,7 @@ static inline void pmd_clear(pmd_t *pmdp) #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) #define pte_unmap(pte) ((void)(pte)) -#define pte_unmap_nested(pte) ((void)(pte)) /* * Bits 9(_PAGE_PRESENT) and 10(_PAGE_FILE)are taken, diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index e172d696e52b..69fdfbf14ea5 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -429,10 +429,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) #define pte_offset_map(dir, address) pte_offset_kernel(dir, address) -#define pte_offset_map_nested(dir, address) pte_offset_kernel(dir, address) - #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #ifdef CONFIG_X2TLB #define pte_ERROR(e) \ diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h index 0ee46776dad6..10a48111226d 100644 --- a/arch/sh/include/asm/pgtable_64.h +++ b/arch/sh/include/asm/pgtable_64.h @@ -84,9 +84,7 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) ((pte_t *) ((pmd_val(*(dir))) & PAGE_MASK) + pte_index((addr))) #define pte_offset_map(dir,addr) pte_offset_kernel(dir, addr) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel(dir, addr) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #ifndef __ASSEMBLY__ #define IOBASE_VADDR 0xff000000 diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 0ece77f47753..303bd4dc8292 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -304,10 +304,7 @@ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long) * and sun4c is guaranteed to have no highmem anyway. */ #define pte_offset_map(d, a) pte_offset_kernel(d,a) -#define pte_offset_map_nested(d, a) pte_offset_kernel(d,a) - #define pte_unmap(pte) do{}while(0) -#define pte_unmap_nested(pte) do{}while(0) /* Certain architectures need to do special things when pte's * within a page table are directly modified. Thus, the following diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index f5b5fa76c02d..f8dddb7045bb 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -652,9 +652,7 @@ static inline int pte_special(pte_t pte) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_kernel pte_index #define pte_offset_map pte_index -#define pte_offset_map_nested pte_index #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) /* Actual page table PTE updates. */ extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t orig); diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index b3367379d537..dc4ccdd855bc 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -347,15 +347,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) extern pte_t *_pte_offset_map(pmd_t *, unsigned long address, enum km_type); #define pte_offset_map(dir, address) \ _pte_offset_map(dir, address, KM_PTE0) -#define pte_offset_map_nested(dir, address) \ - _pte_offset_map(dir, address, KM_PTE1) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) pte_offset_kernel(dir, address) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #endif /* Clear a non-executable kernel PTE and flush it from the TLB. */ diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index a9f7251b4a8d..41474fb5eee7 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -338,9 +338,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) struct mm_struct; extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 8abde9ec90bf..0c92113c4cb6 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); #endif #if defined(CONFIG_HIGHPTE) -#define __KM_PTE \ - (in_nmi() ? KM_NMI_PTE : \ - in_irq() ? KM_IRQ_PTE : \ - KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ pte_index((address))) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) -#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#define pte_unmap(pte) kunmap_atomic((pte)) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) -#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f96ac9bedf75..f86da20347f2 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -127,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 76bf35554117..b03c043ce75b 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -324,10 +324,7 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #define pte_offset_kernel(dir,addr) \ ((pte_t*) pmd_page_vaddr(*(dir)) + pte_index(addr)) #define pte_offset_map(dir,addr) pte_offset_kernel((dir),(addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir),(addr)) - #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) /* diff --git a/mm/memory.c b/mm/memory.c index af82741caaa4..92cc54e94137 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -736,7 +736,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; - src_pte = pte_offset_map_nested(src_pmd, addr); + src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; @@ -767,7 +767,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(orig_src_pte); + pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); diff --git a/mm/mremap.c b/mm/mremap.c index cde56ee51ef7..563fbdd6293a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * pte locks because exclusive mmap_sem prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_pte = pte_offset_map(new_pmd, new_addr); new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap_nested(new_pte - 1); + pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); From 7a837d1bb7cb2bceb093ec639068626586a89234 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:53 -0700 Subject: [PATCH 046/176] perf, x86: Fix up kmap_atomic() type Now that the KM_type stuff is history, clean up the compiler warning. Signed-off-by: Peter Zijlstra Acked-by: Chris Metcalf Cc: David Howells Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fe73c1844a9a..c1e8c7a51164 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -49,7 +49,6 @@ static unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; unsigned long size, len = 0; struct page *page; void *map; @@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) offset = addr & (PAGE_SIZE - 1); size = min(PAGE_SIZE - offset, n - len); - map = kmap_atomic(page, type); + map = kmap_atomic(page); memcpy(to, map+offset, size); - kunmap_atomic(map, type); + kunmap_atomic(map); put_page(page); len += size; From d65bfacb046f3df8aa11a9cb9b6e448f6171174d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:54 -0700 Subject: [PATCH 047/176] mm: highmem documentation Document outlining some of the highmem issues, started by me, edited by David. Signed-off-by: David Howells Signed-off-by: Peter Zijlstra Acked-by: Chris Metcalf Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/highmem.txt | 162 +++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 Documentation/vm/highmem.txt diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.txt new file mode 100644 index 000000000000..4324d24ffacd --- /dev/null +++ b/Documentation/vm/highmem.txt @@ -0,0 +1,162 @@ + + ==================== + HIGH MEMORY HANDLING + ==================== + +By: Peter Zijlstra + +Contents: + + (*) What is high memory? + + (*) Temporary virtual mappings. + + (*) Using kmap_atomic. + + (*) Cost of temporary mappings. + + (*) i386 PAE. + + +==================== +WHAT IS HIGH MEMORY? +==================== + +High memory (highmem) is used when the size of physical memory approaches or +exceeds the maximum size of virtual memory. At that point it becomes +impossible for the kernel to keep all of the available physical memory mapped +at all times. This means the kernel needs to start using temporary mappings of +the pieces of physical memory that it wants to access. + +The part of (physical) memory not covered by a permanent mapping is what we +refer to as 'highmem'. There are various architecture dependent constraints on +where exactly that border lies. + +In the i386 arch, for example, we choose to map the kernel into every process's +VM space so that we don't have to pay the full TLB invalidation costs for +kernel entry/exit. This means the available virtual memory space (4GiB on +i386) has to be divided between user and kernel space. + +The traditional split for architectures using this approach is 3:1, 3GiB for +userspace and the top 1GiB for kernel space: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +This means that the kernel can at most map 1GiB of physical memory at any one +time, but because we need virtual address space for other things - including +temporary maps to access the rest of the physical memory - the actual direct +map will typically be less (usually around ~896MiB). + +Other architectures that have mm context tagged TLBs can have separate kernel +and user maps. Some hardware (like some ARMs), however, have limited virtual +space when they use mm context tags. + + +========================== +TEMPORARY VIRTUAL MAPPINGS +========================== + +The kernel contains several ways of creating temporary mappings: + + (*) vmap(). This can be used to make a long duration mapping of multiple + physical pages into a contiguous virtual space. It needs global + synchronization to unmap. + + (*) kmap(). This permits a short duration mapping of a single page. It needs + global synchronization, but is amortized somewhat. It is also prone to + deadlocks when using in a nested fashion, and so it is not recommended for + new code. + + (*) kmap_atomic(). This permits a very short duration mapping of a single + page. Since the mapping is restricted to the CPU that issued it, it + performs well, but the issuing task is therefore required to stay on that + CPU until it has finished, lest some other task displace its mappings. + + kmap_atomic() may also be used by interrupt contexts, since it is does not + sleep and the caller may not sleep until after kunmap_atomic() is called. + + It may be assumed that k[un]map_atomic() won't fail. + + +================= +USING KMAP_ATOMIC +================= + +When and where to use kmap_atomic() is straightforward. It is used when code +wants to access the contents of a page that might be allocated from high memory +(see __GFP_HIGHMEM), for example a page in the pagecache. The API has two +functions, and they can be used in a manner similar to the following: + + /* Find the page of interest. */ + struct page *page = find_get_page(mapping, offset); + + /* Gain access to the contents of that page. */ + void *vaddr = kmap_atomic(page); + + /* Do something to the contents of that page. */ + memset(vaddr, 0, PAGE_SIZE); + + /* Unmap that page. */ + kunmap_atomic(vaddr); + +Note that the kunmap_atomic() call takes the result of the kmap_atomic() call +not the argument. + +If you need to map two pages because you want to copy from one page to +another you need to keep the kmap_atomic calls strictly nested, like: + + vaddr1 = kmap_atomic(page1); + vaddr2 = kmap_atomic(page2); + + memcpy(vaddr1, vaddr2, PAGE_SIZE); + + kunmap_atomic(vaddr2); + kunmap_atomic(vaddr1); + + +========================== +COST OF TEMPORARY MAPPINGS +========================== + +The cost of creating temporary mappings can be quite high. The arch has to +manipulate the kernel's page tables, the data TLB and/or the MMU's registers. + +If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping +simply with a bit of arithmetic that will convert the page struct address into +a pointer to the page contents rather than juggling mappings about. In such a +case, the unmap operation may be a null operation. + +If CONFIG_MMU is not set, then there can be no temporary mappings and no +highmem. In such a case, the arithmetic approach will also be used. + + +======== +i386 PAE +======== + +The i386 arch, under some circumstances, will permit you to stick up to 64GiB +of RAM into your 32-bit machine. This has a number of consequences: + + (*) Linux needs a page-frame structure for each page in the system and the + pageframes need to live in the permanent mapping, which means: + + (*) you can have 896M/sizeof(struct page) page-frames at most; with struct + page being 32-bytes that would end up being something in the order of 112G + worth of pages; the kernel, however, needs to store more than just + page-frames in that memory... + + (*) PAE makes your page tables larger - which slows the system down as more + data has to be accessed to traverse in TLB fills and the like. One + advantage is that PAE has more PTE bits and can provide advanced features + like NX and PAT. + +The general recommendation is that you don't use more than 8GiB on a 32-bit +machine - although more might work for you and your workload, you're pretty +much on your own - don't expect kernel developers to really care much if things +come apart. From ea05c8444e451f1cfbf78c68733e717ad7b8602b Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Tue, 26 Oct 2010 14:21:54 -0700 Subject: [PATCH 048/176] mm: add a might_sleep_if() to dma_pool_alloc() Buggy drivers (e.g. fsl_udc) could call dma_pool_alloc from atomic context with GFP_KERNEL. In most instances, the first pool_alloc_page call would succeed and the sleeping functions would never be called. This allowed the buggy drivers to slip through the cracks. Add a might_sleep_if() checking for __GFP_WAIT in flags. Signed-off-by: Dima Zavin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/dmapool.c b/mm/dmapool.c index 3df063706f53..4df2de77e069 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; + might_sleep_if(mem_flags & __GFP_WAIT); + spin_lock_irqsave(&pool->lock, flags); restart: list_for_each_entry(page, &pool->page_list, page_list) { From 182fea8f48332de085c0ae936605cb72671db9f2 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Tue, 26 Oct 2010 14:21:55 -0700 Subject: [PATCH 049/176] mm: remove alignment padding from anon_vma on (some) 64 bit builds Reorder structure anon_vma to remove alignment padding on 64 builds when (CONFIG_KSM || CONFIG_MIGRATION). This will shrink the size of the anon_vma structure from 40 to 32 bytes & allow more objects per slab in its kmem_cache. Under slub the objects in the anon_vma kmem_cache will then be 40 bytes with 102 objects per slab. (On v2.6.36 without this patch,the size is 48 bytes and 85 objects/slab.) Signed-off-by: Richard Kennedy Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 31b2fd75dcba..5c98df68a953 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -25,8 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - spinlock_t lock; /* Serialize access to vma list */ struct anon_vma *root; /* Root of this anon_vma tree */ + spinlock_t lock; /* Serialize access to vma list */ #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) /* From b522c94da5d9cbc73f708be5e530ebc3bbd4a031 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 26 Oct 2010 14:21:56 -0700 Subject: [PATCH 050/176] mm: filemap_fault: unique path for locking page Introduce a single location where filemap_fault() locks the desired page. There used to be two such places, depending if the initial find_get_page() was successful or not. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Acked-by: Linus Torvalds Cc: Nick Piggin Reviewed-by: Wu Fengguang Cc: Ying Han Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3d4df44e4221..8ed709a83eb7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1539,25 +1539,27 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - lock_page(page); - - /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto no_cached_page; - } } else { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_lock_page(mapping, offset); + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } + lock_page(page); + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON(page->index != offset); + /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. From d065bd810b6deb67d4897a14bfe21f8eb526ba99 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 26 Oct 2010 14:21:57 -0700 Subject: [PATCH 051/176] mm: retry page fault when blocking on disk transfer This change reduces mmap_sem hold times that are caused by waiting for disk transfers when accessing file mapped VMAs. It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call site wants mmap_sem to be released if blocking on a pending disk transfer. In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and do_page_fault() will then re-acquire mmap_sem and retry the page fault. It is expected that the retry will hit the same page which will now be cached, and thus it will complete with a low mmap_sem hold time. Tests: - microbenchmark: thread A mmaps a large file and does random read accesses to the mmaped area - achieves about 55 iterations/s. Thread B does mmap/munmap in a loop at a separate location - achieves 55 iterations/s before, 15000 iterations/s after. - We are seeing related effects in some applications in house, which show significant performance regressions when running without this change. [akpm@linux-foundation.org: fix warning & crash] Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Acked-by: Linus Torvalds Cc: Nick Piggin Reviewed-by: Wu Fengguang Cc: Ying Han Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Acked-by: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 38 ++++++++++++++++++++++++++------------ include/linux/mm.h | 2 ++ include/linux/pagemap.h | 13 +++++++++++++ mm/filemap.c | 16 +++++++++++++++- mm/memory.c | 10 ++++++++-- 5 files changed, 64 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 852b319edbdc..9b2345c9e0c3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -956,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) struct task_struct *tsk; unsigned long address; struct mm_struct *mm; - int write; int fault; + int write = error_code & PF_WRITE; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + (write ? FAULT_FLAG_WRITE : 0); tsk = current; mm = tsk->mm; @@ -1068,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) bad_area_nosemaphore(regs, error_code, address); return; } +retry: down_read(&mm->mmap_sem); } else { /* @@ -1111,8 +1114,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * we can handle it.. */ good_area: - write = error_code & PF_WRITE; - if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -1123,21 +1124,34 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } } check_v8086_mode(regs, address, tsk); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2862009f9573..3bf46655b50a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ +#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is @@ -723,6 +724,7 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ +#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e12cdc6d79ee..2d1ffe3cf1ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -299,6 +299,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern void __lock_page_nosync(struct page *page); +extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags); extern void unlock_page(struct page *page); static inline void __set_page_locked(struct page *page) @@ -350,6 +352,17 @@ static inline void lock_page_nosync(struct page *page) __lock_page_nosync(page); } +/* + * lock_page_or_retry - Lock the page, unless this would block and the + * caller indicated that it can handle a retry. + */ +static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + might_sleep(); + return trylock_page(page) || __lock_page_or_retry(page, mm, flags); +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. * Never use this directly! diff --git a/mm/filemap.c b/mm/filemap.c index 8ed709a83eb7..33f81252a744 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page) TASK_UNINTERRUPTIBLE); } +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { + __lock_page(page); + return 1; + } else { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + return 0; + } +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -1550,7 +1563,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) goto no_cached_page; } - lock_page(page); + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + return ret | VM_FAULT_RETRY; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { diff --git a/mm/memory.c b/mm/memory.c index 92cc54e94137..714c4438d887 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2627,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; + int locked; struct mem_cgroup *ptr = NULL; int exclusive = 0; int ret = 0; @@ -2677,8 +2678,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release; } - lock_page(page); + locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } /* * Make sure try_to_free_swap or reuse_swap_page or swapoff did not @@ -2927,7 +2932,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) return ret; if (unlikely(PageHWPoison(vmf.page))) { From 68da336a14e16c2de95e987f3200995b707d7038 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 26 Oct 2010 14:21:58 -0700 Subject: [PATCH 052/176] x86: access_error API cleanup access_error() already takes error_code as an argument, so there is no need for an additional write flag. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Nick Piggin Acked-by: Wu Fengguang Cc: Ying Han Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Acked-by: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9b2345c9e0c3..7d90ceb882a4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -919,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address) int show_unhandled_signals = 1; static inline int -access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +access_error(unsigned long error_code, struct vm_area_struct *vma) { - if (write) { + if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -1114,7 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * we can handle it.. */ good_area: - if (unlikely(access_error(error_code, write, vma))) { + if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; } From 0116651c85e671a693dd2f56e95dd651f746c973 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:21:58 -0700 Subject: [PATCH 053/176] mm: remove temporary variable on generic_file_direct_write() 'end' shadows earlier one and is not necessary at all. Remove it and use 'pos' instead. This removes following sparse warnings: mm/filemap.c:2180:24: warning: symbol 'end' shadows an earlier one mm/filemap.c:2132:25: originally declared here Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 33f81252a744..75572b5f2374 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2193,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); + pos += written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = end; + *ppos = pos; } out: return written; From e6223a3b19421e3a8df1352d21fd0d71093f44ae Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:21:59 -0700 Subject: [PATCH 054/176] mm: add casts to/from gfp_t in gfp_to_alloc_flags() This removes following warning from sparse: mm/page_alloc.c:1934:9: warning: restricted gfp_t degrades to integer Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b13bc5e5bd7d..07a654486f75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1932,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) const gfp_t wait = gfp_mask & __GFP_WAIT; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ - BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); /* * The caller may dip into page reserves a bit more if the caller @@ -1940,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { alloc_flags |= ALLOC_HARDER; From 25ca1d6c02fe1c6d90d918867ef670d323725458 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:21:59 -0700 Subject: [PATCH 055/176] mm: wrap get_locked_pte() using __cond_lock() The get_locked_pte() conditionally grabs 'ptl' in case of returning non-NULL. This leads sparse to complain about context imbalance. Rename and wrap it using __cond_lock() to make sparse happy. Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++++++++- mm/memory.c | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3bf46655b50a..721f451c3029 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1034,7 +1034,15 @@ extern void unregister_shrinker(struct shrinker *); int vma_wants_writenotify(struct vm_area_struct *vma); -extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); +extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl); +static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pte_t *ptep; + __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); + return ptep; +} #ifdef __PAGETABLE_PUD_FOLDED static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, diff --git a/mm/memory.c b/mm/memory.c index 714c4438d887..4ce24a4d5d48 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1591,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); From e6219ec8195efd5640765e657810f262ad9d1a92 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:00 -0700 Subject: [PATCH 056/176] mm: add lock release annotation on do_wp_page() The do_wp_page() releases @ptl but was missing proper annotation. Add it. This removes following warnings from sparse: mm/memory.c:2337:9: warning: context imbalance in 'do_wp_page' - unexpected unlock mm/memory.c:3142:19: warning: context imbalance in 'handle_mm_fault' - different lock contexts for basic block Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory.c b/mm/memory.c index 4ce24a4d5d48..01bdf9dbbfac 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2108,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) { struct page *old_page, *new_page; pte_t entry; From 1b36ba815bd91f17e31277a44dd5c6b6a5a8d97e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:00 -0700 Subject: [PATCH 057/176] mm: wrap follow_pte() using __cond_lock() The follow_pte() conditionally grabs *@ptlp in case of returning 0. Rename and wrap it using __cond_lock() removes following warnings: mm/memory.c:2337:9: warning: context imbalance in 'do_wp_page' - unexpected unlock mm/memory.c:3142:19: warning: context imbalance in 'handle_mm_fault' - different lock contexts for basic block Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 01bdf9dbbfac..861f7982dd54 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -static int follow_pte(struct mm_struct *mm, unsigned long address, +static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; @@ -3388,6 +3388,17 @@ static int follow_pte(struct mm_struct *mm, unsigned long address, return -EINVAL; } +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte(mm, address, ptepp, ptlp))); + return res; +} + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping From ea4525b6008fb29553306ec6719f8e6930ac9499 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:01 -0700 Subject: [PATCH 058/176] rmap: annotate lock context change on page_[un]lock_anon_vma() The page_lock_anon_vma() conditionally grabs RCU and anon_vma lock but page_unlock_anon_vma() releases them unconditionally. This leads sparse to complain about context imbalance. Annotate them. Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 15 ++++++++++++++- mm/rmap.c | 4 +++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5c98df68a953..07ea89c16761 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -230,7 +230,20 @@ int try_to_munlock(struct page *); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma(struct page *page); +struct anon_vma *__page_lock_anon_vma(struct page *page); + +static inline struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma; + + __cond_lock(RCU, anon_vma = __page_lock_anon_vma(page)); + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(&anon_vma->root->lock, anon_vma); + + return anon_vma; +} + void page_unlock_anon_vma(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/mm/rmap.c b/mm/rmap.c index f5ad996a4a8f..0995a8f68866 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -314,7 +314,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *__page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma, *root_anon_vma; unsigned long anon_mapping; @@ -348,6 +348,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) } void page_unlock_anon_vma(struct anon_vma *anon_vma) + __releases(&anon_vma->root->lock) + __releases(RCU) { anon_vma_unlock(anon_vma); rcu_read_unlock(); From e9a81a821d7f9c5d899cc3acdeafbd884c2c48bb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:01 -0700 Subject: [PATCH 059/176] rmap: wrap page_check_address() using __cond_lock() The page_check_address() conditionally grabs *@ptlp in case of returning non-NULL. Rename and wrap it using __cond_lock() removes following warnings from sparse: mm/rmap.c:472:9: warning: context imbalance in 'page_mapped_in_vma' - unexpected unlock mm/rmap.c:524:9: warning: context imbalance in 'page_referenced_one' - unexpected unlock mm/rmap.c:706:9: warning: context imbalance in 'page_mkclean_one' - unexpected unlock mm/rmap.c:1066:9: warning: context imbalance in 'try_to_unmap_one' - unexpected unlock Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 13 ++++++++++++- mm/rmap.c | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 07ea89c16761..bb83c0da2071 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -205,9 +205,20 @@ int try_to_unmap_one(struct page *, struct vm_area_struct *, /* * Called from mm/filemap_xip.c to unmap empty zero page */ -pte_t *page_check_address(struct page *, struct mm_struct *, +pte_t *__page_check_address(struct page *, struct mm_struct *, unsigned long, spinlock_t **, int); +static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm, + unsigned long address, + spinlock_t **ptlp, int sync) +{ + pte_t *ptep; + + __cond_lock(*ptlp, ptep = __page_check_address(page, mm, address, + ptlp, sync)); + return ptep; +} + /* * Used by swapoff to help locate where page is expected in vma. */ diff --git a/mm/rmap.c b/mm/rmap.c index 0995a8f68866..bfeffbddb712 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -409,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) * * On success returns with pte mapped and locked. */ -pte_t *page_check_address(struct page *page, struct mm_struct *mm, +pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; From e574b5fd20027b422aa80790f710d695699b4fba Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:02 -0700 Subject: [PATCH 060/176] rmap: make anon_vma_chain_free() static Make anon_vma_chain_free() static. It is called only in rmap.c and the corresponding alloc function is already static. Signed-off-by: Namhyung Kim Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index bfeffbddb712..1a8bf76bfd03 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void) return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); } -void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } From 170168d0a351c045adc0bee0987e51dfc82890c0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:02 -0700 Subject: [PATCH 061/176] vmalloc: rename temporary variable in __insert_vmap_area() Rename redundant 'tmp' to fix following sparse warnings: mm/vmalloc.c:296:34: warning: symbol 'tmp' shadows an earlier one mm/vmalloc.c:293:24: originally declared here Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9f909622a25e..55c7a47391f6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va) struct rb_node *tmp; while (*p) { - struct vmap_area *tmp; + struct vmap_area *tmp_va; parent = *p; - tmp = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp->va_end) + tmp_va = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp_va->va_end) p = &(*p)->rb_left; - else if (va->va_end > tmp->va_start) + else if (va->va_end > tmp_va->va_start) p = &(*p)->rb_right; else BUG(); From e199b5d1fed13f5e8f47a0ee8216f36244dad1f4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:03 -0700 Subject: [PATCH 062/176] vmalloc: annotate lock context change on s_start/stop() s_start() and s_stop() grab/release vmlist_lock but were missing proper annotations. Add them. Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 55c7a47391f6..f492c774fa7b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2350,6 +2350,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmlist_lock) { loff_t n = *pos; struct vm_struct *v; @@ -2376,6 +2377,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmlist_lock) { read_unlock(&vmlist_lock); } From 92c09c041f15fc88b35f8628e07639f52e1fbb38 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:03 -0700 Subject: [PATCH 063/176] mm: declare some external symbols Declare 'bdi_pending_list' and 'tag_pages_for_writeback()' to remove following sparse warnings: mm/backing-dev.c:46:1: warning: symbol 'bdi_pending_list' was not declared. Should it be static? mm/page-writeback.c:825:6: warning: symbol 'tag_pages_for_writeback' was not declared. Should it be static? Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 1 + include/linux/writeback.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f1b402a50679..4ce34fa937d4 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -111,6 +111,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct list_head bdi_pending_list; static inline int wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..c7299d2ace6b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -149,6 +149,8 @@ int write_cache_pages(struct address_space *mapping, int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void set_page_dirty_balance(struct page *page, int page_mkwrite); void writeback_set_ratelimit(void); +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl From 36deb0be314702627aeae1f5737fc84d01dc26c6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:04 -0700 Subject: [PATCH 064/176] vmstat: include compaction.h when CONFIG_COMPACTION This removes following warning from sparse: mm/vmstat.c:466:5: warning: symbol 'fragmentation_index' was not declared. Should it be static? [akpm@linux-foundation.org: move the include to top-of-file] Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index baa4ab387db7..cd2e42be7b68 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -395,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif #ifdef CONFIG_COMPACTION + struct contig_page_info { unsigned long free_pages; unsigned long free_blocks_total; From 16b56cf4b8a0fa9acc21bd2ad19839b917999b96 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:04 -0700 Subject: [PATCH 065/176] mm: fix sparse warnings on GFP_ZONE_TABLE/BAD Introduce ___GFP_* masks in order for gfp_t to not be mixed with plain integers which causes a lot of warnings like the following: warning: restricted gfp_t degrades to integer Signed-off-by: Namhyung Kim Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 105 ++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 975609cb8548..e8713d55360a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -9,6 +9,32 @@ struct vm_area_struct; +/* Plain integer GFP bitmasks. Do not use this directly. */ +#define ___GFP_DMA 0x01u +#define ___GFP_HIGHMEM 0x02u +#define ___GFP_DMA32 0x04u +#define ___GFP_MOVABLE 0x08u +#define ___GFP_WAIT 0x10u +#define ___GFP_HIGH 0x20u +#define ___GFP_IO 0x40u +#define ___GFP_FS 0x80u +#define ___GFP_COLD 0x100u +#define ___GFP_NOWARN 0x200u +#define ___GFP_REPEAT 0x400u +#define ___GFP_NOFAIL 0x800u +#define ___GFP_NORETRY 0x1000u +#define ___GFP_COMP 0x4000u +#define ___GFP_ZERO 0x8000u +#define ___GFP_NOMEMALLOC 0x10000u +#define ___GFP_HARDWALL 0x20000u +#define ___GFP_THISNODE 0x40000u +#define ___GFP_RECLAIMABLE 0x80000u +#ifdef CONFIG_KMEMCHECK +#define ___GFP_NOTRACK 0x200000u +#else +#define ___GFP_NOTRACK 0 +#endif + /* * GFP bitmasks.. * @@ -18,10 +44,10 @@ struct vm_area_struct; * without the underscores and use them consistently. The definitions here may * be used in bit comparisons. */ -#define __GFP_DMA ((__force gfp_t)0x01u) -#define __GFP_HIGHMEM ((__force gfp_t)0x02u) -#define __GFP_DMA32 ((__force gfp_t)0x04u) -#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */ +#define __GFP_DMA ((__force gfp_t)___GFP_DMA) +#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) +#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) +#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /* * Action modifiers - doesn't change the zoning @@ -38,27 +64,22 @@ struct vm_area_struct; * __GFP_MOVABLE: Flag that this page will be movable by the page migration * mechanism or reclaimed */ -#define __GFP_WAIT ((__force gfp_t)0x10u) /* Can wait and reschedule? */ -#define __GFP_HIGH ((__force gfp_t)0x20u) /* Should access emergency pools? */ -#define __GFP_IO ((__force gfp_t)0x40u) /* Can start physical IO? */ -#define __GFP_FS ((__force gfp_t)0x80u) /* Can call down to low-level FS? */ -#define __GFP_COLD ((__force gfp_t)0x100u) /* Cache-cold page required */ -#define __GFP_NOWARN ((__force gfp_t)0x200u) /* Suppress page allocation failure warning */ -#define __GFP_REPEAT ((__force gfp_t)0x400u) /* See above */ -#define __GFP_NOFAIL ((__force gfp_t)0x800u) /* See above */ -#define __GFP_NORETRY ((__force gfp_t)0x1000u)/* See above */ -#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */ -#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ -#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ -#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ -#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ -#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ - -#ifdef CONFIG_KMEMCHECK -#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */ -#else -#define __GFP_NOTRACK ((__force gfp_t)0) -#endif +#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */ +#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */ +#define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */ +#define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */ +#define __GFP_COLD ((__force gfp_t)___GFP_COLD) /* Cache-cold page required */ +#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) /* Suppress page allocation failure warning */ +#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ +#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ +#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ +#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ +#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ +#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */ +#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ +#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ +#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ +#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ /* * This may seem redundant, but it's a way of annotating false positives vs. @@ -186,14 +207,14 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) #endif #define GFP_ZONE_TABLE ( \ - (ZONE_NORMAL << 0 * ZONES_SHIFT) \ - | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \ - | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \ - | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \ - | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \ - | (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT) \ - | (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\ - | (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\ + (ZONE_NORMAL << 0 * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT) \ + | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT) \ + | (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT) \ + | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT) \ ) /* @@ -203,20 +224,20 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) * allowed. */ #define GFP_ZONE_BAD ( \ - 1 << (__GFP_DMA | __GFP_HIGHMEM) \ - | 1 << (__GFP_DMA | __GFP_DMA32) \ - | 1 << (__GFP_DMA32 | __GFP_HIGHMEM) \ - | 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM) \ - | 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA) \ - | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA) \ - | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM) \ - | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\ + 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ + | 1 << (___GFP_DMA | ___GFP_DMA32) \ + | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ + | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ + | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ + | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ + | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ + | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; - int bit = flags & GFP_ZONEMASK; + int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & ((1 << ZONES_SHIFT) - 1); From 74ce002d9aee23031b4967e1dd1c1966ddc60749 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:05 -0700 Subject: [PATCH 066/176] fs/fs-writeback.c: restore lost comment I had to go back to a 2.6.20 tree to work out why we're adding a number-of-inodes into a number-of-pages count. Restore the lost comment. Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 97d2951bd4d1..b5aae4bd0aca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -721,6 +721,10 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; wb->last_old_flush = jiffies; + /* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); From 7bbc0905ea4f7a471a7f79d0bea5d538f5114fc9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:05 -0700 Subject: [PATCH 067/176] mm/memory_hotplug.c: make scan_lru_pages() static Reported-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b0dc65452973..bb63b36c4413 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -646,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) * Scanning pfn is much easier than scanning lru list. * Scan pfn from start to end and Find LRU page. */ -unsigned long scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_lru_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; From e1ca7788dec6773b1a2bce51b7141948f2b8bccf Mon Sep 17 00:00:00 2001 From: Dave Young Date: Tue, 26 Oct 2010 14:22:06 -0700 Subject: [PATCH 068/176] mm: add vzalloc() and vzalloc_node() helpers Add vzalloc() and vzalloc_node() to encapsulate the vmalloc-then-memset-zero operation. Use __GFP_ZERO to zero fill the allocated memory. Signed-off-by: Dave Young Cc: Christoph Lameter Acked-by: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 ++ mm/nommu.c | 49 ++++++++++++++++++++++++++++++++++++++++- mm/vmalloc.c | 46 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 63a4fe6d51bd..a03dcf62ca9d 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -53,8 +53,10 @@ static inline void vmalloc_init(void) #endif extern void *vmalloc(unsigned long size); +extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); +extern void *vzalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/mm/nommu.c b/mm/nommu.c index 88ff091eb07a..30b5c20eec15 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -293,11 +293,58 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +/* + * vzalloc - allocate virtually continguos memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } -EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f492c774fa7b..a3d66b3dc5cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1596,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1607,11 +1614,27 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, - -1, __builtin_return_address(0)); + return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); +/** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, -1, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size @@ -1653,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc_node() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node_flags(size, node, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif From 66d7dd518ae413a383ab2c6c263cc30617329842 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 26 Oct 2010 14:22:06 -0700 Subject: [PATCH 069/176] /proc/swaps: support polling System management wants to subscribe to changes in swap configuration. Make /proc/swaps pollable like /proc/mounts. [akpm@linux-foundation.org: document proc_poll_event] Signed-off-by: Kay Sievers Acked-by: Greg KH Cc: Jonathan Corbet Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fc7bac7db0c..67ddaaf98c74 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } filp_close(swap_file, NULL); err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); @@ -1688,6 +1695,25 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } #ifdef CONFIG_PROC_FS +struct proc_swaps { + struct seq_file seq; + int event; +}; + +static unsigned swaps_poll(struct file *file, poll_table *wait) +{ + struct proc_swaps *s = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (s->event != atomic_read(&proc_poll_event)) { + s->event = atomic_read(&proc_poll_event); + return POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + + return POLLIN | POLLRDNORM; +} + /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { @@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - return seq_open(file, &swaps_op); + struct proc_swaps *s; + int ret; + + s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); + if (!s) + return -ENOMEM; + + file->private_data = s; + + ret = seq_open(file, &swaps_op); + if (ret) { + kfree(s); + return ret; + } + + s->seq.private = s; + s->event = atomic_read(&proc_poll_event); + return ret; } static const struct file_operations proc_swaps_operations = { @@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .poll = swaps_poll, }; static int __init procswaps_init(void) @@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + error = 0; goto out; bad_swap: From 70384dc6dcc6aa76762200262820bdb8b724ecd5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 26 Oct 2010 14:22:07 -0700 Subject: [PATCH 070/176] mm: fix error reporting in move_pages() syscall The vma returned by find_vma does not necessarily include the target address. If this happens the code tries to follow a page outside of any vma and returns ENOENT instead of EFAULT. Signed-off-by: Gleb Natapov Acked-by: Christoph Lameter Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 35e454189966..fe5a3c6a5426 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1037,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = -EFAULT; vma = find_vma(mm, pp->addr); - if (!vma || !vma_migratable(vma)) + if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; page = follow_page(vma, pp->addr, FOLL_GET); @@ -1204,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = find_vma(mm, addr); - if (!vma) + if (!vma || addr < vma->vm_start) goto set_status; page = follow_page(vma, addr, 0); From 44e2aa937e698ea95dd86b2a4fabd734ef2c76db Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Tue, 26 Oct 2010 14:22:08 -0700 Subject: [PATCH 071/176] mm/hugetlb.c: add missing spin_lock() to hugetlb_cow() Add missing spin_lock() of the page_table_lock before an error return in hugetlb_cow(). Callers of hugtelb_cow() expect it to be held upon return. Signed-off-by: Dean Nelson Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 96991ded82fe..c4a3558589ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2448,8 +2448,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; + } copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); From 572438f9b52236bd8938b1647cc15e027d27ef55 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Oct 2010 14:22:08 -0700 Subject: [PATCH 072/176] mm: fix is_mem_section_removable() page_order BUG_ON check page_order() is called by memory hotplug's user interface to check the section is removable or not. (is_mem_section_removable()) It calls page_order() withoug holding zone->lock. So, even if the caller does if (PageBuddy(page)) ret = page_order(page) ... The caller may hit BUG_ON(). For fixing this, there are 2 choices. 1. add zone->lock. 2. remove BUG_ON(). is_mem_section_removable() is used for some "advice" and doesn't need to be 100% accurate. This is_removable() can be called via user program.. We don't want to take this important lock for long by user's request. So, this patch removes BUG_ON(). Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Wu Fengguang Acked-by: Michal Hocko Acked-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 6a697bb97fc5..dedb0aff673f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page); */ static inline unsigned long page_order(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); + /* PageBuddy() must be checked by the caller */ return page_private(page); } From f6a3607e5f30dc642bead8cd95c48d47b6b4bfbb Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 26 Oct 2010 14:22:09 -0700 Subject: [PATCH 073/176] mm: page_isolation: codeclean fix comment and rm unneeded val init __test_page_isolated_in_pageblock() returns 1 if all pages in the range are isolated, so fix the comment. Variable `pfn' will be initialised in the following loop so remove it. Signed-off-by: Bob Liu Acked-by: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: KOSAKI Motohiro Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e0ffd967452..4ae42bb40892 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 0 if all pages in the range is isolated. + * Returns 1 if all pages in the range is isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) @@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) struct zone *zone; int ret; - pfn = start_pfn; /* * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page * is not aligned to pageblock_nr_pages. From 809c444977adb7313e0612e9e3af4b73ba3f5746 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 26 Oct 2010 14:22:10 -0700 Subject: [PATCH 074/176] mm: do_migrate_range: exit loop if not_managed is true If not_managed is true all pages will be putback to lru, so break the loop earlier to skip other pages isolate. Signed-off-by: Bob Liu Acked-by: KAMEZAWA Hiroyuki Acked-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bb63b36c4413..e4af144ee409 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -696,15 +696,17 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_cache(page)); } else { - /* Becasue we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) - not_managed++; #ifdef CONFIG_DEBUG_VM printk(KERN_ALERT "removing pfn %lx from LRU failed\n", pfn); dump_page(page); #endif + /* Becasue we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) { + not_managed++; + break; + } } } ret = -EBUSY; From f3ab2636c5c1dd9ab0ff53a46d8354d5769ffdd4 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 26 Oct 2010 14:22:10 -0700 Subject: [PATCH 075/176] mm: do_migrate_range: reduce list_empty() check Simple code for reducing list_empty(&source) check. Signed-off-by: Bob Liu Acked-by: KAMEZAWA Hiroyuki Acked-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4af144ee409..9260314a221e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -705,24 +705,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) check this again here. */ if (page_count(page)) { not_managed++; + ret = -EBUSY; break; } } } - ret = -EBUSY; - if (not_managed) { - if (!list_empty(&source)) + if (!list_empty(&source)) { + if (not_managed) { + putback_lru_pages(&source); + goto out; + } + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + if (ret) putback_lru_pages(&source); - goto out; } - ret = 0; - if (list_empty(&source)) - goto out; - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); - if (ret) - putback_lru_pages(&source); - out: return ret; } From a56d5318716d120e040294bb258901ba89fb9c90 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 26 Oct 2010 14:22:11 -0700 Subject: [PATCH 076/176] hpet: unmap unused I/O space When the initialization code in hpet finds a memory resource and does not find an IRQ, it does not unmap the memory resource previously mapped. There are buggy BIOSes which report resources exactly like this and what is worse the memory region bases point to normal RAM. This normally would not matter since the space is not touched. But when PAT is turned on, ioremap causes the page to be uncached and sets this bit in page->flags. Then when the page is about to be used by the allocator, it is reported as: BUG: Bad page state in process md5sum pfn:3ed00 page:ffffea0000dbd800 count:0 mapcount:0 mapping:(null) index:0x0 page flags: 0x20000001000000(uncached) Pid: 7956, comm: md5sum Not tainted 2.6.34-12-desktop #1 Call Trace: [] bad_page+0xb1/0x100 [] prep_new_page+0x1a5/0x1c0 [] get_page_from_freelist+0x3a1/0x640 [] __alloc_pages_nodemask+0x10f/0x6b0 ... In this particular case: 1) HPET returns 3ed00000 as memory region base, but it is not in reserved ranges reported by the BIOS (excerpt): BIOS-e820: 0000000000100000 - 00000000af6cf000 (usable) BIOS-e820: 00000000af6cf000 - 00000000afdcf000 (reserved) 2) there is no IRQ resource reported by HPET method. On the other hand, the Intel HPET specs (1.0a) says (3.2.5.1): _CRS ( // Report 1K of memory consumed by this Timer Block memory range consumed // Optional: only used if BIOS allocates Interrupts [1] IRQs consumed ) [1] For case where Timer Block is configured to consume IRQ0/IRQ8 AND Legacy 8254/Legacy RTC hardware still exists, the device objects associated with 8254 & RTC devices should not report IRQ0/IRQ8 as "consumed resources". So in theory we should check whether if it is the case and use those interrupts instead. Anyway the address reported by the BIOS here is bogus, so non-presence of IRQ doesn't mean the "optional" part in point 2). Since I got no reply previously, fix this by simply unmapping the space when IRQ is not found and memory region was mapped previously. It would be probably more safe to walk the resources again and unmap appropriately depending on type. But as we now use only ioremap for both 2 memory resource types, it is not necessarily needed right now. Addresses https://bugzilla.novell.com/show_bug.cgi?id=629908 Reported-by: Olaf Hering Signed-off-by: Jiri Slaby Acked-by: Clemens Ladisch Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/hpet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index a4eee324eb1e..84e6a7e673b7 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -1000,6 +1000,8 @@ static int hpet_acpi_add(struct acpi_device *device) return -ENODEV; if (!data.hd_address || !data.hd_nirqs) { + if (data.hd_address) + iounmap(data.hd_address); printk("%s: no address or irqs in _CRS\n", __func__); return -ENODEV; } From 96e9694df446d1154ec2f4fdba8908588b9cba38 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Tue, 26 Oct 2010 14:22:13 -0700 Subject: [PATCH 077/176] hpet: fix unwanted interrupt due to stale irq status bit Jaswinder Singh Rajput wrote: > By executing Documentation/timers/hpet_example.c > > for polling, I requested for 3 iterations but it seems iteration work > for only 2 as first expired time is always very small. > > # ./hpet_example poll /dev/hpet 10 3 > -hpet: executing poll > hpet_poll: info.hi_flags 0x0 > hpet_poll: expired time = 0x13 > hpet_poll: revents = 0x1 > hpet_poll: data 0x1 > hpet_poll: expired time = 0x1868c > hpet_poll: revents = 0x1 > hpet_poll: data 0x1 > hpet_poll: expired time = 0x18645 > hpet_poll: revents = 0x1 > hpet_poll: data 0x1 Clearing the HPET interrupt enable bit disables interrupt generation but does not disable the timer, so the interrupt status bit will still be set when the timer elapses. If another interrupt arrives before the timer has been correctly programmed (due to some other device on the same interrupt line, or CONFIG_DEBUG_SHIRQ), this results in an extra unwanted interrupt event because the status bit is likely to be set from comparator matches that happened before the device was opened. Therefore, we have to ensure that the interrupt status bit is and stays cleared until we actually program the timer. Signed-off-by: Clemens Ladisch Reported-by: Jaswinder Singh Rajput Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Bob Picco Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/hpet.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 84e6a7e673b7..a2cbb828e92a 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -465,6 +465,21 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp) if (irq) { unsigned long irq_flags; + if (devp->hd_flags & HPET_SHARED_IRQ) { + /* + * To prevent the interrupt handler from seeing an + * unwanted interrupt status bit, program the timer + * so that it will not fire in the near future ... + */ + writel(readl(&timer->hpet_config) & ~Tn_TYPE_CNF_MASK, + &timer->hpet_config); + write_counter(read_counter(&hpet->hpet_mc), + &timer->hpet_compare); + /* ... and clear any left-over status. */ + isr = 1 << (devp - devp->hd_hpets->hp_dev); + writel(isr, &hpet->hpet_isr); + } + sprintf(devp->hd_name, "hpet%d", (int)(devp - hpetp->hp_dev)); irq_flags = devp->hd_flags & HPET_SHARED_IRQ ? IRQF_SHARED : IRQF_DISABLED; From 0ca01763a028d0034042a9397534bc1f27848652 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 26 Oct 2010 14:22:13 -0700 Subject: [PATCH 078/176] hpet: fix style problems Fix the following style problems: WARNING: Use #include instead of WARNING: Use #include instead of ERROR: code indent should use tabs where possible ERROR: do not initialise statics to 0 or NULL Signed-off-by: Jaswinder Singh Rajput Cc: Clemens Ladisch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/hpet.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index a2cbb828e92a..fcb5f0d6ce70 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -32,12 +32,12 @@ #include #include #include +#include #include +#include #include -#include #include -#include #include #include @@ -81,13 +81,13 @@ static cycle_t read_hpet(struct clocksource *cs) } static struct clocksource clocksource_hpet = { - .name = "hpet", - .rating = 250, - .read = read_hpet, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be calculated */ - .shift = 10, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be calculated */ + .shift = 10, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static struct clocksource *hpet_clocksource; #endif @@ -826,7 +826,7 @@ int hpet_alloc(struct hpet_data *hdp) struct hpets *hpetp; size_t siz; struct hpet __iomem *hpet; - static struct hpets *last = NULL; + static struct hpets *last; unsigned long period; unsigned long long temp; u32 remainder; From aaaddfe0b3bb449b8734bf29bbd36141076e5277 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 26 Oct 2010 14:22:14 -0700 Subject: [PATCH 079/176] Documentation/timers/hpet_example.c: add supporting info for hpet_example $./hpet_example info /dev/hpet -hpet: executing info hpet_info: hi_irqfreq 0x0 hi_flags 0x0 hi_hpet 0 hi_timer 2 Signed-off-by: Jaswinder Singh Rajput Cc: Clemens Ladisch Cc: "Venkatesh Pallipadi (Venki)" Cc: john stultz Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/timers/hpet_example.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/timers/hpet_example.c b/Documentation/timers/hpet_example.c index 4bfafb7bc4c5..9a3e7012c190 100644 --- a/Documentation/timers/hpet_example.c +++ b/Documentation/timers/hpet_example.c @@ -97,6 +97,33 @@ hpet_open_close(int argc, const char **argv) void hpet_info(int argc, const char **argv) { + struct hpet_info info; + int fd; + + if (argc != 1) { + fprintf(stderr, "hpet_info: device-name\n"); + return; + } + + fd = open(argv[0], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]); + return; + } + + if (ioctl(fd, HPET_INFO, &info) < 0) { + fprintf(stderr, "hpet_info: failed to get info\n"); + goto out; + } + + fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ", + info.hi_ireqfreq, info.hi_flags); + fprintf(stderr, "hi_hpet %d hi_timer %d\n", + info.hi_hpet, info.hi_timer); + +out: + close(fd); + return; } void From dae512edc6e945e127f0848aa757055265d70aa2 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 26 Oct 2010 14:22:15 -0700 Subject: [PATCH 080/176] drivers/char/hpet.c: fix information leak to userland Structure info is copied to userland with some padding fields unitialized. It leads to leaking of stack memory. [akpm@linux-foundation.org: remove now-unneeded zeroing of info->hi_ireqfreq] Signed-off-by: Vasiliy Kulikov Cc: Clemens Ladisch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/hpet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index fcb5f0d6ce70..55b8667f739f 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -596,11 +596,10 @@ hpet_ioctl_common(struct hpet_dev *devp, int cmd, unsigned long arg, break; case HPET_INFO: { + memset(info, 0, sizeof(*info)); if (devp->hd_ireqfreq) info->hi_ireqfreq = hpet_time_div(hpetp, devp->hd_ireqfreq); - else - info->hi_ireqfreq = 0; info->hi_flags = readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK; info->hi_hpet = hpetp->hp_which; From 947272dd3e959c69ff0fc54e62e44163b729b796 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 26 Oct 2010 14:22:15 -0700 Subject: [PATCH 081/176] alpha: enable ARCH_DMA_ADDR_T_64BIT Signed-off-by: FUJITA Tomonori Cc: Ivan Kokshaysky Cc: Richard Henderson Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index d04ccd73af45..28f93a6c0fde 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -55,6 +55,9 @@ config ZONE_DMA bool default y +config ARCH_DMA_ADDR_T_64BIT + def_bool y + config NEED_DMA_MAP_STATE def_bool y From 98c532ecbe582586e204688c6cde7e27580cc43f Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 26 Oct 2010 14:22:17 -0700 Subject: [PATCH 082/176] alpha: use single HAE window on T2 core logic (gamma, sable) T2 are the only alpha SMP systems that do HAE switching at runtime, which is fundamentally racy on SMP. This patch limits MMIO space on T2 to HAE0 only, like we did on MCPCIA (rawhide) long ago. This leaves us with only 112 Mb of PCI MMIO (128 Mb HAE aperture minus 16 Mb reserved for EISA), but since linux PCI allocations are reasonably tight, it should be enough for sane hardware configurations. Also, fix a typo in MCPCIA_FROB_MMIO macro which shouldn't call set_hae() if MCPCIA_ONE_HAE_WINDOW is defined. It's more for correctness, as set_hae() is a no-op anyway in that case. Signed-off-by: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/core_mcpcia.h | 2 +- arch/alpha/include/asm/core_t2.h | 54 ++++++++++------------------ arch/alpha/kernel/core_t2.c | 11 ++++-- arch/alpha/kernel/machvec_impl.h | 3 ++ 4 files changed, 30 insertions(+), 40 deletions(-) diff --git a/arch/alpha/include/asm/core_mcpcia.h b/arch/alpha/include/asm/core_mcpcia.h index 21ac53383b37..9f67a056b461 100644 --- a/arch/alpha/include/asm/core_mcpcia.h +++ b/arch/alpha/include/asm/core_mcpcia.h @@ -247,7 +247,7 @@ struct el_MCPCIA_uncorrected_frame_mcheck { #define vip volatile int __force * #define vuip volatile unsigned int __force * -#ifdef MCPCIA_ONE_HAE_WINDOW +#ifndef MCPCIA_ONE_HAE_WINDOW #define MCPCIA_FROB_MMIO \ if (__mcpcia_is_mmio(hose)) { \ set_hae(hose & 0xffffffff); \ diff --git a/arch/alpha/include/asm/core_t2.h b/arch/alpha/include/asm/core_t2.h index 471c07292e0b..91b46801b290 100644 --- a/arch/alpha/include/asm/core_t2.h +++ b/arch/alpha/include/asm/core_t2.h @@ -1,6 +1,9 @@ #ifndef __ALPHA_T2__H__ #define __ALPHA_T2__H__ +/* Fit everything into one 128MB HAE window. */ +#define T2_ONE_HAE_WINDOW 1 + #include #include #include @@ -19,7 +22,7 @@ * */ -#define T2_MEM_R1_MASK 0x07ffffff /* Mem sparse region 1 mask is 26 bits */ +#define T2_MEM_R1_MASK 0x07ffffff /* Mem sparse region 1 mask is 27 bits */ /* GAMMA-SABLE is a SABLE with EV5-based CPUs */ /* All LYNX machines, EV4 or EV5, use the GAMMA bias also */ @@ -85,7 +88,9 @@ #define T2_DIR (IDENT_ADDR + GAMMA_BIAS + 0x38e0004a0UL) #define T2_ICE (IDENT_ADDR + GAMMA_BIAS + 0x38e0004c0UL) +#ifndef T2_ONE_HAE_WINDOW #define T2_HAE_ADDRESS T2_HAE_1 +#endif /* T2 CSRs are in the non-cachable primary IO space from 3.8000.0000 to 3.8fff.ffff @@ -429,13 +434,15 @@ extern inline void t2_outl(u32 b, unsigned long addr) * */ +#ifdef T2_ONE_HAE_WINDOW +#define t2_set_hae +#else #define t2_set_hae { \ - msb = addr >> 27; \ + unsigned long msb = addr >> 27; \ addr &= T2_MEM_R1_MASK; \ set_hae(msb); \ } - -extern raw_spinlock_t t2_hae_lock; +#endif /* * NOTE: take T2_DENSE_MEM off in each readX/writeX routine, since @@ -446,28 +453,22 @@ extern raw_spinlock_t t2_hae_lock; __EXTERN_INLINE u8 t2_readb(const volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long result, msb; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long result; t2_set_hae; result = *(vip) ((addr << 5) + T2_SPARSE_MEM + 0x00); - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); return __kernel_extbl(result, addr & 3); } __EXTERN_INLINE u16 t2_readw(const volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long result, msb; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long result; t2_set_hae; result = *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x08); - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); return __kernel_extwl(result, addr & 3); } @@ -478,59 +479,47 @@ __EXTERN_INLINE u16 t2_readw(const volatile void __iomem *xaddr) __EXTERN_INLINE u32 t2_readl(const volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long result, msb; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long result; t2_set_hae; result = *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x18); - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); return result & 0xffffffffUL; } __EXTERN_INLINE u64 t2_readq(const volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long r0, r1, work, msb; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long r0, r1, work; t2_set_hae; work = (addr << 5) + T2_SPARSE_MEM + 0x18; r0 = *(vuip)(work); r1 = *(vuip)(work + (4 << 5)); - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); return r1 << 32 | r0; } __EXTERN_INLINE void t2_writeb(u8 b, volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long msb, w; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long w; t2_set_hae; w = __kernel_insbl(b, addr & 3); *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x00) = w; - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); } __EXTERN_INLINE void t2_writew(u16 b, volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long msb, w; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long w; t2_set_hae; w = __kernel_inswl(b, addr & 3); *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x08) = w; - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); } /* @@ -540,29 +529,22 @@ __EXTERN_INLINE void t2_writew(u16 b, volatile void __iomem *xaddr) __EXTERN_INLINE void t2_writel(u32 b, volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long msb; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); t2_set_hae; *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x18) = b; - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); } __EXTERN_INLINE void t2_writeq(u64 b, volatile void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM; - unsigned long msb, work; - unsigned long flags; - raw_spin_lock_irqsave(&t2_hae_lock, flags); + unsigned long work; t2_set_hae; work = (addr << 5) + T2_SPARSE_MEM + 0x18; *(vuip)work = b; *(vuip)(work + (4 << 5)) = b >> 32; - raw_spin_unlock_irqrestore(&t2_hae_lock, flags); } __EXTERN_INLINE void __iomem *t2_ioportmap(unsigned long addr) diff --git a/arch/alpha/kernel/core_t2.c b/arch/alpha/kernel/core_t2.c index e6d90568b65d..2f770e994289 100644 --- a/arch/alpha/kernel/core_t2.c +++ b/arch/alpha/kernel/core_t2.c @@ -74,8 +74,6 @@ # define DBG(args) #endif -DEFINE_RAW_SPINLOCK(t2_hae_lock); - static volatile unsigned int t2_mcheck_any_expected; static volatile unsigned int t2_mcheck_last_taken; @@ -406,6 +404,7 @@ void __init t2_init_arch(void) { struct pci_controller *hose; + struct resource *hae_mem; unsigned long temp; unsigned int i; @@ -433,7 +432,13 @@ t2_init_arch(void) */ pci_isa_hose = hose = alloc_pci_controller(); hose->io_space = &ioport_resource; - hose->mem_space = &iomem_resource; + hae_mem = alloc_resource(); + hae_mem->start = 0; + hae_mem->end = T2_MEM_R1_MASK; + hae_mem->name = pci_hae0_name; + if (request_resource(&iomem_resource, hae_mem) < 0) + printk(KERN_ERR "Failed to request HAE_MEM\n"); + hose->mem_space = hae_mem; hose->index = 0; hose->sparse_mem_base = T2_SPARSE_MEM - IDENT_ADDR; diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h index 512685f78097..7fa62488bd16 100644 --- a/arch/alpha/kernel/machvec_impl.h +++ b/arch/alpha/kernel/machvec_impl.h @@ -25,6 +25,9 @@ #ifdef MCPCIA_ONE_HAE_WINDOW #define MCPCIA_HAE_ADDRESS (&alpha_mv.hae_cache) #endif +#ifdef T2_ONE_HAE_WINDOW +#define T2_HAE_ADDRESS (&alpha_mv.hae_cache) +#endif /* Only a few systems don't define IACK_SC, handling all interrupts through the SRM console. But splitting out that one case from IO() below From d911202e3f722c29065942400ea108b7096bdac7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 26 Oct 2010 14:22:18 -0700 Subject: [PATCH 083/176] uml: define CONFIG_NO_DMA I think that it's better to detect DMA misuse at build time rather than calling BUG_ON. Architectures that can't do DMA need to define CONFIG_NO_DMA. Thanks to Sam Ravnborg for explaining how CONFIG_NO_DMA and CONFIG_HAS_DMA work: http://marc.info/?l=linux-kernel&m=128359913825550&w=2 HAS_DMA is defined like this: config HAS_DMA boolean depends on !NO_DMA default y So to set HAS_DMA to true an arch should do: 1) Do not define NO_DMA 2) Define NO_DMA abd set it to 'n' Must archs - including um - used principle 1). In the um case we want to say that we do NOT have any DMA. This can be done in two ways. a) define NO_DMA and set it to 'y' b) redefine HAS_DMA and set it to 'n'. The patch you provided used principle b) where other archs use principle a). So I suggest you should use principle a) for um too. Signed-off-by: FUJITA Tomonori Cc: Miklos Szeredi Cc: Jeff Dike Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/Kconfig.um | 3 + arch/um/defconfig | 1 - arch/um/include/asm/dma-mapping.h | 112 ------------------------------ 3 files changed, 3 insertions(+), 113 deletions(-) delete mode 100644 arch/um/include/asm/dma-mapping.h diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index ec2b8da1aba4..ffe0934a3ccf 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -147,3 +147,6 @@ config KERNEL_STACK_ORDER This option determines the size of UML kernel stacks. They will be 1 << order pages. The default is OK unless you're running Valgrind on UML, in which case, set this to 3. + +config NO_DMA + def_bool y diff --git a/arch/um/defconfig b/arch/um/defconfig index 6bd456f96f90..564f3de65b4a 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -566,7 +566,6 @@ CONFIG_CRC32=m # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_PLIST=y -CONFIG_HAS_DMA=y # # SCSI device support diff --git a/arch/um/include/asm/dma-mapping.h b/arch/um/include/asm/dma-mapping.h deleted file mode 100644 index 1f469e80fdd3..000000000000 --- a/arch/um/include/asm/dma-mapping.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef _ASM_DMA_MAPPING_H -#define _ASM_DMA_MAPPING_H - -#include - -static inline int -dma_supported(struct device *dev, u64 mask) -{ - BUG(); - return(0); -} - -static inline int -dma_set_mask(struct device *dev, u64 dma_mask) -{ - BUG(); - return(0); -} - -static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag) -{ - BUG(); - return((void *) 0); -} - -static inline void -dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) -{ - BUG(); -} - -static inline dma_addr_t -dma_map_single(struct device *dev, void *cpu_addr, size_t size, - enum dma_data_direction direction) -{ - BUG(); - return(0); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - BUG(); - return(0); -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - BUG(); - return(0); -} - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - BUG(); -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - -static inline void -dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline int -dma_mapping_error(struct device *dev, dma_addr_t dma_handle) -{ - BUG(); - return 0; -} - -#endif From aa5fb4dbfd121296ca97c68cf90043a7ea97579d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 26 Oct 2010 14:22:19 -0700 Subject: [PATCH 084/176] uml: fix CONFIG_STATIC_LINK=y build failure with newer glibc With glibc 2.11 or later that was built with --enable-multi-arch, the UML link fails with undefined references to __rel_iplt_start and similar symbols. In recent binutils, the default linker script defines these symbols (see ld --verbose). Fix the UML linker scripts to match the new defaults for these sections. Signed-off-by: Roland McGrath Cc: Jeff Dike Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/dyn.lds.S | 14 ++++++++++++-- arch/um/kernel/uml.lds.S | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 69268014dd8e..a3cab6d3ae02 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -50,8 +50,18 @@ SECTIONS .rela.got : { *(.rela.got) } .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } - .rel.plt : { *(.rel.plt) } - .rela.plt : { *(.rela.plt) } + .rel.plt : { + *(.rel.plt) + PROVIDE_HIDDEN(__rel_iplt_start = .); + *(.rel.iplt) + PROVIDE_HIDDEN(__rel_iplt_end = .); + } + .rela.plt : { + *(.rela.plt) + PROVIDE_HIDDEN(__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN(__rela_iplt_end = .); + } .init : { KEEP (*(.init)) } =0x90909090 diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 9a873d765615..fbd99402d4d2 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -43,6 +43,23 @@ SECTIONS __syscall_stub_end = .; } + /* + * These are needed even in a static link, even if they wind up being empty. + * Newer glibc needs these __rel{,a}_iplt_{start,end} symbols. + */ + .rel.plt : { + *(.rel.plt) + PROVIDE_HIDDEN(__rel_iplt_start = .); + *(.rel.iplt) + PROVIDE_HIDDEN(__rel_iplt_end = .); + } + .rela.plt : { + *(.rela.plt) + PROVIDE_HIDDEN(__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN(__rela_iplt_end = .); + } + #include "asm/common.lds.S" init.data : { INIT_DATA } From be76d81f99c4c120adcd201a7316e4dd7dbe3c11 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:22:20 -0700 Subject: [PATCH 085/176] um: migrate from __do_IRQ() to generic_handle_irq() This patch removes __do_IRQ() from user mode linux. __do_IRQ is deprecated. Signed-off-by: Richard Weinberger Cc: Jeff Dike Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/Kconfig.um | 3 +++ arch/um/kernel/irq.c | 15 ++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index ffe0934a3ccf..50d6aa20c353 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -120,6 +120,9 @@ config SMP If you don't know what to do, say N. +config GENERIC_HARDIRQS_NO__DO_IRQ + def_bool y + config NR_CPUS int "Maximum number of CPUs (2-32)" range 2 32 diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index a746e3037a5b..3f0ac9e0c966 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -334,7 +334,7 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs); irq_enter(); - __do_IRQ(irq); + generic_handle_irq(irq); irq_exit(); set_irq_regs(old_regs); return 1; @@ -391,17 +391,10 @@ void __init init_IRQ(void) { int i; - irq_desc[TIMER_IRQ].status = IRQ_DISABLED; - irq_desc[TIMER_IRQ].action = NULL; - irq_desc[TIMER_IRQ].depth = 1; - irq_desc[TIMER_IRQ].chip = &SIGVTALRM_irq_type; - enable_irq(TIMER_IRQ); + set_irq_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); + for (i = 1; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - irq_desc[i].chip = &normal_irq_type; - enable_irq(i); + set_irq_chip_and_handler(i, &normal_irq_type, handle_edge_irq); } } From c5c6dd4e2dce3cb4d705d97183bc42b4e33e2606 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:22:20 -0700 Subject: [PATCH 086/176] hostfs: code cleanups Some code cleanups for hostfs. Signed-off-by: Richard Weinberger Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs_user.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 8d02683585e0..d51a98384bc0 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out) dir = opendir(path); *err_out = errno; - if (dir == NULL) - return NULL; + return dir; } @@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) if (attrs->ia_valid & HOSTFS_ATTR_MODE) { if (fd >= 0) { if (fchmod(fd, attrs->ia_mode) != 0) - return (-errno); + return -errno; } else if (chmod(file, attrs->ia_mode) != 0) { return -errno; } From f27c85c56b32c42bcc54a43189c1e00fdceb23ec Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Tue, 26 Oct 2010 14:22:21 -0700 Subject: [PATCH 087/176] kernel.h: add {min,max}3 macros Introduce two additional min/max macros to compare three operands. This will save some cycles as well as some bytes on the stack and last but not least more pleasing as macro nesting. [akpm@linux-foundation.org: fix warnings] Signed-off-by: Hagen Paul Pfeifer Cc: Joe Perches Cc: Ingo Molnar Cc: Hartley Sweeten Cc: Russell King Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Herbert Xu Cc: Roland Dreier Cc: Sean Hefty Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index edef168a0406..8e786a27cfe6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -651,6 +651,24 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; }) +#define min3(x, y, z) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + typeof(z) _min3 = (z); \ + (void) (&_min1 == &_min2); \ + (void) (&_min1 == &_min3); \ + _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ + (_min2 < _min3 ? _min2 : _min3); }) + +#define max3(x, y, z) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + typeof(z) _max3 = (z); \ + (void) (&_max1 == &_max2); \ + (void) (&_max1 == &_max3); \ + _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ + (_max2 > _max3 ? _max2 : _max3); }) + /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero * @x: value1 From 732eacc0542d0aa48797f675888b85d6065af837 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Tue, 26 Oct 2010 14:22:23 -0700 Subject: [PATCH 088/176] replace nested max/min macros with {max,min}3 macro Use the new {max,min}3 macros to save some cycles and bytes on the stack. This patch substitutes trivial nested macros with their counterpart. Signed-off-by: Hagen Paul Pfeifer Cc: Joe Perches Cc: Ingo Molnar Cc: Hartley Sweeten Cc: Russell King Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Herbert Xu Cc: Roland Dreier Cc: Sean Hefty Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-ep93xx/clock.c | 3 +-- arch/powerpc/kernel/vio.c | 4 +--- arch/x86/kernel/cpu/intel_cacheinfo.c | 1 + drivers/crypto/hifn_795x.c | 4 ++-- drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 +-- drivers/macintosh/windfarm_pm121.c | 2 +- drivers/video/au1200fb.c | 2 +- mm/slab.c | 2 +- 8 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/arm/mach-ep93xx/clock.c b/arch/arm/mach-ep93xx/clock.c index 4566bd1c8660..ef06c66a6f16 100644 --- a/arch/arm/mach-ep93xx/clock.c +++ b/arch/arm/mach-ep93xx/clock.c @@ -358,8 +358,7 @@ static int calc_clk_div(struct clk *clk, unsigned long rate, int i, found = 0, __div = 0, __pdiv = 0; /* Don't exceed the maximum rate */ - max_rate = max(max(clk_pll1.rate / 4, clk_pll2.rate / 4), - clk_xtali.rate / 4); + max_rate = max3(clk_pll1.rate / 4, clk_pll2.rate / 4, clk_xtali.rate / 4); rate = min(rate, max_rate); /* diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index d692989a4318..441d2a722f06 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -238,9 +238,7 @@ static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size) * memory in this pool does not change. */ if (spare_needed && reserve_freed) { - tmp = min(spare_needed, min(reserve_freed, - (viodev->cmo.entitled - - VIO_CMO_MIN_ENT))); + tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT)); vio_cmo.spare += tmp; viodev->cmo.entitled -= tmp; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 12cd823c8d03..17ad03366211 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index 0eac3da566ba..a84250a5dd51 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -1467,7 +1467,7 @@ static int ablkcipher_add(unsigned int *drestp, struct scatterlist *dst, return -EINVAL; while (size) { - copy = min(drest, min(size, dst->length)); + copy = min3(drest, size, dst->length); size -= copy; drest -= copy; @@ -1729,7 +1729,7 @@ static int ablkcipher_get(void *saddr, unsigned int *srestp, unsigned int offset return -EINVAL; while (size) { - copy = min(srest, min(dst->length, size)); + copy = min3(srest, dst->length, size); daddr = kmap_atomic(sg_page(dst), KM_IRQ0); memcpy(daddr + dst->offset + offset, saddr, copy); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b4b22576f12a..19c5d3b191f4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1409,8 +1409,7 @@ static int __init ipoib_init_module(void) ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); - ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, - IPOIB_MIN_QUEUE_SIZE)); + ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c index 947d4afa25ca..30e6195e19d4 100644 --- a/drivers/macintosh/windfarm_pm121.c +++ b/drivers/macintosh/windfarm_pm121.c @@ -482,7 +482,7 @@ static s32 pm121_correct(s32 new_setpoint, new_min += correction->offset; new_min = (new_min >> 16) + min; - return max(new_setpoint, max(new_min, 0)); + return max3(new_setpoint, new_min, 0); } static s32 pm121_connect(unsigned int control_id, s32 setpoint) diff --git a/drivers/video/au1200fb.c b/drivers/video/au1200fb.c index e77e8e4280fb..4ea187d93768 100644 --- a/drivers/video/au1200fb.c +++ b/drivers/video/au1200fb.c @@ -1079,7 +1079,7 @@ static int au1200fb_fb_check_var(struct fb_var_screeninfo *var, * clock can only be obtain by dividing this value by an even integer. * Fallback to a slower pixel clock if necessary. */ pixclock = max((u32)(PICOS2KHZ(var->pixclock) * 1000), fbi->monspecs.dclkmin); - pixclock = min(pixclock, min(fbi->monspecs.dclkmax, (u32)AU1200_LCD_MAX_CLK/2)); + pixclock = min3(pixclock, fbi->monspecs.dclkmax, (u32)AU1200_LCD_MAX_CLK/2); if (AU1200_LCD_MAX_CLK % pixclock) { int diff = AU1200_LCD_MAX_CLK % pixclock; diff --git a/mm/slab.c b/mm/slab.c index fcae9815d3b3..b1e40dafbab3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to, struct array_cache *from, unsigned int max) { /* Figure out how many entries to transfer */ - int nr = min(min(from->avail, max), to->limit - to->avail); + int nr = min3(from->avail, max, to->limit - to->avail); if (!nr) return 0; From c925cf0b80cb486b31e1ec0e9f981d75a4b38453 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Tue, 26 Oct 2010 14:22:23 -0700 Subject: [PATCH 089/176] m68k{nommu}/blackfin : remove old assembler-only flags bit definitions Long ago, PT_TRACESYS_OFF and friends were introduced as hard defines to avoid straight constants in assembler parts of linux m68k. They are not used anymore, and were not updated to follow changes in linux kernel. Remove them. When similar constants are needed, they are now generated using asm-offsets.c. Signed-off-by: Philippe De Muyter Acked-by: Mike Frysinger Acked-by: Geert Uytterhoeven Acked-by: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/entry.h | 8 -------- arch/m68k/include/asm/entry_mm.h | 8 -------- arch/m68k/include/asm/entry_no.h | 10 ---------- 3 files changed, 26 deletions(-) diff --git a/arch/blackfin/include/asm/entry.h b/arch/blackfin/include/asm/entry.h index a6886f6e4819..4104d5783e2c 100644 --- a/arch/blackfin/include/asm/entry.h +++ b/arch/blackfin/include/asm/entry.h @@ -15,14 +15,6 @@ #define LFLUSH_I_AND_D 0x00000808 #define LSIGTRAP 5 -/* process bits for task_struct.flags */ -#define PF_TRACESYS_OFF 3 -#define PF_TRACESYS_BIT 5 -#define PF_PTRACED_OFF 3 -#define PF_PTRACED_BIT 4 -#define PF_DTRACE_OFF 1 -#define PF_DTRACE_BIT 5 - /* * NOTE! The single-stepping code assumes that all interrupt handlers * start by saving SYSCFG on the stack with their first instruction. diff --git a/arch/m68k/include/asm/entry_mm.h b/arch/m68k/include/asm/entry_mm.h index e41fea399bfe..73b8c8fbed9c 100644 --- a/arch/m68k/include/asm/entry_mm.h +++ b/arch/m68k/include/asm/entry_mm.h @@ -50,14 +50,6 @@ LFLUSH_I_AND_D = 0x00000808 -/* process bits for task_struct.ptrace */ -PT_TRACESYS_OFF = 3 -PT_TRACESYS_BIT = 1 -PT_PTRACED_OFF = 3 -PT_PTRACED_BIT = 0 -PT_DTRACE_OFF = 3 -PT_DTRACE_BIT = 2 - #define SAVE_ALL_INT save_all_int #define SAVE_ALL_SYS save_all_sys #define RESTORE_ALL restore_all diff --git a/arch/m68k/include/asm/entry_no.h b/arch/m68k/include/asm/entry_no.h index 80e41492aa2a..26be277394f9 100644 --- a/arch/m68k/include/asm/entry_no.h +++ b/arch/m68k/include/asm/entry_no.h @@ -32,16 +32,6 @@ #ifdef __ASSEMBLY__ -/* process bits for task_struct.flags */ -PF_TRACESYS_OFF = 3 -PF_TRACESYS_BIT = 5 -PF_PTRACED_OFF = 3 -PF_PTRACED_BIT = 4 -PF_DTRACE_OFF = 1 -PF_DTRACE_BIT = 5 - -LENOSYS = 38 - #define SWITCH_STACK_SIZE (6*4+4) /* Includes return address */ /* From a55621f15bc61826969a29e111ba131a55ef45de Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:25 -0700 Subject: [PATCH 090/176] include/linux/kernel.h: add __must_check to strict_strto*() The whole point to using the strict functions is to check the return value. If you don't, strict_strto*() will return you uninitialised garbage. Offenders have been observed in the wild. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8e786a27cfe6..e9b492b33032 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -203,10 +203,10 @@ extern unsigned long simple_strtoul(const char *,char **,unsigned int); extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -extern int strict_strtoul(const char *, unsigned int, unsigned long *); -extern int strict_strtol(const char *, unsigned int, long *); -extern int strict_strtoull(const char *, unsigned int, unsigned long long *); -extern int strict_strtoll(const char *, unsigned int, long long *); +extern int __must_check strict_strtoul(const char *, unsigned int, unsigned long *); +extern int __must_check strict_strtol(const char *, unsigned int, long *); +extern int __must_check strict_strtoull(const char *, unsigned int, unsigned long long *); +extern int __must_check strict_strtoll(const char *, unsigned int, long long *); extern int sprintf(char * buf, const char * fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int vsprintf(char *buf, const char *, va_list) From b6472776816af1ed52848c93d26e3edb3b17adab Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Oct 2010 14:22:26 -0700 Subject: [PATCH 091/176] modules: no need to align .modinfo strings gcc aligns strings as a performance consideration for those cases where strings are being used a lot. Their use is not performance critical, and hence it seems better to save some space. Signed-off-by: Jan Beulich Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/moduleparam.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 9d2f1837b3d8..112adf8bd47d 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -21,8 +21,8 @@ #define __module_cat(a,b) ___module_cat(a,b) #define __MODULE_INFO(tag, name, info) \ static const char __module_cat(name,__LINE__)[] \ - __used \ - __attribute__((section(".modinfo"),unused)) = __stringify(tag) "=" info + __used __attribute__((section(".modinfo"), unused, aligned(1))) \ + = __stringify(tag) "=" info #else /* !MODULE */ #define __MODULE_INFO(tag, name, info) #endif From 3ecb01df3261d3b1f02ccfcf8384e2a255d2a1d0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Oct 2010 14:22:27 -0700 Subject: [PATCH 092/176] use clear_page()/copy_page() in favor of memset()/memcpy() on whole pages After all that's what they are intended for. Signed-off-by: Jan Beulich Cc: Miklos Szeredi Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 2 +- kernel/kexec.c | 2 +- kernel/power/snapshot.c | 14 +++++++------- kernel/power/swap.c | 6 +++--- mm/memory.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cde755cca564..69037118b2c9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -811,7 +811,7 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (page && zeroing && count < PAGE_SIZE) { void *mapaddr = kmap_atomic(page, KM_USER1); - memset(mapaddr, 0, PAGE_SIZE); + clear_page(mapaddr); kunmap_atomic(mapaddr, KM_USER1); } while (count) { diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d6730..b55045bc7563 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, ptr = kmap(page); /* Start with a clear page */ - memset(ptr, 0, PAGE_SIZE); + clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); if (mchunk > mbytes) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 9e3581f4619a..0dac75ea4456 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) */ safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); } else { safe_copy_page(page_address(d_page), s_page); @@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); + clear_page(buffer); pack_pfns(buffer, &orig_bm); } else { struct page *page; @@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) void *kaddr; kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); + copy_page(buffer, kaddr); kunmap_atomic(kaddr, KM_USER0); handle->buffer = buffer; } else { @@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void) void *dst; dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); last_highmem_page = NULL; } @@ -2270,9 +2270,9 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) kaddr1 = kmap_atomic(p1, KM_USER0); kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); kunmap_atomic(kaddr2, KM_USER1); kunmap_atomic(kaddr1, KM_USER0); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 916eaa790399..a0e4a86ccf94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (bio_chain) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (src) { - memcpy(src, buf, PAGE_SIZE); + copy_page(src, buf); } else { WARN_ON_ONCE(1); bio_chain = NULL; /* Go synchronous */ @@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = write_page(handle->cur, handle->cur_swap, NULL); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } @@ -910,7 +910,7 @@ int swsusp_check(void) hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); + clear_page(swsusp_header); error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) diff --git a/mm/memory.c b/mm/memory.c index 861f7982dd54..02e48aa0ed13 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2080,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); + clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); } else From b6777c40c79168d938c30b5b7471fbd64bca109c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 26 Oct 2010 14:22:27 -0700 Subject: [PATCH 093/176] fuse: use clear_highpage() and KM_USER0 instead of KM_USER1 Commit 7909b1c640 ("fuse: don't use atomic kmap") removed KM_USER0 usage from fuse/dev.c. Switch KM_USER1 uses to KM_USER0 for clarity. Also replace open coded clear_highpage(). Signed-off-by: Miklos Szeredi Cc: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69037118b2c9..b98664275f02 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, int err; struct page *page = *pagep; - if (page && zeroing && count < PAGE_SIZE) { - void *mapaddr = kmap_atomic(page, KM_USER1); - clear_page(mapaddr); - kunmap_atomic(mapaddr, KM_USER1); - } + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + while (count) { if (cs->write && cs->pipebufs && page) { return fuse_ref_page(cs, page, offset, count); @@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, } } if (page) { - void *mapaddr = kmap_atomic(page, KM_USER1); + void *mapaddr = kmap_atomic(page, KM_USER0); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); - kunmap_atomic(mapaddr, KM_USER1); + kunmap_atomic(mapaddr, KM_USER0); } else offset += fuse_copy_do(cs, NULL, &count); } From 2473238eac95ba6dd2c4ba19cc36aaf01465076b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 26 Oct 2010 14:22:28 -0700 Subject: [PATCH 094/176] ihex: add support for CS:IP/EIP records ihex firmwares can include a jump address for starting execution. Add a -j option which will cause this to be written into the generated file as a record with address zero and data consisting of the address to jump to, allowing drivers to make use of this information. This format is chosen because it most closely follows the original ihex format, though it may make more sense to write a record with length zero and the address stored as the address. The records are not omitted by default since our ihex format does not include record type information and so including additional records may lead to confusion. Signed-off-by: Mark Brown Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- firmware/ihex2fw.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/firmware/ihex2fw.c b/firmware/ihex2fw.c index 5a03ba8c8364..ba0cf0b601bb 100644 --- a/firmware/ihex2fw.c +++ b/firmware/ihex2fw.c @@ -55,6 +55,7 @@ static int output_records(int outfd); static int sort_records = 0; static int wide_records = 0; +static int include_jump = 0; static int usage(void) { @@ -63,6 +64,7 @@ static int usage(void) fprintf(stderr, "usage: ihex2fw [] \n"); fprintf(stderr, " -w: wide records (16-bit length)\n"); fprintf(stderr, " -s: sort records by address\n"); + fprintf(stderr, " -j: include records for CS:IP/EIP address\n"); return 1; } @@ -73,7 +75,7 @@ int main(int argc, char **argv) uint8_t *data; int opt; - while ((opt = getopt(argc, argv, "ws")) != -1) { + while ((opt = getopt(argc, argv, "wsj")) != -1) { switch (opt) { case 'w': wide_records = 1; @@ -81,7 +83,9 @@ int main(int argc, char **argv) case 's': sort_records = 1; break; - default: + case 'j': + include_jump = 1; + break; return usage(); } } @@ -128,6 +132,7 @@ static int process_ihex(uint8_t *data, ssize_t size) { struct ihex_binrec *record; uint32_t offset = 0; + uint32_t data32; uint8_t type, crc = 0, crcbyte = 0; int i, j; int line = 1; @@ -223,8 +228,14 @@ static int process_ihex(uint8_t *data, ssize_t size) return -EINVAL; } + memcpy(&data32, &record->data[0], sizeof(data32)); + data32 = htonl(data32); + memcpy(&record->data[0], &data32, sizeof(data32)); + /* These records contain the CS/IP or EIP where execution - * starts. Don't really know what to do with them. */ + * starts. If requested output this as a record. */ + if (include_jump) + file_record(record); goto next_record; default: From cd1c584f380183aca268d454c14b22d8454eac4f Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Tue, 26 Oct 2010 14:22:28 -0700 Subject: [PATCH 095/176] fs/direct-io.c: fix truncation error in dio_complete() return Fix up truncation (ssize_t->int). This only matters with >2G reads/writes, which the kernel doesn't permit. Signed-off-by: Edward Shishkin Reviewed-by: Christoph Hellwig Acked-by: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 48d74c7391d1..85882f6ba5f7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) { ssize_t transferred = 0; From d356c0b680d15e0187de48aa303e541b461ea794 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 Oct 2010 14:22:29 -0700 Subject: [PATCH 096/176] vmlinux.lds.h: gather .data..shared_aligned sections in DATA_DATA With the recent change "net: remove time limit in process_backlog()", the softnet_data variable changed from "DEFINE_PER_CPU()" to "DEFINE_PER_CPU_ALIGNED()" which moved it from the .data section to the .data.shared_align section. I'm not saying this patch is wrong, just that is what caused me to notice this larger problem. No one else in the kernel is using this aligned macro variant, so I imagine that's why no one has noticed yet. Since .data..shared_align isn't declared in any vmlinux files that I can see, the linker just places it last. This "just works" for most people, but when building a ROM kernel on Blackfin systems, it causes section overlap errors: bfin-uclinux-ld.real: section .init.data [00000000202e06b8 -> 00000000202e48b7] overlaps section .data.shared_aligned [00000000202e06b8 -> 00000000202e0723] I imagine other arches which support the ROM config option and thus do funky placement would see similar issues ... On x86, it is stuck in a dedicated section at the end: [8] .data PROGBITS ffffffff810ec000 2ec0000303a8 00 WA 0 0 4096 [9] .data.shared_alig PROGBITS ffffffff8111c3c0 31c3c00000c8 00 WA 0 0 64 So make sure we include this section in the DATA_DATA macro so that it is placed in the right location. Signed-off-by: Mike Frysinger Cc: Sam Ravnborg Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Alan Jenkins Cc: Greg Ungerer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/vmlinux.lds.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f4229fb315e1..aaf1105d9d91 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -150,6 +150,7 @@ #define DATA_DATA \ *(.data) \ *(.ref.data) \ + *(.data..shared_aligned) /* percpu related */ \ DEV_KEEP(init.data) \ DEV_KEEP(exit.data) \ CPU_KEEP(init.data) \ From d88262623fb58853ed60fb110141c435de26e3de Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 Oct 2010 14:22:30 -0700 Subject: [PATCH 097/176] vmlinux.lds.h: lower init ramfs alignment to 4 The new init ramfs format (cpio based) requires an alignment of 4 (per the documentation and per the source files themselves). As for compressed sources, the decompressors can all deal with unaligned buffers. The cpio source is also found in the __init sections of the kernel, so once they are read and expanded into a tmpfs, the source is freed. That means there is no need to force page alignment here either. This has been used on Blackfin systems for many releases without issue. Signed-off-by: Mike Frysinger Cc: Al Viro Cc: Sam Ravnborg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index aaf1105d9d91..2c0fc10956ba 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -637,7 +637,7 @@ #ifdef CONFIG_BLK_DEV_INITRD #define INIT_RAM_FS \ - . = ALIGN(PAGE_SIZE); \ + . = ALIGN(4); \ VMLINUX_SYMBOL(__initramfs_start) = .; \ *(.init.ramfs) \ VMLINUX_SYMBOL(__initramfs_end) = .; From 1df79da85657aecde2ecff052ff0cf9910311078 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:31 -0700 Subject: [PATCH 098/176] fs/buffer.c: remove duplicated assignment to b_private bh->b_private is initialized within init_buffer(), thus this assignment is redundant. Signed-off-by: Namhyung Kim Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index ec21a92e08b4..8d595ab2aed1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -905,7 +905,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bh->b_state = 0; atomic_set(&bh->b_count, 0); - bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ From 4199ca77cc44c418972e8f30a36be7d26d21a0d2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 26 Oct 2010 14:22:32 -0700 Subject: [PATCH 099/176] fs: move exportfs since it is not a networking filesystem Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block since it provides a library function that can be (and is) used by other (non-network) filesystems. This also eliminates a kconfig dependency warning: warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS) Signed-off-by: Randy Dunlap Cc: Dave Chinner Cc: Al Viro Cc: Alex Elder Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 65781de44fc0..b5e582bd769d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +config EXPORTFS + tristate + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y @@ -222,9 +225,6 @@ config LOCKD_V4 depends on FILE_LOCKING default y -config EXPORTFS - tristate - config NFS_ACL_SUPPORT tristate select FS_POSIX_ACL From 99dc829256bb8cfcb1f58b7f118893fdbf608e60 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 26 Oct 2010 14:22:33 -0700 Subject: [PATCH 100/176] procfs: fix numbering in /proc/locks The lock number in /proc/locks (first field) is implemented by a counter (private field of struct seq_file) which is incremented at each call of locks_show() and reset to 1 in locks_start() whatever the offset is. It should be reset according to the actual position in the list. Because of this, the numbering erratically restarts at 1 several times when reading a long /proc/locks file. Moreover, locks_show() can be called twice to print a single line thus skipping a number. The counter should be incremented in locks_next(). And last, pos is a loff_t, which can be bigger than a pointer, so we don't use the pointer as an integer anymore, and allocate a loff_t instead. Signed-off-by: Jerome Marchand Cc: Pavel Emelyanov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 8b2b6ad56a09..4de3a2666810 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2109,7 +2109,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock); #include static void lock_get_status(struct seq_file *f, struct file_lock *fl, - int id, char *pfx) + loff_t id, char *pfx) { struct inode *inode = NULL; unsigned int fl_pid; @@ -2122,7 +2122,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (fl->fl_file != NULL) inode = fl->fl_file->f_path.dentry->d_inode; - seq_printf(f, "%d:%s ", id, pfx); + seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { seq_printf(f, "%6s %s ", (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", @@ -2185,24 +2185,27 @@ static int locks_show(struct seq_file *f, void *v) fl = list_entry(v, struct file_lock, fl_link); - lock_get_status(f, fl, (long)f->private, ""); + lock_get_status(f, fl, *((loff_t *)f->private), ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) - lock_get_status(f, bfl, (long)f->private, " ->"); + lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); - f->private++; return 0; } static void *locks_start(struct seq_file *f, loff_t *pos) { + loff_t *p = f->private; + lock_flocks(); - f->private = (void *)1; + *p = (*pos + 1); return seq_list_start(&file_lock_list, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) { + loff_t *p = f->private; + ++*p; return seq_list_next(v, &file_lock_list, pos); } @@ -2220,14 +2223,14 @@ static const struct seq_operations locks_seq_operations = { static int locks_open(struct inode *inode, struct file *filp) { - return seq_open(filp, &locks_seq_operations); + return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); } static const struct file_operations proc_locks_operations = { .open = locks_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; static int __init proc_locks_init(void) From ca1cab37d91cbe8a8333732540d43cabb54cfa85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:34 -0700 Subject: [PATCH 101/176] workqueues: s/ON_STACK/ONSTACK/ Silly though it is, completions and wait_queue_heads use foo_ONSTACK (COMPLETION_INITIALIZER_ONSTACK, DECLARE_COMPLETION_ONSTACK, __WAIT_QUEUE_HEAD_INIT_ONSTACK and DECLARE_WAIT_QUEUE_HEAD_ONSTACK) so I guess workqueues should do the same thing. s/INIT_WORK_ON_STACK/INIT_WORK_ONSTACK/ s/INIT_DELAYED_WORK_ON_STACK/INIT_DELAYED_WORK_ONSTACK/ Cc: Peter Zijlstra Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- drivers/md/dm-snap-persistent.c | 2 +- include/linux/workqueue.h | 6 +++--- kernel/workqueue.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index aff0b3c27509..ae03cab4352e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -713,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6af118511b4a..6c7faecd9e4a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -747,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 0b61792a2780..2129cdb115dc 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -254,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, * Issue the synchronous I/O from a different thread * to avoid generic_make_request recursion. */ - INIT_WORK_ON_STACK(&req.work, do_metadata); + INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); flush_workqueue(ps->metadata_wq); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 070bb7a88936..0c0771f06bfa 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -190,7 +190,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } __INIT_WORK((_work), (_func), 0); \ } while (0) -#define INIT_WORK_ON_STACK(_work, _func) \ +#define INIT_WORK_ONSTACK(_work, _func) \ do { \ __INIT_WORK((_work), (_func), 1); \ } while (0) @@ -201,9 +201,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } init_timer(&(_work)->timer); \ } while (0) -#define INIT_DELAYED_WORK_ON_STACK(_work, _func) \ +#define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ do { \ - INIT_WORK_ON_STACK(&(_work)->work, (_func)); \ + INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ init_timer_on_stack(&(_work)->timer); \ } while (0) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e5ff2cbaadc2..90db1bd1a978 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * checks and call back into the fixup functions where we * might deadlock. */ - INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); + INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); From 10ad5278bbc961c9df8260f3e116d60eaaa3fb18 Mon Sep 17 00:00:00 2001 From: Rahul Ruikar Date: Tue, 26 Oct 2010 14:22:35 -0700 Subject: [PATCH 102/176] drivers/misc/phantom.c: add missing warning messages in phantom_probe() phantom_probe() can fail in many places. Add missing warning messages in pci_enable_device() and pci_request_regions(). Signed-off-by: Rahul Ruikar Cc: Jiri Slaby Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/phantom.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index 4197a3cb26ba..b05db55c8c8e 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -343,8 +343,10 @@ static int __devinit phantom_probe(struct pci_dev *pdev, int retval; retval = pci_enable_device(pdev); - if (retval) + if (retval) { + dev_err(&pdev->dev, "pci_enable_device failed!\n"); goto err; + } minor = phantom_get_free(); if (minor == PHANTOM_MAX_MINORS) { @@ -356,8 +358,10 @@ static int __devinit phantom_probe(struct pci_dev *pdev, phantom_devices[minor] = 1; retval = pci_request_regions(pdev, "phantom"); - if (retval) + if (retval) { + dev_err(&pdev->dev, "pci_request_regions failed!\n"); goto err_null; + } retval = -ENOMEM; pht = kzalloc(sizeof(*pht), GFP_KERNEL); From 5f400cf40fc703673aa791966ffb1c628c1ff45a Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Tue, 26 Oct 2010 14:22:35 -0700 Subject: [PATCH 103/176] drivers/misc/ad525x_dpot.c: fix part name typos in defines There is no runtime effect by this change. It frees up namespace for defines erroneously used. This is required to actually support devices requiring the namespace, added with "drivers/misc/ad525x_dpot.c: new features". All defines touched have the same value defined, after the change. Signed-off-by: Michael Hennerich Cc: Mike Frysinger Cc: Chris Verges Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/ad525x_dpot.c | 10 +++++----- drivers/misc/ad525x_dpot.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/misc/ad525x_dpot.c b/drivers/misc/ad525x_dpot.c index 5e6fa8449e8b..9698c7546911 100644 --- a/drivers/misc/ad525x_dpot.c +++ b/drivers/misc/ad525x_dpot.c @@ -166,7 +166,7 @@ static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg) case DPOT_UID(AD5280_ID): case DPOT_UID(AD5282_ID): ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ? - 0 : DPOT_AD5291_RDAC_AB; + 0 : DPOT_AD5282_RDAC_AB; return dpot_read_r8d8(dpot, ctrl); case DPOT_UID(AD5170_ID): case DPOT_UID(AD5171_ID): @@ -175,7 +175,7 @@ static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg) case DPOT_UID(AD5172_ID): case DPOT_UID(AD5173_ID): ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ? - 0 : DPOT_AD5272_3_A0; + 0 : DPOT_AD5172_3_A0; return dpot_read_r8d8(dpot, ctrl); default: if ((reg & DPOT_REG_TOL) || (dpot->max_pos > 256)) @@ -273,7 +273,7 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value) case DPOT_UID(AD5280_ID): case DPOT_UID(AD5282_ID): ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ? - 0 : DPOT_AD5291_RDAC_AB; + 0 : DPOT_AD5282_RDAC_AB; return dpot_write_r8d8(dpot, ctrl, value); break; case DPOT_UID(AD5171_ID): @@ -289,12 +289,12 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value) case DPOT_UID(AD5172_ID): case DPOT_UID(AD5173_ID): ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ? - 0 : DPOT_AD5272_3_A0; + 0 : DPOT_AD5172_3_A0; if (reg & DPOT_ADDR_OTP) { tmp = dpot_read_r8d16(dpot, ctrl); if (tmp >> 14) /* Ready to Program? */ return -EFAULT; - ctrl |= DPOT_AD5270_2_3_FUSE; + ctrl |= DPOT_AD5170_2_3_FUSE; } return dpot_write_r8d8(dpot, ctrl, value); break; diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h index 78b89fd2e2fd..7609d49efd31 100644 --- a/drivers/misc/ad525x_dpot.h +++ b/drivers/misc/ad525x_dpot.h @@ -166,14 +166,14 @@ enum dpot_devid { #define DPOT_AD5291_RDAC 0x01 #define DPOT_AD5291_READ_RDAC 0x02 -/* AD524x use special commands */ #define DPOT_AD5291_RDAC_AB 0x80 +#define DPOT_AD5282_RDAC_AB 0x80 #define DPOT_AD5273_FUSE 0x80 -#define DPOT_AD5270_2_3_FUSE 0x20 -#define DPOT_AD5270_2_3_OW 0x08 -#define DPOT_AD5272_3_A0 0x08 -#define DPOT_AD5270_2FUSE 0x80 +#define DPOT_AD5170_2_3_FUSE 0x20 +#define DPOT_AD5170_2_3_OW 0x08 +#define DPOT_AD5172_3_A0 0x08 +#define DPOT_AD5170_2FUSE 0x80 struct dpot_data; From a4bd394956f20d0bfc0ca6ecfac2af4150da274a Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Tue, 26 Oct 2010 14:22:36 -0700 Subject: [PATCH 104/176] drivers/misc/ad525x_dpot.c: new features Add support for AD5270, AD5271, AD5272, AD5274 digital potentiometers. Add 20-TP feature for AD5291 and AD5292 parts, and update feature list. AD5291 rdac read back must be shifted by two. Signed-off-by: Michael Hennerich Cc: Mike Frysinger Cc: Chris Verges Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/Kconfig | 3 +- drivers/misc/ad525x_dpot-i2c.c | 2 + drivers/misc/ad525x_dpot-spi.c | 2 + drivers/misc/ad525x_dpot.c | 107 ++++++++++++++++++++++++++++++--- drivers/misc/ad525x_dpot.h | 23 ++++++- 5 files changed, 123 insertions(+), 14 deletions(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 1f69743b12ec..3d57adfc8703 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -24,7 +24,8 @@ config AD525X_DPOT AD5260, AD5262, AD5263, AD5290, AD5291, AD5292, AD5293, AD7376, AD8400, AD8402, AD8403, ADN2850, AD5241, AD5242, AD5243, AD5245, AD5246, AD5247, AD5248, AD5280, AD5282, - ADN2860, AD5273, AD5171, AD5170, AD5172, AD5173 + ADN2860, AD5273, AD5171, AD5170, AD5172, AD5173, AD5270, + AD5271, AD5272, AD5274 digital potentiometer chips. See Documentation/misc-devices/ad525x_dpot.txt for the diff --git a/drivers/misc/ad525x_dpot-i2c.c b/drivers/misc/ad525x_dpot-i2c.c index 374352af7979..4ff73c215746 100644 --- a/drivers/misc/ad525x_dpot-i2c.c +++ b/drivers/misc/ad525x_dpot-i2c.c @@ -102,6 +102,8 @@ static const struct i2c_device_id ad_dpot_id[] = { {"ad5170", AD5170_ID}, {"ad5172", AD5172_ID}, {"ad5173", AD5173_ID}, + {"ad5272", AD5272_ID}, + {"ad5274", AD5274_ID}, {} }; MODULE_DEVICE_TABLE(i2c, ad_dpot_id); diff --git a/drivers/misc/ad525x_dpot-spi.c b/drivers/misc/ad525x_dpot-spi.c index 6cfcb636577a..7f9a55afe05d 100644 --- a/drivers/misc/ad525x_dpot-spi.c +++ b/drivers/misc/ad525x_dpot-spi.c @@ -38,6 +38,8 @@ static const struct ad_dpot_id ad_dpot_spi_devlist[] = { {.name = "ad8402", .devid = AD8402_ID}, {.name = "ad8403", .devid = AD8403_ID}, {.name = "adn2850", .devid = ADN2850_ID}, + {.name = "ad5270", .devid = AD5270_ID}, + {.name = "ad5271", .devid = AD5271_ID}, {} }; diff --git a/drivers/misc/ad525x_dpot.c b/drivers/misc/ad525x_dpot.c index 9698c7546911..7cb911028d09 100644 --- a/drivers/misc/ad525x_dpot.c +++ b/drivers/misc/ad525x_dpot.c @@ -29,9 +29,9 @@ * AD5262 2 256 20, 50, 200 * AD5263 4 256 20, 50, 200 * AD5290 1 256 10, 50, 100 - * AD5291 1 256 20 - * AD5292 1 1024 20 - * AD5293 1 1024 20 + * AD5291 1 256 20, 50, 100 (20-TP) + * AD5292 1 1024 20, 50, 100 (20-TP) + * AD5293 1 1024 20, 50, 100 * AD7376 1 128 10, 50, 100, 1M * AD8400 1 256 1, 10, 50, 100 * AD8402 2 256 1, 10, 50, 100 @@ -52,6 +52,10 @@ * AD5170 1 256 2.5, 10, 50, 100 (OTP) * AD5172 2 256 2.5, 10, 50, 100 (OTP) * AD5173 2 256 2.5, 10, 50, 100 (OTP) + * AD5270 1 1024 20, 50, 100 (50-TP) + * AD5271 1 256 20, 50, 100 (50-TP) + * AD5272 1 1024 20, 50, 100 (50-TP) + * AD5274 1 256 20, 50, 100 (50-TP) * * See Documentation/misc-devices/ad525x_dpot.txt for more info. * @@ -126,18 +130,38 @@ static inline int dpot_write_r8d16(struct dpot_data *dpot, u8 reg, u16 val) static s32 dpot_read_spi(struct dpot_data *dpot, u8 reg) { unsigned ctrl = 0; + int value; if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD))) { if (dpot->feat & F_RDACS_WONLY) return dpot->rdac_cache[reg & DPOT_RDAC_MASK]; - if (dpot->uid == DPOT_UID(AD5291_ID) || dpot->uid == DPOT_UID(AD5292_ID) || - dpot->uid == DPOT_UID(AD5293_ID)) - return dpot_read_r8d8(dpot, + dpot->uid == DPOT_UID(AD5293_ID)) { + + value = dpot_read_r8d8(dpot, DPOT_AD5291_READ_RDAC << 2); + if (dpot->uid == DPOT_UID(AD5291_ID)) + value = value >> 2; + + return value; + } else if (dpot->uid == DPOT_UID(AD5270_ID) || + dpot->uid == DPOT_UID(AD5271_ID)) { + + value = dpot_read_r8d8(dpot, + DPOT_AD5270_1_2_4_READ_RDAC << 2); + + if (value < 0) + return value; + + if (dpot->uid == DPOT_UID(AD5271_ID)) + value = value >> 2; + + return value; + } + ctrl = DPOT_SPI_READ_RDAC; } else if (reg & DPOT_ADDR_EEPROM) { ctrl = DPOT_SPI_READ_EEPROM; @@ -153,6 +177,7 @@ static s32 dpot_read_spi(struct dpot_data *dpot, u8 reg) static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg) { + int value; unsigned ctrl = 0; switch (dpot->uid) { case DPOT_UID(AD5246_ID): @@ -177,6 +202,25 @@ static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg) ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ? 0 : DPOT_AD5172_3_A0; return dpot_read_r8d8(dpot, ctrl); + case DPOT_UID(AD5272_ID): + case DPOT_UID(AD5274_ID): + dpot_write_r8d8(dpot, + (DPOT_AD5270_1_2_4_READ_RDAC << 2), 0); + + value = dpot_read_r8d16(dpot, + DPOT_AD5270_1_2_4_RDAC << 2); + + if (value < 0) + return value; + /* + * AD5272/AD5274 returns high byte first, however + * underling smbus expects low byte first. + */ + value = swab16(value); + + if (dpot->uid == DPOT_UID(AD5271_ID)) + value = value >> 2; + return value; default: if ((reg & DPOT_REG_TOL) || (dpot->max_pos > 256)) return dpot_read_r8d16(dpot, (reg & 0xF8) | @@ -198,7 +242,7 @@ static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value) { unsigned val = 0; - if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD))) { + if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD | DPOT_ADDR_OTP))) { if (dpot->feat & F_RDACS_WONLY) dpot->rdac_cache[reg & DPOT_RDAC_MASK] = value; @@ -219,11 +263,30 @@ static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value) } else { if (dpot->uid == DPOT_UID(AD5291_ID) || dpot->uid == DPOT_UID(AD5292_ID) || - dpot->uid == DPOT_UID(AD5293_ID)) + dpot->uid == DPOT_UID(AD5293_ID)) { + + dpot_write_r8d8(dpot, DPOT_AD5291_CTRLREG << 2, + DPOT_AD5291_UNLOCK_CMD); + + if (dpot->uid == DPOT_UID(AD5291_ID)) + value = value << 2; + return dpot_write_r8d8(dpot, (DPOT_AD5291_RDAC << 2) | (value >> 8), value & 0xFF); + } else if (dpot->uid == DPOT_UID(AD5270_ID) || + dpot->uid == DPOT_UID(AD5271_ID)) { + dpot_write_r8d8(dpot, + DPOT_AD5270_1_2_4_CTRLREG << 2, + DPOT_AD5270_1_2_4_UNLOCK_CMD); + if (dpot->uid == DPOT_UID(AD5271_ID)) + value = value << 2; + + return dpot_write_r8d8(dpot, + (DPOT_AD5270_1_2_4_RDAC << 2) | + (value >> 8), value & 0xFF); + } val = DPOT_SPI_RDAC | (reg & DPOT_RDAC_MASK); } } else if (reg & DPOT_ADDR_EEPROM) { @@ -243,6 +306,16 @@ static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value) val = DPOT_SPI_INC_ALL; break; } + } else if (reg & DPOT_ADDR_OTP) { + if (dpot->uid == DPOT_UID(AD5291_ID) || + dpot->uid == DPOT_UID(AD5292_ID)) { + return dpot_write_r8d8(dpot, + DPOT_AD5291_STORE_XTPM << 2, 0); + } else if (dpot->uid == DPOT_UID(AD5270_ID) || + dpot->uid == DPOT_UID(AD5271_ID)) { + return dpot_write_r8d8(dpot, + DPOT_AD5270_1_2_4_STORE_XTPM << 2, 0); + } } else BUG(); @@ -303,10 +376,25 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value) tmp = dpot_read_r8d16(dpot, tmp); if (tmp >> 14) /* Ready to Program? */ return -EFAULT; - ctrl = DPOT_AD5270_2_3_FUSE; + ctrl = DPOT_AD5170_2_3_FUSE; } return dpot_write_r8d8(dpot, ctrl, value); break; + case DPOT_UID(AD5272_ID): + case DPOT_UID(AD5274_ID): + dpot_write_r8d8(dpot, DPOT_AD5270_1_2_4_CTRLREG << 2, + DPOT_AD5270_1_2_4_UNLOCK_CMD); + + if (reg & DPOT_ADDR_OTP) + return dpot_write_r8d8(dpot, + DPOT_AD5270_1_2_4_STORE_XTPM << 2, 0); + + if (dpot->uid == DPOT_UID(AD5274_ID)) + value = value << 2; + + return dpot_write_r8d8(dpot, (DPOT_AD5270_1_2_4_RDAC << 2) | + (value >> 8), value & 0xFF); + break; default: if (reg & DPOT_ADDR_CMD) return dpot_write_d8(dpot, reg); @@ -320,7 +408,6 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value) } } - static s32 dpot_write(struct dpot_data *dpot, u8 reg, u16 value) { if (dpot->feat & F_SPI) diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h index 7609d49efd31..3b9e355489ae 100644 --- a/drivers/misc/ad525x_dpot.h +++ b/drivers/misc/ad525x_dpot.h @@ -93,8 +93,10 @@ enum dpot_devid { BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 8, 23), AD5290_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT, BRDAC0, 8, 24), - AD5291_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 8, 25), - AD5292_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 10, 26), + AD5291_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT | F_CMD_OTP, + BRDAC0, 8, 25), + AD5292_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT | F_CMD_OTP, + BRDAC0, 10, 26), AD5293_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 10, 27), AD7376_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT, BRDAC0, 7, 28), @@ -122,6 +124,12 @@ enum dpot_devid { AD5170_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 8, 45), AD5172_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0 | BRDAC1, 8, 46), AD5173_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0 | BRDAC1, 8, 47), + AD5270_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP | F_SPI_16BIT, + BRDAC0, 10, 48), + AD5271_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP | F_SPI_16BIT, + BRDAC0, 8, 49), + AD5272_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 10, 50), + AD5274_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 8, 51), }; #define DPOT_RDAC0 0 @@ -165,10 +173,19 @@ enum dpot_devid { /* AD5291/2/3 use special commands */ #define DPOT_AD5291_RDAC 0x01 #define DPOT_AD5291_READ_RDAC 0x02 +#define DPOT_AD5291_STORE_XTPM 0x03 +#define DPOT_AD5291_CTRLREG 0x06 +#define DPOT_AD5291_UNLOCK_CMD 0x03 -#define DPOT_AD5291_RDAC_AB 0x80 +/* AD5270/1/2/4 use special commands */ +#define DPOT_AD5270_1_2_4_RDAC 0x01 +#define DPOT_AD5270_1_2_4_READ_RDAC 0x02 +#define DPOT_AD5270_1_2_4_STORE_XTPM 0x03 +#define DPOT_AD5270_1_2_4_CTRLREG 0x07 +#define DPOT_AD5270_1_2_4_UNLOCK_CMD 0x03 #define DPOT_AD5282_RDAC_AB 0x80 + #define DPOT_AD5273_FUSE 0x80 #define DPOT_AD5170_2_3_FUSE 0x20 #define DPOT_AD5170_2_3_OW 0x08 From 4b068de9ab1c404734fde90ce5d3f4f5b4f0b9d5 Mon Sep 17 00:00:00 2001 From: steven miao Date: Tue, 26 Oct 2010 14:22:37 -0700 Subject: [PATCH 105/176] ad525x_dpot: use correct rdac channel for ad5251/ad5252 The ad5251/ad5252 devices have rdac1 and rdac3, but no rdac0. So make sure we use the right channels so userspace gets correct data and not just garbage. Signed-off-by: steven miao Signed-off-by: Mike Frysinger Cc: Michael Hennerich Cc: Chris Verges Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/ad525x_dpot.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h index 3b9e355489ae..a662f5987b68 100644 --- a/drivers/misc/ad525x_dpot.h +++ b/drivers/misc/ad525x_dpot.h @@ -47,9 +47,9 @@ enum dpot_devid { AD5258_ID = DPOT_CONF(F_RDACS_RW_TOL, BRDAC0, 6, 0), /* I2C */ AD5259_ID = DPOT_CONF(F_RDACS_RW_TOL, BRDAC0, 8, 1), AD5251_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC, - BRDAC0 | BRDAC3, 6, 2), + BRDAC1 | BRDAC3, 6, 2), AD5252_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC, - BRDAC0 | BRDAC3, 8, 3), + BRDAC1 | BRDAC3, 8, 3), AD5253_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC, BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 6, 4), AD5254_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC, From 190420ab34ab4c077c641893ac19f364cf3606e4 Mon Sep 17 00:00:00 2001 From: Samu Onkalo Date: Tue, 26 Oct 2010 14:22:37 -0700 Subject: [PATCH 106/176] drivers/misc: driver for bh1770glc / sfh7770 ALS and proximity sensor This is a driver for ROHM BH1770GLC and OSRAM SFH7770 combined ALS and proximity sensor. Interface is sysfs based. The driver uses interrupts to provide new data. The driver supports pm_runtime and regulator frameworks. See Documentation/misc-devices/bh1770glc.txt for details Signed-off-by: Samu Onkalo Acked-by: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/Kconfig | 10 + drivers/misc/Makefile | 1 + drivers/misc/bh1770glc.c | 1413 +++++++++++++++++++++++++++++++++ include/linux/i2c/bh1770glc.h | 53 ++ 4 files changed, 1477 insertions(+) create mode 100644 drivers/misc/bh1770glc.c create mode 100644 include/linux/i2c/bh1770glc.h diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 3d57adfc8703..3b50106abfa5 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -315,6 +315,16 @@ config SENSORS_BH1780 This driver can also be built as a module. If so, the module will be called bh1780gli. +config SENSORS_BH1770 + tristate "BH1770GLC / SFH7770 combined ALS - Proximity sensor" + depends on I2C + ---help--- + Say Y here if you want to build a driver for BH1770GLC (ROHM) or + SFH7770 (Osram) combined ambient light and proximity sensor chip. + + To compile this driver as a module, choose M here: the + module will be called bh1770glc. If unsure, say N here. + config HMC6352 tristate "Honeywell HMC6352 compass" depends on I2C diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 9f2986b4da2f..5fed6ab533e2 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_TIFM_CORE) += tifm_core.o obj-$(CONFIG_TIFM_7XX1) += tifm_7xx1.o obj-$(CONFIG_PHANTOM) += phantom.o obj-$(CONFIG_SENSORS_BH1780) += bh1780gli.o +obj-$(CONFIG_SENSORS_BH1770) += bh1770glc.o obj-$(CONFIG_SGI_IOC4) += ioc4.o obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o obj-$(CONFIG_KGDB_TESTS) += kgdbts.o diff --git a/drivers/misc/bh1770glc.c b/drivers/misc/bh1770glc.c new file mode 100644 index 000000000000..cee632e645e1 --- /dev/null +++ b/drivers/misc/bh1770glc.c @@ -0,0 +1,1413 @@ +/* + * This file is part of the ROHM BH1770GLC / OSRAM SFH7770 sensor driver. + * Chip is combined proximity and ambient light sensor. + * + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). + * + * Contact: Samu Onkalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BH1770_ALS_CONTROL 0x80 /* ALS operation mode control */ +#define BH1770_PS_CONTROL 0x81 /* PS operation mode control */ +#define BH1770_I_LED 0x82 /* active LED and LED1, LED2 current */ +#define BH1770_I_LED3 0x83 /* LED3 current setting */ +#define BH1770_ALS_PS_MEAS 0x84 /* Forced mode trigger */ +#define BH1770_PS_MEAS_RATE 0x85 /* PS meas. rate at stand alone mode */ +#define BH1770_ALS_MEAS_RATE 0x86 /* ALS meas. rate at stand alone mode */ +#define BH1770_PART_ID 0x8a /* Part number and revision ID */ +#define BH1770_MANUFACT_ID 0x8b /* Manufacturerer ID */ +#define BH1770_ALS_DATA_0 0x8c /* ALS DATA low byte */ +#define BH1770_ALS_DATA_1 0x8d /* ALS DATA high byte */ +#define BH1770_ALS_PS_STATUS 0x8e /* Measurement data and int status */ +#define BH1770_PS_DATA_LED1 0x8f /* PS data from LED1 */ +#define BH1770_PS_DATA_LED2 0x90 /* PS data from LED2 */ +#define BH1770_PS_DATA_LED3 0x91 /* PS data from LED3 */ +#define BH1770_INTERRUPT 0x92 /* Interrupt setting */ +#define BH1770_PS_TH_LED1 0x93 /* PS interrupt threshold for LED1 */ +#define BH1770_PS_TH_LED2 0x94 /* PS interrupt threshold for LED2 */ +#define BH1770_PS_TH_LED3 0x95 /* PS interrupt threshold for LED3 */ +#define BH1770_ALS_TH_UP_0 0x96 /* ALS upper threshold low byte */ +#define BH1770_ALS_TH_UP_1 0x97 /* ALS upper threshold high byte */ +#define BH1770_ALS_TH_LOW_0 0x98 /* ALS lower threshold low byte */ +#define BH1770_ALS_TH_LOW_1 0x99 /* ALS lower threshold high byte */ + +/* MANUFACT_ID */ +#define BH1770_MANUFACT_ROHM 0x01 +#define BH1770_MANUFACT_OSRAM 0x03 + +/* PART_ID */ +#define BH1770_PART 0x90 +#define BH1770_PART_MASK 0xf0 +#define BH1770_REV_MASK 0x0f +#define BH1770_REV_SHIFT 0 +#define BH1770_REV_0 0x00 +#define BH1770_REV_1 0x01 + +/* Operating modes for both */ +#define BH1770_STANDBY 0x00 +#define BH1770_FORCED 0x02 +#define BH1770_STANDALONE 0x03 +#define BH1770_SWRESET (0x01 << 2) + +#define BH1770_PS_TRIG_MEAS (1 << 0) +#define BH1770_ALS_TRIG_MEAS (1 << 1) + +/* Interrupt control */ +#define BH1770_INT_OUTPUT_MODE (1 << 3) /* 0 = latched */ +#define BH1770_INT_POLARITY (1 << 2) /* 1 = active high */ +#define BH1770_INT_ALS_ENA (1 << 1) +#define BH1770_INT_PS_ENA (1 << 0) + +/* Interrupt status */ +#define BH1770_INT_LED1_DATA (1 << 0) +#define BH1770_INT_LED1_INT (1 << 1) +#define BH1770_INT_LED2_DATA (1 << 2) +#define BH1770_INT_LED2_INT (1 << 3) +#define BH1770_INT_LED3_DATA (1 << 4) +#define BH1770_INT_LED3_INT (1 << 5) +#define BH1770_INT_LEDS_INT ((1 << 1) | (1 << 3) | (1 << 5)) +#define BH1770_INT_ALS_DATA (1 << 6) +#define BH1770_INT_ALS_INT (1 << 7) + +/* Led channels */ +#define BH1770_LED1 0x00 + +#define BH1770_DISABLE 0 +#define BH1770_ENABLE 1 +#define BH1770_PROX_CHANNELS 1 + +#define BH1770_LUX_DEFAULT_RATE 1 /* Index to lux rate table */ +#define BH1770_PROX_DEFAULT_RATE 1 /* Direct HW value =~ 50Hz */ +#define BH1770_PROX_DEF_RATE_THRESH 6 /* Direct HW value =~ 5 Hz */ +#define BH1770_STARTUP_DELAY 50 +#define BH1770_RESET_TIME 10 +#define BH1770_TIMEOUT 2100 /* Timeout in 2.1 seconds */ + +#define BH1770_LUX_RANGE 65535 +#define BH1770_PROX_RANGE 255 +#define BH1770_COEF_SCALER 1024 +#define BH1770_CALIB_SCALER 8192 +#define BH1770_LUX_NEUTRAL_CALIB_VALUE (1 * BH1770_CALIB_SCALER) +#define BH1770_LUX_DEF_THRES 1000 +#define BH1770_PROX_DEF_THRES 70 +#define BH1770_PROX_DEF_ABS_THRES 100 +#define BH1770_DEFAULT_PERSISTENCE 10 +#define BH1770_PROX_MAX_PERSISTENCE 50 +#define BH1770_LUX_GA_SCALE 16384 +#define BH1770_LUX_CF_SCALE 2048 /* CF ChipFactor */ +#define BH1770_NEUTRAL_CF BH1770_LUX_CF_SCALE +#define BH1770_LUX_CORR_SCALE 4096 + +#define PROX_ABOVE_THRESHOLD 1 +#define PROX_BELOW_THRESHOLD 0 + +#define PROX_IGNORE_LUX_LIMIT 500 + +struct bh1770_chip { + struct bh1770_platform_data *pdata; + char chipname[10]; + u8 revision; + struct i2c_client *client; + struct regulator_bulk_data regs[2]; + struct mutex mutex; /* avoid parallel access */ + wait_queue_head_t wait; + + bool int_mode_prox; + bool int_mode_lux; + struct delayed_work prox_work; + u32 lux_cf; /* Chip specific factor */ + u32 lux_ga; + u32 lux_calib; + int lux_rate_index; + u32 lux_corr; + u16 lux_data_raw; + u16 lux_threshold_hi; + u16 lux_threshold_lo; + u16 lux_thres_hi_onchip; + u16 lux_thres_lo_onchip; + bool lux_wait_result; + + int prox_enable_count; + u16 prox_coef; + u16 prox_const; + int prox_rate; + int prox_rate_threshold; + u8 prox_persistence; + u8 prox_persistence_counter; + u8 prox_data; + u8 prox_threshold; + u8 prox_threshold_hw; + bool prox_force_update; + u8 prox_abs_thres; + u8 prox_led; +}; + +static const char reg_vcc[] = "Vcc"; +static const char reg_vleds[] = "Vleds"; + +/* + * Supported stand alone rates in ms from chip data sheet + * {10, 20, 30, 40, 70, 100, 200, 500, 1000, 2000}; + */ +static const s16 prox_rates_hz[] = {100, 50, 33, 25, 14, 10, 5, 2}; +static const s16 prox_rates_ms[] = {10, 20, 30, 40, 70, 100, 200, 500}; + +/* Supported IR-led currents in mA */ +static const u8 prox_curr_ma[] = {5, 10, 20, 50, 100, 150, 200}; + +/* + * Supported stand alone rates in ms from chip data sheet + * {100, 200, 500, 1000, 2000}; + */ +static const s16 lux_rates_hz[] = {10, 5, 2, 1, 0}; + +/* + * interrupt control functions are called while keeping chip->mutex + * excluding module probe / remove + */ +static inline int bh1770_lux_interrupt_control(struct bh1770_chip *chip, + int lux) +{ + chip->int_mode_lux = lux; + /* Set interrupt modes, interrupt active low, latched */ + return i2c_smbus_write_byte_data(chip->client, + BH1770_INTERRUPT, + (lux << 1) | chip->int_mode_prox); +} + +static inline int bh1770_prox_interrupt_control(struct bh1770_chip *chip, + int ps) +{ + chip->int_mode_prox = ps; + return i2c_smbus_write_byte_data(chip->client, + BH1770_INTERRUPT, + (chip->int_mode_lux << 1) | (ps << 0)); +} + +/* chip->mutex is always kept here */ +static int bh1770_lux_rate(struct bh1770_chip *chip, int rate_index) +{ + /* sysfs may call this when the chip is powered off */ + if (pm_runtime_suspended(&chip->client->dev)) + return 0; + + /* Proper proximity response needs fastest lux rate (100ms) */ + if (chip->prox_enable_count) + rate_index = 0; + + return i2c_smbus_write_byte_data(chip->client, + BH1770_ALS_MEAS_RATE, + rate_index); +} + +static int bh1770_prox_rate(struct bh1770_chip *chip, int mode) +{ + int rate; + + rate = (mode == PROX_ABOVE_THRESHOLD) ? + chip->prox_rate_threshold : chip->prox_rate; + + return i2c_smbus_write_byte_data(chip->client, + BH1770_PS_MEAS_RATE, + rate); +} + +/* InfraredLED is controlled by the chip during proximity scanning */ +static inline int bh1770_led_cfg(struct bh1770_chip *chip) +{ + /* LED cfg, current for leds 1 and 2 */ + return i2c_smbus_write_byte_data(chip->client, + BH1770_I_LED, + (BH1770_LED1 << 6) | + (BH1770_LED_5mA << 3) | + chip->prox_led); +} + +/* + * Following two functions converts raw ps values from HW to normalized + * values. Purpose is to compensate differences between different sensor + * versions and variants so that result means about the same between + * versions. + */ +static inline u8 bh1770_psraw_to_adjusted(struct bh1770_chip *chip, u8 psraw) +{ + u16 adjusted; + adjusted = (u16)(((u32)(psraw + chip->prox_const) * chip->prox_coef) / + BH1770_COEF_SCALER); + if (adjusted > BH1770_PROX_RANGE) + adjusted = BH1770_PROX_RANGE; + return adjusted; +} + +static inline u8 bh1770_psadjusted_to_raw(struct bh1770_chip *chip, u8 ps) +{ + u16 raw; + + raw = (((u32)ps * BH1770_COEF_SCALER) / chip->prox_coef); + if (raw > chip->prox_const) + raw = raw - chip->prox_const; + else + raw = 0; + return raw; +} + +/* + * Following two functions converts raw lux values from HW to normalized + * values. Purpose is to compensate differences between different sensor + * versions and variants so that result means about the same between + * versions. Chip->mutex is kept when this is called. + */ +static int bh1770_prox_set_threshold(struct bh1770_chip *chip) +{ + u8 tmp = 0; + + /* sysfs may call this when the chip is powered off */ + if (pm_runtime_suspended(&chip->client->dev)) + return 0; + + tmp = bh1770_psadjusted_to_raw(chip, chip->prox_threshold); + chip->prox_threshold_hw = tmp; + + return i2c_smbus_write_byte_data(chip->client, BH1770_PS_TH_LED1, + tmp); +} + +static inline u16 bh1770_lux_raw_to_adjusted(struct bh1770_chip *chip, u16 raw) +{ + u32 lux; + lux = ((u32)raw * chip->lux_corr) / BH1770_LUX_CORR_SCALE; + return min(lux, (u32)BH1770_LUX_RANGE); +} + +static inline u16 bh1770_lux_adjusted_to_raw(struct bh1770_chip *chip, + u16 adjusted) +{ + return (u32)adjusted * BH1770_LUX_CORR_SCALE / chip->lux_corr; +} + +/* chip->mutex is kept when this is called */ +static int bh1770_lux_update_thresholds(struct bh1770_chip *chip, + u16 threshold_hi, u16 threshold_lo) +{ + u8 data[4]; + int ret; + + /* sysfs may call this when the chip is powered off */ + if (pm_runtime_suspended(&chip->client->dev)) + return 0; + + /* + * Compensate threshold values with the correction factors if not + * set to minimum or maximum. + * Min & max values disables interrupts. + */ + if (threshold_hi != BH1770_LUX_RANGE && threshold_hi != 0) + threshold_hi = bh1770_lux_adjusted_to_raw(chip, threshold_hi); + + if (threshold_lo != BH1770_LUX_RANGE && threshold_lo != 0) + threshold_lo = bh1770_lux_adjusted_to_raw(chip, threshold_lo); + + if (chip->lux_thres_hi_onchip == threshold_hi && + chip->lux_thres_lo_onchip == threshold_lo) + return 0; + + chip->lux_thres_hi_onchip = threshold_hi; + chip->lux_thres_lo_onchip = threshold_lo; + + data[0] = threshold_hi; + data[1] = threshold_hi >> 8; + data[2] = threshold_lo; + data[3] = threshold_lo >> 8; + + ret = i2c_smbus_write_i2c_block_data(chip->client, + BH1770_ALS_TH_UP_0, + ARRAY_SIZE(data), + data); + return ret; +} + +static int bh1770_lux_get_result(struct bh1770_chip *chip) +{ + u16 data; + int ret; + + ret = i2c_smbus_read_byte_data(chip->client, BH1770_ALS_DATA_0); + if (ret < 0) + return ret; + + data = ret & 0xff; + ret = i2c_smbus_read_byte_data(chip->client, BH1770_ALS_DATA_1); + if (ret < 0) + return ret; + + chip->lux_data_raw = data | ((ret & 0xff) << 8); + + return 0; +} + +/* Calculate correction value which contains chip and device specific parts */ +static u32 bh1770_get_corr_value(struct bh1770_chip *chip) +{ + u32 tmp; + /* Impact of glass attenuation correction */ + tmp = (BH1770_LUX_CORR_SCALE * chip->lux_ga) / BH1770_LUX_GA_SCALE; + /* Impact of chip factor correction */ + tmp = (tmp * chip->lux_cf) / BH1770_LUX_CF_SCALE; + /* Impact of Device specific calibration correction */ + tmp = (tmp * chip->lux_calib) / BH1770_CALIB_SCALER; + return tmp; +} + +static int bh1770_lux_read_result(struct bh1770_chip *chip) +{ + bh1770_lux_get_result(chip); + return bh1770_lux_raw_to_adjusted(chip, chip->lux_data_raw); +} + +/* + * Chip on / off functions are called while keeping mutex except probe + * or remove phase + */ +static int bh1770_chip_on(struct bh1770_chip *chip) +{ + int ret = regulator_bulk_enable(ARRAY_SIZE(chip->regs), + chip->regs); + if (ret < 0) + return ret; + + usleep_range(BH1770_STARTUP_DELAY, BH1770_STARTUP_DELAY * 2); + + /* Reset the chip */ + i2c_smbus_write_byte_data(chip->client, BH1770_ALS_CONTROL, + BH1770_SWRESET); + usleep_range(BH1770_RESET_TIME, BH1770_RESET_TIME * 2); + + /* + * ALS is started always since proximity needs als results + * for realibility estimation. + * Let's assume dark until the first ALS measurement is ready. + */ + chip->lux_data_raw = 0; + chip->prox_data = 0; + ret = i2c_smbus_write_byte_data(chip->client, + BH1770_ALS_CONTROL, BH1770_STANDALONE); + + /* Assume reset defaults */ + chip->lux_thres_hi_onchip = BH1770_LUX_RANGE; + chip->lux_thres_lo_onchip = 0; + + return ret; +} + +static void bh1770_chip_off(struct bh1770_chip *chip) +{ + i2c_smbus_write_byte_data(chip->client, + BH1770_INTERRUPT, BH1770_DISABLE); + i2c_smbus_write_byte_data(chip->client, + BH1770_ALS_CONTROL, BH1770_STANDBY); + i2c_smbus_write_byte_data(chip->client, + BH1770_PS_CONTROL, BH1770_STANDBY); + regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs); +} + +/* chip->mutex is kept when this is called */ +static int bh1770_prox_mode_control(struct bh1770_chip *chip) +{ + if (chip->prox_enable_count) { + chip->prox_force_update = true; /* Force immediate update */ + + bh1770_lux_rate(chip, chip->lux_rate_index); + bh1770_prox_set_threshold(chip); + bh1770_led_cfg(chip); + bh1770_prox_rate(chip, PROX_BELOW_THRESHOLD); + bh1770_prox_interrupt_control(chip, BH1770_ENABLE); + i2c_smbus_write_byte_data(chip->client, + BH1770_PS_CONTROL, BH1770_STANDALONE); + } else { + chip->prox_data = 0; + bh1770_lux_rate(chip, chip->lux_rate_index); + bh1770_prox_interrupt_control(chip, BH1770_DISABLE); + i2c_smbus_write_byte_data(chip->client, + BH1770_PS_CONTROL, BH1770_STANDBY); + } + return 0; +} + +/* chip->mutex is kept when this is called */ +static int bh1770_prox_read_result(struct bh1770_chip *chip) +{ + int ret; + bool above; + u8 mode; + + ret = i2c_smbus_read_byte_data(chip->client, BH1770_PS_DATA_LED1); + if (ret < 0) + goto out; + + if (ret > chip->prox_threshold_hw) + above = true; + else + above = false; + + /* + * when ALS levels goes above limit, proximity result may be + * false proximity. Thus ignore the result. With real proximity + * there is a shadow causing low als levels. + */ + if (chip->lux_data_raw > PROX_IGNORE_LUX_LIMIT) + ret = 0; + + chip->prox_data = bh1770_psraw_to_adjusted(chip, ret); + + /* Strong proximity level or force mode requires immediate response */ + if (chip->prox_data >= chip->prox_abs_thres || + chip->prox_force_update) + chip->prox_persistence_counter = chip->prox_persistence; + + chip->prox_force_update = false; + + /* Persistence filttering to reduce false proximity events */ + if (likely(above)) { + if (chip->prox_persistence_counter < chip->prox_persistence) { + chip->prox_persistence_counter++; + ret = -ENODATA; + } else { + mode = PROX_ABOVE_THRESHOLD; + ret = 0; + } + } else { + chip->prox_persistence_counter = 0; + mode = PROX_BELOW_THRESHOLD; + chip->prox_data = 0; + ret = 0; + } + + /* Set proximity detection rate based on above or below value */ + if (ret == 0) { + bh1770_prox_rate(chip, mode); + sysfs_notify(&chip->client->dev.kobj, NULL, "prox0_raw"); + } +out: + return ret; +} + +static int bh1770_detect(struct bh1770_chip *chip) +{ + struct i2c_client *client = chip->client; + s32 ret; + u8 manu, part; + + ret = i2c_smbus_read_byte_data(client, BH1770_MANUFACT_ID); + if (ret < 0) + goto error; + manu = (u8)ret; + + ret = i2c_smbus_read_byte_data(client, BH1770_PART_ID); + if (ret < 0) + goto error; + part = (u8)ret; + + chip->revision = (part & BH1770_REV_MASK) >> BH1770_REV_SHIFT; + chip->prox_coef = BH1770_COEF_SCALER; + chip->prox_const = 0; + chip->lux_cf = BH1770_NEUTRAL_CF; + + if ((manu == BH1770_MANUFACT_ROHM) && + ((part & BH1770_PART_MASK) == BH1770_PART)) { + snprintf(chip->chipname, sizeof(chip->chipname), "BH1770GLC"); + return 0; + } + + if ((manu == BH1770_MANUFACT_OSRAM) && + ((part & BH1770_PART_MASK) == BH1770_PART)) { + snprintf(chip->chipname, sizeof(chip->chipname), "SFH7770"); + /* Values selected by comparing different versions */ + chip->prox_coef = 819; /* 0.8 * BH1770_COEF_SCALER */ + chip->prox_const = 40; + return 0; + } + + ret = -ENODEV; +error: + dev_dbg(&client->dev, "BH1770 or SFH7770 not found\n"); + + return ret; +} + +/* + * This work is re-scheduled at every proximity interrupt. + * If this work is running, it means that there hasn't been any + * proximity interrupt in time. Situation is handled as no-proximity. + * It would be nice to have low-threshold interrupt or interrupt + * when measurement and hi-threshold are both 0. But neither of those exists. + * This is a workaroud for missing HW feature. + */ + +static void bh1770_prox_work(struct work_struct *work) +{ + struct bh1770_chip *chip = + container_of(work, struct bh1770_chip, prox_work.work); + + mutex_lock(&chip->mutex); + bh1770_prox_read_result(chip); + mutex_unlock(&chip->mutex); +} + +/* This is threaded irq handler */ +static irqreturn_t bh1770_irq(int irq, void *data) +{ + struct bh1770_chip *chip = data; + int status; + int rate = 0; + + mutex_lock(&chip->mutex); + status = i2c_smbus_read_byte_data(chip->client, BH1770_ALS_PS_STATUS); + + /* Acknowledge interrupt by reading this register */ + i2c_smbus_read_byte_data(chip->client, BH1770_INTERRUPT); + + /* + * Check if there is fresh data available for als. + * If this is the very first data, update thresholds after that. + */ + if (status & BH1770_INT_ALS_DATA) { + bh1770_lux_get_result(chip); + if (unlikely(chip->lux_wait_result)) { + chip->lux_wait_result = false; + wake_up(&chip->wait); + bh1770_lux_update_thresholds(chip, + chip->lux_threshold_hi, + chip->lux_threshold_lo); + } + } + + /* Disable interrupt logic to guarantee acknowledgement */ + i2c_smbus_write_byte_data(chip->client, BH1770_INTERRUPT, + (0 << 1) | (0 << 0)); + + if ((status & BH1770_INT_ALS_INT)) + sysfs_notify(&chip->client->dev.kobj, NULL, "lux0_input"); + + if (chip->int_mode_prox && (status & BH1770_INT_LEDS_INT)) { + rate = prox_rates_ms[chip->prox_rate_threshold]; + bh1770_prox_read_result(chip); + } + + /* Re-enable interrupt logic */ + i2c_smbus_write_byte_data(chip->client, BH1770_INTERRUPT, + (chip->int_mode_lux << 1) | + (chip->int_mode_prox << 0)); + mutex_unlock(&chip->mutex); + + /* + * Can't cancel work while keeping mutex since the work uses the + * same mutex. + */ + if (rate) { + /* + * Simulate missing no-proximity interrupt 50ms after the + * next expected interrupt time. + */ + cancel_delayed_work_sync(&chip->prox_work); + schedule_delayed_work(&chip->prox_work, + msecs_to_jiffies(rate + 50)); + } + return IRQ_HANDLED; +} + +static ssize_t bh1770_power_state_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + size_t ret; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + if (value) { + pm_runtime_get_sync(dev); + + ret = bh1770_lux_rate(chip, chip->lux_rate_index); + ret |= bh1770_lux_interrupt_control(chip, BH1770_ENABLE); + + if (ret < 0) { + pm_runtime_put(dev); + goto leave; + } + + /* This causes interrupt after the next measurement cycle */ + bh1770_lux_update_thresholds(chip, BH1770_LUX_DEF_THRES, + BH1770_LUX_DEF_THRES); + /* Inform that we are waiting for a result from ALS */ + chip->lux_wait_result = true; + bh1770_prox_mode_control(chip); + } else if (!pm_runtime_suspended(dev)) { + pm_runtime_put(dev); + } + ret = count; +leave: + mutex_unlock(&chip->mutex); + return ret; +} + +static ssize_t bh1770_power_state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !pm_runtime_suspended(dev)); +} + +static ssize_t bh1770_lux_result_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + ssize_t ret; + long timeout; + + if (pm_runtime_suspended(dev)) + return -EIO; /* Chip is not enabled at all */ + + timeout = wait_event_interruptible_timeout(chip->wait, + !chip->lux_wait_result, + msecs_to_jiffies(BH1770_TIMEOUT)); + if (!timeout) + return -EIO; + + mutex_lock(&chip->mutex); + ret = sprintf(buf, "%d\n", bh1770_lux_read_result(chip)); + mutex_unlock(&chip->mutex); + + return ret; +} + +static ssize_t bh1770_lux_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", BH1770_LUX_RANGE); +} + +static ssize_t bh1770_prox_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + /* Assume no proximity. Sensor will tell real state soon */ + if (!chip->prox_enable_count) + chip->prox_data = 0; + + if (value) + chip->prox_enable_count++; + else if (chip->prox_enable_count > 0) + chip->prox_enable_count--; + else + goto leave; + + /* Run control only when chip is powered on */ + if (!pm_runtime_suspended(dev)) + bh1770_prox_mode_control(chip); +leave: + mutex_unlock(&chip->mutex); + return count; +} + +static ssize_t bh1770_prox_enable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + ssize_t len; + + mutex_lock(&chip->mutex); + len = sprintf(buf, "%d\n", chip->prox_enable_count); + mutex_unlock(&chip->mutex); + return len; +} + +static ssize_t bh1770_prox_result_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + ssize_t ret; + + mutex_lock(&chip->mutex); + if (chip->prox_enable_count && !pm_runtime_suspended(dev)) + ret = sprintf(buf, "%d\n", chip->prox_data); + else + ret = -EIO; + mutex_unlock(&chip->mutex); + return ret; +} + +static ssize_t bh1770_prox_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", BH1770_PROX_RANGE); +} + +static ssize_t bh1770_get_prox_rate_avail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int i; + int pos = 0; + for (i = 0; i < ARRAY_SIZE(prox_rates_hz); i++) + pos += sprintf(buf + pos, "%d ", prox_rates_hz[i]); + sprintf(buf + pos - 1, "\n"); + return pos; +} + +static ssize_t bh1770_get_prox_rate_above(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", prox_rates_hz[chip->prox_rate_threshold]); +} + +static ssize_t bh1770_get_prox_rate_below(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", prox_rates_hz[chip->prox_rate]); +} + +static int bh1770_prox_rate_validate(int rate) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prox_rates_hz) - 1; i++) + if (rate >= prox_rates_hz[i]) + break; + return i; +} + +static ssize_t bh1770_set_prox_rate_above(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + chip->prox_rate_threshold = bh1770_prox_rate_validate(value); + mutex_unlock(&chip->mutex); + return count; +} + +static ssize_t bh1770_set_prox_rate_below(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + chip->prox_rate = bh1770_prox_rate_validate(value); + mutex_unlock(&chip->mutex); + return count; +} + +static ssize_t bh1770_get_prox_thres(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->prox_threshold); +} + +static ssize_t bh1770_set_prox_thres(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + int ret; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + if (value > BH1770_PROX_RANGE) + return -EINVAL; + + mutex_lock(&chip->mutex); + chip->prox_threshold = value; + ret = bh1770_prox_set_threshold(chip); + mutex_unlock(&chip->mutex); + if (ret < 0) + return ret; + return count; +} + +static ssize_t bh1770_prox_persistence_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", chip->prox_persistence); +} + +static ssize_t bh1770_prox_persistence_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + if (value > BH1770_PROX_MAX_PERSISTENCE) + return -EINVAL; + + chip->prox_persistence = value; + + return len; +} + +static ssize_t bh1770_prox_abs_thres_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%u\n", chip->prox_abs_thres); +} + +static ssize_t bh1770_prox_abs_thres_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + if (value > BH1770_PROX_RANGE) + return -EINVAL; + + chip->prox_abs_thres = value; + + return len; +} + +static ssize_t bh1770_chip_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%s rev %d\n", chip->chipname, chip->revision); +} + +static ssize_t bh1770_lux_calib_default_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", BH1770_CALIB_SCALER); +} + +static ssize_t bh1770_lux_calib_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + ssize_t len; + + mutex_lock(&chip->mutex); + len = sprintf(buf, "%u\n", chip->lux_calib); + mutex_unlock(&chip->mutex); + return len; +} + +static ssize_t bh1770_lux_calib_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long value; + u32 old_calib; + u32 new_corr; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + old_calib = chip->lux_calib; + chip->lux_calib = value; + new_corr = bh1770_get_corr_value(chip); + if (new_corr == 0) { + chip->lux_calib = old_calib; + mutex_unlock(&chip->mutex); + return -EINVAL; + } + chip->lux_corr = new_corr; + /* Refresh thresholds on HW after changing correction value */ + bh1770_lux_update_thresholds(chip, chip->lux_threshold_hi, + chip->lux_threshold_lo); + + mutex_unlock(&chip->mutex); + + return len; +} + +static ssize_t bh1770_get_lux_rate_avail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int i; + int pos = 0; + for (i = 0; i < ARRAY_SIZE(lux_rates_hz); i++) + pos += sprintf(buf + pos, "%d ", lux_rates_hz[i]); + sprintf(buf + pos - 1, "\n"); + return pos; +} + +static ssize_t bh1770_get_lux_rate(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", lux_rates_hz[chip->lux_rate_index]); +} + +static ssize_t bh1770_set_lux_rate(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + unsigned long rate_hz; + int ret, i; + + if (strict_strtoul(buf, 0, &rate_hz)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(lux_rates_hz) - 1; i++) + if (rate_hz >= lux_rates_hz[i]) + break; + + mutex_lock(&chip->mutex); + chip->lux_rate_index = i; + ret = bh1770_lux_rate(chip, i); + mutex_unlock(&chip->mutex); + + if (ret < 0) + return ret; + + return count; +} + +static ssize_t bh1770_get_lux_thresh_above(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->lux_threshold_hi); +} + +static ssize_t bh1770_get_lux_thresh_below(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->lux_threshold_lo); +} + +static ssize_t bh1770_set_lux_thresh(struct bh1770_chip *chip, u16 *target, + const char *buf) +{ + int ret = 0; + unsigned long thresh; + + if (strict_strtoul(buf, 0, &thresh)) + return -EINVAL; + + if (thresh > BH1770_LUX_RANGE) + return -EINVAL; + + mutex_lock(&chip->mutex); + *target = thresh; + /* + * Don't update values in HW if we are still waiting for + * first interrupt to come after device handle open call. + */ + if (!chip->lux_wait_result) + ret = bh1770_lux_update_thresholds(chip, + chip->lux_threshold_hi, + chip->lux_threshold_lo); + mutex_unlock(&chip->mutex); + return ret; + +} + +static ssize_t bh1770_set_lux_thresh_above(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + int ret = bh1770_set_lux_thresh(chip, &chip->lux_threshold_hi, buf); + if (ret < 0) + return ret; + return len; +} + +static ssize_t bh1770_set_lux_thresh_below(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct bh1770_chip *chip = dev_get_drvdata(dev); + int ret = bh1770_set_lux_thresh(chip, &chip->lux_threshold_lo, buf); + if (ret < 0) + return ret; + return len; +} + +static DEVICE_ATTR(prox0_raw_en, S_IRUGO | S_IWUSR, bh1770_prox_enable_show, + bh1770_prox_enable_store); +static DEVICE_ATTR(prox0_thresh_above1_value, S_IRUGO | S_IWUSR, + bh1770_prox_abs_thres_show, + bh1770_prox_abs_thres_store); +static DEVICE_ATTR(prox0_thresh_above0_value, S_IRUGO | S_IWUSR, + bh1770_get_prox_thres, + bh1770_set_prox_thres); +static DEVICE_ATTR(prox0_raw, S_IRUGO, bh1770_prox_result_show, NULL); +static DEVICE_ATTR(prox0_sensor_range, S_IRUGO, bh1770_prox_range_show, NULL); +static DEVICE_ATTR(prox0_thresh_above_count, S_IRUGO | S_IWUSR, + bh1770_prox_persistence_show, + bh1770_prox_persistence_store); +static DEVICE_ATTR(prox0_rate_above, S_IRUGO | S_IWUSR, + bh1770_get_prox_rate_above, + bh1770_set_prox_rate_above); +static DEVICE_ATTR(prox0_rate_below, S_IRUGO | S_IWUSR, + bh1770_get_prox_rate_below, + bh1770_set_prox_rate_below); +static DEVICE_ATTR(prox0_rate_avail, S_IRUGO, bh1770_get_prox_rate_avail, NULL); + +static DEVICE_ATTR(lux0_calibscale, S_IRUGO | S_IWUSR, bh1770_lux_calib_show, + bh1770_lux_calib_store); +static DEVICE_ATTR(lux0_calibscale_default, S_IRUGO, + bh1770_lux_calib_default_show, + NULL); +static DEVICE_ATTR(lux0_input, S_IRUGO, bh1770_lux_result_show, NULL); +static DEVICE_ATTR(lux0_sensor_range, S_IRUGO, bh1770_lux_range_show, NULL); +static DEVICE_ATTR(lux0_rate, S_IRUGO | S_IWUSR, bh1770_get_lux_rate, + bh1770_set_lux_rate); +static DEVICE_ATTR(lux0_rate_avail, S_IRUGO, bh1770_get_lux_rate_avail, NULL); +static DEVICE_ATTR(lux0_thresh_above_value, S_IRUGO | S_IWUSR, + bh1770_get_lux_thresh_above, + bh1770_set_lux_thresh_above); +static DEVICE_ATTR(lux0_thresh_below_value, S_IRUGO | S_IWUSR, + bh1770_get_lux_thresh_below, + bh1770_set_lux_thresh_below); +static DEVICE_ATTR(chip_id, S_IRUGO, bh1770_chip_id_show, NULL); +static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR, bh1770_power_state_show, + bh1770_power_state_store); + + +static struct attribute *sysfs_attrs[] = { + &dev_attr_lux0_calibscale.attr, + &dev_attr_lux0_calibscale_default.attr, + &dev_attr_lux0_input.attr, + &dev_attr_lux0_sensor_range.attr, + &dev_attr_lux0_rate.attr, + &dev_attr_lux0_rate_avail.attr, + &dev_attr_lux0_thresh_above_value.attr, + &dev_attr_lux0_thresh_below_value.attr, + &dev_attr_prox0_raw.attr, + &dev_attr_prox0_sensor_range.attr, + &dev_attr_prox0_raw_en.attr, + &dev_attr_prox0_thresh_above_count.attr, + &dev_attr_prox0_rate_above.attr, + &dev_attr_prox0_rate_below.attr, + &dev_attr_prox0_rate_avail.attr, + &dev_attr_prox0_thresh_above0_value.attr, + &dev_attr_prox0_thresh_above1_value.attr, + &dev_attr_chip_id.attr, + &dev_attr_power_state.attr, + NULL +}; + +static struct attribute_group bh1770_attribute_group = { + .attrs = sysfs_attrs +}; + +static int __devinit bh1770_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct bh1770_chip *chip; + int err; + + chip = kzalloc(sizeof *chip, GFP_KERNEL); + if (!chip) + return -ENOMEM; + + i2c_set_clientdata(client, chip); + chip->client = client; + + mutex_init(&chip->mutex); + init_waitqueue_head(&chip->wait); + INIT_DELAYED_WORK(&chip->prox_work, bh1770_prox_work); + + if (client->dev.platform_data == NULL) { + dev_err(&client->dev, "platform data is mandatory\n"); + err = -EINVAL; + goto fail1; + } + + chip->pdata = client->dev.platform_data; + chip->lux_calib = BH1770_LUX_NEUTRAL_CALIB_VALUE; + chip->lux_rate_index = BH1770_LUX_DEFAULT_RATE; + chip->lux_threshold_lo = BH1770_LUX_DEF_THRES; + chip->lux_threshold_hi = BH1770_LUX_DEF_THRES; + + if (chip->pdata->glass_attenuation == 0) + chip->lux_ga = BH1770_NEUTRAL_GA; + else + chip->lux_ga = chip->pdata->glass_attenuation; + + chip->prox_threshold = BH1770_PROX_DEF_THRES; + chip->prox_led = chip->pdata->led_def_curr; + chip->prox_abs_thres = BH1770_PROX_DEF_ABS_THRES; + chip->prox_persistence = BH1770_DEFAULT_PERSISTENCE; + chip->prox_rate_threshold = BH1770_PROX_DEF_RATE_THRESH; + chip->prox_rate = BH1770_PROX_DEFAULT_RATE; + chip->prox_data = 0; + + chip->regs[0].supply = reg_vcc; + chip->regs[1].supply = reg_vleds; + + err = regulator_bulk_get(&client->dev, + ARRAY_SIZE(chip->regs), chip->regs); + if (err < 0) { + dev_err(&client->dev, "Cannot get regulators\n"); + goto fail1; + } + + err = regulator_bulk_enable(ARRAY_SIZE(chip->regs), + chip->regs); + if (err < 0) { + dev_err(&client->dev, "Cannot enable regulators\n"); + goto fail2; + } + + usleep_range(BH1770_STARTUP_DELAY, BH1770_STARTUP_DELAY * 2); + err = bh1770_detect(chip); + if (err < 0) + goto fail3; + + /* Start chip */ + bh1770_chip_on(chip); + pm_runtime_set_active(&client->dev); + pm_runtime_enable(&client->dev); + + chip->lux_corr = bh1770_get_corr_value(chip); + if (chip->lux_corr == 0) { + dev_err(&client->dev, "Improper correction values\n"); + err = -EINVAL; + goto fail3; + } + + if (chip->pdata->setup_resources) { + err = chip->pdata->setup_resources(); + if (err) { + err = -EINVAL; + goto fail3; + } + } + + err = sysfs_create_group(&chip->client->dev.kobj, + &bh1770_attribute_group); + if (err < 0) { + dev_err(&chip->client->dev, "Sysfs registration failed\n"); + goto fail4; + } + + /* + * Chip needs level triggered interrupt to work. However, + * level triggering doesn't work always correctly with power + * management. Select both + */ + err = request_threaded_irq(client->irq, NULL, + bh1770_irq, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT | + IRQF_TRIGGER_LOW, + "bh1770", chip); + if (err) { + dev_err(&client->dev, "could not get IRQ %d\n", + client->irq); + goto fail5; + } + regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs); + return err; +fail5: + sysfs_remove_group(&chip->client->dev.kobj, + &bh1770_attribute_group); +fail4: + if (chip->pdata->release_resources) + chip->pdata->release_resources(); +fail3: + regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs); +fail2: + regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs); +fail1: + kfree(chip); + return err; +} + +static int __devexit bh1770_remove(struct i2c_client *client) +{ + struct bh1770_chip *chip = i2c_get_clientdata(client); + + free_irq(client->irq, chip); + + sysfs_remove_group(&chip->client->dev.kobj, + &bh1770_attribute_group); + + if (chip->pdata->release_resources) + chip->pdata->release_resources(); + + cancel_delayed_work_sync(&chip->prox_work); + + if (!pm_runtime_suspended(&client->dev)) + bh1770_chip_off(chip); + + pm_runtime_disable(&client->dev); + pm_runtime_set_suspended(&client->dev); + + regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs); + kfree(chip); + return 0; +} + +#ifdef CONFIG_PM +static int bh1770_suspend(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct bh1770_chip *chip = i2c_get_clientdata(client); + + bh1770_chip_off(chip); + + return 0; +} + +static int bh1770_resume(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct bh1770_chip *chip = i2c_get_clientdata(client); + int ret = 0; + + bh1770_chip_on(chip); + + if (!pm_runtime_suspended(dev)) { + /* + * If we were enabled at suspend time, it is expected + * everything works nice and smoothly + */ + ret = bh1770_lux_rate(chip, chip->lux_rate_index); + ret |= bh1770_lux_interrupt_control(chip, BH1770_ENABLE); + + /* This causes interrupt after the next measurement cycle */ + bh1770_lux_update_thresholds(chip, BH1770_LUX_DEF_THRES, + BH1770_LUX_DEF_THRES); + /* Inform that we are waiting for a result from ALS */ + chip->lux_wait_result = true; + bh1770_prox_mode_control(chip); + } + return ret; +} + +#else +#define bh1770_suspend NULL +#define bh1770_shutdown NULL +#define bh1770_resume NULL +#endif + +#ifdef CONFIG_PM_RUNTIME +static int bh1770_runtime_suspend(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct bh1770_chip *chip = i2c_get_clientdata(client); + + bh1770_chip_off(chip); + + return 0; +} + +static int bh1770_runtime_resume(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct bh1770_chip *chip = i2c_get_clientdata(client); + + bh1770_chip_on(chip); + + return 0; +} +#endif + +static const struct i2c_device_id bh1770_id[] = { + {"bh1770glc", 0 }, + {"sfh7770", 0 }, + {} +}; + +MODULE_DEVICE_TABLE(i2c, bh1770_id); + +static const struct dev_pm_ops bh1770_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(bh1770_suspend, bh1770_resume) + SET_RUNTIME_PM_OPS(bh1770_runtime_suspend, bh1770_runtime_resume, NULL) +}; + +static struct i2c_driver bh1770_driver = { + .driver = { + .name = "bh1770glc", + .owner = THIS_MODULE, + .pm = &bh1770_pm_ops, + }, + .probe = bh1770_probe, + .remove = __devexit_p(bh1770_remove), + .id_table = bh1770_id, +}; + +static int __init bh1770_init(void) +{ + return i2c_add_driver(&bh1770_driver); +} + +static void __exit bh1770_exit(void) +{ + i2c_del_driver(&bh1770_driver); +} + +MODULE_DESCRIPTION("BH1770GLC / SFH7770 combined ALS and proximity sensor"); +MODULE_AUTHOR("Samu Onkalo, Nokia Corporation"); +MODULE_LICENSE("GPL v2"); + +module_init(bh1770_init); +module_exit(bh1770_exit); diff --git a/include/linux/i2c/bh1770glc.h b/include/linux/i2c/bh1770glc.h new file mode 100644 index 000000000000..8b5e2df36c72 --- /dev/null +++ b/include/linux/i2c/bh1770glc.h @@ -0,0 +1,53 @@ +/* + * This file is part of the ROHM BH1770GLC / OSRAM SFH7770 sensor driver. + * Chip is combined proximity and ambient light sensor. + * + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). + * + * Contact: Samu Onkalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#ifndef __BH1770_H__ +#define __BH1770_H__ + +/** + * struct bh1770_platform_data - platform data for bh1770glc driver + * @led_def_curr: IR led driving current. + * @glass_attenuation: Attenuation factor for covering window. + * @setup_resources: Call back for interrupt line setup function + * @release_resources: Call back for interrupte line release function + * + * Example of glass attenuation: 16384 * 385 / 100 means attenuation factor + * of 3.85. i.e. light_above_sensor = light_above_cover_window / 3.85 + */ + +struct bh1770_platform_data { +#define BH1770_LED_5mA 0 +#define BH1770_LED_10mA 1 +#define BH1770_LED_20mA 2 +#define BH1770_LED_50mA 3 +#define BH1770_LED_100mA 4 +#define BH1770_LED_150mA 5 +#define BH1770_LED_200mA 6 + __u8 led_def_curr; +#define BH1770_NEUTRAL_GA 16384 /* 16384 / 16384 = 1 */ + __u32 glass_attenuation; + int (*setup_resources)(void); + int (*release_resources)(void); +}; +#endif From 92b1f84d46b24675493d95a239eea2b07e5f13f8 Mon Sep 17 00:00:00 2001 From: Samu Onkalo Date: Tue, 26 Oct 2010 14:22:38 -0700 Subject: [PATCH 107/176] drivers/misc: driver for APDS990X ALS and proximity sensors This is a driver for Avago APDS990X combined ALS and proximity sensor. Interface is sysfs based. The driver uses interrupts to provide new data. The driver supports pm_runtime and regulator frameworks. See Documentation/misc-devices/apds990x.txt for details Signed-off-by: Samu Onkalo Acked-by: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/Kconfig | 11 + drivers/misc/Makefile | 1 + drivers/misc/apds990x.c | 1295 ++++++++++++++++++++++++++++++++++ include/linux/i2c/apds990x.h | 79 +++ 4 files changed, 1386 insertions(+) create mode 100644 drivers/misc/apds990x.c create mode 100644 include/linux/i2c/apds990x.h diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 3b50106abfa5..7362851a2ca8 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -325,6 +325,17 @@ config SENSORS_BH1770 To compile this driver as a module, choose M here: the module will be called bh1770glc. If unsure, say N here. +config SENSORS_APDS990X + tristate "APDS990X combined als and proximity sensors" + depends on I2C + default n + ---help--- + Say Y here if you want to build a driver for Avago APDS990x + combined ambient light and proximity sensor chip. + + To compile this driver as a module, choose M here: the + module will be called apds990x. If unsure, say N here. + config HMC6352 tristate "Honeywell HMC6352 compass" depends on I2C diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 5fed6ab533e2..081b9fca8a06 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_TIFM_7XX1) += tifm_7xx1.o obj-$(CONFIG_PHANTOM) += phantom.o obj-$(CONFIG_SENSORS_BH1780) += bh1780gli.o obj-$(CONFIG_SENSORS_BH1770) += bh1770glc.o +obj-$(CONFIG_SENSORS_APDS990X) += apds990x.o obj-$(CONFIG_SGI_IOC4) += ioc4.o obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o obj-$(CONFIG_KGDB_TESTS) += kgdbts.o diff --git a/drivers/misc/apds990x.c b/drivers/misc/apds990x.c new file mode 100644 index 000000000000..200311fea369 --- /dev/null +++ b/drivers/misc/apds990x.c @@ -0,0 +1,1295 @@ +/* + * This file is part of the APDS990x sensor driver. + * Chip is combined proximity and ambient light sensor. + * + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). + * + * Contact: Samu Onkalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Register map */ +#define APDS990X_ENABLE 0x00 /* Enable of states and interrupts */ +#define APDS990X_ATIME 0x01 /* ALS ADC time */ +#define APDS990X_PTIME 0x02 /* Proximity ADC time */ +#define APDS990X_WTIME 0x03 /* Wait time */ +#define APDS990X_AILTL 0x04 /* ALS interrupt low threshold low byte */ +#define APDS990X_AILTH 0x05 /* ALS interrupt low threshold hi byte */ +#define APDS990X_AIHTL 0x06 /* ALS interrupt hi threshold low byte */ +#define APDS990X_AIHTH 0x07 /* ALS interrupt hi threshold hi byte */ +#define APDS990X_PILTL 0x08 /* Proximity interrupt low threshold low byte */ +#define APDS990X_PILTH 0x09 /* Proximity interrupt low threshold hi byte */ +#define APDS990X_PIHTL 0x0a /* Proximity interrupt hi threshold low byte */ +#define APDS990X_PIHTH 0x0b /* Proximity interrupt hi threshold hi byte */ +#define APDS990X_PERS 0x0c /* Interrupt persistence filters */ +#define APDS990X_CONFIG 0x0d /* Configuration */ +#define APDS990X_PPCOUNT 0x0e /* Proximity pulse count */ +#define APDS990X_CONTROL 0x0f /* Gain control register */ +#define APDS990X_REV 0x11 /* Revision Number */ +#define APDS990X_ID 0x12 /* Device ID */ +#define APDS990X_STATUS 0x13 /* Device status */ +#define APDS990X_CDATAL 0x14 /* Clear ADC low data register */ +#define APDS990X_CDATAH 0x15 /* Clear ADC high data register */ +#define APDS990X_IRDATAL 0x16 /* IR ADC low data register */ +#define APDS990X_IRDATAH 0x17 /* IR ADC high data register */ +#define APDS990X_PDATAL 0x18 /* Proximity ADC low data register */ +#define APDS990X_PDATAH 0x19 /* Proximity ADC high data register */ + +/* Control */ +#define APDS990X_MAX_AGAIN 3 + +/* Enable register */ +#define APDS990X_EN_PIEN (0x1 << 5) +#define APDS990X_EN_AIEN (0x1 << 4) +#define APDS990X_EN_WEN (0x1 << 3) +#define APDS990X_EN_PEN (0x1 << 2) +#define APDS990X_EN_AEN (0x1 << 1) +#define APDS990X_EN_PON (0x1 << 0) +#define APDS990X_EN_DISABLE_ALL 0 + +/* Status register */ +#define APDS990X_ST_PINT (0x1 << 5) +#define APDS990X_ST_AINT (0x1 << 4) + +/* I2C access types */ +#define APDS990x_CMD_TYPE_MASK (0x03 << 5) +#define APDS990x_CMD_TYPE_RB (0x00 << 5) /* Repeated byte */ +#define APDS990x_CMD_TYPE_INC (0x01 << 5) /* Auto increment */ +#define APDS990x_CMD_TYPE_SPE (0x03 << 5) /* Special function */ + +#define APDS990x_ADDR_SHIFT 0 +#define APDS990x_CMD 0x80 + +/* Interrupt ack commands */ +#define APDS990X_INT_ACK_ALS 0x6 +#define APDS990X_INT_ACK_PS 0x5 +#define APDS990X_INT_ACK_BOTH 0x7 + +/* ptime */ +#define APDS990X_PTIME_DEFAULT 0xff /* Recommended conversion time 2.7ms*/ + +/* wtime */ +#define APDS990X_WTIME_DEFAULT 0xee /* ~50ms wait time */ + +#define APDS990X_TIME_TO_ADC 1024 /* One timetick as ADC count value */ + +/* Persistence */ +#define APDS990X_APERS_SHIFT 0 +#define APDS990X_PPERS_SHIFT 4 + +/* Supported ID:s */ +#define APDS990X_ID_0 0x0 +#define APDS990X_ID_4 0x4 +#define APDS990X_ID_29 0x29 + +/* pgain and pdiode settings */ +#define APDS_PGAIN_1X 0x0 +#define APDS_PDIODE_IR 0x2 + +#define APDS990X_LUX_OUTPUT_SCALE 10 + +/* Reverse chip factors for threshold calculation */ +struct reverse_factors { + u32 afactor; + int cf1; + int irf1; + int cf2; + int irf2; +}; + +struct apds990x_chip { + struct apds990x_platform_data *pdata; + struct i2c_client *client; + struct mutex mutex; /* avoid parallel access */ + struct regulator_bulk_data regs[2]; + wait_queue_head_t wait; + + int prox_en; + bool prox_continuous_mode; + bool lux_wait_fresh_res; + + /* Chip parameters */ + struct apds990x_chip_factors cf; + struct reverse_factors rcf; + u16 atime; /* als integration time */ + u16 arate; /* als reporting rate */ + u16 a_max_result; /* Max possible ADC value with current atime */ + u8 again_meas; /* Gain used in last measurement */ + u8 again_next; /* Next calculated gain */ + u8 pgain; + u8 pdiode; + u8 pdrive; + u8 lux_persistence; + u8 prox_persistence; + + u32 lux_raw; + u32 lux; + u16 lux_clear; + u16 lux_ir; + u16 lux_calib; + u32 lux_thres_hi; + u32 lux_thres_lo; + + u32 prox_thres; + u16 prox_data; + u16 prox_calib; + + char chipname[10]; + u8 revision; +}; + +#define APDS_CALIB_SCALER 8192 +#define APDS_LUX_NEUTRAL_CALIB_VALUE (1 * APDS_CALIB_SCALER) +#define APDS_PROX_NEUTRAL_CALIB_VALUE (1 * APDS_CALIB_SCALER) + +#define APDS_PROX_DEF_THRES 600 +#define APDS_PROX_HYSTERESIS 50 +#define APDS_LUX_DEF_THRES_HI 101 +#define APDS_LUX_DEF_THRES_LO 100 +#define APDS_DEFAULT_PROX_PERS 1 + +#define APDS_TIMEOUT 2000 +#define APDS_STARTUP_DELAY 25000 /* us */ +#define APDS_RANGE 65535 +#define APDS_PROX_RANGE 1023 +#define APDS_LUX_GAIN_LO_LIMIT 100 +#define APDS_LUX_GAIN_LO_LIMIT_STRICT 25 + +#define TIMESTEP 87 /* 2.7ms is about 87 / 32 */ +#define TIME_STEP_SCALER 32 + +#define APDS_LUX_AVERAGING_TIME 50 /* tolerates 50/60Hz ripple */ +#define APDS_LUX_DEFAULT_RATE 200 + +static const u8 again[] = {1, 8, 16, 120}; /* ALS gain steps */ +static const u8 ir_currents[] = {100, 50, 25, 12}; /* IRled currents in mA */ + +/* Following two tables must match i.e 10Hz rate means 1 as persistence value */ +static const u16 arates_hz[] = {10, 5, 2, 1}; +static const u8 apersis[] = {1, 2, 4, 5}; + +/* Regulators */ +static const char reg_vcc[] = "Vdd"; +static const char reg_vled[] = "Vled"; + +static int apds990x_read_byte(struct apds990x_chip *chip, u8 reg, u8 *data) +{ + struct i2c_client *client = chip->client; + s32 ret; + + reg &= ~APDS990x_CMD_TYPE_MASK; + reg |= APDS990x_CMD | APDS990x_CMD_TYPE_RB; + + ret = i2c_smbus_read_byte_data(client, reg); + *data = ret; + return (int)ret; +} + +static int apds990x_read_word(struct apds990x_chip *chip, u8 reg, u16 *data) +{ + struct i2c_client *client = chip->client; + s32 ret; + + reg &= ~APDS990x_CMD_TYPE_MASK; + reg |= APDS990x_CMD | APDS990x_CMD_TYPE_INC; + + ret = i2c_smbus_read_word_data(client, reg); + *data = ret; + return (int)ret; +} + +static int apds990x_write_byte(struct apds990x_chip *chip, u8 reg, u8 data) +{ + struct i2c_client *client = chip->client; + s32 ret; + + reg &= ~APDS990x_CMD_TYPE_MASK; + reg |= APDS990x_CMD | APDS990x_CMD_TYPE_RB; + + ret = i2c_smbus_write_byte_data(client, reg, data); + return (int)ret; +} + +static int apds990x_write_word(struct apds990x_chip *chip, u8 reg, u16 data) +{ + struct i2c_client *client = chip->client; + s32 ret; + + reg &= ~APDS990x_CMD_TYPE_MASK; + reg |= APDS990x_CMD | APDS990x_CMD_TYPE_INC; + + ret = i2c_smbus_write_word_data(client, reg, data); + return (int)ret; +} + +static int apds990x_mode_on(struct apds990x_chip *chip) +{ + /* ALS is mandatory, proximity optional */ + u8 reg = APDS990X_EN_AIEN | APDS990X_EN_PON | APDS990X_EN_AEN | + APDS990X_EN_WEN; + + if (chip->prox_en) + reg |= APDS990X_EN_PIEN | APDS990X_EN_PEN; + + return apds990x_write_byte(chip, APDS990X_ENABLE, reg); +} + +static u16 apds990x_lux_to_threshold(struct apds990x_chip *chip, u32 lux) +{ + u32 thres; + u32 cpl; + u32 ir; + + if (lux == 0) + return 0; + else if (lux == APDS_RANGE) + return APDS_RANGE; + + /* + * Reported LUX value is a combination of the IR and CLEAR channel + * values. However, interrupt threshold is only for clear channel. + * This function approximates needed HW threshold value for a given + * LUX value in the current lightning type. + * IR level compared to visible light varies heavily depending on the + * source of the light + * + * Calculate threshold value for the next measurement period. + * Math: threshold = lux * cpl where + * cpl = atime * again / (glass_attenuation * device_factor) + * (count-per-lux) + * + * First remove calibration. Division by four is to avoid overflow + */ + lux = lux * (APDS_CALIB_SCALER / 4) / (chip->lux_calib / 4); + + /* Multiplication by 64 is to increase accuracy */ + cpl = ((u32)chip->atime * (u32)again[chip->again_next] * + APDS_PARAM_SCALE * 64) / (chip->cf.ga * chip->cf.df); + + thres = lux * cpl / 64; + /* + * Convert IR light from the latest result to match with + * new gain step. This helps to adapt with the current + * source of light. + */ + ir = (u32)chip->lux_ir * (u32)again[chip->again_next] / + (u32)again[chip->again_meas]; + + /* + * Compensate count with IR light impact + * IAC1 > IAC2 (see apds990x_get_lux for formulas) + */ + if (chip->lux_clear * APDS_PARAM_SCALE >= + chip->rcf.afactor * chip->lux_ir) + thres = (chip->rcf.cf1 * thres + chip->rcf.irf1 * ir) / + APDS_PARAM_SCALE; + else + thres = (chip->rcf.cf2 * thres + chip->rcf.irf2 * ir) / + APDS_PARAM_SCALE; + + if (thres >= chip->a_max_result) + thres = chip->a_max_result - 1; + return thres; +} + +static inline int apds990x_set_atime(struct apds990x_chip *chip, u32 time_ms) +{ + u8 reg_value; + + chip->atime = time_ms; + /* Formula is specified in the data sheet */ + reg_value = 256 - ((time_ms * TIME_STEP_SCALER) / TIMESTEP); + /* Calculate max ADC value for given integration time */ + chip->a_max_result = (u16)(256 - reg_value) * APDS990X_TIME_TO_ADC; + return apds990x_write_byte(chip, APDS990X_ATIME, reg_value); +} + +/* Called always with mutex locked */ +static int apds990x_refresh_pthres(struct apds990x_chip *chip, int data) +{ + int ret, lo, hi; + + /* If the chip is not in use, don't try to access it */ + if (pm_runtime_suspended(&chip->client->dev)) + return 0; + + if (data < chip->prox_thres) { + lo = 0; + hi = chip->prox_thres; + } else { + lo = chip->prox_thres - APDS_PROX_HYSTERESIS; + if (chip->prox_continuous_mode) + hi = chip->prox_thres; + else + hi = APDS_RANGE; + } + + ret = apds990x_write_word(chip, APDS990X_PILTL, lo); + ret |= apds990x_write_word(chip, APDS990X_PIHTL, hi); + return ret; +} + +/* Called always with mutex locked */ +static int apds990x_refresh_athres(struct apds990x_chip *chip) +{ + int ret; + /* If the chip is not in use, don't try to access it */ + if (pm_runtime_suspended(&chip->client->dev)) + return 0; + + ret = apds990x_write_word(chip, APDS990X_AILTL, + apds990x_lux_to_threshold(chip, chip->lux_thres_lo)); + ret |= apds990x_write_word(chip, APDS990X_AIHTL, + apds990x_lux_to_threshold(chip, chip->lux_thres_hi)); + + return ret; +} + +/* Called always with mutex locked */ +static void apds990x_force_a_refresh(struct apds990x_chip *chip) +{ + /* This will force ALS interrupt after the next measurement. */ + apds990x_write_word(chip, APDS990X_AILTL, APDS_LUX_DEF_THRES_LO); + apds990x_write_word(chip, APDS990X_AIHTL, APDS_LUX_DEF_THRES_HI); +} + +/* Called always with mutex locked */ +static void apds990x_force_p_refresh(struct apds990x_chip *chip) +{ + /* This will force proximity interrupt after the next measurement. */ + apds990x_write_word(chip, APDS990X_PILTL, APDS_PROX_DEF_THRES - 1); + apds990x_write_word(chip, APDS990X_PIHTL, APDS_PROX_DEF_THRES); +} + +/* Called always with mutex locked */ +static int apds990x_calc_again(struct apds990x_chip *chip) +{ + int curr_again = chip->again_meas; + int next_again = chip->again_meas; + int ret = 0; + + /* Calculate suitable als gain */ + if (chip->lux_clear == chip->a_max_result) + next_again -= 2; /* ALS saturated. Decrease gain by 2 steps */ + else if (chip->lux_clear > chip->a_max_result / 2) + next_again--; + else if (chip->lux_clear < APDS_LUX_GAIN_LO_LIMIT_STRICT) + next_again += 2; /* Too dark. Increase gain by 2 steps */ + else if (chip->lux_clear < APDS_LUX_GAIN_LO_LIMIT) + next_again++; + + /* Limit gain to available range */ + if (next_again < 0) + next_again = 0; + else if (next_again > APDS990X_MAX_AGAIN) + next_again = APDS990X_MAX_AGAIN; + + /* Let's check can we trust the measured result */ + if (chip->lux_clear == chip->a_max_result) + /* Result can be totally garbage due to saturation */ + ret = -ERANGE; + else if (next_again != curr_again && + chip->lux_clear < APDS_LUX_GAIN_LO_LIMIT_STRICT) + /* + * Gain is changed and measurement result is very small. + * Result can be totally garbage due to underflow + */ + ret = -ERANGE; + + chip->again_next = next_again; + apds990x_write_byte(chip, APDS990X_CONTROL, + (chip->pdrive << 6) | + (chip->pdiode << 4) | + (chip->pgain << 2) | + (chip->again_next << 0)); + + /* + * Error means bad result -> re-measurement is needed. The forced + * refresh uses fastest possible persistence setting to get result + * as soon as possible. + */ + if (ret < 0) + apds990x_force_a_refresh(chip); + else + apds990x_refresh_athres(chip); + + return ret; +} + +/* Called always with mutex locked */ +static int apds990x_get_lux(struct apds990x_chip *chip, int clear, int ir) +{ + int iac, iac1, iac2; /* IR adjusted counts */ + u32 lpc; /* Lux per count */ + + /* Formulas: + * iac1 = CF1 * CLEAR_CH - IRF1 * IR_CH + * iac2 = CF2 * CLEAR_CH - IRF2 * IR_CH + */ + iac1 = (chip->cf.cf1 * clear - chip->cf.irf1 * ir) / APDS_PARAM_SCALE; + iac2 = (chip->cf.cf2 * clear - chip->cf.irf2 * ir) / APDS_PARAM_SCALE; + + iac = max(iac1, iac2); + iac = max(iac, 0); + + lpc = APDS990X_LUX_OUTPUT_SCALE * (chip->cf.df * chip->cf.ga) / + (u32)(again[chip->again_meas] * (u32)chip->atime); + + return (iac * lpc) / APDS_PARAM_SCALE; +} + +static int apds990x_ack_int(struct apds990x_chip *chip, u8 mode) +{ + struct i2c_client *client = chip->client; + s32 ret; + u8 reg = APDS990x_CMD | APDS990x_CMD_TYPE_SPE; + + switch (mode & (APDS990X_ST_AINT | APDS990X_ST_PINT)) { + case APDS990X_ST_AINT: + reg |= APDS990X_INT_ACK_ALS; + break; + case APDS990X_ST_PINT: + reg |= APDS990X_INT_ACK_PS; + break; + default: + reg |= APDS990X_INT_ACK_BOTH; + break; + } + + ret = i2c_smbus_read_byte_data(client, reg); + return (int)ret; +} + +static irqreturn_t apds990x_irq(int irq, void *data) +{ + struct apds990x_chip *chip = data; + u8 status; + + apds990x_read_byte(chip, APDS990X_STATUS, &status); + apds990x_ack_int(chip, status); + + mutex_lock(&chip->mutex); + if (!pm_runtime_suspended(&chip->client->dev)) { + if (status & APDS990X_ST_AINT) { + apds990x_read_word(chip, APDS990X_CDATAL, + &chip->lux_clear); + apds990x_read_word(chip, APDS990X_IRDATAL, + &chip->lux_ir); + /* Store used gain for calculations */ + chip->again_meas = chip->again_next; + + chip->lux_raw = apds990x_get_lux(chip, + chip->lux_clear, + chip->lux_ir); + + if (apds990x_calc_again(chip) == 0) { + /* Result is valid */ + chip->lux = chip->lux_raw; + chip->lux_wait_fresh_res = false; + wake_up(&chip->wait); + sysfs_notify(&chip->client->dev.kobj, + NULL, "lux0_input"); + } + } + + if ((status & APDS990X_ST_PINT) && chip->prox_en) { + u16 clr_ch; + + apds990x_read_word(chip, APDS990X_CDATAL, &clr_ch); + /* + * If ALS channel is saturated at min gain, + * proximity gives false posivite values. + * Just ignore them. + */ + if (chip->again_meas == 0 && + clr_ch == chip->a_max_result) + chip->prox_data = 0; + else + apds990x_read_word(chip, + APDS990X_PDATAL, + &chip->prox_data); + + apds990x_refresh_pthres(chip, chip->prox_data); + if (chip->prox_data < chip->prox_thres) + chip->prox_data = 0; + else if (!chip->prox_continuous_mode) + chip->prox_data = APDS_PROX_RANGE; + sysfs_notify(&chip->client->dev.kobj, + NULL, "prox0_raw"); + } + } + mutex_unlock(&chip->mutex); + return IRQ_HANDLED; +} + +static int apds990x_configure(struct apds990x_chip *chip) +{ + /* It is recommended to use disabled mode during these operations */ + apds990x_write_byte(chip, APDS990X_ENABLE, APDS990X_EN_DISABLE_ALL); + + /* conversion and wait times for different state machince states */ + apds990x_write_byte(chip, APDS990X_PTIME, APDS990X_PTIME_DEFAULT); + apds990x_write_byte(chip, APDS990X_WTIME, APDS990X_WTIME_DEFAULT); + apds990x_set_atime(chip, APDS_LUX_AVERAGING_TIME); + + apds990x_write_byte(chip, APDS990X_CONFIG, 0); + + /* Persistence levels */ + apds990x_write_byte(chip, APDS990X_PERS, + (chip->lux_persistence << APDS990X_APERS_SHIFT) | + (chip->prox_persistence << APDS990X_PPERS_SHIFT)); + + apds990x_write_byte(chip, APDS990X_PPCOUNT, chip->pdata->ppcount); + + /* Start with relatively small gain */ + chip->again_meas = 1; + chip->again_next = 1; + apds990x_write_byte(chip, APDS990X_CONTROL, + (chip->pdrive << 6) | + (chip->pdiode << 4) | + (chip->pgain << 2) | + (chip->again_next << 0)); + return 0; +} + +static int apds990x_detect(struct apds990x_chip *chip) +{ + struct i2c_client *client = chip->client; + int ret; + u8 id; + + ret = apds990x_read_byte(chip, APDS990X_ID, &id); + if (ret < 0) { + dev_err(&client->dev, "ID read failed\n"); + return ret; + } + + ret = apds990x_read_byte(chip, APDS990X_REV, &chip->revision); + if (ret < 0) { + dev_err(&client->dev, "REV read failed\n"); + return ret; + } + + switch (id) { + case APDS990X_ID_0: + case APDS990X_ID_4: + case APDS990X_ID_29: + snprintf(chip->chipname, sizeof(chip->chipname), "APDS-990x"); + break; + default: + ret = -ENODEV; + break; + } + return ret; +} + +static int apds990x_chip_on(struct apds990x_chip *chip) +{ + int err = regulator_bulk_enable(ARRAY_SIZE(chip->regs), + chip->regs); + if (err < 0) + return err; + + usleep_range(APDS_STARTUP_DELAY, 2 * APDS_STARTUP_DELAY); + + /* Refresh all configs in case of regulators were off */ + chip->prox_data = 0; + apds990x_configure(chip); + apds990x_mode_on(chip); + return 0; +} + +static int apds990x_chip_off(struct apds990x_chip *chip) +{ + apds990x_write_byte(chip, APDS990X_ENABLE, APDS990X_EN_DISABLE_ALL); + regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs); + return 0; +} + +static ssize_t apds990x_lux_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + ssize_t ret; + u32 result; + long timeout; + + if (pm_runtime_suspended(dev)) + return -EIO; + + timeout = wait_event_interruptible_timeout(chip->wait, + !chip->lux_wait_fresh_res, + msecs_to_jiffies(APDS_TIMEOUT)); + if (!timeout) + return -EIO; + + mutex_lock(&chip->mutex); + result = (chip->lux * chip->lux_calib) / APDS_CALIB_SCALER; + if (result > (APDS_RANGE * APDS990X_LUX_OUTPUT_SCALE)) + result = APDS_RANGE * APDS990X_LUX_OUTPUT_SCALE; + + ret = sprintf(buf, "%d.%d\n", + result / APDS990X_LUX_OUTPUT_SCALE, + result % APDS990X_LUX_OUTPUT_SCALE); + mutex_unlock(&chip->mutex); + return ret; +} + +static DEVICE_ATTR(lux0_input, S_IRUGO, apds990x_lux_show, NULL); + +static ssize_t apds990x_lux_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", APDS_RANGE); +} + +static DEVICE_ATTR(lux0_sensor_range, S_IRUGO, apds990x_lux_range_show, NULL); + +static ssize_t apds990x_lux_calib_format_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", APDS_CALIB_SCALER); +} + +static DEVICE_ATTR(lux0_calibscale_default, S_IRUGO, + apds990x_lux_calib_format_show, NULL); + +static ssize_t apds990x_lux_calib_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", chip->lux_calib); +} + +static ssize_t apds990x_lux_calib_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + if (chip->lux_calib > APDS_RANGE) + return -EINVAL; + + chip->lux_calib = value; + + return len; +} + +static DEVICE_ATTR(lux0_calibscale, S_IRUGO | S_IWUSR, apds990x_lux_calib_show, + apds990x_lux_calib_store); + +static ssize_t apds990x_rate_avail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int i; + int pos = 0; + for (i = 0; i < ARRAY_SIZE(arates_hz); i++) + pos += sprintf(buf + pos, "%d ", arates_hz[i]); + sprintf(buf + pos - 1, "\n"); + return pos; +} + +static ssize_t apds990x_rate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->arate); +} + +static int apds990x_set_arate(struct apds990x_chip *chip, int rate) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(arates_hz); i++) + if (rate >= arates_hz[i]) + break; + + if (i == ARRAY_SIZE(arates_hz)) + return -EINVAL; + + /* Pick up corresponding persistence value */ + chip->lux_persistence = apersis[i]; + chip->arate = arates_hz[i]; + + /* If the chip is not in use, don't try to access it */ + if (pm_runtime_suspended(&chip->client->dev)) + return 0; + + /* Persistence levels */ + return apds990x_write_byte(chip, APDS990X_PERS, + (chip->lux_persistence << APDS990X_APERS_SHIFT) | + (chip->prox_persistence << APDS990X_PPERS_SHIFT)); +} + +static ssize_t apds990x_rate_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + unsigned long value; + int ret; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + ret = apds990x_set_arate(chip, value); + mutex_unlock(&chip->mutex); + + if (ret < 0) + return ret; + return len; +} + +static DEVICE_ATTR(lux0_rate_avail, S_IRUGO, apds990x_rate_avail, NULL); + +static DEVICE_ATTR(lux0_rate, S_IRUGO | S_IWUSR, apds990x_rate_show, + apds990x_rate_store); + +static ssize_t apds990x_prox_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t ret; + struct apds990x_chip *chip = dev_get_drvdata(dev); + if (pm_runtime_suspended(dev) || !chip->prox_en) + return -EIO; + + mutex_lock(&chip->mutex); + ret = sprintf(buf, "%d\n", chip->prox_data); + mutex_unlock(&chip->mutex); + return ret; +} + +static DEVICE_ATTR(prox0_raw, S_IRUGO, apds990x_prox_show, NULL); + +static ssize_t apds990x_prox_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", APDS_PROX_RANGE); +} + +static DEVICE_ATTR(prox0_sensor_range, S_IRUGO, apds990x_prox_range_show, NULL); + +static ssize_t apds990x_prox_enable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->prox_en); +} + +static ssize_t apds990x_prox_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + mutex_lock(&chip->mutex); + + if (!chip->prox_en) + chip->prox_data = 0; + + if (value) + chip->prox_en++; + else if (chip->prox_en > 0) + chip->prox_en--; + + if (!pm_runtime_suspended(dev)) + apds990x_mode_on(chip); + mutex_unlock(&chip->mutex); + return len; +} + +static DEVICE_ATTR(prox0_raw_en, S_IRUGO | S_IWUSR, apds990x_prox_enable_show, + apds990x_prox_enable_store); + +static const char reporting_modes[][9] = {"trigger", "periodic"}; + +static ssize_t apds990x_prox_reporting_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%s\n", + reporting_modes[!!chip->prox_continuous_mode]); +} + +static ssize_t apds990x_prox_reporting_mode_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + + if (sysfs_streq(buf, reporting_modes[0])) + chip->prox_continuous_mode = 0; + else if (sysfs_streq(buf, reporting_modes[1])) + chip->prox_continuous_mode = 1; + else + return -EINVAL; + return len; +} + +static DEVICE_ATTR(prox0_reporting_mode, S_IRUGO | S_IWUSR, + apds990x_prox_reporting_mode_show, + apds990x_prox_reporting_mode_store); + +static ssize_t apds990x_prox_reporting_avail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s %s\n", reporting_modes[0], reporting_modes[1]); +} + +static DEVICE_ATTR(prox0_reporting_mode_avail, S_IRUGO | S_IWUSR, + apds990x_prox_reporting_avail_show, NULL); + + +static ssize_t apds990x_lux_thresh_above_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->lux_thres_hi); +} + +static ssize_t apds990x_lux_thresh_below_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->lux_thres_lo); +} + +static ssize_t apds990x_set_lux_thresh(struct apds990x_chip *chip, u32 *target, + const char *buf) +{ + int ret = 0; + unsigned long thresh; + + if (strict_strtoul(buf, 0, &thresh)) + return -EINVAL; + + if (thresh > APDS_RANGE) + return -EINVAL; + + mutex_lock(&chip->mutex); + *target = thresh; + /* + * Don't update values in HW if we are still waiting for + * first interrupt to come after device handle open call. + */ + if (!chip->lux_wait_fresh_res) + apds990x_refresh_athres(chip); + mutex_unlock(&chip->mutex); + return ret; + +} + +static ssize_t apds990x_lux_thresh_above_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + int ret = apds990x_set_lux_thresh(chip, &chip->lux_thres_hi, buf); + if (ret < 0) + return ret; + return len; +} + +static ssize_t apds990x_lux_thresh_below_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + int ret = apds990x_set_lux_thresh(chip, &chip->lux_thres_lo, buf); + if (ret < 0) + return ret; + return len; +} + +static DEVICE_ATTR(lux0_thresh_above_value, S_IRUGO | S_IWUSR, + apds990x_lux_thresh_above_show, + apds990x_lux_thresh_above_store); + +static DEVICE_ATTR(lux0_thresh_below_value, S_IRUGO | S_IWUSR, + apds990x_lux_thresh_below_show, + apds990x_lux_thresh_below_store); + +static ssize_t apds990x_prox_threshold_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", chip->prox_thres); +} + +static ssize_t apds990x_prox_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + + if ((value > APDS_RANGE) || (value == 0) || + (value < APDS_PROX_HYSTERESIS)) + return -EINVAL; + + mutex_lock(&chip->mutex); + chip->prox_thres = value; + + apds990x_force_p_refresh(chip); + mutex_unlock(&chip->mutex); + return len; +} + +static DEVICE_ATTR(prox0_thresh_above_value, S_IRUGO | S_IWUSR, + apds990x_prox_threshold_show, + apds990x_prox_threshold_store); + +static ssize_t apds990x_power_state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !pm_runtime_suspended(dev)); + return 0; +} + +static ssize_t apds990x_power_state_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + unsigned long value; + + if (strict_strtoul(buf, 0, &value)) + return -EINVAL; + if (value) { + pm_runtime_get_sync(dev); + mutex_lock(&chip->mutex); + chip->lux_wait_fresh_res = true; + apds990x_force_a_refresh(chip); + apds990x_force_p_refresh(chip); + mutex_unlock(&chip->mutex); + } else { + if (!pm_runtime_suspended(dev)) + pm_runtime_put(dev); + } + return len; +} + +static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR, + apds990x_power_state_show, + apds990x_power_state_store); + +static ssize_t apds990x_chip_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct apds990x_chip *chip = dev_get_drvdata(dev); + return sprintf(buf, "%s %d\n", chip->chipname, chip->revision); +} + +static DEVICE_ATTR(chip_id, S_IRUGO, apds990x_chip_id_show, NULL); + +static struct attribute *sysfs_attrs_ctrl[] = { + &dev_attr_lux0_calibscale.attr, + &dev_attr_lux0_calibscale_default.attr, + &dev_attr_lux0_input.attr, + &dev_attr_lux0_sensor_range.attr, + &dev_attr_lux0_rate.attr, + &dev_attr_lux0_rate_avail.attr, + &dev_attr_lux0_thresh_above_value.attr, + &dev_attr_lux0_thresh_below_value.attr, + &dev_attr_prox0_raw_en.attr, + &dev_attr_prox0_raw.attr, + &dev_attr_prox0_sensor_range.attr, + &dev_attr_prox0_thresh_above_value.attr, + &dev_attr_prox0_reporting_mode.attr, + &dev_attr_prox0_reporting_mode_avail.attr, + &dev_attr_chip_id.attr, + &dev_attr_power_state.attr, + NULL +}; + +static struct attribute_group apds990x_attribute_group[] = { + {.attrs = sysfs_attrs_ctrl }, +}; + +static int __devinit apds990x_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct apds990x_chip *chip; + int err; + + chip = kzalloc(sizeof *chip, GFP_KERNEL); + if (!chip) + return -ENOMEM; + + i2c_set_clientdata(client, chip); + chip->client = client; + + init_waitqueue_head(&chip->wait); + mutex_init(&chip->mutex); + chip->pdata = client->dev.platform_data; + + if (chip->pdata == NULL) { + dev_err(&client->dev, "platform data is mandatory\n"); + err = -EINVAL; + goto fail1; + } + + if (chip->pdata->cf.ga == 0) { + /* set uncovered sensor default parameters */ + chip->cf.ga = 1966; /* 0.48 * APDS_PARAM_SCALE */ + chip->cf.cf1 = 4096; /* 1.00 * APDS_PARAM_SCALE */ + chip->cf.irf1 = 9134; /* 2.23 * APDS_PARAM_SCALE */ + chip->cf.cf2 = 2867; /* 0.70 * APDS_PARAM_SCALE */ + chip->cf.irf2 = 5816; /* 1.42 * APDS_PARAM_SCALE */ + chip->cf.df = 52; + } else { + chip->cf = chip->pdata->cf; + } + + /* precalculate inverse chip factors for threshold control */ + chip->rcf.afactor = + (chip->cf.irf1 - chip->cf.irf2) * APDS_PARAM_SCALE / + (chip->cf.cf1 - chip->cf.cf2); + chip->rcf.cf1 = APDS_PARAM_SCALE * APDS_PARAM_SCALE / + chip->cf.cf1; + chip->rcf.irf1 = chip->cf.irf1 * APDS_PARAM_SCALE / + chip->cf.cf1; + chip->rcf.cf2 = APDS_PARAM_SCALE * APDS_PARAM_SCALE / + chip->cf.cf2; + chip->rcf.irf2 = chip->cf.irf2 * APDS_PARAM_SCALE / + chip->cf.cf2; + + /* Set something to start with */ + chip->lux_thres_hi = APDS_LUX_DEF_THRES_HI; + chip->lux_thres_lo = APDS_LUX_DEF_THRES_LO; + chip->lux_calib = APDS_LUX_NEUTRAL_CALIB_VALUE; + + chip->prox_thres = APDS_PROX_DEF_THRES; + chip->pdrive = chip->pdata->pdrive; + chip->pdiode = APDS_PDIODE_IR; + chip->pgain = APDS_PGAIN_1X; + chip->prox_calib = APDS_PROX_NEUTRAL_CALIB_VALUE; + chip->prox_persistence = APDS_DEFAULT_PROX_PERS; + chip->prox_continuous_mode = false; + + chip->regs[0].supply = reg_vcc; + chip->regs[1].supply = reg_vled; + + err = regulator_bulk_get(&client->dev, + ARRAY_SIZE(chip->regs), chip->regs); + if (err < 0) { + dev_err(&client->dev, "Cannot get regulators\n"); + goto fail1; + } + + err = regulator_bulk_enable(ARRAY_SIZE(chip->regs), chip->regs); + if (err < 0) { + dev_err(&client->dev, "Cannot enable regulators\n"); + goto fail2; + } + + usleep_range(APDS_STARTUP_DELAY, 2 * APDS_STARTUP_DELAY); + + err = apds990x_detect(chip); + if (err < 0) { + dev_err(&client->dev, "APDS990X not found\n"); + goto fail3; + } + + pm_runtime_set_active(&client->dev); + + apds990x_configure(chip); + apds990x_set_arate(chip, APDS_LUX_DEFAULT_RATE); + apds990x_mode_on(chip); + + pm_runtime_enable(&client->dev); + + if (chip->pdata->setup_resources) { + err = chip->pdata->setup_resources(); + if (err) { + err = -EINVAL; + goto fail3; + } + } + + err = sysfs_create_group(&chip->client->dev.kobj, + apds990x_attribute_group); + if (err < 0) { + dev_err(&chip->client->dev, "Sysfs registration failed\n"); + goto fail4; + } + + err = request_threaded_irq(client->irq, NULL, + apds990x_irq, + IRQF_TRIGGER_FALLING | IRQF_TRIGGER_LOW | + IRQF_ONESHOT, + "apds990x", chip); + if (err) { + dev_err(&client->dev, "could not get IRQ %d\n", + client->irq); + goto fail5; + } + return err; +fail5: + sysfs_remove_group(&chip->client->dev.kobj, + &apds990x_attribute_group[0]); +fail4: + if (chip->pdata && chip->pdata->release_resources) + chip->pdata->release_resources(); +fail3: + regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs); +fail2: + regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs); +fail1: + kfree(chip); + return err; +} + +static int __devexit apds990x_remove(struct i2c_client *client) +{ + struct apds990x_chip *chip = i2c_get_clientdata(client); + + free_irq(client->irq, chip); + sysfs_remove_group(&chip->client->dev.kobj, + apds990x_attribute_group); + + if (chip->pdata && chip->pdata->release_resources) + chip->pdata->release_resources(); + + if (!pm_runtime_suspended(&client->dev)) + apds990x_chip_off(chip); + + pm_runtime_disable(&client->dev); + pm_runtime_set_suspended(&client->dev); + + regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs); + + kfree(chip); + return 0; +} + +#ifdef CONFIG_PM +static int apds990x_suspend(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct apds990x_chip *chip = i2c_get_clientdata(client); + + apds990x_chip_off(chip); + return 0; +} + +static int apds990x_resume(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct apds990x_chip *chip = i2c_get_clientdata(client); + + /* + * If we were enabled at suspend time, it is expected + * everything works nice and smoothly. Chip_on is enough + */ + apds990x_chip_on(chip); + + return 0; +} +#else +#define apds990x_suspend NULL +#define apds990x_resume NULL +#define apds990x_shutdown NULL +#endif + +#ifdef CONFIG_PM_RUNTIME +static int apds990x_runtime_suspend(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct apds990x_chip *chip = i2c_get_clientdata(client); + + apds990x_chip_off(chip); + return 0; +} + +static int apds990x_runtime_resume(struct device *dev) +{ + struct i2c_client *client = container_of(dev, struct i2c_client, dev); + struct apds990x_chip *chip = i2c_get_clientdata(client); + + apds990x_chip_on(chip); + return 0; +} + +#endif + +static const struct i2c_device_id apds990x_id[] = { + {"apds990x", 0 }, + {} +}; + +MODULE_DEVICE_TABLE(i2c, apds990x_id); + +static const struct dev_pm_ops apds990x_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(apds990x_suspend, apds990x_resume) + SET_RUNTIME_PM_OPS(apds990x_runtime_suspend, + apds990x_runtime_resume, + NULL) +}; + +static struct i2c_driver apds990x_driver = { + .driver = { + .name = "apds990x", + .owner = THIS_MODULE, + .pm = &apds990x_pm_ops, + }, + .probe = apds990x_probe, + .remove = __devexit_p(apds990x_remove), + .id_table = apds990x_id, +}; + +static int __init apds990x_init(void) +{ + return i2c_add_driver(&apds990x_driver); +} + +static void __exit apds990x_exit(void) +{ + i2c_del_driver(&apds990x_driver); +} + +MODULE_DESCRIPTION("APDS990X combined ALS and proximity sensor"); +MODULE_AUTHOR("Samu Onkalo, Nokia Corporation"); +MODULE_LICENSE("GPL v2"); + +module_init(apds990x_init); +module_exit(apds990x_exit); diff --git a/include/linux/i2c/apds990x.h b/include/linux/i2c/apds990x.h new file mode 100644 index 000000000000..d186fcc5d257 --- /dev/null +++ b/include/linux/i2c/apds990x.h @@ -0,0 +1,79 @@ +/* + * This file is part of the APDS990x sensor driver. + * Chip is combined proximity and ambient light sensor. + * + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). + * + * Contact: Samu Onkalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#ifndef __APDS990X_H__ +#define __APDS990X_H__ + + +#define APDS_IRLED_CURR_12mA 0x3 +#define APDS_IRLED_CURR_25mA 0x2 +#define APDS_IRLED_CURR_50mA 0x1 +#define APDS_IRLED_CURR_100mA 0x0 + +/** + * struct apds990x_chip_factors - defines effect of the cover window + * @ga: Total glass attenuation + * @cf1: clear channel factor 1 for raw to lux conversion + * @irf1: IR channel factor 1 for raw to lux conversion + * @cf2: clear channel factor 2 for raw to lux conversion + * @irf2: IR channel factor 2 for raw to lux conversion + * @df: device factor for conversion formulas + * + * Structure for tuning ALS calculation to match with environment. + * Values depend on the material above the sensor and the sensor + * itself. If the GA is zero, driver will use uncovered sensor default values + * format: decimal value * APDS_PARAM_SCALE except df which is plain integer. + */ +#define APDS_PARAM_SCALE 4096 +struct apds990x_chip_factors { + int ga; + int cf1; + int irf1; + int cf2; + int irf2; + int df; +}; + +/** + * struct apds990x_platform_data - platform data for apsd990x.c driver + * @cf: chip factor data + * @pddrive: IR-led driving current + * @ppcount: number of IR pulses used for proximity estimation + * @setup_resources: interrupt line setup call back function + * @release_resources: interrupt line release call back function + * + * Proximity detection result depends heavily on correct ppcount, pdrive + * and cover window. + * + */ + +struct apds990x_platform_data { + struct apds990x_chip_factors cf; + u8 pdrive; + u8 ppcount; + int (*setup_resources)(void); + int (*release_resources)(void); +}; + +#endif From 3f0f4a3f2008613c601e97f773dbd80ac400e459 Mon Sep 17 00:00:00 2001 From: Samu Onkalo Date: Tue, 26 Oct 2010 14:22:39 -0700 Subject: [PATCH 108/176] Documentation: short descriptions for bh1770glc and apds990x drivers Add short documentation for two ALS / proximity chip drivers. Signed-off-by: Samu Onkalo Acked-by: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/misc-devices/apds990x.txt | 111 ++++++++++++++++++++++ Documentation/misc-devices/bh1770glc.txt | 116 +++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 Documentation/misc-devices/apds990x.txt create mode 100644 Documentation/misc-devices/bh1770glc.txt diff --git a/Documentation/misc-devices/apds990x.txt b/Documentation/misc-devices/apds990x.txt new file mode 100644 index 000000000000..d5408cade32f --- /dev/null +++ b/Documentation/misc-devices/apds990x.txt @@ -0,0 +1,111 @@ +Kernel driver apds990x +====================== + +Supported chips: +Avago APDS990X + +Data sheet: +Not freely available + +Author: +Samu Onkalo + +Description +----------- + +APDS990x is a combined ambient light and proximity sensor. ALS and proximity +functionality are highly connected. ALS measurement path must be running +while the proximity functionality is enabled. + +ALS produces raw measurement values for two channels: Clear channel +(infrared + visible light) and IR only. However, threshold comparisons happen +using clear channel only. Lux value and the threshold level on the HW +might vary quite much depending the spectrum of the light source. + +Driver makes necessary conversions to both directions so that user handles +only lux values. Lux value is calculated using information from the both +channels. HW threshold level is calculated from the given lux value to match +with current type of the lightning. Sometimes inaccuracy of the estimations +lead to false interrupt, but that doesn't harm. + +ALS contains 4 different gain steps. Driver automatically +selects suitable gain step. After each measurement, reliability of the results +is estimated and new measurement is trigged if necessary. + +Platform data can provide tuned values to the conversion formulas if +values are known. Otherwise plain sensor default values are used. + +Proximity side is little bit simpler. There is no need for complex conversions. +It produces directly usable values. + +Driver controls chip operational state using pm_runtime framework. +Voltage regulators are controlled based on chip operational state. + +SYSFS +----- + + +chip_id + RO - shows detected chip type and version + +power_state + RW - enable / disable chip. Uses counting logic + 1 enables the chip + 0 disables the chip +lux0_input + RO - measured lux value + sysfs_notify called when threshold interrupt occurs + +lux0_sensor_range + RO - lux0_input max value. Actually never reaches since sensor tends + to saturate much before that. Real max value varies depending + on the light spectrum etc. + +lux0_rate + RW - measurement rate in Hz + +lux0_rate_avail + RO - supported measurement rates + +lux0_calibscale + RW - calibration value. Set to neutral value by default. + Output results are multiplied with calibscale / calibscale_default + value. + +lux0_calibscale_default + RO - neutral calibration value + +lux0_thresh_above_value + RW - HI level threshold value. All results above the value + trigs an interrupt. 65535 (i.e. sensor_range) disables the above + interrupt. + +lux0_thresh_below_value + RW - LO level threshold value. All results below the value + trigs an interrupt. 0 disables the below interrupt. + +prox0_raw + RO - measured proximity value + sysfs_notify called when threshold interrupt occurs + +prox0_sensor_range + RO - prox0_raw max value (1023) + +prox0_raw_en + RW - enable / disable proximity - uses counting logic + 1 enables the proximity + 0 disables the proximity + +prox0_reporting_mode + RW - trigger / periodic. In "trigger" mode the driver tells two possible + values: 0 or prox0_sensor_range value. 0 means no proximity, + 1023 means proximity. This causes minimal number of interrupts. + In "periodic" mode the driver reports all values above + prox0_thresh_above. This causes more interrupts, but it can give + _rough_ estimate about the distance. + +prox0_reporting_mode_avail + RO - accepted values to prox0_reporting_mode (trigger, periodic) + +prox0_thresh_above_value + RW - threshold level which trigs proximity events. diff --git a/Documentation/misc-devices/bh1770glc.txt b/Documentation/misc-devices/bh1770glc.txt new file mode 100644 index 000000000000..7d64c014dc70 --- /dev/null +++ b/Documentation/misc-devices/bh1770glc.txt @@ -0,0 +1,116 @@ +Kernel driver bh1770glc +======================= + +Supported chips: +ROHM BH1770GLC +OSRAM SFH7770 + +Data sheet: +Not freely available + +Author: +Samu Onkalo + +Description +----------- +BH1770GLC and SFH7770 are combined ambient light and proximity sensors. +ALS and proximity parts operates on their own, but they shares common I2C +interface and interrupt logic. In principle they can run on their own, +but ALS side results are used to estimate reliability of the proximity sensor. + +ALS produces 16 bit lux values. The chip contains interrupt logic to produce +low and high threshold interrupts. + +Proximity part contains IR-led driver up to 3 IR leds. The chip measures +amount of reflected IR light and produces proximity result. Resolution is +8 bit. Driver supports only one channel. Driver uses ALS results to estimate +reliability of the proximity results. Thus ALS is always running while +proximity detection is needed. + +Driver uses threshold interrupts to avoid need for polling the values. +Proximity low interrupt doesn't exists in the chip. This is simulated +by using a delayed work. As long as there is proximity threshold above +interrupts the delayed work is pushed forward. So, when proximity level goes +below the threshold value, there is no interrupt and the delayed work will +finally run. This is handled as no proximity indication. + +Chip state is controlled via runtime pm framework when enabled in config. + +Calibscale factor is used to hide differences between the chips. By default +value set to neutral state meaning factor of 1.00. To get proper values, +calibrated source of light is needed as a reference. Calibscale factor is set +so that measurement produces about the expected lux value. + +SYSFS +----- + +chip_id + RO - shows detected chip type and version + +power_state + RW - enable / disable chip. Uses counting logic + 1 enables the chip + 0 disables the chip + +lux0_input + RO - measured lux value + sysfs_notify called when threshold interrupt occurs + +lux0_sensor_range + RO - lux0_input max value + +lux0_rate + RW - measurement rate in Hz + +lux0_rate_avail + RO - supported measurement rates + +lux0_thresh_above_value + RW - HI level threshold value. All results above the value + trigs an interrupt. 65535 (i.e. sensor_range) disables the above + interrupt. + +lux0_thresh_below_value + RW - LO level threshold value. All results below the value + trigs an interrupt. 0 disables the below interrupt. + +lux0_calibscale + RW - calibration value. Set to neutral value by default. + Output results are multiplied with calibscale / calibscale_default + value. + +lux0_calibscale_default + RO - neutral calibration value + +prox0_raw + RO - measured proximity value + sysfs_notify called when threshold interrupt occurs + +prox0_sensor_range + RO - prox0_raw max value + +prox0_raw_en + RW - enable / disable proximity - uses counting logic + 1 enables the proximity + 0 disables the proximity + +prox0_thresh_above_count + RW - number of proximity interrupts needed before triggering the event + +prox0_rate_above + RW - Measurement rate (in Hz) when the level is above threshold + i.e. when proximity on has been reported. + +prox0_rate_below + RW - Measurement rate (in Hz) when the level is below threshold + i.e. when proximity off has been reported. + +prox0_rate_avail + RO - Supported proximity measurement rates in Hz + +prox0_thresh_above0_value + RW - threshold level which trigs proximity events. + Filtered by persistence filter (prox0_thresh_above_count) + +prox0_thresh_above1_value + RW - threshold level which trigs event immediately From 93e2f585c149b5056d5c5eaffcaf747bbe9c3015 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:40 -0700 Subject: [PATCH 109/176] lkdtm: prefix enum constants Prefix cname and ctype constants with CN/CT_. This is especially for the conflict on BUG which causes a build break if arch defines it as a inline function, i.e. MIPS. Signed-off-by: Namhyung Kim Cc: Ankita Garg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/lkdtm.c | 128 +++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c index 343b5d8ea697..81d7fa4ec0db 100644 --- a/drivers/misc/lkdtm.c +++ b/drivers/misc/lkdtm.c @@ -52,32 +52,32 @@ #define REC_NUM_DEFAULT 10 enum cname { - INVALID, - INT_HARDWARE_ENTRY, - INT_HW_IRQ_EN, - INT_TASKLET_ENTRY, - FS_DEVRW, - MEM_SWAPOUT, - TIMERADD, - SCSI_DISPATCH_CMD, - IDE_CORE_CP, - DIRECT, + CN_INVALID, + CN_INT_HARDWARE_ENTRY, + CN_INT_HW_IRQ_EN, + CN_INT_TASKLET_ENTRY, + CN_FS_DEVRW, + CN_MEM_SWAPOUT, + CN_TIMERADD, + CN_SCSI_DISPATCH_CMD, + CN_IDE_CORE_CP, + CN_DIRECT, }; enum ctype { - NONE, - PANIC, - BUG, - EXCEPTION, - LOOP, - OVERFLOW, - CORRUPT_STACK, - UNALIGNED_LOAD_STORE_WRITE, - OVERWRITE_ALLOCATION, - WRITE_AFTER_FREE, - SOFTLOCKUP, - HARDLOCKUP, - HUNG_TASK, + CT_NONE, + CT_PANIC, + CT_BUG, + CT_EXCEPTION, + CT_LOOP, + CT_OVERFLOW, + CT_CORRUPT_STACK, + CT_UNALIGNED_LOAD_STORE_WRITE, + CT_OVERWRITE_ALLOCATION, + CT_WRITE_AFTER_FREE, + CT_SOFTLOCKUP, + CT_HARDLOCKUP, + CT_HUNG_TASK, }; static char* cp_name[] = { @@ -117,8 +117,8 @@ static char* cpoint_type; static int cpoint_count = DEFAULT_COUNT; static int recur_count = REC_NUM_DEFAULT; -static enum cname cpoint = INVALID; -static enum ctype cptype = NONE; +static enum cname cpoint = CN_INVALID; +static enum ctype cptype = CT_NONE; static int count = DEFAULT_COUNT; module_param(recur_count, int, 0644); @@ -207,12 +207,12 @@ static enum ctype parse_cp_type(const char *what, size_t count) return i + 1; } - return NONE; + return CT_NONE; } static const char *cp_type_to_str(enum ctype type) { - if (type == NONE || type < 0 || type > ARRAY_SIZE(cp_type)) + if (type == CT_NONE || type < 0 || type > ARRAY_SIZE(cp_type)) return "None"; return cp_type[type - 1]; @@ -220,7 +220,7 @@ static const char *cp_type_to_str(enum ctype type) static const char *cp_name_to_str(enum cname name) { - if (name == INVALID || name < 0 || name > ARRAY_SIZE(cp_name)) + if (name == CN_INVALID || name < 0 || name > ARRAY_SIZE(cp_name)) return "INVALID"; return cp_name[name - 1]; @@ -245,7 +245,7 @@ static int lkdtm_parse_commandline(void) return -EINVAL; cptype = parse_cp_type(cpoint_type, strlen(cpoint_type)); - if (cptype == NONE) + if (cptype == CT_NONE) return -EINVAL; for (i = 0; i < ARRAY_SIZE(cp_name); i++) { @@ -274,30 +274,30 @@ static int recursive_loop(int a) static void lkdtm_do_action(enum ctype which) { switch (which) { - case PANIC: + case CT_PANIC: panic("dumptest"); break; - case BUG: + case CT_BUG: BUG(); break; - case EXCEPTION: + case CT_EXCEPTION: *((int *) 0) = 0; break; - case LOOP: + case CT_LOOP: for (;;) ; break; - case OVERFLOW: + case CT_OVERFLOW: (void) recursive_loop(0); break; - case CORRUPT_STACK: { + case CT_CORRUPT_STACK: { volatile u32 data[8]; volatile u32 *p = data; p[12] = 0x12345678; break; } - case UNALIGNED_LOAD_STORE_WRITE: { + case CT_UNALIGNED_LOAD_STORE_WRITE: { static u8 data[5] __attribute__((aligned(4))) = {1, 2, 3, 4, 5}; u32 *p; @@ -309,7 +309,7 @@ static void lkdtm_do_action(enum ctype which) *p = val; break; } - case OVERWRITE_ALLOCATION: { + case CT_OVERWRITE_ALLOCATION: { size_t len = 1020; u32 *data = kmalloc(len, GFP_KERNEL); @@ -317,7 +317,7 @@ static void lkdtm_do_action(enum ctype which) kfree(data); break; } - case WRITE_AFTER_FREE: { + case CT_WRITE_AFTER_FREE: { size_t len = 1024; u32 *data = kmalloc(len, GFP_KERNEL); @@ -326,21 +326,21 @@ static void lkdtm_do_action(enum ctype which) memset(data, 0x78, len); break; } - case SOFTLOCKUP: + case CT_SOFTLOCKUP: preempt_disable(); for (;;) cpu_relax(); break; - case HARDLOCKUP: + case CT_HARDLOCKUP: local_irq_disable(); for (;;) cpu_relax(); break; - case HUNG_TASK: + case CT_HUNG_TASK: set_current_state(TASK_UNINTERRUPTIBLE); schedule(); break; - case NONE: + case CT_NONE: default: break; } @@ -363,43 +363,43 @@ static int lkdtm_register_cpoint(enum cname which) { int ret; - cpoint = INVALID; + cpoint = CN_INVALID; if (lkdtm.entry != NULL) unregister_jprobe(&lkdtm); switch (which) { - case DIRECT: + case CN_DIRECT: lkdtm_do_action(cptype); return 0; - case INT_HARDWARE_ENTRY: + case CN_INT_HARDWARE_ENTRY: lkdtm.kp.symbol_name = "do_IRQ"; lkdtm.entry = (kprobe_opcode_t*) jp_do_irq; break; - case INT_HW_IRQ_EN: + case CN_INT_HW_IRQ_EN: lkdtm.kp.symbol_name = "handle_IRQ_event"; lkdtm.entry = (kprobe_opcode_t*) jp_handle_irq_event; break; - case INT_TASKLET_ENTRY: + case CN_INT_TASKLET_ENTRY: lkdtm.kp.symbol_name = "tasklet_action"; lkdtm.entry = (kprobe_opcode_t*) jp_tasklet_action; break; - case FS_DEVRW: + case CN_FS_DEVRW: lkdtm.kp.symbol_name = "ll_rw_block"; lkdtm.entry = (kprobe_opcode_t*) jp_ll_rw_block; break; - case MEM_SWAPOUT: + case CN_MEM_SWAPOUT: lkdtm.kp.symbol_name = "shrink_inactive_list"; lkdtm.entry = (kprobe_opcode_t*) jp_shrink_inactive_list; break; - case TIMERADD: + case CN_TIMERADD: lkdtm.kp.symbol_name = "hrtimer_start"; lkdtm.entry = (kprobe_opcode_t*) jp_hrtimer_start; break; - case SCSI_DISPATCH_CMD: + case CN_SCSI_DISPATCH_CMD: lkdtm.kp.symbol_name = "scsi_dispatch_cmd"; lkdtm.entry = (kprobe_opcode_t*) jp_scsi_dispatch_cmd; break; - case IDE_CORE_CP: + case CN_IDE_CORE_CP: #ifdef CONFIG_IDE lkdtm.kp.symbol_name = "generic_ide_ioctl"; lkdtm.entry = (kprobe_opcode_t*) jp_generic_ide_ioctl; @@ -416,7 +416,7 @@ static int lkdtm_register_cpoint(enum cname which) cpoint = which; if ((ret = register_jprobe(&lkdtm)) < 0) { printk(KERN_INFO "lkdtm: Couldn't register jprobe\n"); - cpoint = INVALID; + cpoint = CN_INVALID; } return ret; @@ -445,7 +445,7 @@ static ssize_t do_register_entry(enum cname which, struct file *f, cptype = parse_cp_type(buf, count); free_page((unsigned long) buf); - if (cptype == NONE) + if (cptype == CT_NONE) return -EINVAL; err = lkdtm_register_cpoint(which); @@ -487,49 +487,49 @@ static int lkdtm_debugfs_open(struct inode *inode, struct file *file) static ssize_t int_hardware_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(INT_HARDWARE_ENTRY, f, buf, count, off); + return do_register_entry(CN_INT_HARDWARE_ENTRY, f, buf, count, off); } static ssize_t int_hw_irq_en(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(INT_HW_IRQ_EN, f, buf, count, off); + return do_register_entry(CN_INT_HW_IRQ_EN, f, buf, count, off); } static ssize_t int_tasklet_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(INT_TASKLET_ENTRY, f, buf, count, off); + return do_register_entry(CN_INT_TASKLET_ENTRY, f, buf, count, off); } static ssize_t fs_devrw_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(FS_DEVRW, f, buf, count, off); + return do_register_entry(CN_FS_DEVRW, f, buf, count, off); } static ssize_t mem_swapout_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(MEM_SWAPOUT, f, buf, count, off); + return do_register_entry(CN_MEM_SWAPOUT, f, buf, count, off); } static ssize_t timeradd_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(TIMERADD, f, buf, count, off); + return do_register_entry(CN_TIMERADD, f, buf, count, off); } static ssize_t scsi_dispatch_cmd_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(SCSI_DISPATCH_CMD, f, buf, count, off); + return do_register_entry(CN_SCSI_DISPATCH_CMD, f, buf, count, off); } static ssize_t ide_core_cp_entry(struct file *f, const char __user *buf, size_t count, loff_t *off) { - return do_register_entry(IDE_CORE_CP, f, buf, count, off); + return do_register_entry(CN_IDE_CORE_CP, f, buf, count, off); } /* Special entry to just crash directly. Available without KPROBEs */ @@ -557,7 +557,7 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf, type = parse_cp_type(buf, count); free_page((unsigned long) buf); - if (type == NONE) + if (type == CT_NONE) return -EINVAL; printk(KERN_INFO "lkdtm: Performing direct entry %s\n", @@ -649,7 +649,7 @@ static int __init lkdtm_module_init(void) goto out_err; } - if (cpoint != INVALID && cptype != NONE) { + if (cpoint != CN_INVALID && cptype != CT_NONE) { ret = lkdtm_register_cpoint(cpoint); if (ret < 0) { printk(KERN_INFO "lkdtm: Invalid crash point %d\n", From 2e85c4ddd3f32d3e1da51f4129473399e505ffa3 Mon Sep 17 00:00:00 2001 From: Kalhan Trisal Date: Tue, 26 Oct 2010 14:22:40 -0700 Subject: [PATCH 110/176] drivers/misc/isl29020.c: ambient light sensor The LS driver will read the latest Lux measurement based upon the light brightness and will report the LUX output through sysfs interface. This hardware isn't quite the same as the ISL29003 so has a different driver. [akpm@linux-foundation.org: put PM code under #ifdef CONFIG_PM] Signed-off-by: Kalhan Trisal [Runtime power management support added] Signed-off-by: Arjan van de Ven [Fixes to runtime PM] Signed-off-by: Liu Hong [Cleanups and added checks for I2C errors, reworked the API to match the saner one agreed for other sensors] Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/Kconfig | 10 ++ drivers/misc/Makefile | 1 + drivers/misc/isl29020.c | 248 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 drivers/misc/isl29020.c diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 7362851a2ca8..477a9434b9c0 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -295,6 +295,16 @@ config ISL29003 This driver can also be built as a module. If so, the module will be called isl29003. +config ISL29020 + tristate "Intersil ISL29020 ambient light sensor" + depends on I2C + help + If you say yes here you get support for the Intersil ISL29020 + ambient light sensor. + + This driver can also be built as a module. If so, the module + will be called isl29020. + config SENSORS_TSL2550 tristate "Taos TSL2550 ambient light sensor" depends on I2C && SYSFS diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 081b9fca8a06..2ecdd32d1357 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_SGI_GRU) += sgi-gru/ obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o obj-$(CONFIG_HP_ILO) += hpilo.o obj-$(CONFIG_ISL29003) += isl29003.o +obj-$(CONFIG_ISL29020) += isl29020.o obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o obj-$(CONFIG_EP93XX_PWM) += ep93xx_pwm.o obj-$(CONFIG_DS1682) += ds1682.o diff --git a/drivers/misc/isl29020.c b/drivers/misc/isl29020.c new file mode 100644 index 000000000000..34fe835921c4 --- /dev/null +++ b/drivers/misc/isl29020.c @@ -0,0 +1,248 @@ +/* + * isl29020.c - Intersil ALS Driver + * + * Copyright (C) 2008 Intel Corp + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Data sheet at: http://www.intersil.com/data/fn/fn6505.pdf + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(mutex); + +static ssize_t als_sensing_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + int val; + + val = i2c_smbus_read_byte_data(client, 0x00); + + if (val < 0) + return val; + return sprintf(buf, "%d000\n", 1 << (2 * (val & 3))); + +} + +static ssize_t als_lux_input_data_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + int ret_val, val; + unsigned long int lux; + int temp; + + pm_runtime_get_sync(dev); + msleep(100); + + mutex_lock(&mutex); + temp = i2c_smbus_read_byte_data(client, 0x02); /* MSB data */ + if (temp < 0) { + pm_runtime_put_sync(dev); + mutex_unlock(&mutex); + return temp; + } + + ret_val = i2c_smbus_read_byte_data(client, 0x01); /* LSB data */ + mutex_unlock(&mutex); + + if (ret_val < 0) { + pm_runtime_put_sync(dev); + return ret_val; + } + + ret_val |= temp << 8; + val = i2c_smbus_read_byte_data(client, 0x00); + pm_runtime_put_sync(dev); + if (val < 0) + return val; + lux = ((((1 << (2 * (val & 3))))*1000) * ret_val) / 65536; + return sprintf(buf, "%ld\n", lux); +} + +static ssize_t als_sensing_range_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned int ret_val; + unsigned long val; + + if (strict_strtoul(buf, 10, &val)) + return -EINVAL; + if (val < 1 || val > 64000) + return -EINVAL; + + /* Pick the smallest sensor range that will meet our requirements */ + if (val <= 1000) + val = 1; + else if (val <= 4000) + val = 2; + else if (val <= 16000) + val = 3; + else + val = 4; + + ret_val = i2c_smbus_read_byte_data(client, 0x00); + + ret_val &= 0xFC; /*reset the bit before setting them */ + ret_val |= val - 1; + ret_val = i2c_smbus_write_byte_data(client, 0x00, ret_val); + + if (ret_val < 0) + return ret_val; + return count; +} + +static void als_set_power_state(struct i2c_client *client, int enable) +{ + int ret_val; + + ret_val = i2c_smbus_read_byte_data(client, 0x00); + if (ret_val < 0) + return; + + if (enable) + ret_val |= 0x80; + else + ret_val &= 0x7F; + + i2c_smbus_write_byte_data(client, 0x00, ret_val); +} + +static DEVICE_ATTR(lux0_sensor_range, S_IRUGO | S_IWUSR, + als_sensing_range_show, als_sensing_range_store); +static DEVICE_ATTR(lux0_input, S_IRUGO, als_lux_input_data_show, NULL); + +static struct attribute *mid_att_als[] = { + &dev_attr_lux0_sensor_range.attr, + &dev_attr_lux0_input.attr, + NULL +}; + +static struct attribute_group m_als_gr = { + .name = "isl29020", + .attrs = mid_att_als +}; + +static int als_set_default_config(struct i2c_client *client) +{ + int retval; + + retval = i2c_smbus_write_byte_data(client, 0x00, 0xc0); + if (retval < 0) { + dev_err(&client->dev, "default write failed."); + return retval; + } + return 0;; +} + +static int isl29020_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + int res; + + res = als_set_default_config(client); + if (res < 0) + return res; + + res = sysfs_create_group(&client->dev.kobj, &m_als_gr); + if (res) { + dev_err(&client->dev, "isl29020: device create file failed\n"); + return res; + } + dev_info(&client->dev, "%s isl29020: ALS chip found\n", client->name); + als_set_power_state(client, 0); + pm_runtime_enable(&client->dev); + return res; +} + +static int isl29020_remove(struct i2c_client *client) +{ + struct als_data *data = i2c_get_clientdata(client); + sysfs_remove_group(&client->dev.kobj, &m_als_gr); + kfree(data); + return 0; +} + +static struct i2c_device_id isl29020_id[] = { + { "isl29020", 0 }, + { } +}; + +MODULE_DEVICE_TABLE(i2c, isl29020_id); + +#ifdef CONFIG_PM + +static int isl29020_runtime_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + als_set_power_state(client, 0); + return 0; +} + +static int isl29020_runtime_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + als_set_power_state(client, 1); + return 0; +} + +static const struct dev_pm_ops isl29020_pm_ops = { + .runtime_suspend = isl29020_runtime_suspend, + .runtime_resume = isl29020_runtime_resume, +}; + +#define ISL29020_PM_OPS (&isl29020_pm_ops) +#else /* CONFIG_PM */ +#define ISL29020_PM_OPS NULL +#endif /* CONFIG_PM */ + +static struct i2c_driver isl29020_driver = { + .driver = { + .name = "isl29020", + .pm = ISL29020_PM_OPS, + }, + .probe = isl29020_probe, + .remove = isl29020_remove, + .id_table = isl29020_id, +}; + +static int __init sensor_isl29020_init(void) +{ + return i2c_add_driver(&isl29020_driver); +} + +static void __exit sensor_isl29020_exit(void) +{ + i2c_del_driver(&isl29020_driver); +} + +module_init(sensor_isl29020_init); +module_exit(sensor_isl29020_exit); + +MODULE_AUTHOR("Kalhan Trisal Date: Tue, 26 Oct 2010 14:22:41 -0700 Subject: [PATCH 111/176] pca953x: pca953x driver fixes for x86 mrst Our Moorestown platform has two max7315 chips which is covered by pca953x i2c gpio driver. A while ago this driver got updated with nested irq thread support, and it broke the compatibity with "request_irq". For example, the gpio_keys.c driver can not work with this driver now. This patch fixes the issue by switching to generic_handle_irq. Also fix the irq_base issue: irq_base == 0 is valid, and a "-1" value should mean invalid. IRQ 0 is not a valid IRQ, irq_base of 0 is valid. Signed-off-by: Alek Du Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpio/pca953x.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/pca953x.c b/drivers/gpio/pca953x.c index a2b12aa1f2b9..501866662e05 100644 --- a/drivers/gpio/pca953x.c +++ b/drivers/gpio/pca953x.c @@ -345,7 +345,7 @@ static irqreturn_t pca953x_irq_handler(int irq, void *devid) do { level = __ffs(pending); - handle_nested_irq(level + chip->irq_base); + generic_handle_irq(level + chip->irq_base); pending &= ~(1 << level); } while (pending); @@ -360,7 +360,8 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, struct pca953x_platform_data *pdata = client->dev.platform_data; int ret; - if (pdata->irq_base && (id->driver_data & PCA953X_INT)) { + if (pdata->irq_base != -1 + && (id->driver_data & PCA953X_INT)) { int lvl; ret = pca953x_read_reg(chip, PCA953X_INPUT, @@ -383,7 +384,6 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, set_irq_chip_data(irq, chip); set_irq_chip_and_handler(irq, &pca953x_irq_chip, handle_edge_irq); - set_irq_nested_thread(irq, 1); #ifdef CONFIG_ARM set_irq_flags(irq, IRQF_VALID); #else @@ -394,6 +394,7 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, ret = request_threaded_irq(client->irq, NULL, pca953x_irq_handler, + IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING | IRQF_ONESHOT, dev_name(&client->dev), chip); if (ret) { @@ -408,13 +409,13 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, return 0; out_failed: - chip->irq_base = 0; + chip->irq_base = -1; return ret; } static void pca953x_irq_teardown(struct pca953x_chip *chip) { - if (chip->irq_base) + if (chip->irq_base != -1) free_irq(chip->client->irq, chip); } #else /* CONFIG_GPIO_PCA953X_IRQ */ @@ -424,7 +425,7 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, struct i2c_client *client = chip->client; struct pca953x_platform_data *pdata = client->dev.platform_data; - if (pdata->irq_base && (id->driver_data & PCA953X_INT)) + if (pdata->irq_base != -1 && (id->driver_data & PCA953X_INT)) dev_warn(&client->dev, "interrupt support not compiled in\n"); return 0; From 22d96aa59cf120db3584e4c3365554cae77d2441 Mon Sep 17 00:00:00 2001 From: anantha Date: Tue, 26 Oct 2010 14:22:41 -0700 Subject: [PATCH 112/176] drivers/misc/apds9802als.c: ALS drivers for the apds9802als This adds support for the ADPS9802ALS sensor. Cleanup by Alan Cox - move mutexes to cover more things - report I/O errors back to user space - report range and values in LUX Signed-off-by: Anantha Narayanan [The 4K and 64K in the hw spec actually means 4095 (12bit) and 65535 (16bit).] Signed-off-by: Hong Liu [Updated to match the ALS light API interface convention from Samu] Signed-off-by: Alan Cox Acked-by: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/Kconfig | 10 ++ drivers/misc/Makefile | 1 + drivers/misc/apds9802als.c | 308 +++++++++++++++++++++++++++++++++++++ 3 files changed, 319 insertions(+) create mode 100644 drivers/misc/apds9802als.c diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 477a9434b9c0..6d7665ce6f4a 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -285,6 +285,16 @@ config SGI_GRU_DEBUG This option enables addition debugging code for the SGI GRU driver. If you are unsure, say N. +config APDS9802ALS + tristate "Medfield Avago APDS9802 ALS Sensor module" + depends on I2C + help + If you say yes here you get support for the ALS APDS9802 ambient + light sensor. + + This driver can also be built as a module. If so, the module + will be called apds9802als. + config ISL29003 tristate "Intersil ISL29003 ambient light sensor" depends on I2C && SYSFS diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 2ecdd32d1357..4be5c6fc5ef4 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o obj-$(CONFIG_HP_ILO) += hpilo.o +obj-$(CONFIG_APDS9802ALS) += apds9802als.o obj-$(CONFIG_ISL29003) += isl29003.o obj-$(CONFIG_ISL29020) += isl29020.o obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c new file mode 100644 index 000000000000..fbe49602f396 --- /dev/null +++ b/drivers/misc/apds9802als.c @@ -0,0 +1,308 @@ +/* + * apds9802als.c - apds9802 ALS Driver + * + * Copyright (C) 2009 Intel Corp + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALS_MIN_RANGE_VAL 1 +#define ALS_MAX_RANGE_VAL 2 +#define POWER_STA_ENABLE 1 +#define POWER_STA_DISABLE 0 +#define APDS9802ALS_I2C_ADDR 0x29 + +#define DRIVER_NAME "apds9802als" + +struct als_data { + struct device *hwmon_dev; + bool needresume; + struct mutex mutex; +}; + +static ssize_t als_sensing_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + int val; + + val = i2c_smbus_read_byte_data(client, 0x81); + if (val < 0) + return val; + if (val & 1) + return sprintf(buf, "4095\n"); + else + return sprintf(buf, "65535\n"); +} + +static ssize_t als_lux0_input_data_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + struct als_data *data = i2c_get_clientdata(client); + unsigned int ret_val; + int temp; + + /* Protect against parallel reads */ + mutex_lock(&data->mutex); + temp = i2c_smbus_read_byte_data(client, 0x8C);/*LSB data*/ + if (temp < 0) { + ret_val = temp; + goto failed; + } + ret_val = i2c_smbus_read_byte_data(client, 0x8D);/*MSB data*/ + if (ret_val < 0) + goto failed; + mutex_unlock(&data->mutex); + ret_val = (ret_val << 8) | temp; + return sprintf(buf, "%d\n", ret_val); +failed: + mutex_unlock(&data->mutex); + return ret_val; +} + +static ssize_t als_sensing_range_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct als_data *data = i2c_get_clientdata(client); + unsigned int ret_val; + unsigned long val; + + if (strict_strtoul(buf, 10, &val)) + return -EINVAL; + + if (val < 4096) + val = 1; + else if (val < 65536) + val = 2; + else + return -ERANGE; + + /* Make sure nobody else reads/modifies/writes 0x81 while we + are active */ + + mutex_lock(&data->mutex); + + ret_val = i2c_smbus_read_byte_data(client, 0x81); + if (ret_val < 0) + goto fail; + + /* Reset the bits before setting them */ + ret_val = ret_val & 0xFA; + + if (val == 1) /* Setting the continous measurement up to 4k LUX */ + ret_val = (ret_val | 0x05); + else /* Setting the continous measurement up to 64k LUX*/ + ret_val = (ret_val | 0x04); + + ret_val = i2c_smbus_write_byte_data(client, 0x81, ret_val); + if (ret_val >= 0) { + /* All OK */ + mutex_unlock(&data->mutex); + return count; + } +fail: + mutex_unlock(&data->mutex); + return ret_val; +} + +static ssize_t als_power_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + int ret_val; + ret_val = i2c_smbus_read_byte_data(client, 0x80); + if (ret_val < 0) + return ret_val; + ret_val = ret_val & 0x01; + return sprintf(buf, "%d\n", ret_val); +} + +static int als_set_power_state(struct i2c_client *client, bool on_off) +{ + int ret_val; + struct als_data *data = i2c_get_clientdata(client); + + mutex_lock(&data->mutex); + ret_val = i2c_smbus_read_byte_data(client, 0x80); + if (ret_val < 0) + goto fail; + if (on_off) + ret_val = ret_val | 0x01; + else + ret_val = ret_val & 0xFE; + ret_val = i2c_smbus_write_byte_data(client, 0x80, ret_val); +fail: + mutex_unlock(&data->mutex); + return ret_val; +} + +static ssize_t als_power_status_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct als_data *data = i2c_get_clientdata(client); + unsigned long val; + int ret_val; + + if (strict_strtoul(buf, 10, &val)) + return -EINVAL; + if (val == POWER_STA_ENABLE) { + ret_val = als_set_power_state(client, true); + data->needresume = true; + } else if (val == POWER_STA_DISABLE) { + ret_val = als_set_power_state(client, false); + data->needresume = false; + } else + return -EINVAL; + if (ret_val < 0) + return ret_val; + return count; +} + +static DEVICE_ATTR(lux0_sensor_range, S_IRUGO | S_IWUSR, + als_sensing_range_show, als_sensing_range_store); +static DEVICE_ATTR(lux0_input, S_IRUGO, als_lux0_input_data_show, NULL); +static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR, + als_power_status_show, als_power_status_store); + +static struct attribute *mid_att_als[] = { + &dev_attr_lux0_sensor_range.attr, + &dev_attr_lux0_input.attr, + &dev_attr_power_state.attr, + NULL +}; + +static struct attribute_group m_als_gr = { + .name = "apds9802als", + .attrs = mid_att_als +}; + +static int als_set_default_config(struct i2c_client *client) +{ + int ret_val; + /* Write the command and then switch on */ + ret_val = i2c_smbus_write_byte_data(client, 0x80, 0x01); + if (ret_val < 0) { + dev_err(&client->dev, "failed default switch on write\n"); + return ret_val; + } + /* Continous from 1Lux to 64k Lux */ + ret_val = i2c_smbus_write_byte_data(client, 0x81, 0x04); + if (ret_val < 0) + dev_err(&client->dev, "failed default LUX on write\n"); + return ret_val; +} + +static int apds9802als_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + int res; + struct als_data *data; + + data = kzalloc(sizeof(struct als_data), GFP_KERNEL); + if (data == NULL) { + dev_err(&client->dev, "Memory allocation failed\n"); + return -ENOMEM; + } + i2c_set_clientdata(client, data); + res = sysfs_create_group(&client->dev.kobj, &m_als_gr); + if (res) { + dev_err(&client->dev, "device create file failed\n"); + goto als_error1; + } + dev_info(&client->dev, + "%s apds9802als: ALS chip found\n", client->name); + als_set_default_config(client); + data->needresume = true; + mutex_init(&data->mutex); + return res; +als_error1: + i2c_set_clientdata(client, NULL); + kfree(data); + return res; +} + +static int apds9802als_remove(struct i2c_client *client) +{ + struct als_data *data = i2c_get_clientdata(client); + sysfs_remove_group(&client->dev.kobj, &m_als_gr); + kfree(data); + return 0; +} + +static int apds9802als_suspend(struct i2c_client *client, pm_message_t mesg) +{ + als_set_power_state(client, false); + return 0; +} + +static int apds9802als_resume(struct i2c_client *client) +{ + struct als_data *data = i2c_get_clientdata(client); + + if (data->needresume == true) + als_set_power_state(client, true); + return 0; +} + +static struct i2c_device_id apds9802als_id[] = { + { DRIVER_NAME, 0 }, + { } +}; + +MODULE_DEVICE_TABLE(i2c, apds9802als_id); + +static struct i2c_driver apds9802als_driver = { + .driver = { + .name = DRIVER_NAME, + .owner = THIS_MODULE, + }, + .probe = apds9802als_probe, + .remove = apds9802als_remove, + .suspend = apds9802als_suspend, + .resume = apds9802als_resume, + .id_table = apds9802als_id, +}; + +static int __init sensor_apds9802als_init(void) +{ + return i2c_add_driver(&apds9802als_driver); +} + +static void __exit sensor_apds9802als_exit(void) +{ + i2c_del_driver(&apds9802als_driver); +} +module_init(sensor_apds9802als_init); +module_exit(sensor_apds9802als_exit); + +MODULE_AUTHOR("Anantha Narayanan Date: Tue, 26 Oct 2010 14:22:42 -0700 Subject: [PATCH 113/176] drivers/misc/apds9802als.c: add runtime PM support Update the driver for the needed runtime power features. Remove the old user controlled power functions. [akpm@linux-foundation.org: put PM code under CONFIG_PM] Signed-off-by: Hong Liu Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/apds9802als.c | 167 +++++++++++++++++++++++-------------- 1 file changed, 103 insertions(+), 64 deletions(-) diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c index fbe49602f396..f9b91ba8900c 100644 --- a/drivers/misc/apds9802als.c +++ b/drivers/misc/apds9802als.c @@ -29,19 +29,16 @@ #include #include #include -#include +#include #define ALS_MIN_RANGE_VAL 1 #define ALS_MAX_RANGE_VAL 2 #define POWER_STA_ENABLE 1 #define POWER_STA_DISABLE 0 -#define APDS9802ALS_I2C_ADDR 0x29 #define DRIVER_NAME "apds9802als" struct als_data { - struct device *hwmon_dev; - bool needresume; struct mutex mutex; }; @@ -60,29 +57,64 @@ static ssize_t als_sensing_range_show(struct device *dev, return sprintf(buf, "65535\n"); } +static int als_wait_for_data_ready(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + int ret; + int retry = 10; + + do { + msleep(30); + ret = i2c_smbus_read_byte_data(client, 0x86); + } while (!(ret & 0x80) && retry--); + + if (!retry) { + dev_warn(dev, "timeout waiting for data ready\n"); + return -ETIMEDOUT; + } + + return 0; +} + static ssize_t als_lux0_input_data_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_client *client = to_i2c_client(dev); struct als_data *data = i2c_get_clientdata(client); - unsigned int ret_val; + int ret_val; int temp; /* Protect against parallel reads */ + pm_runtime_get_sync(dev); mutex_lock(&data->mutex); - temp = i2c_smbus_read_byte_data(client, 0x8C);/*LSB data*/ + + /* clear EOC interrupt status */ + i2c_smbus_write_byte(client, 0x40); + /* start measurement */ + temp = i2c_smbus_read_byte_data(client, 0x81); + i2c_smbus_write_byte_data(client, 0x81, temp | 0x08); + + ret_val = als_wait_for_data_ready(dev); + if (ret_val < 0) + goto failed; + + temp = i2c_smbus_read_byte_data(client, 0x8C); /* LSB data */ if (temp < 0) { ret_val = temp; goto failed; } - ret_val = i2c_smbus_read_byte_data(client, 0x8D);/*MSB data*/ + ret_val = i2c_smbus_read_byte_data(client, 0x8D); /* MSB data */ if (ret_val < 0) goto failed; + mutex_unlock(&data->mutex); - ret_val = (ret_val << 8) | temp; - return sprintf(buf, "%d\n", ret_val); + pm_runtime_put_sync(dev); + + temp = (ret_val << 8) | temp; + return sprintf(buf, "%d\n", temp); failed: mutex_unlock(&data->mutex); + pm_runtime_put_sync(dev); return ret_val; } @@ -104,9 +136,10 @@ static ssize_t als_sensing_range_store(struct device *dev, else return -ERANGE; + pm_runtime_get_sync(dev); + /* Make sure nobody else reads/modifies/writes 0x81 while we are active */ - mutex_lock(&data->mutex); ret_val = i2c_smbus_read_byte_data(client, 0x81); @@ -116,34 +149,25 @@ static ssize_t als_sensing_range_store(struct device *dev, /* Reset the bits before setting them */ ret_val = ret_val & 0xFA; - if (val == 1) /* Setting the continous measurement up to 4k LUX */ - ret_val = (ret_val | 0x05); - else /* Setting the continous measurement up to 64k LUX*/ - ret_val = (ret_val | 0x04); + if (val == 1) /* Setting detection range up to 4k LUX */ + ret_val = (ret_val | 0x01); + else /* Setting detection range up to 64k LUX*/ + ret_val = (ret_val | 0x00); ret_val = i2c_smbus_write_byte_data(client, 0x81, ret_val); + if (ret_val >= 0) { /* All OK */ mutex_unlock(&data->mutex); + pm_runtime_put_sync(dev); return count; } fail: mutex_unlock(&data->mutex); + pm_runtime_put_sync(dev); return ret_val; } -static ssize_t als_power_status_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct i2c_client *client = to_i2c_client(dev); - int ret_val; - ret_val = i2c_smbus_read_byte_data(client, 0x80); - if (ret_val < 0) - return ret_val; - ret_val = ret_val & 0x01; - return sprintf(buf, "%d\n", ret_val); -} - static int als_set_power_state(struct i2c_client *client, bool on_off) { int ret_val; @@ -163,39 +187,13 @@ static int als_set_power_state(struct i2c_client *client, bool on_off) return ret_val; } -static ssize_t als_power_status_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct i2c_client *client = to_i2c_client(dev); - struct als_data *data = i2c_get_clientdata(client); - unsigned long val; - int ret_val; - - if (strict_strtoul(buf, 10, &val)) - return -EINVAL; - if (val == POWER_STA_ENABLE) { - ret_val = als_set_power_state(client, true); - data->needresume = true; - } else if (val == POWER_STA_DISABLE) { - ret_val = als_set_power_state(client, false); - data->needresume = false; - } else - return -EINVAL; - if (ret_val < 0) - return ret_val; - return count; -} - static DEVICE_ATTR(lux0_sensor_range, S_IRUGO | S_IWUSR, als_sensing_range_show, als_sensing_range_store); static DEVICE_ATTR(lux0_input, S_IRUGO, als_lux0_input_data_show, NULL); -static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR, - als_power_status_show, als_power_status_store); static struct attribute *mid_att_als[] = { &dev_attr_lux0_sensor_range.attr, &dev_attr_lux0_input.attr, - &dev_attr_power_state.attr, NULL }; @@ -213,15 +211,21 @@ static int als_set_default_config(struct i2c_client *client) dev_err(&client->dev, "failed default switch on write\n"); return ret_val; } - /* Continous from 1Lux to 64k Lux */ - ret_val = i2c_smbus_write_byte_data(client, 0x81, 0x04); + /* detection range: 1~64K Lux, maunal measurement */ + ret_val = i2c_smbus_write_byte_data(client, 0x81, 0x08); if (ret_val < 0) dev_err(&client->dev, "failed default LUX on write\n"); + + /* We always get 0 for the 1st measurement after system power on, + * so make sure it is finished before user asks for data. + */ + als_wait_for_data_ready(&client->dev); + return ret_val; } -static int apds9802als_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int apds9802als_probe(struct i2c_client *client, + const struct i2c_device_id *id) { int res; struct als_data *data; @@ -237,11 +241,14 @@ static int apds9802als_probe(struct i2c_client *client, dev_err(&client->dev, "device create file failed\n"); goto als_error1; } - dev_info(&client->dev, - "%s apds9802als: ALS chip found\n", client->name); + dev_info(&client->dev, "ALS chip found\n"); als_set_default_config(client); - data->needresume = true; mutex_init(&data->mutex); + + pm_runtime_enable(&client->dev); + pm_runtime_get(&client->dev); + pm_runtime_put(&client->dev); + return res; als_error1: i2c_set_clientdata(client, NULL); @@ -252,11 +259,14 @@ static int apds9802als_probe(struct i2c_client *client, static int apds9802als_remove(struct i2c_client *client) { struct als_data *data = i2c_get_clientdata(client); + + als_set_power_state(client, false); sysfs_remove_group(&client->dev.kobj, &m_als_gr); kfree(data); return 0; } +#ifdef CONFIG_PM static int apds9802als_suspend(struct i2c_client *client, pm_message_t mesg) { als_set_power_state(client, false); @@ -265,13 +275,42 @@ static int apds9802als_suspend(struct i2c_client *client, pm_message_t mesg) static int apds9802als_resume(struct i2c_client *client) { - struct als_data *data = i2c_get_clientdata(client); + als_set_default_config(client); - if (data->needresume == true) - als_set_power_state(client, true); + pm_runtime_get(&client->dev); + pm_runtime_put(&client->dev); return 0; } +static int apds9802als_runtime_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + als_set_power_state(client, false); + return 0; +} + +static int apds9802als_runtime_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + als_set_power_state(client, true); + return 0; +} + +static const struct dev_pm_ops apds9802als_pm_ops = { + .runtime_suspend = apds9802als_runtime_suspend, + .runtime_resume = apds9802als_runtime_resume, +}; + +#define APDS9802ALS_PM_OPS (&apds9802als_pm_ops) + +#else /* CONFIG_PM */ +#define apds9802als_suspend NULL +#define apds9802als_resume NULL +#define APDS9802ALS_PM_OPS NULL +#endif /* CONFIG_PM */ + static struct i2c_device_id apds9802als_id[] = { { DRIVER_NAME, 0 }, { } @@ -281,8 +320,8 @@ MODULE_DEVICE_TABLE(i2c, apds9802als_id); static struct i2c_driver apds9802als_driver = { .driver = { - .name = DRIVER_NAME, - .owner = THIS_MODULE, + .name = DRIVER_NAME, + .pm = APDS9802ALS_PM_OPS, }, .probe = apds9802als_probe, .remove = apds9802als_remove, From 562f5e638de4ef451226552fe8dd7847bacea24e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:42 -0700 Subject: [PATCH 114/176] init: mark __user address space on string literals When calling syscall service routines in kernel, some of arguments should be user pointers but were missing __user markup on string literals. Add it. Removes some sparse warnings. Signed-off-by: Namhyung Kim Cc: Phillip Lougher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts.c | 4 ++-- init/do_mounts_md.c | 2 +- init/do_mounts_rd.c | 4 ++-- init/initramfs.c | 5 +++-- init/noinitramfs.c | 6 +++--- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 62a47eafa8e9..830aaec9c7d5 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -291,7 +291,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) if (err) return err; - sys_chdir("/root"); + sys_chdir((const char __user __force *)"/root"); ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev; printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", current->fs->pwd.mnt->mnt_sb->s_type->name, @@ -488,5 +488,5 @@ void __init prepare_namespace(void) out: devtmpfs_mount("dev"); sys_mount(".", "/", NULL, MS_MOVE, NULL); - sys_chroot("."); + sys_chroot((const char __user __force *)"."); } diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 69aebbf8fd2d..32c4799b8c91 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -283,7 +283,7 @@ static void __init autodetect_raid(void) wait_for_device_probe(); - fd = sys_open("/dev/md0", 0, 0); + fd = sys_open((const char __user __force *) "/dev/md0", 0, 0); if (fd >= 0) { sys_ioctl(fd, RAID_AUTORUN, raid_autopart); sys_close(fd); diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index bf3ef667bf36..6e1ee6987c78 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -168,7 +168,7 @@ int __init rd_load_image(char *from) char rotator[4] = { '|' , '/' , '-' , '\\' }; #endif - out_fd = sys_open("/dev/ram", O_RDWR, 0); + out_fd = sys_open((const char __user __force *) "/dev/ram", O_RDWR, 0); if (out_fd < 0) goto out; @@ -267,7 +267,7 @@ int __init rd_load_image(char *from) sys_close(out_fd); out: kfree(buf); - sys_unlink("/dev/ram"); + sys_unlink((const char __user __force *) "/dev/ram"); return res; } diff --git a/init/initramfs.c b/init/initramfs.c index 4b9c20205092..d9c6e782ff53 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -528,7 +528,7 @@ static void __init clean_rootfs(void) struct linux_dirent64 *dirp; int num; - fd = sys_open("/", O_RDONLY, 0); + fd = sys_open((const char __user __force *) "/", O_RDONLY, 0); WARN_ON(fd < 0); if (fd < 0) return; @@ -590,7 +590,8 @@ static int __init populate_rootfs(void) } printk(KERN_INFO "rootfs image is not initramfs (%s)" "; looks like an initrd\n", err); - fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 0700); + fd = sys_open((const char __user __force *) "/initrd.image", + O_WRONLY|O_CREAT, 0700); if (fd >= 0) { sys_write(fd, (char *)initrd_start, initrd_end - initrd_start); diff --git a/init/noinitramfs.c b/init/noinitramfs.c index f4c1a3a1b8c5..267739d85179 100644 --- a/init/noinitramfs.c +++ b/init/noinitramfs.c @@ -29,17 +29,17 @@ static int __init default_rootfs(void) { int err; - err = sys_mkdir("/dev", 0755); + err = sys_mkdir((const char __user __force *) "/dev", 0755); if (err < 0) goto out; - err = sys_mknod((const char __user *) "/dev/console", + err = sys_mknod((const char __user __force *) "/dev/console", S_IFCHR | S_IRUSR | S_IWUSR, new_encode_dev(MKDEV(5, 1))); if (err < 0) goto out; - err = sys_mkdir("/root", 0700); + err = sys_mkdir((const char __user __force *) "/root", 0700); if (err < 0) goto out; From 571428be550fbe37160596995e96ad398873fcbd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:43 -0700 Subject: [PATCH 115/176] kernel/user.c: add lock release annotation on free_user() free_user() releases uidhash_lock but was missing annotation. Add it. This removes following sparse warnings: include/linux/spinlock.h:339:9: warning: context imbalance in 'free_user' - unexpected unlock kernel/user.c:120:6: warning: context imbalance in 'free_uid' - wrong count at exit Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Dhaval Giani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736d..2c7d8d5914b1 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) + __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: [PATCH 116/176] fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 17 +++++++---------- include/linux/fs.h | 8 ++++---- kernel/sysctl.c | 6 +++--- net/unix/af_unix.c | 14 +++++++------- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index a04bdd81c11c..c3dee381f1b4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -60,7 +60,7 @@ static inline void file_free(struct file *f) /* * Return the total number of open files in the system */ -static int get_nr_files(void) +static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } @@ -68,7 +68,7 @@ static int get_nr_files(void) /* * Return the maximum number of open files in the system */ -int get_max_files(void) +unsigned long get_max_files(void) { return files_stat.max_files; } @@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(ctl_table *table, int write, @@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *get_empty_filp(void) { const struct cred *cred = current_cred(); - static int old_max; + static long old_max; struct file * f; /* @@ -140,8 +140,7 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", - get_max_files()); + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } goto fail; @@ -487,7 +486,7 @@ void mark_files_ro(struct super_block *sb) void __init files_init(unsigned long mempages) { - int n; + unsigned long n; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages) */ n = (mempages * (PAGE_SIZE / 1024)) / 10; - files_stat.max_files = n; - if (files_stat.max_files < NR_FILE) - files_stat.max_files = NR_FILE; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); files_defer_init(); lg_lock_init(files_lglock); percpu_counter_init(&nr_files, 0); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f34ff6e5558..b2cdb6bc8287 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,9 +34,9 @@ /* And dynamically-tunable limits and defaults: */ struct files_stat_struct { - int nr_files; /* read only */ - int nr_free_files; /* read only */ - int max_files; /* tunable */ + unsigned long nr_files; /* read only */ + unsigned long nr_free_files; /* read only */ + unsigned long max_files; /* tunable */ }; struct inodes_stat_t { @@ -400,7 +400,7 @@ extern void __init inode_init_early(void); extern void __init files_init(unsigned long); extern struct files_stat_struct files_stat; -extern int get_max_files(void); +extern unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770f..694b140852c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a6660..3c95304a0817 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); From ca51c5a76345b28c6f1b742f9f5f0a6fc9afd9ca Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: [PATCH 117/176] kernel/stop_machine.c: fix unused variable warning kernel/stop_machine.c: In function `cpu_stopper_thread': kernel/stop_machine.c:265: warning: unused variable `ksym_buf' ksym_buf[] is unused if WARN_ON() is a no-op. Signed-off-by: Rakib Mullick Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 090c28812ce1..3cb4a9a8ae1c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -262,7 +262,7 @@ static int cpu_stopper_thread(void *data) cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN]; + char ksym_buf[KSYM_NAME_LEN] __maybe_unused; __set_current_state(TASK_RUNNING); From 4ce6494dbd8909718840bb88d5a699ef6ce5c212 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Oct 2010 14:22:45 -0700 Subject: [PATCH 118/176] stop_machine: convert cpu notifier to return encapsulate errno value In commit e6bde73b07edeb703d4c89c1daabc09c303de11f ("cpu-hotplug: return better errno on cpu hotplug failure"), the cpu notifier can return an encapsulated errno value. This converts the cpu notifier to return an encapsulated errno value for stop_machine(). Signed-off-by: Akinobu Mita Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3cb4a9a8ae1c..2df820b03beb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", cpu); if (IS_ERR(p)) - return NOTIFY_BAD; + return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); kthread_bind(p, cpu); sched_set_stop_task(cpu, p); @@ -372,7 +372,7 @@ static int __init cpu_stop_init(void) /* start one for the boot cpu */ err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, bcpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); From 766f9164193f6dda1497bbf3861060198421fb92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:22:45 -0700 Subject: [PATCH 119/176] kernel: remove PF_FLUSHER PF_FLUSHER is only ever set, not tested, remove it. Signed-off-by: Peter Zijlstra Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- include/linux/sched.h | 1 - mm/backing-dev.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b5aae4bd0aca..9e46aec10d1a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -794,7 +794,7 @@ int bdi_writeback_thread(void *data) struct backing_dev_info *bdi = wb->bdi; long pages_written; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); wb->last_active = jiffies; diff --git a/include/linux/sched.h b/include/linux/sched.h index 56154bbb8da9..393ce94e54b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1706,7 +1706,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ -#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ #define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5ad3c106606b..f2eb27884ffa 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); /* From 5f2365d8c24aec8dbedf49c69b7601c7cfaee2c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Oct 2010 14:22:46 -0700 Subject: [PATCH 120/176] misc devices: do not enable by default Do not enable this Kconfig menu by default since it contains devices not present on the majority of systems. This is becoming a pain and a waste of time especially when doing a bunch of kernel builds on different systems daily and have to answer "make oldconfig" prompts for strange devices. Signed-off-by: Borislav Petkov Acked-by: Mike Frysinger Cc: Michael Hennerich Cc: Andres Salomon Cc: Jean Delvare Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 6d7665ce6f4a..5a74db75f66f 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -4,7 +4,6 @@ menuconfig MISC_DEVICES bool "Misc devices" - default y ---help--- Say Y here to get to see options for device drivers from various different categories. This option alone does not add any kernel code. From 6c095efd82e8f6a98515426a733110f91cf0a709 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:47 -0700 Subject: [PATCH 121/176] printk: fixup declaration of kmsg_reasons Move redundant 'const' after '*' to make pointer itself const Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk.c b/kernel/printk.c index 2531017795f6..d18933a92819 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1511,7 +1511,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char const *kmsg_reasons[] = { +static const char * const kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", [KMSG_DUMP_KEXEC] = "kexec", From 8155c02a44a95562e1ae0999360eb31288d7195a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:47 -0700 Subject: [PATCH 122/176] printk: add lock context annotation acquire_console_semaphore_for_printk() releases logbuf_lock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/printk.c b/kernel/printk.c index d18933a92819..6ff26151f4ff 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -647,6 +647,7 @@ static inline int can_use_console(unsigned int cpu) * released but interrupts still disabled. */ static int acquire_console_semaphore_for_printk(unsigned int cpu) + __releases(&logbuf_lock) { int retval = 0; From 674dff6507d3f9b110219ea125cf5e1213c9acef Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:48 -0700 Subject: [PATCH 123/176] printk: change type of 'boot_delay' to int * get_option() takes its 2nd arg as int * so passing boot_delay to it caused following warnings from sparse: kernel/printk.c:223:27: warning: incorrect type in argument 2 (different signedness) kernel/printk.c:223:27: expected int *pint kernel/printk.c:223:27: got unsigned int static [toplevel] * Since boot_delay can't grow more than 10,000 changing it to 'int *' will not produce any problem. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk.c b/kernel/printk.c index 6ff26151f4ff..b2ebaee8c377 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY -static unsigned int boot_delay; /* msecs delay after each printk during bootup */ +static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) From f5d87d851d76a390d0fab2f77bd1d563d69ee586 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:49 -0700 Subject: [PATCH 124/176] printk: declare printk_ratelimit_state in ratelimit.h Adding declaration of printk_ratelimit_state in ratelimit.h removes potential build breakage and following sparse warning: kernel/printk.c:1426:1: warning: symbol 'printk_ratelimit_state' was not declared. Should it be static? [akpm@linux-foundation.org: remove unneeded ifdef] Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ratelimit.h | 2 ++ kernel/sysctl.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 8f69d09a41a5..03ff67b0cdf5 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -36,6 +36,8 @@ static inline void ratelimit_state_init(struct ratelimit_state *rs, rs->begin = 0; } +extern struct ratelimit_state printk_ratelimit_state; + extern int ___ratelimit(struct ratelimit_state *rs, const char *func); #define __ratelimit(state) ___ratelimit(state, __func__) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 694b140852c2..48d9d689498f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,8 +161,6 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif -extern struct ratelimit_state printk_ratelimit_state; - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); From 77006a0a828249dd69341f960043ee41e7487aa0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:49 -0700 Subject: [PATCH 125/176] ratelimit: add comment warning people off printk_ratelimit() printk_ratelimit() was a bad idea - we don't want subsytem A causing ratelimiting of subsystem B's messages. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e9b492b33032..77b04ed037df 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -277,6 +277,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))) __cold; +/* + * Please don't use printk_ratelimit(), because it shares ratelimiting state + * with all other unrelated printk_ratelimit() callsites. Instead use + * printk_ratelimited() or plain old __ratelimit(). + */ extern int __printk_ratelimit(const char *func); #define printk_ratelimit() __printk_ratelimit(__func__) extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, From 5e0579812834ab7fa072db4a15ebdff68d62e2e7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:50 -0700 Subject: [PATCH 126/176] vsprintf.c: use default pointer field size for "(null)" strings It might be nicer to align the output. For instance, ACPI messages sometimes have "(null)" pointers. $ dmesg | grep "(null)" -A 1 -B 1 [ 0.198733] ACPI: Dynamic OEM Table Load: [ 0.198745] ACPI: SSDT (null) 00239 (v02 PmRef Cpu0Ist 00003000 INTL 20051117) [ 0.199294] ACPI: SSDT 7f596e10 001C7 (v02 PmRef Cpu0Cst 00003001 INTL 20051117) [ 0.200708] ACPI: Dynamic OEM Table Load: [ 0.200721] ACPI: SSDT (null) 001C7 (v02 PmRef Cpu0Cst 00003001 INTL 20051117) [ 0.201950] ACPI: SSDT 7f597f10 000D0 (v02 PmRef Cpu1Ist 00003000 INTL 20051117) [ 0.203386] ACPI: Dynamic OEM Table Load: [ 0.203398] ACPI: SSDT (null) 000D0 (v02 PmRef Cpu1Ist 00003000 INTL 20051117) [ 0.203871] ACPI: SSDT 7f595f10 00083 (v02 PmRef Cpu1Cst 00003000 INTL 20051117) [ 0.205301] ACPI: Dynamic OEM Table Load: [ 0.205315] ACPI: SSDT (null) 00083 (v02 PmRef Cpu1Cst 00003000 INTL 20051117) [akpm@linux-foundation.org: add code comment] Signed-off-by: Joe Perches Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7af9d841c43b..8378c136b6e1 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -988,8 +988,15 @@ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { - if (!ptr) + if (!ptr) { + /* + * Print (null) with the same width as a pointer so it makes + * tabular output look nice. + */ + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(void *); return string(buf, end, "(null)", spec); + } switch (*fmt) { case 'F': @@ -1031,7 +1038,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, } spec.flags |= SMALL; if (spec.field_width == -1) { - spec.field_width = 2*sizeof(void *); + spec.field_width = 2 * sizeof(void *); spec.flags |= ZEROPAD; } spec.base = 16; From b903c0b8899b46829a9b80ba55b61079b35940ec Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 26 Oct 2010 14:22:50 -0700 Subject: [PATCH 127/176] lib: fix scnprintf() if @size is == 0 scnprintf() should return 0 if @size is == 0. Update the comment for it, as @size is unsigned. Signed-off-by: Changli Gao Cc: Ingo Molnar Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 8378c136b6e1..c150d3dafff4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1504,7 +1504,7 @@ EXPORT_SYMBOL(snprintf); * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including - * the trailing '\0'. If @size is <= 0 the function returns 0. + * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) @@ -1516,7 +1516,11 @@ int scnprintf(char *buf, size_t size, const char *fmt, ...) i = vsnprintf(buf, size, fmt, args); va_end(args); - return (i >= size) ? (size - 1) : i; + if (likely(i < size)) + return i; + if (size != 0) + return size - 1; + return 0; } EXPORT_SYMBOL(scnprintf); From 63ab52db5ba7f362266cfed03109387ca73e5eb5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:51 -0700 Subject: [PATCH 128/176] scripts/get_maintainer.pl: Add --git-blame --rolestats "Authored lines" information When options --git-blame and --rolestats are specified, add the maintainers with the qualifying --git-min-percent amount of lines authored of the complete file. Does not add more authors than specified by --git-max-maintainers. For anyone using hg, this option works but is _very_ slow. It's orders of magnitude slower than git slow. The get_maintainer.pl version was incremented to 0.25. This can be used with or without --git. For instance: $ ./scripts/get_maintainer.pl --git-blame --nogit --rolestats -f lib/bitmap.c Paul Jackson (authored lines:406/613=66%,commits:7/20=35%) Akinobu Mita (authored lines:87/613=14%,commits:3/20=15%) Reinette Chatre (authored lines:42/613=7%) Andrew Morton (commits:16/20=80%) Paul Mundt (commits:3/20=15%) Randy Dunlap (commits:2/20=10%) $ ./scripts/get_maintainer.pl --git-blame --git --rolestats -f lib/bitmap.c Andrew Morton (commit_signer:4/5=80%,commits:16/20=80%) Akinobu Mita (commit_signer:2/5=40%,authored lines:87/613=14%,commits:3/20=15%) Jack Steiner (commit_signer:1/5=20%) Ben Hutchings (commit_signer:1/5=20%) Lee Schermerhorn (commit_signer:1/5=20%) Paul Jackson (authored lines:406/613=66%,commits:7/20=35%) Reinette Chatre (authored lines:42/613=7%) Paul Mundt (commits:3/20=15%) Randy Dunlap (commits:2/20=10%) linux-kernel@vger.kernel.org (open list) Signed-off-by: Joe Perches Cc: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 60 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index b2281982f52f..a91ae6318403 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -13,7 +13,7 @@ use strict; my $P = $0; -my $V = '0.24'; +my $V = '0.25'; use Getopt::Long qw(:config no_auto_abbrev); @@ -88,6 +88,7 @@ my %VCS_cmds_git = ( "available" => '(which("git") ne "") && (-d ".git")', "find_signers_cmd" => "git log --no-color --since=\$email_git_since -- \$file", "find_commit_signers_cmd" => "git log --no-color -1 \$commit", + "find_commit_author_cmd" => "git log -1 --format=\"%an <%ae>\" \$commit", "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file", "blame_file_cmd" => "git blame -l \$file", "commit_pattern" => "^commit [0-9a-f]{40,40}", @@ -101,6 +102,7 @@ my %VCS_cmds_hg = ( "hg log --date=\$email_hg_since" . " --template='commit {node}\\n{desc}\\n' -- \$file", "find_commit_signers_cmd" => "hg log --template='{desc}\\n' -r \$commit", + "find_commit_author_cmd" => "hg log -l 1 --template='{author}\\n' -r \$commit", "blame_range_cmd" => "", # not supported "blame_file_cmd" => "hg blame -c \$file", "commit_pattern" => "^commit [0-9a-f]{40,40}", @@ -1014,6 +1016,9 @@ sub vcs_find_signers { if (!$email_git_penguin_chiefs) { @lines = grep(!/${penguin_chiefs}/i, @lines); } + + return (0, @lines) if !@lines; + # cut -f2- -d":" s/.*:\s*(.+)\s*/$1/ for (@lines); @@ -1027,6 +1032,28 @@ sub vcs_find_signers { return ($commits, @lines); } +sub vcs_find_author { + my ($cmd) = @_; + my @lines = (); + + @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); + + if (!$email_git_penguin_chiefs) { + @lines = grep(!/${penguin_chiefs}/i, @lines); + } + + return @lines if !@lines; + +## Reformat email addresses (with names) to avoid badly written signatures + + foreach my $line (@lines) { + my ($name, $address) = parse_email($line); + $line = format_email($name, $address, 1); + } + + return @lines; +} + sub vcs_save_commits { my ($cmd) = @_; my @lines = (); @@ -1084,6 +1111,10 @@ sub vcs_blame { @commits = vcs_save_commits($cmd); } + foreach my $commit (@commits) { + $commit =~ s/^\^//g; + } + return @commits; } @@ -1121,6 +1152,8 @@ sub vcs_assign { @lines = mailmap(@lines); } + return if (@lines <= 0); + @lines = sort(@lines); # uniq -c @@ -1165,14 +1198,17 @@ sub vcs_file_blame { my ($file) = @_; my @signers = (); + my @all_commits = (); my @commits = (); my $total_commits; + my $total_lines; return if (!vcs_exists()); - @commits = vcs_blame($file); - @commits = uniq(@commits); + @all_commits = vcs_blame($file); + @commits = uniq(@all_commits); $total_commits = @commits; + $total_lines = @all_commits; foreach my $commit (@commits) { my $commit_count; @@ -1182,10 +1218,28 @@ sub vcs_file_blame { $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd ($commit_count, @commit_signers) = vcs_find_signers($cmd); + push(@signers, @commit_signers); } if ($from_filename) { + if ($output_rolestats) { + my @blame_signers; + foreach my $commit (@commits) { + my $i; + my $cmd = $VCS_cmds{"find_commit_author_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd + my @author = vcs_find_author($cmd); + next if !@author; + my $count = grep(/$commit/, @all_commits); + for ($i = 0; $i < $count ; $i++) { + push(@blame_signers, $author[0]); + } + } + if (@blame_signers) { + vcs_assign("authored lines", $total_lines, @blame_signers); + } + } vcs_assign("commits", $total_commits, @signers); } else { vcs_assign("modified commits", $total_commits, @signers); From 6ffd9485f5c9c0b2d2aea9f904dff08e7088010a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:51 -0700 Subject: [PATCH 129/176] scripts/get_maintainer.pl: use correct indentation Fix an overly indented block. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index a91ae6318403..65c6acf1bb3b 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -420,23 +420,23 @@ foreach my $file (@files) { foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { add_categories($line); - if ($sections) { - my $i; - my $start = find_starting_index($line); - my $end = find_ending_index($line); - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ /^[FX]:/) { ##Restore file patterns - $line =~ s/([^\\])\.([^\*])/$1\?$2/g; - $line =~ s/([^\\])\.$/$1\?/g; ##Convert . back to ? - $line =~ s/\\\./\./g; ##Convert \. to . - $line =~ s/\.\*/\*/g; ##Convert .* to * - } - $line =~ s/^([A-Z]):/$1:\t/g; - print("$line\n"); + if ($sections) { + my $i; + my $start = find_starting_index($line); + my $end = find_ending_index($line); + for ($i = $start; $i < $end; $i++) { + my $line = $typevalue[$i]; + if ($line =~ /^[FX]:/) { ##Restore file patterns + $line =~ s/([^\\])\.([^\*])/$1\?$2/g; + $line =~ s/([^\\])\.$/$1\?/g; ##Convert . back to ? + $line =~ s/\\\./\./g; ##Convert \. to . + $line =~ s/\.\*/\*/g; ##Convert .* to * } - print("\n"); + $line =~ s/^([A-Z]):/$1:\t/g; + print("$line\n"); } + print("\n"); + } } if ($email && $email_git) { From fab9ed12fcd0c182a72509382c3da55c527963e3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:52 -0700 Subject: [PATCH 130/176] scripts/get_maintainer.pl: don't search MAINTAINERS for keywords or emails Keyword matching uses K: patterns from MAINTAINERS, so if looking for the MAINTAINERS maintainer, don't search MAINTAINERS for pattern matches. MAINTAINERS also has rather a lot of email addresses and is easily searched using grep "^M:", so skip it. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 65c6acf1bb3b..77f4f2e4cd81 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -304,7 +304,7 @@ foreach my $file (@ARGV) { } if ($from_filename) { push(@files, $file); - if (-f $file && ($keywords || $file_emails)) { + if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) { open(my $f, '<', $file) or die "$P: Can't open $file: $!\n"; my $text = do { local($/) ; <$f> }; From e3e9d11479737692f797bad1762f71468d577a93 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:53 -0700 Subject: [PATCH 131/176] scripts/get_maintainer.pl: add default --git-fallback, remove default --git Adding commit signers when there is a listed MAINTAINER for a file can make the output list longer than necessary. Change the --git default from on to off. Add a new --git-fallback option (default on) used to add commit signers only when there is no MAINTAINER for a file. git history is used when --git-fallback is enabled and the pattern directory depth is not the same as the file directory depth. For instance: X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner M: Ingo Molnar M: "H. Peter Anvin" M: x86@kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git S: Maintained F: Documentation/x86/ F: arch/x86/ If using "./scripts/get_maintainer -f arch/x86/lib/atomic64_32.c", the pattern for "arch/x86/" does not match the directory depth of "arch/x86/lib" so the MAINTAINERS entries and git history is used to produce: $ ./scripts/get_maintainer.pl -f --rolestats arch/x86/lib/atomic64_32.c Thomas Gleixner (maintainer:X86 ARCHITECTURE...) Ingo Molnar (maintainer:X86 ARCHITECTURE...) "H. Peter Anvin" (maintainer:X86 ARCHITECTURE...,commit_signer:1/1=100%) x86@kernel.org (maintainer:X86 ARCHITECTURE...) Luca Barbieri (commit_signer:1/1=100%) linux-kernel@vger.kernel.org (open list) Luca Barbieri is added because he signed the only commit to arch/x86/lib/atomic64_32.c during the last year and he meets the other default qualifications. --git-min-percent (default:10) --git-min-signatures (default:1) If current users of ./scripts/get_maintainers.pl have scripts that use --nogit that expect git history to be excluded, those scripts should be updated to include --nogit-fallback or a .get_maintainer.conf file should be created with --nogit-fallback. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 77f4f2e4cd81..f46576949ccb 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -24,9 +24,10 @@ my $email_maintainer = 1; my $email_list = 1; my $email_subscriber_list = 0; my $email_git_penguin_chiefs = 0; -my $email_git = 1; +my $email_git = 0; my $email_git_all_signature_types = 0; my $email_git_blame = 0; +my $email_git_fallback = 1; my $email_git_min_signatures = 1; my $email_git_max_maintainers = 5; my $email_git_min_percent = 5; @@ -138,6 +139,7 @@ if (!GetOptions( 'git!' => \$email_git, 'git-all-signature-types!' => \$email_git_all_signature_types, 'git-blame!' => \$email_git_blame, + 'git-fallback!' => \$email_git_fallback, 'git-chief-penguins!' => \$email_git_penguin_chiefs, 'git-min-signatures=i' => \$email_git_min_signatures, 'git-max-maintainers=i' => \$email_git_max_maintainers, @@ -371,6 +373,7 @@ my @status = (); foreach my $file (@files) { my %hash; + my $exact_pattern_match = 0; my $tvi = find_first_section(); while ($tvi < @typevalue) { my $start = find_starting_index($tvi); @@ -405,6 +408,8 @@ foreach my $file (@files) { my $value_pd = ($value =~ tr@/@@); my $file_pd = ($file =~ tr@/@@); $value_pd++ if (substr($value,-1,1) ne "/"); + $value_pd = -1 if ($value =~ /^\.\*/); + $exact_pattern_match = 1 if ($value_pd >= $file_pd); if ($pattern_depth == 0 || (($file_pd - $value_pd) < $pattern_depth)) { $hash{$tvi} = $value_pd; @@ -439,7 +444,8 @@ foreach my $file (@files) { } } - if ($email && $email_git) { + if ($email && + ($email_git || ($email_git_fallback && !$exact_pattern_match))) { vcs_file_signoffs($file); } @@ -540,6 +546,7 @@ MAINTAINER field selection options: --git => include recent git \*-by: signers --git-all-signature-types => include signers regardless of signature type or use only ${signaturePattern} signers (default: $email_git_all_signature_types) + --git-fallback => use git when no exact MAINTAINERS pattern (default: $email_git_fallback) --git-chief-penguins => include ${penguin_chiefs} --git-min-signatures => number of signatures required (default: $email_git_min_signatures) --git-max-maintainers => maximum maintainers to add (default: $email_git_max_maintainers) From bcde44ed7d2a58733efdf04b5392c027d1348bac Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:53 -0700 Subject: [PATCH 132/176] scripts/get_maintainer.pl: use .get_maintainer.conf from . then $HOME then scripts On Mon, 2010-09-13 at 00:01 -0400, Valdis.Kletnieks@vt.edu wrote: > Any chance of getting that to be ~/.get_maintainer.conf rather than > ./.get_maintainer.conf? I've just gotten bit like the 3rd or 4th time by > "oh but you didn't create that file in *this* tree" > (I usually have a linus git tree, a linux-next tree, and 3-4 -mm trees) Sure. Add a search path for the .conf file. 3 paths are added: . customized per-tree configurations $HOME user global configuration when per-tree configs don't exist ./scripts lk defaults to override script Signed-off-by: Joe Perches Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index f46576949ccb..e5a400c53bf0 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -110,10 +110,12 @@ my %VCS_cmds_hg = ( "blame_commit_pattern" => "^([0-9a-f]+):" ); -if (-f "${lk_path}.get_maintainer.conf") { +my $conf = which_conf(".get_maintainer.conf"); +if (-f $conf) { my @conf_args; - open(my $conffile, '<', "${lk_path}.get_maintainer.conf") - or warn "$P: Can't open .get_maintainer.conf: $!\n"; + open(my $conffile, '<', "$conf") + or warn "$P: Can't find a readable .get_maintainer.conf file $!\n"; + while (<$conffile>) { my $line = $_; @@ -961,6 +963,18 @@ sub which { return ""; } +sub which_conf { + my ($conf) = @_; + + foreach my $path (split(/:/, ".:$ENV{HOME}:.scripts")) { + if (-e "$path/$conf") { + return "$path/$conf"; + } + } + + return ""; +} + sub mailmap { my (@lines) = @_; my %hash; From dace8e300d6820c2842de750d12b498a743bcfe5 Mon Sep 17 00:00:00 2001 From: Florian Mickler Date: Tue, 26 Oct 2010 14:22:54 -0700 Subject: [PATCH 133/176] scripts/get_maintainer.pl: add interactive mode This is a first version of an interactive mode for scripts/get_maintainer.pl. It allows the user to interact with the script. Each cc candidate can be selected and deselected and a shortlog of authored commits can be displayed for each candidate. The menu is displayed via STDERR, the end result is outputted to STDOUT. This unusual mechanism allows using get_maintainer.pl in interactive mode via git send-email --cc-cmd. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 146 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 141 insertions(+), 5 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index e5a400c53bf0..1ae8c50f1908 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -33,6 +33,7 @@ my $email_git_max_maintainers = 5; my $email_git_min_percent = 5; my $email_git_since = "1-year-ago"; my $email_hg_since = "-365"; +my $interactive = 0; my $email_remove_duplicates = 1; my $output_multiline = 1; my $output_separator = ", "; @@ -52,6 +53,8 @@ my $help = 0; my $exit = 0; +my %shortlog_buffer; + my @penguin_chief = (); push(@penguin_chief, "Linus Torvalds:torvalds\@linux-foundation.org"); #Andrew wants in on most everything - 2009/01/14 @@ -93,7 +96,8 @@ my %VCS_cmds_git = ( "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file", "blame_file_cmd" => "git blame -l \$file", "commit_pattern" => "^commit [0-9a-f]{40,40}", - "blame_commit_pattern" => "^([0-9a-f]+) " + "blame_commit_pattern" => "^([0-9a-f]+) ", + "shortlog_cmd" => "git log --no-color --oneline --since=\$email_git_since --author=\"\$email\" -- \$file" ); my %VCS_cmds_hg = ( @@ -107,7 +111,8 @@ my %VCS_cmds_hg = ( "blame_range_cmd" => "", # not supported "blame_file_cmd" => "hg blame -c \$file", "commit_pattern" => "^commit [0-9a-f]{40,40}", - "blame_commit_pattern" => "^([0-9a-f]+):" + "blame_commit_pattern" => "^([0-9a-f]+):", + "shortlog_cmd" => "ht log --date=\$email_hg_since" ); my $conf = which_conf(".get_maintainer.conf"); @@ -148,6 +153,7 @@ if (!GetOptions( 'git-min-percent=i' => \$email_git_min_percent, 'git-since=s' => \$email_git_since, 'hg-since=s' => \$email_hg_since, + 'i|interactive!' => \$interactive, 'remove-duplicates!' => \$email_remove_duplicates, 'm!' => \$email_maintainer, 'n!' => \$email_usename, @@ -225,6 +231,8 @@ if ($email_git_all_signature_types) { $signaturePattern = "(.+?)[Bb][Yy]:"; } + + ## Read MAINTAINERS for type/value pairs my @typevalue = (); @@ -450,10 +458,13 @@ foreach my $file (@files) { ($email_git || ($email_git_fallback && !$exact_pattern_match))) { vcs_file_signoffs($file); } - if ($email && $email_git_blame) { vcs_file_blame($file); } + if ($email && $interactive){ + vcs_file_shortlogs($file); + + } } if ($keywords) { @@ -486,9 +497,13 @@ if ($email) { } } + if ($email || $email_list) { my @to = (); if ($email) { + if ($interactive) { + @email_to = @{vcs_interactive_menu(\@email_to)}; + } @to = (@to, @email_to); } if ($email_list) { @@ -501,7 +516,6 @@ if ($scm) { @scm = uniq(@scm); output(@scm); } - if ($status) { @status = uniq(@status); output(@status); @@ -556,6 +570,7 @@ MAINTAINER field selection options: --git-blame => use git blame to find modified commits for patch or file --git-since => git history to use (default: $email_git_since) --hg-since => hg history to use (default: $email_hg_since) + --interactive => display a menu (mostly useful if used with the --git option) --m => include maintainer(s) if any --n => include name 'Full Name ' --l => include list(s) if any @@ -1156,6 +1171,127 @@ sub vcs_exists { return 0; } +sub vcs_interactive_menu { + my $list_ref = shift; + my @list = @$list_ref; + + return if (!vcs_exists()); + + my %selected; + my %shortlog; + my $input; + my $count = 0; + + #select maintainers by default + foreach my $entry (@list){ + my $role = $entry->[1]; + $selected{$count} = ($role =~ /maintainer:|supporter:/); + $count++; + } + + #menu loop + do { + my $count = 0; + foreach my $entry (@list){ + my $email = $entry->[0]; + my $role = $entry->[1]; + if ($selected{$count}){ + print STDERR "* "; + } else { + print STDERR " "; + } + print STDERR "$count: $email,\t\t $role"; + print STDERR "\n"; + if ($shortlog{$count}){ + my $entries_ref = vcs_get_shortlog($email); + foreach my $entry_ref (@{$entries_ref}){ + my $filename = @{$entry_ref}[0]; + my @shortlog = @{@{$entry_ref}[1]}; + print STDERR "\tshortlog for $filename (authored commits: " . @shortlog . ").\n"; + foreach my $commit (@shortlog){ + print STDERR "\t $commit\n"; + } + print STDERR "\n"; + } + } + $count++; + } + print STDERR "\n"; + print STDERR "Choose whom to cc by entering a commaseperated list of numbers and hitting enter.\n"; + print STDERR "To show a short list of commits, precede the number by a '?',\n"; + print STDERR "A blank line indicates that you are satisfied with your choice.\n"; + $input = ; + chomp($input); + + my @wish = split(/[, ]+/,$input); + foreach my $nr (@wish){ + my $logtoggle = 0; + if ($nr =~ /\?/){ + $nr =~ s/\?//; + $logtoggle = 1; + } + + #skip out of bounds numbers + next unless ($nr <= $count && $nr >= 0); + + if ($logtoggle){ + $shortlog{$nr} = !$shortlog{$nr}; + } else { + $selected{$nr} = !$selected{$nr}; + + #switch shortlog on if an entry get's selected + if ($selected{$nr}){ + $shortlog{$nr}=1; + } + } + }; + } while(length($input) > 0); + + #drop not selected entries + $count = 0; + my @new_emailto; + foreach my $entry (@list){ + if ($selected{$count}){ + push(@new_emailto,$list[$count]); + print STDERR "$count: "; + print STDERR $email_to[$count]->[0]; + print STDERR ",\t\t "; + print STDERR $email_to[$count]->[1]; + print STDERR "\n"; + } + $count++; + } + return \@new_emailto; +} + +sub vcs_get_shortlog { + my $arg = shift; + my ($name, $address) = parse_email($arg); + return $shortlog_buffer{$address}; +} + +sub vcs_file_shortlogs { + my ($file) = @_; + print STDERR "shortlog processing $file:"; + foreach my $entry (@email_to){ + my ($name, $address) = parse_email($entry->[0]); + print STDERR "."; + my $commits_ref = vcs_email_shortlog($address, $file); + push(@{$shortlog_buffer{$address}}, [ $file, $commits_ref ]); + } + print STDERR "\n"; +} + +sub vcs_email_shortlog { + my $email = shift; + my ($file) = @_; + + my $cmd = $VCS_cmds{"shortlog_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables + my @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); + return \@lines; +} + sub vcs_assign { my ($role, $divisor, @lines) = @_; @@ -1236,7 +1372,7 @@ sub vcs_file_blame { my @commit_signers = (); my $cmd = $VCS_cmds{"find_commit_signers_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd + $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd ($commit_count, @commit_signers) = vcs_find_signers($cmd); From 683c6f8fcbcb6de8d07545ba70aff49e50d8bcf2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:55 -0700 Subject: [PATCH 134/176] scripts/get_maintainer.pl: improve --interactive UI o Added searching by git-blame as well as git-history o Added different selection toggles o Added ability to list commits by author or by sign-off-type o Use custom git and hg formats to make searching for subject/author a bit easier. o Move inlined section matching and searching git/hg history to new get_maintainer subroutine o Added subroutines save_commits_by_author and save_commits_by_signer o Removed subroutines vcs_get_shortlog and vcs_email_shortlog o Rename camelcase signaturePattern to signature_pattern Update to 0.26 beta3 Signed-off-by: Joe Perches Cc: Florian Mickler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 903 ++++++++++++++++++++++++++------------ 1 file changed, 612 insertions(+), 291 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 1ae8c50f1908..f51176039ff5 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -13,7 +13,7 @@ use strict; my $P = $0; -my $V = '0.25'; +my $V = '0.26-beta3'; use Getopt::Long qw(:config no_auto_abbrev); @@ -27,6 +27,7 @@ my $email_git_penguin_chiefs = 0; my $email_git = 0; my $email_git_all_signature_types = 0; my $email_git_blame = 0; +my $email_git_blame_signatures = 1; my $email_git_fallback = 1; my $email_git_min_signatures = 1; my $email_git_max_maintainers = 5; @@ -51,9 +52,12 @@ my $pattern_depth = 0; my $version = 0; my $help = 0; +my $vcs_used = 0; + my $exit = 0; -my %shortlog_buffer; +my %commit_author_hash; +my %commit_signer_hash; my @penguin_chief = (); push(@penguin_chief, "Linus Torvalds:torvalds\@linux-foundation.org"); @@ -77,7 +81,6 @@ my @signature_tags = (); push(@signature_tags, "Signed-off-by:"); push(@signature_tags, "Reviewed-by:"); push(@signature_tags, "Acked-by:"); -my $signaturePattern = "\(" . join("|", @signature_tags) . "\)"; # rfc822 email address - preloaded methods go here. my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])"; @@ -90,29 +93,62 @@ my %VCS_cmds; my %VCS_cmds_git = ( "execute_cmd" => \&git_execute_cmd, "available" => '(which("git") ne "") && (-d ".git")', - "find_signers_cmd" => "git log --no-color --since=\$email_git_since -- \$file", - "find_commit_signers_cmd" => "git log --no-color -1 \$commit", - "find_commit_author_cmd" => "git log -1 --format=\"%an <%ae>\" \$commit", + "find_signers_cmd" => + "git log --no-color --since=\$email_git_since " . + '--format="GitCommit: %H%n' . + 'GitAuthor: %an <%ae>%n' . + 'GitDate: %aD%n' . + 'GitSubject: %s%n' . + '%b%n"' . + " -- \$file", + "find_commit_signers_cmd" => + "git log --no-color " . + '--format="GitCommit: %H%n' . + 'GitAuthor: %an <%ae>%n' . + 'GitDate: %aD%n' . + 'GitSubject: %s%n' . + '%b%n"' . + " -1 \$commit", + "find_commit_author_cmd" => + "git log --no-color " . + '--format="GitCommit: %H%n' . + 'GitAuthor: %an <%ae>%n' . + 'GitDate: %aD%n' . + 'GitSubject: %s%n"' . + " -1 \$commit", "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file", "blame_file_cmd" => "git blame -l \$file", - "commit_pattern" => "^commit [0-9a-f]{40,40}", + "commit_pattern" => "^GitCommit: ([0-9a-f]{40,40})", "blame_commit_pattern" => "^([0-9a-f]+) ", - "shortlog_cmd" => "git log --no-color --oneline --since=\$email_git_since --author=\"\$email\" -- \$file" + "author_pattern" => "^GitAuthor: (.*)", + "subject_pattern" => "^GitSubject: (.*)", ); my %VCS_cmds_hg = ( "execute_cmd" => \&hg_execute_cmd, "available" => '(which("hg") ne "") && (-d ".hg")', "find_signers_cmd" => - "hg log --date=\$email_hg_since" . - " --template='commit {node}\\n{desc}\\n' -- \$file", - "find_commit_signers_cmd" => "hg log --template='{desc}\\n' -r \$commit", - "find_commit_author_cmd" => "hg log -l 1 --template='{author}\\n' -r \$commit", + "hg log --date=\$email_hg_since " . + "--template='HgCommit: {node}\\n" . + "HgAuthor: {author}\\n" . + "HgSubject: {desc}\\n'" . + " -- \$file", + "find_commit_signers_cmd" => + "hg log " . + "--template='HgSubject: {desc}\\n'" . + " -r \$commit", + "find_commit_author_cmd" => + "hg log " . + "--template='HgCommit: {node}\\n" . + "HgAuthor: {author}\\n" . + "HgSubject: {desc|firstline}\\n'" . + " -r \$commit", "blame_range_cmd" => "", # not supported - "blame_file_cmd" => "hg blame -c \$file", - "commit_pattern" => "^commit [0-9a-f]{40,40}", - "blame_commit_pattern" => "^([0-9a-f]+):", - "shortlog_cmd" => "ht log --date=\$email_hg_since" + "blame_file_cmd" => "hg blame -n \$file", + "commit_pattern" => "^HgCommit: ([0-9a-f]{40,40})", + "blame_commit_pattern" => "^([ 0-9a-f]+):", + "author_pattern" => "^HgAuthor: (.*)", + "subject_pattern" => "^HgSubject: (.*)", ); my $conf = which_conf(".get_maintainer.conf"); @@ -146,6 +182,7 @@ if (!GetOptions( 'git!' => \$email_git, 'git-all-signature-types!' => \$email_git_all_signature_types, 'git-blame!' => \$email_git_blame, + 'git-blame-signatures!' => \$email_git_blame_signatures, 'git-fallback!' => \$email_git_fallback, 'git-chief-penguins!' => \$email_git_penguin_chiefs, 'git-min-signatures=i' => \$email_git_min_signatures, @@ -193,13 +230,9 @@ if (-t STDIN && !@ARGV) { die "$P: missing patchfile or -f file - use --help if necessary\n"; } -if ($output_separator ne ", ") { - $output_multiline = 0; -} - -if ($output_rolestats) { - $output_roles = 1; -} +$output_multiline = 0 if ($output_separator ne ", "); +$output_rolestats = 1 if ($interactive); +$output_roles = 1 if ($output_rolestats); if ($sections) { $email = 0; @@ -227,12 +260,6 @@ if (!top_of_kernel_tree($lk_path)) { . "a linux kernel source tree.\n"; } -if ($email_git_all_signature_types) { - $signaturePattern = "(.+?)[Bb][Yy]:"; -} - - - ## Read MAINTAINERS for type/value pairs my @typevalue = (); @@ -371,151 +398,28 @@ foreach my $file (@ARGV) { @file_emails = uniq(@file_emails); +my %email_hash_name; +my %email_hash_address; my @email_to = (); +my %hash_list_to; my @list_to = (); my @scm = (); my @web = (); my @subsystem = (); my @status = (); +my $signature_pattern; -# Find responsible parties +my @to = get_maintainer(); -foreach my $file (@files) { +@to = merge_email(@to); - my %hash; - my $exact_pattern_match = 0; - my $tvi = find_first_section(); - while ($tvi < @typevalue) { - my $start = find_starting_index($tvi); - my $end = find_ending_index($tvi); - my $exclude = 0; - my $i; - - #Do not match excluded file patterns - - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ m/^(\C):\s*(.*)/) { - my $type = $1; - my $value = $2; - if ($type eq 'X') { - if (file_match_pattern($file, $value)) { - $exclude = 1; - last; - } - } - } - } - - if (!$exclude) { - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ m/^(\C):\s*(.*)/) { - my $type = $1; - my $value = $2; - if ($type eq 'F') { - if (file_match_pattern($file, $value)) { - my $value_pd = ($value =~ tr@/@@); - my $file_pd = ($file =~ tr@/@@); - $value_pd++ if (substr($value,-1,1) ne "/"); - $value_pd = -1 if ($value =~ /^\.\*/); - $exact_pattern_match = 1 if ($value_pd >= $file_pd); - if ($pattern_depth == 0 || - (($file_pd - $value_pd) < $pattern_depth)) { - $hash{$tvi} = $value_pd; - } - } - } - } - } - } - - $tvi = $end + 1; - } - - foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { - add_categories($line); - if ($sections) { - my $i; - my $start = find_starting_index($line); - my $end = find_ending_index($line); - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ /^[FX]:/) { ##Restore file patterns - $line =~ s/([^\\])\.([^\*])/$1\?$2/g; - $line =~ s/([^\\])\.$/$1\?/g; ##Convert . back to ? - $line =~ s/\\\./\./g; ##Convert \. to . - $line =~ s/\.\*/\*/g; ##Convert .* to * - } - $line =~ s/^([A-Z]):/$1:\t/g; - print("$line\n"); - } - print("\n"); - } - } - - if ($email && - ($email_git || ($email_git_fallback && !$exact_pattern_match))) { - vcs_file_signoffs($file); - } - if ($email && $email_git_blame) { - vcs_file_blame($file); - } - if ($email && $interactive){ - vcs_file_shortlogs($file); - - } -} - -if ($keywords) { - @keyword_tvi = sort_and_uniq(@keyword_tvi); - foreach my $line (@keyword_tvi) { - add_categories($line); - } -} - -if ($email) { - foreach my $chief (@penguin_chief) { - if ($chief =~ m/^(.*):(.*)/) { - my $email_address; - - $email_address = format_email($1, $2, $email_usename); - if ($email_git_penguin_chiefs) { - push(@email_to, [$email_address, 'chief penguin']); - } else { - @email_to = grep($_->[0] !~ /${email_address}/, @email_to); - } - } - } - - foreach my $email (@file_emails) { - my ($name, $address) = parse_email($email); - - my $tmp_email = format_email($name, $address, $email_usename); - push_email_address($tmp_email, ''); - add_role($tmp_email, 'in file'); - } -} - - -if ($email || $email_list) { - my @to = (); - if ($email) { - if ($interactive) { - @email_to = @{vcs_interactive_menu(\@email_to)}; - } - @to = (@to, @email_to); - } - if ($email_list) { - @to = (@to, @list_to); - } - output(merge_email(@to)); -} +output(@to) if (@to); if ($scm) { @scm = uniq(@scm); output(@scm); } + if ($status) { @status = uniq(@status); output(@status); @@ -533,6 +437,154 @@ if ($web) { exit($exit); +sub get_maintainer { + %email_hash_name = (); + %email_hash_address = (); + %commit_author_hash = (); + %commit_signer_hash = (); + @email_to = (); + %hash_list_to = (); + @list_to = (); + @scm = (); + @web = (); + @subsystem = (); + @status = (); + + if ($email_git_all_signature_types) { + $signature_pattern = "(.+?)[Bb][Yy]:"; + } else { + $signature_pattern = "\(" . join("|", @signature_tags) . "\)"; + } + + # Find responsible parties + + foreach my $file (@files) { + + my %hash; + my $exact_pattern_match = 0; + my $tvi = find_first_section(); + while ($tvi < @typevalue) { + my $start = find_starting_index($tvi); + my $end = find_ending_index($tvi); + my $exclude = 0; + my $i; + + #Do not match excluded file patterns + + for ($i = $start; $i < $end; $i++) { + my $line = $typevalue[$i]; + if ($line =~ m/^(\C):\s*(.*)/) { + my $type = $1; + my $value = $2; + if ($type eq 'X') { + if (file_match_pattern($file, $value)) { + $exclude = 1; + last; + } + } + } + } + + if (!$exclude) { + for ($i = $start; $i < $end; $i++) { + my $line = $typevalue[$i]; + if ($line =~ m/^(\C):\s*(.*)/) { + my $type = $1; + my $value = $2; + if ($type eq 'F') { + if (file_match_pattern($file, $value)) { + my $value_pd = ($value =~ tr@/@@); + my $file_pd = ($file =~ tr@/@@); + $value_pd++ if (substr($value,-1,1) ne "/"); + $value_pd = -1 if ($value =~ /^\.\*/); + $exact_pattern_match = 1 if ($value_pd >= $file_pd); + if ($pattern_depth == 0 || + (($file_pd - $value_pd) < $pattern_depth)) { + $hash{$tvi} = $value_pd; + } + } + } + } + } + } + $tvi = $end + 1; + } + + foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { + add_categories($line); + if ($sections) { + my $i; + my $start = find_starting_index($line); + my $end = find_ending_index($line); + for ($i = $start; $i < $end; $i++) { + my $line = $typevalue[$i]; + if ($line =~ /^[FX]:/) { ##Restore file patterns + $line =~ s/([^\\])\.([^\*])/$1\?$2/g; + $line =~ s/([^\\])\.$/$1\?/g; ##Convert . back to ? + $line =~ s/\\\./\./g; ##Convert \. to . + $line =~ s/\.\*/\*/g; ##Convert .* to * + } + $line =~ s/^([A-Z]):/$1:\t/g; + print("$line\n"); + } + print("\n"); + } + } + + if ($email && ($email_git || + ($email_git_fallback && !$exact_pattern_match))) { + vcs_file_signoffs($file); + } + if ($email && $email_git_blame) { + vcs_file_blame($file); + } + } + + if ($keywords) { + @keyword_tvi = sort_and_uniq(@keyword_tvi); + foreach my $line (@keyword_tvi) { + add_categories($line); + } + } + + if ($email) { + foreach my $chief (@penguin_chief) { + if ($chief =~ m/^(.*):(.*)/) { + my $email_address; + + $email_address = format_email($1, $2, $email_usename); + if ($email_git_penguin_chiefs) { + push(@email_to, [$email_address, 'chief penguin']); + } else { + @email_to = grep($_->[0] !~ /${email_address}/, @email_to); + } + } + } + + foreach my $email (@file_emails) { + my ($name, $address) = parse_email($email); + + my $tmp_email = format_email($name, $address, $email_usename); + push_email_address($tmp_email, ''); + add_role($tmp_email, 'in file'); + } + } + + my @to = (); + if ($email || $email_list) { + if ($email) { + @to = (@to, @email_to); + } + if ($email_list) { + @to = (@to, @list_to); + } + } + + @to = interactive_get_maintainer(\@to) if ($interactive); + + return @to; +} + sub file_match_pattern { my ($file, $pattern) = @_; if (substr($pattern, -1) eq "/") { @@ -561,7 +613,7 @@ MAINTAINER field selection options: --email => print email address(es) if any --git => include recent git \*-by: signers --git-all-signature-types => include signers regardless of signature type - or use only ${signaturePattern} signers (default: $email_git_all_signature_types) + or use only ${signature_pattern} signers (default: $email_git_all_signature_types) --git-fallback => use git when no exact MAINTAINERS pattern (default: $email_git_fallback) --git-chief-penguins => include ${penguin_chiefs} --git-min-signatures => number of signatures required (default: $email_git_min_signatures) @@ -847,11 +899,19 @@ sub add_categories { } if ($list_additional =~ m/subscribers-only/) { if ($email_subscriber_list) { - push(@list_to, [$list_address, "subscriber list${list_role}"]); + if (!$hash_list_to{$list_address}) { + $hash_list_to{$list_address} = 1; + push(@list_to, [$list_address, + "subscriber list${list_role}"]); + } } } else { if ($email_list) { - push(@list_to, [$list_address, "open list${list_role}"]); + if (!$hash_list_to{$list_address}) { + $hash_list_to{$list_address} = 1; + push(@list_to, [$list_address, + "open list${list_role}"]); + } } } } elsif ($ptype eq "M") { @@ -882,9 +942,6 @@ sub add_categories { } } -my %email_hash_name; -my %email_hash_address; - sub email_inuse { my ($name, $address) = @_; @@ -1037,10 +1094,31 @@ sub hg_execute_cmd { return @lines; } +sub extract_formatted_signatures { + my (@signature_lines) = @_; + + my @type = @signature_lines; + + s/\s*(.*):.*/$1/ for (@type); + + # cut -f2- -d":" + s/\s*.*:\s*(.+)\s*/$1/ for (@signature_lines); + +## Reformat email addresses (with names) to avoid badly written signatures + + foreach my $signer (@signature_lines) { + my ($name, $address) = parse_email($signer); + $signer = format_email($name, $address, 1); + } + + return (\@type, \@signature_lines); +} + sub vcs_find_signers { my ($cmd) = @_; - my @lines = (); my $commits; + my @lines = (); + my @signatures = (); @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); @@ -1048,24 +1126,20 @@ sub vcs_find_signers { $commits = grep(/$pattern/, @lines); # of commits - @lines = grep(/^[ \t]*${signaturePattern}.*\@.*$/, @lines); + @signatures = grep(/^[ \t]*${signature_pattern}.*\@.*$/, @lines); + + return (0, @signatures) if !@signatures; + + save_commits_by_author(@lines) if ($interactive); + save_commits_by_signer(@lines) if ($interactive); + if (!$email_git_penguin_chiefs) { - @lines = grep(!/${penguin_chiefs}/i, @lines); + @signatures = grep(!/${penguin_chiefs}/i, @signatures); } - return (0, @lines) if !@lines; + my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures); - # cut -f2- -d":" - s/.*:\s*(.+)\s*/$1/ for (@lines); - -## Reformat email addresses (with names) to avoid badly written signatures - - foreach my $line (@lines) { - my ($name, $address) = parse_email($line); - $line = format_email($name, $address, 1); - } - - return ($commits, @lines); + return ($commits, @$signers_ref); } sub vcs_find_author { @@ -1080,14 +1154,20 @@ sub vcs_find_author { return @lines if !@lines; -## Reformat email addresses (with names) to avoid badly written signatures - + my @authors = (); foreach my $line (@lines) { - my ($name, $address) = parse_email($line); - $line = format_email($name, $address, 1); + if ($line =~ m/$VCS_cmds{"author_pattern"}/) { + my $author = $1; + my ($name, $address) = parse_email($author); + $author = format_email($name, $address, 1); + push(@authors, $author); + } } - return @lines; + save_commits_by_author(@lines) if ($interactive); + save_commits_by_signer(@lines) if ($interactive); + + return @authors; } sub vcs_save_commits { @@ -1159,7 +1239,7 @@ sub vcs_exists { %VCS_cmds = %VCS_cmds_git; return 1 if eval $VCS_cmds{"available"}; %VCS_cmds = %VCS_cmds_hg; - return 1 if eval $VCS_cmds{"available"}; + return 2 if eval $VCS_cmds{"available"}; %VCS_cmds = (); if (!$printed_novcs) { warn("$P: No supported VCS found. Add --nogit to options?\n"); @@ -1171,125 +1251,309 @@ sub vcs_exists { return 0; } -sub vcs_interactive_menu { - my $list_ref = shift; +sub vcs_is_git { + return $vcs_used == 1; +} + +sub vcs_is_hg { + return $vcs_used == 2; +} + +sub interactive_get_maintainer { + my ($list_ref) = @_; my @list = @$list_ref; - return if (!vcs_exists()); + vcs_exists(); my %selected; - my %shortlog; - my $input; + my %authored; + my %signed; my $count = 0; #select maintainers by default foreach my $entry (@list){ - my $role = $entry->[1]; - $selected{$count} = ($role =~ /maintainer:|supporter:/); - $count++; + my $role = $entry->[1]; + $selected{$count} = ($role =~ /^(maintainer|supporter|open list)/); + $authored{$count} = 0; + $signed{$count} = 0; + $count++; } #menu loop - do { - my $count = 0; - foreach my $entry (@list){ - my $email = $entry->[0]; - my $role = $entry->[1]; - if ($selected{$count}){ - print STDERR "* "; - } else { - print STDERR " "; - } - print STDERR "$count: $email,\t\t $role"; - print STDERR "\n"; - if ($shortlog{$count}){ - my $entries_ref = vcs_get_shortlog($email); - foreach my $entry_ref (@{$entries_ref}){ - my $filename = @{$entry_ref}[0]; - my @shortlog = @{@{$entry_ref}[1]}; - print STDERR "\tshortlog for $filename (authored commits: " . @shortlog . ").\n"; - foreach my $commit (@shortlog){ - print STDERR "\t $commit\n"; + my $done = 0; + my $print_options = 0; + my $redraw = 1; + while (!$done) { + $count = 0; + if ($redraw) { + printf STDERR "\n%1s %2s %-65sauth sign\n", + "*", "#", "email/list and role:stats"; + foreach my $entry (@list) { + my $email = $entry->[0]; + my $role = $entry->[1]; + my $sel = ""; + $sel = "*" if ($selected{$count}); + my $commit_author = $commit_author_hash{$email}; + my $commit_signer = $commit_signer_hash{$email}; + my $authored = 0; + my $signed = 0; + $authored++ for (@{$commit_author}); + $signed++ for (@{$commit_signer}); + printf STDERR "%1s %2d %-65s", $sel, $count + 1, $email; + printf STDERR "%4d %4d", $authored, $signed + if ($authored > 0 || $signed > 0); + printf STDERR "\n %s\n", $role; + if ($authored{$count}) { + my $commit_author = $commit_author_hash{$email}; + foreach my $ref (@{$commit_author}) { + print STDERR " Author: @{$ref}[1]\n"; } - print STDERR "\n"; } + if ($signed{$count}) { + my $commit_signer = $commit_signer_hash{$email}; + foreach my $ref (@{$commit_signer}) { + print STDERR " @{$ref}[2]: @{$ref}[1]\n"; + } + } + + $count++; } - $count++; } - print STDERR "\n"; - print STDERR "Choose whom to cc by entering a commaseperated list of numbers and hitting enter.\n"; - print STDERR "To show a short list of commits, precede the number by a '?',\n"; - print STDERR "A blank line indicates that you are satisfied with your choice.\n"; - $input = ; + my $date_ref = \$email_git_since; + $date_ref = \$email_hg_since if (vcs_is_hg()); + if ($print_options) { + $print_options = 0; + if (vcs_exists()) { + print STDERR +"\nVersion Control options:\n" . +"g use git history [$email_git]\n" . +"gf use git-fallback [$email_git_fallback]\n" . +"b use git blame [$email_git_blame]\n" . +"bs use blame signatures [$email_git_blame_signatures]\n" . +"c# minimum commits [$email_git_min_signatures]\n" . +"%# min percent [$email_git_min_percent]\n" . +"d# history to use [$$date_ref]\n" . +"x# max maintainers [$email_git_max_maintainers]\n" . +"t all signature types [$email_git_all_signature_types]\n"; + } + print STDERR "\nAdditional options:\n" . +"0 toggle all\n" . +"f emails in file [$file_emails]\n" . +"k keywords in file [$keywords]\n" . +"r remove duplicates [$email_remove_duplicates]\n" . +"p# pattern match depth [$pattern_depth]\n"; + } + print STDERR +"\n#(toggle), A#(author), S#(signed) *(all), ^(none), O(options), Y(approve): "; + + my $input = ; chomp($input); - my @wish = split(/[, ]+/,$input); - foreach my $nr (@wish){ - my $logtoggle = 0; - if ($nr =~ /\?/){ - $nr =~ s/\?//; - $logtoggle = 1; + $redraw = 1; + my $rerun = 0; + my @wish = split(/[, ]+/, $input); + foreach my $nr (@wish) { + $nr = lc($nr); + my $sel = substr($nr, 0, 1); + my $str = substr($nr, 1); + my $val = 0; + $val = $1 if $str =~ /^(\d+)$/; + + if ($sel eq "y") { + $interactive = 0; + $done = 1; + $output_rolestats = 0; + $output_roles = 0; + last; + } elsif ($nr =~ /^\d+$/ && $nr > 0 && $nr <= $count) { + $selected{$nr - 1} = !$selected{$nr - 1}; + } elsif ($sel eq "*" || $sel eq '^') { + my $toggle = 0; + $toggle = 1 if ($sel eq '*'); + for (my $i = 0; $i < $count; $i++) { + $selected{$i} = $toggle; } - - #skip out of bounds numbers - next unless ($nr <= $count && $nr >= 0); - - if ($logtoggle){ - $shortlog{$nr} = !$shortlog{$nr}; + } elsif ($sel eq "0") { + for (my $i = 0; $i < $count; $i++) { + $selected{$i} = !$selected{$i}; + } + } elsif ($sel eq "a") { + if ($val > 0 && $val <= $count) { + $authored{$val - 1} = !$authored{$val - 1}; + } elsif ($str eq '*' || $str eq '^') { + my $toggle = 0; + $toggle = 1 if ($str eq '*'); + for (my $i = 0; $i < $count; $i++) { + $authored{$i} = $toggle; + } + } + } elsif ($sel eq "s") { + if ($val > 0 && $val <= $count) { + $signed{$val - 1} = !$signed{$val - 1}; + } elsif ($str eq '*' || $str eq '^') { + my $toggle = 0; + $toggle = 1 if ($str eq '*'); + for (my $i = 0; $i < $count; $i++) { + $signed{$i} = $toggle; + } + } + } elsif ($sel eq "o") { + $print_options = 1; + $redraw = 1; + } elsif ($sel eq "g") { + if ($str eq "f") { + bool_invert(\$email_git_fallback); } else { - $selected{$nr} = !$selected{$nr}; - - #switch shortlog on if an entry get's selected - if ($selected{$nr}){ - $shortlog{$nr}=1; - } + bool_invert(\$email_git); } - }; - } while(length($input) > 0); + $rerun = 1; + } elsif ($sel eq "b") { + if ($str eq "s") { + bool_invert(\$email_git_blame_signatures); + } else { + bool_invert(\$email_git_blame); + } + $rerun = 1; + } elsif ($sel eq "c") { + if ($val > 0) { + $email_git_min_signatures = $val; + $rerun = 1; + } + } elsif ($sel eq "x") { + if ($val > 0) { + $email_git_max_maintainers = $val; + $rerun = 1; + } + } elsif ($sel eq "%") { + if ($str ne "" && $val >= 0) { + $email_git_min_percent = $val; + $rerun = 1; + } + } elsif ($sel eq "d") { + if (vcs_is_git()) { + $email_git_since = $str; + } elsif (vcs_is_hg()) { + $email_hg_since = $str; + } + $rerun = 1; + } elsif ($sel eq "t") { + bool_invert(\$email_git_all_signature_types); + $rerun = 1; + } elsif ($sel eq "f") { + bool_invert(\$file_emails); + $rerun = 1; + } elsif ($sel eq "r") { + bool_invert(\$email_remove_duplicates); + $rerun = 1; + } elsif ($sel eq "k") { + bool_invert(\$keywords); + $rerun = 1; + } elsif ($sel eq "p") { + if ($str ne "" && $val >= 0) { + $pattern_depth = $val; + $rerun = 1; + } + } else { + print STDERR "invalid option: '$nr'\n"; + $redraw = 0; + } + } + if ($rerun) { + print STDERR "git-blame can be very slow, please have patience..." + if ($email_git_blame); + goto &get_maintainer; + } + } #drop not selected entries $count = 0; - my @new_emailto; - foreach my $entry (@list){ - if ($selected{$count}){ - push(@new_emailto,$list[$count]); - print STDERR "$count: "; - print STDERR $email_to[$count]->[0]; - print STDERR ",\t\t "; - print STDERR $email_to[$count]->[1]; - print STDERR "\n"; + my @new_emailto = (); + foreach my $entry (@list) { + if ($selected{$count}) { + push(@new_emailto, $list[$count]); } $count++; } - return \@new_emailto; + return @new_emailto; } -sub vcs_get_shortlog { - my $arg = shift; - my ($name, $address) = parse_email($arg); - return $shortlog_buffer{$address}; -} +sub bool_invert { + my ($bool_ref) = @_; -sub vcs_file_shortlogs { - my ($file) = @_; - print STDERR "shortlog processing $file:"; - foreach my $entry (@email_to){ - my ($name, $address) = parse_email($entry->[0]); - print STDERR "."; - my $commits_ref = vcs_email_shortlog($address, $file); - push(@{$shortlog_buffer{$address}}, [ $file, $commits_ref ]); + if ($$bool_ref) { + $$bool_ref = 0; + } else { + $$bool_ref = 1; } - print STDERR "\n"; } -sub vcs_email_shortlog { - my $email = shift; - my ($file) = @_; +sub save_commits_by_author { + my (@lines) = @_; - my $cmd = $VCS_cmds{"shortlog_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables - my @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); - return \@lines; + my @authors = (); + my @commits = (); + my @subjects = (); + + foreach my $line (@lines) { + if ($line =~ m/$VCS_cmds{"author_pattern"}/) { + my $author = $1; + my ($name, $address) = parse_email($author); + $author = format_email($name, $address, 1); + push(@authors, $author); + } + push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/); + push(@subjects, $1) if ($line =~ m/$VCS_cmds{"subject_pattern"}/); + } + + for (my $i = 0; $i < @authors; $i++) { + my $exists = 0; + foreach my $ref(@{$commit_author_hash{$authors[$i]}}) { + if (@{$ref}[0] eq $commits[$i] && + @{$ref}[1] eq $subjects[$i]) { + $exists = 1; + last; + } + } + if (!$exists) { + push(@{$commit_author_hash{$authors[$i]}}, + [ ($commits[$i], $subjects[$i]) ]); + } + } +} + +sub save_commits_by_signer { + my (@lines) = @_; + + my $commit = ""; + my $subject = ""; + + foreach my $line (@lines) { + $commit = $1 if ($line =~ m/$VCS_cmds{"commit_pattern"}/); + $subject = $1 if ($line =~ m/$VCS_cmds{"subject_pattern"}/); + if ($line =~ /^[ \t]*${signature_pattern}.*\@.*$/) { + my @signatures = ($line); + my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures); + my @types = @$types_ref; + my @signers = @$signers_ref; + + my $type = $types[0]; + my $signer = $signers[0]; + + my $exists = 0; + foreach my $ref(@{$commit_signer_hash{$signer}}) { + if (@{$ref}[0] eq $commit && + @{$ref}[1] eq $subject && + @{$ref}[2] eq $type) { + $exists = 1; + last; + } + } + if (!$exists) { + push(@{$commit_signer_hash{$signer}}, + [ ($commit, $subject, $type) ]); + } + } + } } sub vcs_assign { @@ -1342,7 +1606,8 @@ sub vcs_file_signoffs { my @signers = (); my $commits; - return if (!vcs_exists()); + $vcs_used = vcs_exists(); + return if (!$vcs_used); my $cmd = $VCS_cmds{"find_signers_cmd"}; $cmd =~ s/(\$\w+)/$1/eeg; # interpolate $cmd @@ -1360,37 +1625,93 @@ sub vcs_file_blame { my $total_commits; my $total_lines; - return if (!vcs_exists()); + $vcs_used = vcs_exists(); + return if (!$vcs_used); @all_commits = vcs_blame($file); @commits = uniq(@all_commits); $total_commits = @commits; $total_lines = @all_commits; - foreach my $commit (@commits) { - my $commit_count; - my @commit_signers = (); + if ($email_git_blame_signatures) { + if (vcs_is_hg()) { + my $commit_count; + my @commit_signers = (); + my $commit = join(" -r ", @commits); + my $cmd; - my $cmd = $VCS_cmds{"find_commit_signers_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd + $cmd = $VCS_cmds{"find_commit_signers_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd - ($commit_count, @commit_signers) = vcs_find_signers($cmd); + ($commit_count, @commit_signers) = vcs_find_signers($cmd); - push(@signers, @commit_signers); + push(@signers, @commit_signers); + } else { + foreach my $commit (@commits) { + my $commit_count; + my @commit_signers = (); + my $cmd; + + $cmd = $VCS_cmds{"find_commit_signers_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd + + ($commit_count, @commit_signers) = vcs_find_signers($cmd); + + push(@signers, @commit_signers); + } + } } if ($from_filename) { if ($output_rolestats) { my @blame_signers; - foreach my $commit (@commits) { - my $i; - my $cmd = $VCS_cmds{"find_commit_author_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd - my @author = vcs_find_author($cmd); - next if !@author; - my $count = grep(/$commit/, @all_commits); - for ($i = 0; $i < $count ; $i++) { - push(@blame_signers, $author[0]); + if (vcs_is_hg()) {{ # Double brace for last exit + my $commit_count; + my @commit_signers = (); + @commits = uniq(@commits); + @commits = sort(@commits); + my $commit = join(" -r ", @commits); + my $cmd; + + $cmd = $VCS_cmds{"find_commit_author_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd + + my @lines = (); + + @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); + + if (!$email_git_penguin_chiefs) { + @lines = grep(!/${penguin_chiefs}/i, @lines); + } + + last if !@lines; + + my @authors = (); + foreach my $line (@lines) { + if ($line =~ m/$VCS_cmds{"author_pattern"}/) { + my $author = $1; + my ($name, $address) = parse_email($author); + $author = format_email($name, $address, 1); + push(@authors, $1); + } + } + + save_commits_by_author(@lines) if ($interactive); + save_commits_by_signer(@lines) if ($interactive); + + push(@signers, @authors); + }} + else { + foreach my $commit (@commits) { + my $i; + my $cmd = $VCS_cmds{"find_commit_author_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd + my @author = vcs_find_author($cmd); + next if !@author; + my $count = grep(/$commit/, @all_commits); + for ($i = 0; $i < $count ; $i++) { + push(@blame_signers, $author[0]); + } } } if (@blame_signers) { From 6ef1c52e122b675acc88a8b016d6477f67988b91 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:56 -0700 Subject: [PATCH 135/176] scripts/get_maintainer.pl: use case insensitive name de-duplication Case insensitive name and email address matching can help reduce duplication when authors don't always use the exact same signature. o Add a --interactive per-file exact_match hash so git history can be checked on per-file only when there is no direct maintainer o Make @interactive_to list global so save_commits_by_ can check email names & addresses against this list for duplication o Don't allow --interactive and --sections o rename subroutine get_maintainer to get_maintainers o Added help text option to --interactive menu prompt Update version to 0.26-beta4 Signed-off-by: Joe Perches Cc: Florian Mickler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 135 ++++++++++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 34 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index f51176039ff5..61d3bb51bddf 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -13,7 +13,7 @@ use strict; my $P = $0; -my $V = '0.26-beta3'; +my $V = '0.26-beta4'; use Getopt::Long qw(:config no_auto_abbrev); @@ -242,6 +242,7 @@ if ($sections) { $subsystem = 0; $web = 0; $keywords = 0; + $interactive = 0; } else { my $selections = $email + $scm + $status + $subsystem + $web; if ($selections == 0) { @@ -407,13 +408,15 @@ my @scm = (); my @web = (); my @subsystem = (); my @status = (); +my @interactive_to = (); my $signature_pattern; -my @to = get_maintainer(); +my @maintainers = get_maintainers(); -@to = merge_email(@to); - -output(@to) if (@to); +if (@maintainers) { + @maintainers = merge_email(@maintainers); + output(@maintainers); +} if ($scm) { @scm = uniq(@scm); @@ -437,7 +440,7 @@ if ($web) { exit($exit); -sub get_maintainer { +sub get_maintainers { %email_hash_name = (); %email_hash_address = (); %commit_author_hash = (); @@ -449,7 +452,7 @@ sub get_maintainer { @web = (); @subsystem = (); @status = (); - + @interactive_to = (); if ($email_git_all_signature_types) { $signature_pattern = "(.+?)[Bb][Yy]:"; } else { @@ -458,10 +461,11 @@ sub get_maintainer { # Find responsible parties + my %exact_pattern_match_hash; + foreach my $file (@files) { my %hash; - my $exact_pattern_match = 0; my $tvi = find_first_section(); while ($tvi < @typevalue) { my $start = find_starting_index($tvi); @@ -497,7 +501,9 @@ sub get_maintainer { my $file_pd = ($file =~ tr@/@@); $value_pd++ if (substr($value,-1,1) ne "/"); $value_pd = -1 if ($value =~ /^\.\*/); - $exact_pattern_match = 1 if ($value_pd >= $file_pd); + if ($value_pd >= $file_pd) { + $exact_pattern_match_hash{$file} = 1; + } if ($pattern_depth == 0 || (($file_pd - $value_pd) < $pattern_depth)) { $hash{$tvi} = $value_pd; @@ -530,14 +536,6 @@ sub get_maintainer { print("\n"); } } - - if ($email && ($email_git || - ($email_git_fallback && !$exact_pattern_match))) { - vcs_file_signoffs($file); - } - if ($email && $email_git_blame) { - vcs_file_blame($file); - } } if ($keywords) { @@ -547,6 +545,19 @@ sub get_maintainer { } } + @interactive_to = (@email_to, @list_to); + + foreach my $file (@files) { + if ($email && + ($email_git || ($email_git_fallback && + !$exact_pattern_match_hash{$file}))) { + vcs_file_signoffs($file); + } + if ($email && $email_git_blame) { + vcs_file_blame($file); + } + } + if ($email) { foreach my $chief (@penguin_chief) { if ($chief =~ m/^(.*):(.*)/) { @@ -580,7 +591,10 @@ sub get_maintainer { } } - @to = interactive_get_maintainer(\@to) if ($interactive); + if ($interactive) { + @interactive_to = @to; + @to = interactive_get_maintainers(\@interactive_to); + } return @to; } @@ -899,16 +913,16 @@ sub add_categories { } if ($list_additional =~ m/subscribers-only/) { if ($email_subscriber_list) { - if (!$hash_list_to{$list_address}) { - $hash_list_to{$list_address} = 1; + if (!$hash_list_to{lc($list_address)}) { + $hash_list_to{lc($list_address)} = 1; push(@list_to, [$list_address, "subscriber list${list_role}"]); } } } else { if ($email_list) { - if (!$hash_list_to{$list_address}) { - $hash_list_to{$list_address} = 1; + if (!$hash_list_to{lc($list_address)}) { + $hash_list_to{lc($list_address)} = 1; push(@list_to, [$list_address, "open list${list_role}"]); } @@ -946,8 +960,8 @@ sub email_inuse { my ($name, $address) = @_; return 1 if (($name eq "") && ($address eq "")); - return 1 if (($name ne "") && exists($email_hash_name{$name})); - return 1 if (($address ne "") && exists($email_hash_address{$address})); + return 1 if (($name ne "") && exists($email_hash_name{lc($name)})); + return 1 if (($address ne "") && exists($email_hash_address{lc($address)})); return 0; } @@ -965,8 +979,8 @@ sub push_email_address { push(@email_to, [format_email($name, $address, $email_usename), $role]); } elsif (!email_inuse($name, $address)) { push(@email_to, [format_email($name, $address, $email_usename), $role]); - $email_hash_name{$name}++; - $email_hash_address{$address}++; + $email_hash_name{lc($name)}++; + $email_hash_address{lc($address)}++; } return 1; @@ -1259,7 +1273,7 @@ sub vcs_is_hg { return $vcs_used == 2; } -sub interactive_get_maintainer { +sub interactive_get_maintainers { my ($list_ref) = @_; my @list = @$list_ref; @@ -1269,11 +1283,12 @@ sub interactive_get_maintainer { my %authored; my %signed; my $count = 0; - + my $maintained = 0; #select maintainers by default - foreach my $entry (@list){ + foreach my $entry (@list) { my $role = $entry->[1]; - $selected{$count} = ($role =~ /^(maintainer|supporter|open list)/); + $selected{$count} = ($role =~ /^(maintainer|supporter|open list)/i); + $maintained = 1 if ($role =~ /^(maintainer|supporter)/i); $authored{$count} = 0; $signed{$count} = 0; $count++; @@ -1286,8 +1301,14 @@ sub interactive_get_maintainer { while (!$done) { $count = 0; if ($redraw) { - printf STDERR "\n%1s %2s %-65sauth sign\n", - "*", "#", "email/list and role:stats"; + printf STDERR "\n%1s %2s %-65s", + "*", "#", "email/list and role:stats"; + if ($email_git || + ($email_git_fallback && !$maintained) || + $email_git_blame) { + print STDERR "auth sign"; + } + print STDERR "\n"; foreach my $entry (@list) { my $email = $entry->[0]; my $role = $entry->[1]; @@ -1453,6 +1474,27 @@ sub interactive_get_maintainer { $pattern_depth = $val; $rerun = 1; } + } elsif ($sel eq "h" || $sel eq "?") { + print STDERR <[0]); + if ($email_remove_duplicates && + ((lc($name) eq lc($to_name)) || + (lc($address) eq lc($to_address)))) { + $author = $to->[0]; + $matched = 1; + last; + } + } + $author = format_email($name, $address, 1) if (!$matched); push(@authors, $author); } push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/); @@ -1539,6 +1592,20 @@ sub save_commits_by_signer { my $type = $types[0]; my $signer = $signers[0]; + my $matched = 0; + my ($name, $address) = parse_email($signer); + foreach my $to (@interactive_to) { + my ($to_name, $to_address) = parse_email($to->[0]); + if ($email_remove_duplicates && + ((lc($name) eq lc($to_name)) || + (lc($address) eq lc($to_address)))) { + $signer = $to->[0]; + $matched = 1; + last; + } + $signer = format_email($name, $address, 1) if (!$matched); + } + my $exists = 0; foreach my $ref(@{$commit_signer_hash{$signer}}) { if (@{$ref}[0] eq $commit && From 7fa8ff2e0c0f326cdaaa4ae7d00f5d021e43ffa2 Mon Sep 17 00:00:00 2001 From: Florian Mickler Date: Tue, 26 Oct 2010 14:22:56 -0700 Subject: [PATCH 136/176] scripts/get_maintainer.pl: fix mailmap handling Implement it, like it is described in git-shortlog. Signed-off-by: Florian Mickler Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 147 ++++++++++++++++++++++++++++---------- 1 file changed, 109 insertions(+), 38 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 61d3bb51bddf..faeace4e1fd0 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -295,31 +295,76 @@ while (<$maint>) { } close($maint); -my %mailmap; -if ($email_remove_duplicates) { - open(my $mailmap, '<', "${lk_path}.mailmap") +# +# Read mail address map +# + +my $mailmap = read_mailmap(); + +sub read_mailmap { + my $mailmap = { + names => {}, + addresses => {} + }; + + if (!$email_remove_duplicates) { + return $mailmap; + } + + open(my $mailmap_file, '<', "${lk_path}.mailmap") or warn "$P: Can't open .mailmap: $!\n"; - while (<$mailmap>) { - my $line = $_; - next if ($line =~ m/^\s*#/); - next if ($line =~ m/^\s*$/); + while (<$mailmap_file>) { + s/#.*$//; #strip comments + s/^\s+|\s+$//g; #trim - my ($name, $address) = parse_email($line); - $line = format_email($name, $address, $email_usename); + next if (/^\s*$/); #skip empty lines + #entries have one of the following formats: + # name1 + # + # name1 + # name1 name2 + # (see man git-shortlog) + if (/^(.+)<(.+)>$/) { + my $real_name = $1; + my $address = $2; - next if ($line =~ m/^\s*$/); + $real_name =~ s/\s+$//; + $mailmap->{names}->{$address} = $real_name; - if (exists($mailmap{$name})) { - my $obj = $mailmap{$name}; - push(@$obj, $address); - } else { - my @arr = ($address); - $mailmap{$name} = \@arr; + } elsif (/^<([^\s]+)>\s*<([^\s]+)>$/) { + my $real_address = $1; + my $wrong_address = $2; + + $mailmap->{addresses}->{$wrong_address} = $real_address; + + } elsif (/^(.+)<([^\s]+)>\s*<([^\s]+)>$/) { + my $real_name= $1; + my $real_address = $2; + my $wrong_address = $3; + + $real_name =~ s/\s+$//; + + $mailmap->{names}->{$wrong_address} = $real_name; + $mailmap->{addresses}->{$wrong_address} = $real_address; + + } elsif (/^(.+)<([^\s]+)>\s*([^\s].*)<([^\s]+)>$/) { + my $real_name = $1; + my $real_address = $2; + my $wrong_name = $3; + my $wrong_address = $4; + + $real_name =~ s/\s+$//; + $wrong_name =~ s/\s+$//; + + $mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name; + $mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address; } } - close($mailmap); + close($mailmap_file); + + return $mailmap; } ## use the filenames on the command line or find the filenames in the patchfiles @@ -1061,30 +1106,58 @@ sub which_conf { return ""; } -sub mailmap { - my (@lines) = @_; - my %hash; +sub mailmap_email { + my $line = shift; - foreach my $line (@lines) { my ($name, $address) = parse_email($line); - if (!exists($hash{$name})) { - $hash{$name} = $address; - } elsif ($address ne $hash{$name}) { - $address = $hash{$name}; - $line = format_email($name, $address, $email_usename); - } - if (exists($mailmap{$name})) { - my $obj = $mailmap{$name}; - foreach my $map_address (@$obj) { - if (($map_address eq $address) && - ($map_address ne $hash{$name})) { - $line = format_email($name, $hash{$name}, $email_usename); + my $email = format_email($name, $address, 1); + my $real_name = $name; + my $real_address = $address; + + if (exists $mailmap->{names}->{$email} || exists $mailmap->{addresses}->{$email}) { + if (exists $mailmap->{names}->{$email}) { + $real_name = $mailmap->{names}->{$email}; + } + if (exists $mailmap->{addresses}->{$email}) { + $real_address = $mailmap->{addresses}->{$email}; + } + } else { + if (exists $mailmap->{names}->{$address}) { + $real_name = $mailmap->{names}->{$address}; + } + if (exists $mailmap->{addresses}->{$address}) { + $real_address = $mailmap->{addresses}->{$address}; } - } } + return format_email($real_name, $real_address, 1); +} + +sub mailmap { + my (@addresses) = @_; + + my @ret = (); + foreach my $line (@addresses) { + push(@ret, mailmap_email($line), 1); } - return @lines; + merge_by_realname(@ret) if $email_remove_duplicates; + + return @ret; +} + +sub merge_by_realname { + my %address_map; + my (@emails) = @_; + foreach my $email (@emails) { + my ($name, $address) = parse_email($email); + if (!exists $address_map{$name}) { + $address_map{$name} = $address; + } else { + $address = $address_map{$name}; + $email = format_email($name,$address,1); + } + } + } sub git_execute_cmd { @@ -1636,9 +1709,7 @@ sub vcs_assign { $divisor = 1; } - if ($email_remove_duplicates) { - @lines = mailmap(@lines); - } + @lines = mailmap(@lines); return if (@lines <= 0); From 47abc7225761faf28be52b3ac4dc26ffeac7b750 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:57 -0700 Subject: [PATCH 137/176] scripts/get_maintainer.pl: correct indentation in a few places And a miscellaneous conversion of You to you in a help message Signed-off-by: Joe Perches Cc: Florian Mickler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 154 +++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index faeace4e1fd0..0abfdbc5cdff 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -306,7 +306,7 @@ sub read_mailmap { my $mailmap = { names => {}, addresses => {} - }; + }; if (!$email_remove_duplicates) { return $mailmap; @@ -327,39 +327,39 @@ sub read_mailmap { # name1 name2 # (see man git-shortlog) if (/^(.+)<(.+)>$/) { - my $real_name = $1; - my $address = $2; + my $real_name = $1; + my $address = $2; - $real_name =~ s/\s+$//; - $mailmap->{names}->{$address} = $real_name; + $real_name =~ s/\s+$//; + $mailmap->{names}->{$address} = $real_name; } elsif (/^<([^\s]+)>\s*<([^\s]+)>$/) { - my $real_address = $1; - my $wrong_address = $2; + my $real_address = $1; + my $wrong_address = $2; - $mailmap->{addresses}->{$wrong_address} = $real_address; + $mailmap->{addresses}->{$wrong_address} = $real_address; } elsif (/^(.+)<([^\s]+)>\s*<([^\s]+)>$/) { - my $real_name= $1; - my $real_address = $2; - my $wrong_address = $3; + my $real_name= $1; + my $real_address = $2; + my $wrong_address = $3; - $real_name =~ s/\s+$//; + $real_name =~ s/\s+$//; - $mailmap->{names}->{$wrong_address} = $real_name; - $mailmap->{addresses}->{$wrong_address} = $real_address; + $mailmap->{names}->{$wrong_address} = $real_name; + $mailmap->{addresses}->{$wrong_address} = $real_address; } elsif (/^(.+)<([^\s]+)>\s*([^\s].*)<([^\s]+)>$/) { - my $real_name = $1; - my $real_address = $2; - my $wrong_name = $3; - my $wrong_address = $4; + my $real_name = $1; + my $real_address = $2; + my $wrong_name = $3; + my $wrong_address = $4; - $real_name =~ s/\s+$//; - $wrong_name =~ s/\s+$//; + $real_name =~ s/\s+$//; + $wrong_name =~ s/\s+$//; - $mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name; - $mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address; + $mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name; + $mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address; } } close($mailmap_file); @@ -743,30 +743,30 @@ EOT } sub top_of_kernel_tree { - my ($lk_path) = @_; + my ($lk_path) = @_; - if ($lk_path ne "" && substr($lk_path,length($lk_path)-1,1) ne "/") { - $lk_path .= "/"; - } - if ( (-f "${lk_path}COPYING") - && (-f "${lk_path}CREDITS") - && (-f "${lk_path}Kbuild") - && (-f "${lk_path}MAINTAINERS") - && (-f "${lk_path}Makefile") - && (-f "${lk_path}README") - && (-d "${lk_path}Documentation") - && (-d "${lk_path}arch") - && (-d "${lk_path}include") - && (-d "${lk_path}drivers") - && (-d "${lk_path}fs") - && (-d "${lk_path}init") - && (-d "${lk_path}ipc") - && (-d "${lk_path}kernel") - && (-d "${lk_path}lib") - && (-d "${lk_path}scripts")) { - return 1; - } - return 0; + if ($lk_path ne "" && substr($lk_path,length($lk_path)-1,1) ne "/") { + $lk_path .= "/"; + } + if ( (-f "${lk_path}COPYING") + && (-f "${lk_path}CREDITS") + && (-f "${lk_path}Kbuild") + && (-f "${lk_path}MAINTAINERS") + && (-f "${lk_path}Makefile") + && (-f "${lk_path}README") + && (-d "${lk_path}Documentation") + && (-d "${lk_path}arch") + && (-d "${lk_path}include") + && (-d "${lk_path}drivers") + && (-d "${lk_path}fs") + && (-d "${lk_path}init") + && (-d "${lk_path}ipc") + && (-d "${lk_path}kernel") + && (-d "${lk_path}lib") + && (-d "${lk_path}scripts")) { + return 1; + } + return 0; } sub parse_email { @@ -1107,29 +1107,30 @@ sub which_conf { } sub mailmap_email { - my $line = shift; + my $line = shift; - my ($name, $address) = parse_email($line); - my $email = format_email($name, $address, 1); - my $real_name = $name; - my $real_address = $address; + my ($name, $address) = parse_email($line); + my $email = format_email($name, $address, 1); + my $real_name = $name; + my $real_address = $address; - if (exists $mailmap->{names}->{$email} || exists $mailmap->{addresses}->{$email}) { - if (exists $mailmap->{names}->{$email}) { - $real_name = $mailmap->{names}->{$email}; - } - if (exists $mailmap->{addresses}->{$email}) { - $real_address = $mailmap->{addresses}->{$email}; - } - } else { - if (exists $mailmap->{names}->{$address}) { - $real_name = $mailmap->{names}->{$address}; - } - if (exists $mailmap->{addresses}->{$address}) { - $real_address = $mailmap->{addresses}->{$address}; - } + if (exists $mailmap->{names}->{$email} || + exists $mailmap->{addresses}->{$email}) { + if (exists $mailmap->{names}->{$email}) { + $real_name = $mailmap->{names}->{$email}; } - return format_email($real_name, $real_address, 1); + if (exists $mailmap->{addresses}->{$email}) { + $real_address = $mailmap->{addresses}->{$email}; + } + } else { + if (exists $mailmap->{names}->{$address}) { + $real_name = $mailmap->{names}->{$address}; + } + if (exists $mailmap->{addresses}->{$address}) { + $real_address = $mailmap->{addresses}->{$address}; + } + } + return format_email($real_name, $real_address, 1); } sub mailmap { @@ -1146,18 +1147,17 @@ sub mailmap { } sub merge_by_realname { - my %address_map; - my (@emails) = @_; - foreach my $email (@emails) { - my ($name, $address) = parse_email($email); - if (!exists $address_map{$name}) { - $address_map{$name} = $address; - } else { - $address = $address_map{$name}; - $email = format_email($name,$address,1); - } + my %address_map; + my (@emails) = @_; + foreach my $email (@emails) { + my ($name, $address) = parse_email($email); + if (!exists $address_map{$name}) { + $address_map{$name} = $address; + } else { + $address = $address_map{$name}; + $email = format_email($name,$address,1); } - + } } sub git_execute_cmd { @@ -1555,7 +1555,7 @@ commit signers and mailing lists that could be CC'd on a patch. Any *'d entry is selected. -If you have git or hg installed, You can choose to summarize the commit +If you have git or hg installed, you can choose to summarize the commit history of files in the patch. Also, each line of the current file can be matched to its commit author and that commits signers with blame. From b9e2331dd1e0e04f7f2a6f8aa0c05bac2a7f0d7b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:58 -0700 Subject: [PATCH 138/176] scripts/get_maintainer.pl: use mailmap in name deduplication and other updates Use Florian Mickler's mailmap routine to reduce name duplication. o Add subroutine deduplicate_email to centralize code o Add hashes for deduplicate_(name|address)_hash o Remove now unused @interactive_to o Whitespace neatening o Add command line --help text o Add --mailmap command line option control o Interactive changes: - Add toggles for maintainer, git and list selections - Default selection is all - Add mailmap control Update to 0.26-beta5 Signed-off-by: Joe Perches Cc: Florian Mickler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 234 ++++++++++++++++++++++++-------------- 1 file changed, 149 insertions(+), 85 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 0abfdbc5cdff..e822518bc610 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -13,7 +13,7 @@ use strict; my $P = $0; -my $V = '0.26-beta4'; +my $V = '0.26-beta5'; use Getopt::Long qw(:config no_auto_abbrev); @@ -36,6 +36,7 @@ my $email_git_since = "1-year-ago"; my $email_hg_since = "-365"; my $interactive = 0; my $email_remove_duplicates = 1; +my $email_use_mailmap = 1; my $output_multiline = 1; my $output_separator = ", "; my $output_roles = 0; @@ -192,6 +193,7 @@ if (!GetOptions( 'hg-since=s' => \$email_hg_since, 'i|interactive!' => \$interactive, 'remove-duplicates!' => \$email_remove_duplicates, + 'mailmap!' => \$email_use_mailmap, 'm!' => \$email_maintainer, 'n!' => \$email_usename, 'l!' => \$email_list, @@ -300,17 +302,17 @@ close($maint); # Read mail address map # -my $mailmap = read_mailmap(); +my $mailmap; + +read_mailmap(); sub read_mailmap { - my $mailmap = { + $mailmap = { names => {}, addresses => {} }; - if (!$email_remove_duplicates) { - return $mailmap; - } + return if (!$email_use_mailmap || !(-f "${lk_path}.mailmap")); open(my $mailmap_file, '<', "${lk_path}.mailmap") or warn "$P: Can't open .mailmap: $!\n"; @@ -331,6 +333,7 @@ sub read_mailmap { my $address = $2; $real_name =~ s/\s+$//; + ($real_name, $address) = parse_email("$real_name <$address>"); $mailmap->{names}->{$address} = $real_name; } elsif (/^<([^\s]+)>\s*<([^\s]+)>$/) { @@ -340,12 +343,13 @@ sub read_mailmap { $mailmap->{addresses}->{$wrong_address} = $real_address; } elsif (/^(.+)<([^\s]+)>\s*<([^\s]+)>$/) { - my $real_name= $1; + my $real_name = $1; my $real_address = $2; my $wrong_address = $3; $real_name =~ s/\s+$//; - + ($real_name, $real_address) = + parse_email("$real_name <$real_address>"); $mailmap->{names}->{$wrong_address} = $real_name; $mailmap->{addresses}->{$wrong_address} = $real_address; @@ -356,15 +360,19 @@ sub read_mailmap { my $wrong_address = $4; $real_name =~ s/\s+$//; - $wrong_name =~ s/\s+$//; + ($real_name, $real_address) = + parse_email("$real_name <$real_address>"); - $mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name; - $mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address; + $wrong_name =~ s/\s+$//; + ($wrong_name, $wrong_address) = + parse_email("$wrong_name <$wrong_address>"); + + my $wrong_email = format_email($wrong_name, $wrong_address, 1); + $mailmap->{names}->{$wrong_email} = $real_name; + $mailmap->{addresses}->{$wrong_email} = $real_address; } } close($mailmap_file); - - return $mailmap; } ## use the filenames on the command line or find the filenames in the patchfiles @@ -453,7 +461,8 @@ my @scm = (); my @web = (); my @subsystem = (); my @status = (); -my @interactive_to = (); +my %deduplicate_name_hash = (); +my %deduplicate_address_hash = (); my $signature_pattern; my @maintainers = get_maintainers(); @@ -497,7 +506,8 @@ sub get_maintainers { @web = (); @subsystem = (); @status = (); - @interactive_to = (); + %deduplicate_name_hash = (); + %deduplicate_address_hash = (); if ($email_git_all_signature_types) { $signature_pattern = "(.+?)[Bb][Yy]:"; } else { @@ -506,7 +516,7 @@ sub get_maintainers { # Find responsible parties - my %exact_pattern_match_hash; + my %exact_pattern_match_hash = (); foreach my $file (@files) { @@ -590,7 +600,9 @@ sub get_maintainers { } } - @interactive_to = (@email_to, @list_to); + foreach my $email (@email_to, @list_to) { + $email->[0] = deduplicate_email($email->[0]); + } foreach my $file (@files) { if ($email && @@ -637,8 +649,7 @@ sub get_maintainers { } if ($interactive) { - @interactive_to = @to; - @to = interactive_get_maintainers(\@interactive_to); + @to = interactive_get_maintainers(\@to); } return @to; @@ -702,8 +713,9 @@ Output type options: Other options: --pattern-depth => Number of pattern directory traversals (default: 0 (all)) - --keywords => scan patch for keywords (default: 1 (on)) - --sections => print the entire subsystem sections with pattern matches + --keywords => scan patch for keywords (default: $keywords) + --sections => print all of the subsystem sections with pattern matches + --mailmap => use .mailmap file (default: $email_use_mailmap) --version => show version --help => show this help information @@ -1107,7 +1119,7 @@ sub which_conf { } sub mailmap_email { - my $line = shift; + my ($line) = @_; my ($name, $address) = parse_email($line); my $email = format_email($name, $address, 1); @@ -1136,26 +1148,25 @@ sub mailmap_email { sub mailmap { my (@addresses) = @_; - my @ret = (); + my @mapped_emails = (); foreach my $line (@addresses) { - push(@ret, mailmap_email($line), 1); + push(@mapped_emails, mailmap_email($line)); } - - merge_by_realname(@ret) if $email_remove_duplicates; - - return @ret; + merge_by_realname(@mapped_emails) if ($email_use_mailmap); + return @mapped_emails; } sub merge_by_realname { my %address_map; my (@emails) = @_; + foreach my $email (@emails) { my ($name, $address) = parse_email($email); - if (!exists $address_map{$name}) { - $address_map{$name} = $address; - } else { + if (exists $address_map{$name}) { $address = $address_map{$name}; - $email = format_email($name,$address,1); + $email = format_email($name, $address, 1); + } else { + $address_map{$name} = $address; } } } @@ -1194,8 +1205,7 @@ sub extract_formatted_signatures { ## Reformat email addresses (with names) to avoid badly written signatures foreach my $signer (@signature_lines) { - my ($name, $address) = parse_email($signer); - $signer = format_email($name, $address, 1); + $signer = deduplicate_email($signer); } return (\@type, \@signature_lines); @@ -1339,6 +1349,7 @@ sub vcs_exists { } sub vcs_is_git { + vcs_exists(); return $vcs_used == 1; } @@ -1357,11 +1368,9 @@ sub interactive_get_maintainers { my %signed; my $count = 0; my $maintained = 0; - #select maintainers by default foreach my $entry (@list) { - my $role = $entry->[1]; - $selected{$count} = ($role =~ /^(maintainer|supporter|open list)/i); - $maintained = 1 if ($role =~ /^(maintainer|supporter)/i); + $maintained = 1 if ($entry->[1] =~ /^(maintainer|supporter)/i); + $selected{$count} = 1; $authored{$count} = 0; $signed{$count} = 0; $count++; @@ -1418,24 +1427,34 @@ sub interactive_get_maintainers { if ($print_options) { $print_options = 0; if (vcs_exists()) { - print STDERR -"\nVersion Control options:\n" . -"g use git history [$email_git]\n" . -"gf use git-fallback [$email_git_fallback]\n" . -"b use git blame [$email_git_blame]\n" . -"bs use blame signatures [$email_git_blame_signatures]\n" . -"c# minimum commits [$email_git_min_signatures]\n" . -"%# min percent [$email_git_min_percent]\n" . -"d# history to use [$$date_ref]\n" . -"x# max maintainers [$email_git_max_maintainers]\n" . -"t all signature types [$email_git_all_signature_types]\n"; + print STDERR <[1] =~ /^(maintainer|supporter)/i); + } + } elsif (lc($str) eq "g") { + for (my $i = 0; $i < $count; $i++) { + $selected{$i} = !$selected{$i} + if ($list[$i]->[1] =~ /^(author|commit|signer)/i); + } + } elsif (lc($str) eq "l") { + for (my $i = 0; $i < $count; $i++) { + $selected{$i} = !$selected{$i} + if ($list[$i]->[1] =~ /^(open list)/i); + } + } elsif (lc($str) eq "s") { + for (my $i = 0; $i < $count; $i++) { + $selected{$i} = !$selected{$i} + if ($list[$i]->[1] =~ /^(subscriber list)/i); + } + } } elsif ($sel eq "a") { if ($val > 0 && $val <= $count) { $authored{$val - 1} = !$authored{$val - 1}; @@ -1539,6 +1580,10 @@ sub interactive_get_maintainers { } elsif ($sel eq "r") { bool_invert(\$email_remove_duplicates); $rerun = 1; + } elsif ($sel eq "m") { + bool_invert(\$email_use_mailmap); + read_mailmap(); + $rerun = 1; } elsif ($sel eq "k") { bool_invert(\$keywords); $rerun = 1; @@ -1602,6 +1647,36 @@ sub bool_invert { } } +sub deduplicate_email { + my ($email) = @_; + + my $matched = 0; + my ($name, $address) = parse_email($email); + $email = format_email($name, $address, 1); + $email = mailmap_email($email); + + return $email if (!$email_remove_duplicates); + + ($name, $address) = parse_email($email); + + if ($deduplicate_name_hash{lc($name)}) { + $name = $deduplicate_name_hash{lc($name)}->[0]; + $address = $deduplicate_name_hash{lc($name)}->[1]; + $matched = 1; + } elsif ($deduplicate_address_hash{lc($address)}) { + $name = $deduplicate_address_hash{lc($address)}->[0]; + $address = $deduplicate_address_hash{lc($address)}->[1]; + $matched = 1; + } + if (!$matched) { + $deduplicate_name_hash{lc($name)} = [ $name, $address ]; + $deduplicate_address_hash{lc($address)} = [ $name, $address ]; + } + $email = format_email($name, $address, 1); + $email = mailmap_email($email); + return $email; +} + sub save_commits_by_author { my (@lines) = @_; @@ -1611,20 +1686,8 @@ sub save_commits_by_author { foreach my $line (@lines) { if ($line =~ m/$VCS_cmds{"author_pattern"}/) { - my $matched = 0; my $author = $1; - my ($name, $address) = parse_email($author); - foreach my $to (@interactive_to) { - my ($to_name, $to_address) = parse_email($to->[0]); - if ($email_remove_duplicates && - ((lc($name) eq lc($to_name)) || - (lc($address) eq lc($to_address)))) { - $author = $to->[0]; - $matched = 1; - last; - } - } - $author = format_email($name, $address, 1) if (!$matched); + $author = deduplicate_email($author); push(@authors, $author); } push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/); @@ -1665,19 +1728,7 @@ sub save_commits_by_signer { my $type = $types[0]; my $signer = $signers[0]; - my $matched = 0; - my ($name, $address) = parse_email($signer); - foreach my $to (@interactive_to) { - my ($to_name, $to_address) = parse_email($to->[0]); - if ($email_remove_duplicates && - ((lc($name) eq lc($to_name)) || - (lc($address) eq lc($to_address)))) { - $signer = $to->[0]; - $matched = 1; - last; - } - $signer = format_email($name, $address, 1) if (!$matched); - } + $signer = deduplicate_email($signer); my $exists = 0; foreach my $ref(@{$commit_signer_hash{$signer}}) { @@ -1751,6 +1802,11 @@ sub vcs_file_signoffs { $cmd =~ s/(\$\w+)/$1/eeg; # interpolate $cmd ($commits, @signers) = vcs_find_signers($cmd); + + foreach my $signer (@signers) { + $signer = deduplicate_email($signer); + } + vcs_assign("commit_signer", $commits, @signers); } @@ -1828,9 +1884,8 @@ sub vcs_file_blame { foreach my $line (@lines) { if ($line =~ m/$VCS_cmds{"author_pattern"}/) { my $author = $1; - my ($name, $address) = parse_email($author); - $author = format_email($name, $address, 1); - push(@authors, $1); + $author = deduplicate_email($author); + push(@authors, $author); } } @@ -1846,9 +1901,12 @@ sub vcs_file_blame { $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd my @author = vcs_find_author($cmd); next if !@author; + + my $formatted_author = deduplicate_email($author[0]); + my $count = grep(/$commit/, @all_commits); for ($i = 0; $i < $count ; $i++) { - push(@blame_signers, $author[0]); + push(@blame_signers, $formatted_author); } } } @@ -1856,8 +1914,14 @@ sub vcs_file_blame { vcs_assign("authored lines", $total_lines, @blame_signers); } } + foreach my $signer (@signers) { + $signer = deduplicate_email($signer); + } vcs_assign("commits", $total_commits, @signers); } else { + foreach my $signer (@signers) { + $signer = deduplicate_email($signer); + } vcs_assign("modified commits", $total_commits, @signers); } } From fae99206769b6bbf8a20ab883726b164945771d7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:58 -0700 Subject: [PATCH 139/176] scripts/get_maintainer.pl: don't deduplicate unnamed addresses ie: mailing lists Fix a defect with the first mailing list address being used for each subsequent mailing list. Updated to 0.26-beta6. Signed-off-by: Joe Perches Cc: Florian Mickler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index e822518bc610..d21ec3a89603 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -13,7 +13,7 @@ use strict; my $P = $0; -my $V = '0.26-beta5'; +my $V = '0.26-beta6'; use Getopt::Long qw(:config no_auto_abbrev); @@ -1036,7 +1036,7 @@ sub push_email_address { push(@email_to, [format_email($name, $address, $email_usename), $role]); } elsif (!email_inuse($name, $address)) { push(@email_to, [format_email($name, $address, $email_usename), $role]); - $email_hash_name{lc($name)}++; + $email_hash_name{lc($name)}++ if ($name ne ""); $email_hash_address{lc($address)}++; } @@ -1659,7 +1659,7 @@ sub deduplicate_email { ($name, $address) = parse_email($email); - if ($deduplicate_name_hash{lc($name)}) { + if ($name ne "" && $deduplicate_name_hash{lc($name)}) { $name = $deduplicate_name_hash{lc($name)}->[0]; $address = $deduplicate_name_hash{lc($name)}->[1]; $matched = 1; From ec15408206e6b87a11dc1560b2bd758d2ad7ccb1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:59 -0700 Subject: [PATCH 140/176] MAINTAINERS: fix Colibri PXA270 file pattern The original commit 403d29713e0a ("pxa/income: Add Income SBC support") started with the wrong file pattern. Signed-off-by: Joe Perches Acked-by: Marek Vasut Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 146b8a068a4e..a8d1f5686cee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -710,8 +710,7 @@ ARM/INCOME PXA270 SUPPORT M: Marek Vasut L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -F: arch/arm/mach-pxa/income.c -F: arch/arm/mach-pxa/include/mach-pxa/income.h +F: arch/arm/mach-pxa/colibri-pxa270-income.c ARM/INTEL IOP32X ARM ARCHITECTURE M: Lennert Buytenhek From 838553c5a151f0d76c5558b85d6a4131c835a20c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:22:59 -0700 Subject: [PATCH 141/176] MAINTAINERS: merge imote2 and stargate Removed by commit dcd925f95194da4 ("pxa: merge stargate2 and imote2 board files"). Signed-off-by: Joe Perches Acked-by: Jonathan Cameron Acked-by: Eric Miao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a8d1f5686cee..5bb0f896bc24 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -757,13 +757,7 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-ixp4xx/ -ARM/INTEL RESEARCH IMOTE 2 MACHINE SUPPORT -M: Jonathan Cameron -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained -F: arch/arm/mach-pxa/imote2.c - -ARM/INTEL RESEARCH STARGATE 2 MACHINE SUPPORT +ARM/INTEL RESEARCH IMOTE/STARGATE 2 MACHINE SUPPORT M: Jonathan Cameron L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained From 85743532a8ffa826edba3d1ca7d8f0dbf3f2a67d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:00 -0700 Subject: [PATCH 142/176] MAINTAINERS: merge s3c-244x sections Removed by commit 70556b143ae4c ("ARM: S3C24XX: Remove old mach-s3c2442"). Signed-off-by: Joe Perches Acked-by: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5bb0f896bc24..feda2cff24ff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -922,25 +922,12 @@ W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c2410/ -ARM/S3C2440 ARM ARCHITECTURE +ARM/S3C244x ARM ARCHITECTURE M: Ben Dooks L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c2440/ - -ARM/S3C2442 ARM ARCHITECTURE -M: Ben Dooks -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.fluff.org/ben/linux/ -S: Maintained -F: arch/arm/mach-s3c2442/ - -ARM/S3C2443 ARM ARCHITECTURE -M: Ben Dooks -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.fluff.org/ben/linux/ -S: Maintained F: arch/arm/mach-s3c2443/ ARM/S3C6400 ARM ARCHITECTURE From f996231b808f889df77561c86c01cee3c45e254e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:01 -0700 Subject: [PATCH 143/176] MAINTAINERS: merge s3c6400 and 6410 to 64xx Removed by commit 431107ea5b680a24a ("ARM: S3C64XX: Merge mach-s3c6400 and mach-s3c6410"). Signed-off-by: Joe Perches Acked-by: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index feda2cff24ff..abf4436a3741 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -930,19 +930,12 @@ S: Maintained F: arch/arm/mach-s3c2440/ F: arch/arm/mach-s3c2443/ -ARM/S3C6400 ARM ARCHITECTURE +ARM/S3C64xx ARM ARCHITECTURE M: Ben Dooks L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained -F: arch/arm/mach-s3c6400/ - -ARM/S3C6410 ARM ARCHITECTURE -M: Ben Dooks -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.fluff.org/ben/linux/ -S: Maintained -F: arch/arm/mach-s3c6410/ +F: arch/arm/mach-s3c64xx/ ARM/S5P ARM ARCHITECTURES M: Kukjin Kim From cefea792137c163529c12090c93b693d29166c60 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:01 -0700 Subject: [PATCH 144/176] MAINTAINERS: remove USB OV511 DRIVER Removed by commit 7373ab3669aec93 ("V4L/DVB: Remove obsolete ov511 driver"). Signed-off-by: Joe Perches Acked-by: Amerigo Wang Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 7 ------- 1 file changed, 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index abf4436a3741..f471adfab417 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6114,13 +6114,6 @@ L: linux-usb@vger.kernel.org S: Maintained F: drivers/usb/serial/option.c -USB OV511 DRIVER -M: Mark McClelland -L: linux-usb@vger.kernel.org -W: http://alpha.dyndns.org/ov511/ -S: Maintained -F: drivers/media/video/ov511.* - USB PEGASUS DRIVER M: Petko Manolov L: linux-usb@vger.kernel.org From be8c268a221cbe25025f30e697968cfcd8e78c94 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:02 -0700 Subject: [PATCH 145/176] MAINTAINERS: remove USB ZC0301 DRIVER Removed by commit 0d58cef664e01f ("V4L/DVB: Remove obsolete zc0301 v4l driver"). Signed-off-by: Joe Perches Acked-by: Amerigo Wang Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f471adfab417..c67d606e8477 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6274,16 +6274,6 @@ S: Supported F: drivers/usb/host/xhci* F: drivers/usb/host/pci-quirks* -USB ZC0301 DRIVER -M: Luca Risolia -L: linux-usb@vger.kernel.org -L: linux-media@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git -W: http://www.linux-projects.org -S: Maintained -F: Documentation/video4linux/zc0301.txt -F: drivers/media/video/zc0301/ - USB ZD1201 DRIVER L: linux-wireless@vger.kernel.org W: http://linux-lc100020.sourceforge.net From 1fa7e5473cba543b02a396ced9e407f614bb117c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:02 -0700 Subject: [PATCH 146/176] MAINTAINERS: use "T: git" and whitespace trivia Add missing git as a prefix for git repositories in the few places it wasn't already used. Convert a space delimiter to a tab. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c67d606e8477..014b648f99ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -657,7 +657,7 @@ ARM/FARADAY FA526 PORT M: Hans Ulli Kroll L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -T: git://git.berlios.de/gemini-board +T: git git://git.berlios.de/gemini-board F: arch/arm/mm/*-fa* ARM/FOOTBRIDGE ARCHITECTURE @@ -672,7 +672,7 @@ ARM/FREESCALE IMX / MXC ARM ARCHITECTURE M: Sascha Hauer L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -T: git://git.pengutronix.de/git/imx/linux-2.6.git +T: git git://git.pengutronix.de/git/imx/linux-2.6.git F: arch/arm/mach-mx*/ F: arch/arm/plat-mxc/ @@ -3840,7 +3840,7 @@ F: drivers/net/wireless/mwl8k.c MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER M: Nicolas Pitre S: Odd Fixes -F: drivers/mmc/host/mvsdio.* +F: drivers/mmc/host/mvsdio.* MARVELL YUKON / SYSKONNECT DRIVER M: Mirko Lindner @@ -4931,7 +4931,7 @@ RCUTORTURE MODULE M: Josh Triplett M: "Paul E. McKenney" S: Supported -T: git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git F: Documentation/RCU/torture.txt F: kernel/rcutorture.c @@ -4956,7 +4956,7 @@ M: Dipankar Sarma M: "Paul E. McKenney" W: http://www.rdrop.com/users/paulmck/rclock/ S: Supported -T: git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git F: Documentation/RCU/ F: include/linux/rcu* F: include/linux/srcu* From 66f1991bc2357436498ac990302b6f5bf403d7ef Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 26 Oct 2010 14:23:03 -0700 Subject: [PATCH 147/176] lib/bitmap.c: use hex_to_bin() Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index ffb78c916ccd..741fae905ae3 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -359,7 +359,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area); #define CHUNKSZ 32 #define nbits_to_hold_value(val) fls(val) -#define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) #define BASEDEC 10 /* fancier cpuset lists input in decimal */ /** @@ -466,7 +465,7 @@ int __bitmap_parse(const char *buf, unsigned int buflen, if (chunk & ~((1UL << (CHUNKSZ - 4)) - 1)) return -EOVERFLOW; - chunk = (chunk << 4) | unhex(c); + chunk = (chunk << 4) | hex_to_bin(c); ndigits++; totaldigits++; } if (ndigits == 0) From 066a9be6c0124edc9527088231f03c6236be375d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Oct 2010 14:23:03 -0700 Subject: [PATCH 148/176] idr: fix idr_pre_get() locking description Despite the idr_pre_get() kernel-doc, there are some cases where you can call idr_pre_get() from within locked regions. Add a description for such cases. See also: http://lkml.org/lkml/2010/9/16/462 [akpm@linux-foundation.org: cleanups, grammatical fixes] Signed-off-by: Naohiro Aota Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index 5e0966be0f7c..e35850d3004a 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -110,9 +110,10 @@ static void idr_mark_full(struct idr_layer **pa, int id) * @idp: idr handle * @gfp_mask: memory allocation flags * - * This function should be called prior to locking and calling the - * idr_get_new* functions. It preallocates enough memory to satisfy - * the worst possible allocation. + * This function should be called prior to calling the idr_get_new* functions. + * It preallocates enough memory to satisfy the worst possible allocation. The + * caller should pass in GFP_KERNEL if possible. This of course requires that + * no spinning locks be held. * * If the system is REALLY out of memory this function returns 0, * otherwise 1. @@ -290,9 +291,11 @@ static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id) * This is the allocate id function. It should be called with any * required locks. * - * If memory is required, it will return -EAGAIN, you should unlock - * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * If allocation from IDR's private freelist fails, idr_get_new_above() will + * return -EAGAIN. The caller should retry the idr_pre_get() call to refill + * IDR's preallocation and then retry the idr_get_new_above() call. + * + * If the idr is full idr_get_new_above() will return -ENOSPC. * * @id returns a value in the range @starting_id ... 0x7fffffff */ @@ -318,12 +321,11 @@ EXPORT_SYMBOL(idr_get_new_above); * @ptr: pointer you want associated with the id * @id: pointer to the allocated handle * - * This is the allocate id function. It should be called with any - * required locks. + * If allocation from IDR's private freelist fails, idr_get_new_above() will + * return -EAGAIN. The caller should retry the idr_pre_get() call to refill + * IDR's preallocation and then retry the idr_get_new_above() call. * - * If memory is required, it will return -EAGAIN, you should unlock - * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * If the idr is full idr_get_new_above() will return -ENOSPC. * * @id returns a value in the range 0 ... 0x7fffffff */ From e2852ae825dba5ebc159788720baec1a28a57125 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Oct 2010 14:23:05 -0700 Subject: [PATCH 149/176] percpu_counter: add debugobj support All percpu counters are linked to a global list on initialization and removed from it on destruction. The list is walked during CPU up/down. If a percpu counter is freed without being properly destroyed, the system will oops only on the next CPU up/down making it pretty nasty to track down. This patch adds debugobj support for percpu counters so that such problems can be found easily. As percpu counters don't make sense on stack and can't be statically initialized, debugobj support is pretty simple. It's initialized and activated on counter initialization, and deactivatd and destroyed on counter destruction. With this patch applied, the bug fixed by commit 602586a83b719df0fbd94196a1359ed35aeb2df3 (shmem: put_super must percpu_counter_destroy) triggers the following warning on tmpfs unmount and the system won't oops on the next cpu up/down operation. ------------[ cut here ]------------ WARNING: at lib/debugobjects.c:259 debug_print_object+0x5c/0x70() Hardware name: Bochs ODEBUG: free active (active state 0) object type: percpu_counter Modules linked in: Pid: 3999, comm: umount Not tainted 2.6.36-rc2-work+ #5 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] debug_print_object+0x5c/0x70 [] debug_check_no_obj_freed+0x125/0x210 [] kfree+0xb3/0x2f0 [] shmem_put_super+0x1d/0x30 [] generic_shutdown_super+0x56/0xe0 [] kill_anon_super+0x16/0x60 [] kill_litter_super+0x27/0x30 [] deactivate_locked_super+0x45/0x60 [] deactivate_super+0x4a/0x70 [] mntput_no_expire+0x86/0xe0 [] sys_umount+0x6f/0x360 [] system_call_fastpath+0x16/0x1b ---[ end trace cce2a341ba3611a7 ]--- Signed-off-by: Tejun Heo Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 8 ++++++++ lib/percpu_counter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 69a32664c289..0d5c762532a5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -317,6 +317,14 @@ config DEBUG_OBJECTS_RCU_HEAD help Enable this to turn on debugging of RCU list heads (call_rcu() usage). +config DEBUG_OBJECTS_PERCPU_COUNTER + bool "Debug percpu counter objects" + depends on DEBUG_OBJECTS + help + If you say Y here, additional code will be inserted into the + percpu counter routines to track the life time of percpu counter + objects and validate the percpu counter operations. + config DEBUG_OBJECTS_ENABLE_DEFAULT int "debug_objects bootup default value (0-1)" range 0 1 diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 209448e1d2b9..1d954ea72331 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -8,10 +8,53 @@ #include #include #include +#include static LIST_HEAD(percpu_counters); static DEFINE_MUTEX(percpu_counters_lock); +#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER + +static struct debug_obj_descr percpu_counter_debug_descr; + +static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) +{ + struct percpu_counter *fbc = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + percpu_counter_destroy(fbc); + debug_object_free(fbc, &percpu_counter_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr percpu_counter_debug_descr = { + .name = "percpu_counter", + .fixup_free = percpu_counter_fixup_free, +}; + +static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) +{ + debug_object_init(fbc, &percpu_counter_debug_descr); + debug_object_activate(fbc, &percpu_counter_debug_descr); +} + +static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) +{ + debug_object_deactivate(fbc, &percpu_counter_debug_descr); + debug_object_free(fbc, &percpu_counter_debug_descr); +} + +#else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ +static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) +{ } +static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) +{ } +#endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ + void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; @@ -75,6 +118,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, fbc->counters = alloc_percpu(s32); if (!fbc->counters) return -ENOMEM; + + debug_percpu_counter_activate(fbc); + #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); mutex_lock(&percpu_counters_lock); @@ -90,6 +136,8 @@ void percpu_counter_destroy(struct percpu_counter *fbc) if (!fbc->counters) return; + debug_percpu_counter_deactivate(fbc); + #ifdef CONFIG_HOTPLUG_CPU mutex_lock(&percpu_counters_lock); list_del(&fbc->list); From 6d411e6c01608cefb7b9ea6712110538a1432f9f Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 26 Oct 2010 14:23:05 -0700 Subject: [PATCH 150/176] lib/Kconfig.debug: add list_sort debugging switch While hunting a non-existing bug in 'list_sort()', I've improved the 'list_sort_test()' function which tests the 'list_sort()' library call. Although at the end I found a bug in my code, but not in 'list_sort()', I think my clean-ups and improvements are worth merging because they make the test function better. This patch: Make the self-tests selectable via Kconfig rather than by manual enabling of DEBUG_LIST_SORT. Signed-off-by: Artem Bityutskiy Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 9 +++++++++ lib/list_sort.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0d5c762532a5..95bda87a3e84 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -748,6 +748,15 @@ config DEBUG_LIST If unsure, say N. +config TEST_LIST_SORT + bool "Linked list sorting test" + depends on DEBUG_KERNEL + help + Enable this to turn on 'list_sort()' function test. This test is + executed only once during system boot, so affects only boot time. + + If unsure, say N. + config DEBUG_SG bool "Debug SG table operations" depends on DEBUG_KERNEL diff --git a/lib/list_sort.c b/lib/list_sort.c index a7616fa3162e..827794016bfb 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -141,7 +141,7 @@ void list_sort(void *priv, struct list_head *head, } EXPORT_SYMBOL(list_sort); -#ifdef DEBUG_LIST_SORT +#ifdef CONFIG_TEST_LIST_SORT struct debug_el { struct list_head l_h; int value; @@ -214,4 +214,4 @@ static int __init list_sort_test(void) return 0; } module_init(list_sort_test); -#endif +#endif /* CONFIG_TEST_LIST_SORT */ From bb2ab10fa693110cffa7087ffe2749d6e9a27d5f Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 26 Oct 2010 14:23:06 -0700 Subject: [PATCH 151/176] lib/list_sort: test: use more reasonable printk levels I do not see any reason to use KERN_WARN for normal messages and KERN_EMERG for error messages in the lib_sort testing routine. Let's use more reasonable KERN_NORM and KERN_ERR levels. Signed-off-by: Artem Bityutskiy Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 827794016bfb..679b3a060e7e 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -166,7 +166,7 @@ static int __init list_sort_test(void) struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL); struct list_head *cur; - printk(KERN_WARNING "testing list_sort()\n"); + printk(KERN_DEBUG "testing list_sort()\n"); cur = head; for (i = 0; i < LIST_SORT_TEST_LENGTH; i++) { @@ -189,17 +189,17 @@ static int __init list_sort_test(void) struct debug_el *el = container_of(cur, struct debug_el, l_h); int cmp_result = cmp(NULL, cur, cur->next); if (cur->next->prev != cur) { - printk(KERN_EMERG "list_sort() returned " - "a corrupted list!\n"); + printk(KERN_ERR "list_sort() returned " + "a corrupted list!\n"); return 1; } else if (cmp_result > 0) { - printk(KERN_EMERG "list_sort() failed to sort!\n"); + printk(KERN_ERR "list_sort() failed to sort!\n"); return 1; } else if (cmp_result == 0 && el->serial >= container_of(cur->next, struct debug_el, l_h)->serial) { - printk(KERN_EMERG "list_sort() failed to preserve order" - " of equivalent elements!\n"); + printk(KERN_ERR "list_sort() failed to preserve order " + "of equivalent elements!\n"); return 1; } kfree(cur->prev); @@ -207,8 +207,8 @@ static int __init list_sort_test(void) } kfree(cur); if (count != LIST_SORT_TEST_LENGTH) { - printk(KERN_EMERG "list_sort() returned list of" - "different length!\n"); + printk(KERN_ERR "list_sort() returned list of " + "different length!\n"); return 1; } return 0; From eeee9ebb54b76a33a13d2c926ffb018a4aea410f Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 26 Oct 2010 14:23:06 -0700 Subject: [PATCH 152/176] lib/list_sort: test: use generic random32 Instead of using own pseudo-random generator, use generic linux 'random32()' function. Presumably, this should improve test coverage. At the same time, do the following changes: o Use shorter macro name for test list length o Do not use strange 'l_h' name for 'struct list_head' element, use 'list', because it is traditional name and thus, makes the code more obvious and readable. Signed-off-by: Artem Bityutskiy Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 679b3a060e7e..8f3c24415ae5 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -142,42 +142,45 @@ void list_sort(void *priv, struct list_head *head, EXPORT_SYMBOL(list_sort); #ifdef CONFIG_TEST_LIST_SORT + +#include + struct debug_el { - struct list_head l_h; + struct list_head list; int value; unsigned serial; }; static int cmp(void *priv, struct list_head *a, struct list_head *b) { - return container_of(a, struct debug_el, l_h)->value - - container_of(b, struct debug_el, l_h)->value; + return container_of(a, struct debug_el, list)->value + - container_of(b, struct debug_el, list)->value; } /* * The pattern of set bits in the list length determines which cases * are hit in list_sort(). */ -#define LIST_SORT_TEST_LENGTH (512+128+2) /* not including head */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ static int __init list_sort_test(void) { - int i, r = 1, count; + int i, count; struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL); struct list_head *cur; printk(KERN_DEBUG "testing list_sort()\n"); cur = head; - for (i = 0; i < LIST_SORT_TEST_LENGTH; i++) { + for (i = 0; i < TEST_LIST_LEN; i++) { struct debug_el *el = kmalloc(sizeof(*el), GFP_KERNEL); BUG_ON(!el); /* force some equivalencies */ - el->value = (r = (r * 725861) % 6599) % (LIST_SORT_TEST_LENGTH/3); + el->value = random32() % (TEST_LIST_LEN/3); el->serial = i; - el->l_h.prev = cur; - cur->next = &el->l_h; + el->list.prev = cur; + cur->next = &el->list; cur = cur->next; } head->prev = cur; @@ -186,7 +189,7 @@ static int __init list_sort_test(void) count = 1; for (cur = head->next; cur->next != head; cur = cur->next) { - struct debug_el *el = container_of(cur, struct debug_el, l_h); + struct debug_el *el = container_of(cur, struct debug_el, list); int cmp_result = cmp(NULL, cur, cur->next); if (cur->next->prev != cur) { printk(KERN_ERR "list_sort() returned " @@ -197,7 +200,7 @@ static int __init list_sort_test(void) return 1; } else if (cmp_result == 0 && el->serial >= container_of(cur->next, - struct debug_el, l_h)->serial) { + struct debug_el, list)->serial) { printk(KERN_ERR "list_sort() failed to preserve order " "of equivalent elements!\n"); return 1; @@ -206,7 +209,7 @@ static int __init list_sort_test(void) count++; } kfree(cur); - if (count != LIST_SORT_TEST_LENGTH) { + if (count != TEST_LIST_LEN) { printk(KERN_ERR "list_sort() returned list of " "different length!\n"); return 1; From f3dc0e384248ea6fda0987f909007fa9ab5fb51a Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 26 Oct 2010 14:23:07 -0700 Subject: [PATCH 153/176] lib/list_sort: test: improve errors handling The 'lib_sort()' test does not free memory if it fails, and it makes the kernel panic if it cannot allocate memory. This patch fixes the problem. This patch also changes several small things: o use 'list_add()' helper instead of adding manually o introduce temporary 'el1' variable to avoid ugly and unreadalbe "if" statement o make 'head' to be stack variable instead of 'kmalloc()'ed, which simplifies code a bit Overall, this patch is of clean-up type. Signed-off-by: Artem Bityutskiy Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 65 +++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 8f3c24415ae5..2b99ff80f4be 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -165,56 +165,67 @@ static int cmp(void *priv, struct list_head *a, struct list_head *b) static int __init list_sort_test(void) { - int i, count; - struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL); - struct list_head *cur; + int i, count = 1, err = -EINVAL; + struct debug_el *el; + struct list_head *cur, *tmp; + LIST_HEAD(head); printk(KERN_DEBUG "testing list_sort()\n"); - cur = head; for (i = 0; i < TEST_LIST_LEN; i++) { - struct debug_el *el = kmalloc(sizeof(*el), GFP_KERNEL); - BUG_ON(!el); + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + printk(KERN_ERR "cancel list_sort() testing - cannot " + "allocate memory\n"); + goto exit; + } /* force some equivalencies */ el->value = random32() % (TEST_LIST_LEN/3); el->serial = i; - - el->list.prev = cur; - cur->next = &el->list; - cur = cur->next; + list_add_tail(&el->list, &head); } - head->prev = cur; - list_sort(NULL, head, cmp); + list_sort(NULL, &head, cmp); + + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; - count = 1; - for (cur = head->next; cur->next != head; cur = cur->next) { - struct debug_el *el = container_of(cur, struct debug_el, list); - int cmp_result = cmp(NULL, cur, cur->next); if (cur->next->prev != cur) { printk(KERN_ERR "list_sort() returned " "a corrupted list!\n"); - return 1; - } else if (cmp_result > 0) { + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { printk(KERN_ERR "list_sort() failed to sort!\n"); - return 1; - } else if (cmp_result == 0 && - el->serial >= container_of(cur->next, - struct debug_el, list)->serial) { + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { printk(KERN_ERR "list_sort() failed to preserve order " "of equivalent elements!\n"); - return 1; + goto exit; } - kfree(cur->prev); count++; } - kfree(cur); + if (count != TEST_LIST_LEN) { printk(KERN_ERR "list_sort() returned list of " "different length!\n"); - return 1; + goto exit; } - return 0; + + err = 0; +exit: + list_for_each_safe(cur, tmp, &head) { + list_del(cur); + kfree(container_of(cur, struct debug_el, list)); + } + return err; } module_init(list_sort_test); #endif /* CONFIG_TEST_LIST_SORT */ From 014afa943d44f0df8e65bc4bd071c67772277d93 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 26 Oct 2010 14:23:08 -0700 Subject: [PATCH 154/176] lib/list_sort: test: unify test messages This patch unifies 'list_sort_test()' messages a bit and makes sure all of them start with the "list_sort_test:" prefix to make it obvious for users where the messages come from. Signed-off-by: Artem Bityutskiy Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 2b99ff80f4be..01aff9e80821 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -170,12 +170,12 @@ static int __init list_sort_test(void) struct list_head *cur, *tmp; LIST_HEAD(head); - printk(KERN_DEBUG "testing list_sort()\n"); + printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); if (!el) { - printk(KERN_ERR "cancel list_sort() testing - cannot " + printk(KERN_ERR "list_sort_test: error: cannot " "allocate memory\n"); goto exit; } @@ -192,30 +192,31 @@ static int __init list_sort_test(void) int cmp_result; if (cur->next->prev != cur) { - printk(KERN_ERR "list_sort() returned " - "a corrupted list!\n"); + printk(KERN_ERR "list_sort_test: error: list is " + "corrupted\n"); goto exit; } cmp_result = cmp(NULL, cur, cur->next); if (cmp_result > 0) { - printk(KERN_ERR "list_sort() failed to sort!\n"); + printk(KERN_ERR "list_sort_test: error: list is not " + "sorted\n"); goto exit; } el = container_of(cur, struct debug_el, list); el1 = container_of(cur->next, struct debug_el, list); if (cmp_result == 0 && el->serial >= el1->serial) { - printk(KERN_ERR "list_sort() failed to preserve order " - "of equivalent elements!\n"); + printk(KERN_ERR "list_sort_test: error: order of " + "equivalent elements not preserved\n"); goto exit; } count++; } if (count != TEST_LIST_LEN) { - printk(KERN_ERR "list_sort() returned list of " - "different length!\n"); + printk(KERN_ERR "list_sort_test: error: bad list length %d", + count); goto exit; } From 041b78f232bb87b2de8ca3fed50384bc7dc9c2de Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 26 Oct 2010 14:23:08 -0700 Subject: [PATCH 155/176] lib/list_sort: test: check element addresses Improve 'lib_sort()' test and check that: o 'cmp()' is called only for elements which were present in the original list, i.e., the 'a' and 'b' parameters are valid o the resulted (sorted) list consists onlly of the original elements o intdoruce "poison" fields to make sure data around 'struc list_head' field are not corrupted. Signed-off-by: Artem Bityutskiy Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 83 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 12 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 01aff9e80821..d7325c6b103f 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -145,24 +145,66 @@ EXPORT_SYMBOL(list_sort); #include -struct debug_el { - struct list_head list; - int value; - unsigned serial; -}; - -static int cmp(void *priv, struct list_head *a, struct list_head *b) -{ - return container_of(a, struct debug_el, list)->value - - container_of(b, struct debug_el, list)->value; -} - /* * The pattern of set bits in the list length determines which cases * are hit in list_sort(). */ #define TEST_LIST_LEN (512+128+2) /* not including head */ +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C + +struct debug_el { + unsigned int poison1; + struct list_head list; + unsigned int poison2; + int value; + unsigned serial; +}; + +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) +{ + if (ela->serial >= TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", + ela->serial); + return -EINVAL; + } + if (elb->serial >= TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", + elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + printk(KERN_ERR "list_sort_test: error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; +} + +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); + + check(ela, elb); + return ela->value - elb->value; +} + static int __init list_sort_test(void) { int i, count = 1, err = -EINVAL; @@ -172,6 +214,13 @@ static int __init list_sort_test(void) printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); + elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + if (!elts) { + printk(KERN_ERR "list_sort_test: error: cannot allocate " + "memory\n"); + goto exit; + } + for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); if (!el) { @@ -182,6 +231,9 @@ static int __init list_sort_test(void) /* force some equivalencies */ el->value = random32() % (TEST_LIST_LEN/3); el->serial = i; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; list_add_tail(&el->list, &head); } @@ -211,6 +263,12 @@ static int __init list_sort_test(void) "equivalent elements not preserved\n"); goto exit; } + + if (check(el, el1)) { + printk(KERN_ERR "list_sort_test: error: element check " + "failed\n"); + goto exit; + } count++; } @@ -222,6 +280,7 @@ static int __init list_sort_test(void) err = 0; exit: + kfree(elts); list_for_each_safe(cur, tmp, &head) { list_del(cur); kfree(container_of(cur, struct debug_el, list)); From ea00c30b5b31baa91be29bee966204eccc15e9d3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 26 Oct 2010 14:23:09 -0700 Subject: [PATCH 156/176] percpu_counter: use this_cpu_ptr() instead of per_cpu_ptr() this_cpu_ptr() avoids an array lookup and can use the percpu offset of the local cpu directly. Signed-off-by: Christoph Lameter Cc: Eric Dumazet Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 1d954ea72331..604678d7d06d 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -73,9 +73,9 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; s32 *pcount; - int cpu = get_cpu(); - pcount = per_cpu_ptr(fbc->counters, cpu); + preempt_disable(); + pcount = this_cpu_ptr(fbc->counters); count = *pcount + amount; if (count >= batch || count <= -batch) { spin_lock(&fbc->lock); @@ -85,7 +85,7 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) } else { *pcount = count; } - put_cpu(); + preempt_enable(); } EXPORT_SYMBOL(__percpu_counter_add); From 5d051decfc27cdf33fbbd2bfca958d0d2c903569 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:23:09 -0700 Subject: [PATCH 157/176] lib/parser: cleanup match_number() Use new variable 'len' to make code more readable. Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/parser.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/parser.c b/lib/parser.c index fb34977246bb..6e89eca5cca0 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -128,12 +128,13 @@ static int match_number(substring_t *s, int *result, int base) char *endp; char *buf; int ret; + size_t len = s->to - s->from; - buf = kmalloc(s->to - s->from + 1, GFP_KERNEL); + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; - memcpy(buf, s->from, s->to - s->from); - buf[s->to - s->from] = '\0'; + memcpy(buf, s->from, len); + buf[len] = '\0'; *result = simple_strtol(buf, &endp, base); ret = 0; if (endp == buf) From 658716d19f8f155c67d4677ba68034b8e492dfbe Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 26 Oct 2010 14:23:10 -0700 Subject: [PATCH 158/176] div64_u64(): improve precision on 32bit platforms The current implementation of div64_u64 for 32bit systems returns an approximately correct result when the divisor exceeds 32bits. Since doing 64bit division using 32bit hardware is a long since solved problem we just use one of the existing proven methods. Additionally, add a div64_s64 function to correctly handle doing signed 64bit division. Addresses https://bugzilla.redhat.com/show_bug.cgi?id=616105 Signed-off-by: Brian Behlendorf Signed-off-by: Oleg Nesterov Cc: Ben Woodard Cc: Jeremy Fitzhardinge Cc: Mark Grondona Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 5 ++++ include/linux/math64.h | 12 ++++++++++ lib/div64.c | 52 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 77b04ed037df..450092c1e35f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -173,6 +173,11 @@ extern int _cond_resched(void); (__x < 0) ? -__x : __x; \ }) +#define abs64(x) ({ \ + s64 __x = (x); \ + (__x < 0) ? -__x : __x; \ + }) + #ifdef CONFIG_PROVE_LOCKING void might_fault(void); #else diff --git a/include/linux/math64.h b/include/linux/math64.h index c87f1528703a..23fcdfcba81b 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -35,6 +35,14 @@ static inline u64 div64_u64(u64 dividend, u64 divisor) return dividend / divisor; } +/** + * div64_s64 - signed 64bit divide with 64bit divisor + */ +static inline s64 div64_s64(s64 dividend, s64 divisor) +{ + return dividend / divisor; +} + #elif BITS_PER_LONG == 32 #ifndef div_u64_rem @@ -53,6 +61,10 @@ extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); extern u64 div64_u64(u64 dividend, u64 divisor); #endif +#ifndef div64_s64 +extern s64 div64_s64(s64 dividend, s64 divisor); +#endif + #endif /* BITS_PER_LONG */ /** diff --git a/lib/div64.c b/lib/div64.c index a111eb8de9cf..5b4919191778 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -77,26 +77,58 @@ s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) EXPORT_SYMBOL(div_s64_rem); #endif -/* 64bit divisor, dividend and result. dynamic precision */ +/** + * div64_u64 - unsigned 64bit divide with 64bit divisor + * @dividend: 64bit dividend + * @divisor: 64bit divisor + * + * This implementation is a modified version of the algorithm proposed + * by the book 'Hacker's Delight'. The original source and full proof + * can be found here and is available for use without restriction. + * + * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c' + */ #ifndef div64_u64 u64 div64_u64(u64 dividend, u64 divisor) { - u32 high, d; + u32 high = divisor >> 32; + u64 quot; - high = divisor >> 32; - if (high) { - unsigned int shift = fls(high); + if (high == 0) { + quot = div_u64(dividend, divisor); + } else { + int n = 1 + fls(high); + quot = div_u64(dividend >> n, divisor >> n); - d = divisor >> shift; - dividend >>= shift; - } else - d = divisor; + if (quot != 0) + quot--; + if ((dividend - quot * divisor) >= divisor) + quot++; + } - return div_u64(dividend, d); + return quot; } EXPORT_SYMBOL(div64_u64); #endif +/** + * div64_s64 - signed 64bit divide with 64bit divisor + * @dividend: 64bit dividend + * @divisor: 64bit divisor + */ +#ifndef div64_s64 +s64 div64_s64(s64 dividend, s64 divisor) +{ + s64 quot, t; + + quot = div64_u64(abs64(dividend), abs64(divisor)); + t = (dividend ^ divisor) >> 63; + + return (quot ^ t) - t; +} +EXPORT_SYMBOL(div64_s64); +#endif + #endif /* BITS_PER_LONG == 32 */ /* From 6b4c5bebcebb0a48d29947e9aa749650751a7696 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:11 -0700 Subject: [PATCH 159/176] checkpatch: fix regressions in "fix handling of leading spaces" The patch "checkpatch: fix handling of leading spaces" added checks for leading spaces on lines, but this introduces regressions. Firstly it does not correctly detect when we are in a comment. Secondly it does not allow for preprocessor command spacing. Finally it does not allow for label indentation which is required to be less than one tab. Fix these up: Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2039acdf5122..0a87c7417ada 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1459,10 +1459,13 @@ sub process { } # check for spaces at the beginning of a line. - if ($rawline =~ /^\+ / && $rawline !~ /\+ +\*/) { +# Exceptions: +# 1) within comments +# 2) indented preprocessor commands +# 3) hanging labels + if ($rawline =~ /^\+ / && $line !~ /\+ *(?:$;|#|$Ident:)/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; - WARN("please, no space for starting a line, \ - excluding comments\n" . $herevet); + WARN("please, no spaces at the start of a line\n" . $herevet); } # check we are in a valid C source file if not then ignore this hunk From e91b6e263ed6735c766cb14bbe63b9c7bd774526 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:11 -0700 Subject: [PATCH 160/176] checkpatch: types may sit on a line on their own When the following form is used we have a type which fully fills a line. This means that a type may end at the end of line as well as at the following identifier. int ** foo; Reported-by: Daniel Walker Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0a87c7417ada..41f59b1e209b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -843,7 +843,7 @@ sub annotate_values { $av_preprocessor = 0; } - } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\()/) { + } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) { print "DECLARE($1)\n" if ($dbg_values > 1); $type = 'T'; From d2c0a23514d8ac4ed10a8e742467cfb72ca3bed8 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:12 -0700 Subject: [PATCH 161/176] checkpatch: suggest cleanpatch and cleanfile when appropriate When we hit types of whitespace which may be fixed by scripts/cleanpatch and scripts/cleanfile suggest their use in our report. Suggested-by: Bartlomiej Zolnierkiewicz Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 41f59b1e209b..32d6a236570d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -103,6 +103,8 @@ for my $key (keys %debug) { die "$@" if ($@); } +my $rpt_cleaners = 0; + if ($terse) { $emacs = 1; $quiet++; @@ -1389,6 +1391,7 @@ sub process { } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("trailing whitespace\n" . $herevet); + $rpt_cleaners = 1; } # check for Kconfig help text having a real description @@ -1450,6 +1453,7 @@ sub process { $rawline =~ /^\+\s* \s*/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("code indent should use tabs where possible\n" . $herevet); + $rpt_cleaners = 1; } # check for space before tabs. @@ -2842,6 +2846,15 @@ sub process { print "\n" if ($quiet == 0); } + if ($quiet == 0) { + # If there were whitespace errors which cleanpatch can fix + # then suggest that. + if ($rpt_cleaners) { + print "NOTE: whitespace errors detected, you may wish to use scripts/cleanpatch or\n"; + print " scripts/cleanfile\n\n"; + } + } + if ($clean == 1 && $quiet == 0) { print "$vname has no obvious style problems and is ready for submission.\n" } From fb2d2c1b5825503d30fb6f2dc328dbe4a47d9794 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:12 -0700 Subject: [PATCH 162/176] checkpatch: ensure we do not collapse bracketed sections into constants When determining if a return () sequence is a function style bracketing we simplify the expression one bracket at a time replacing each with a constant. However this can trigger a false merge with expressions as below: return (foo)0; Prevent this false merging. Reported-by: Hitoshi Mitake Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 32d6a236570d..8d010ac0efe1 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2183,15 +2183,16 @@ sub process { my $value = $2; # Flatten any parentheses - $value =~ s/\)\(/\) \(/g; + $value =~ s/\(/ \(/g; + $value =~ s/\)/\) /g; while ($value =~ s/\[[^\{\}]*\]/1/ || $value !~ /(?:$Ident|-?$Constant)\s* $Compare\s* (?:$Ident|-?$Constant)/x && $value =~ s/\([^\(\)]*\)/1/) { } - - if ($value =~ /^(?:$Ident|-?$Constant)$/) { +#print "value<$value>\n"; + if ($value =~ /^\s*(?:$Ident|-?$Constant)\s*$/) { ERROR("return is not a function, parentheses are not required\n" . $herecurr); } elsif ($spacing !~ /\s+/) { From 9446ef569c288e683225fec8337a0b2b81e75cc5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:13 -0700 Subject: [PATCH 163/176] checkpatch: handle casts better fixing false categorisation of : as binary The following incantation is triggering categorisation of its colon (:) as a binary form, which it is not: return foo ? (s8)bar : baz; Handle casts differently from types in the categoriser, allowing us to better track (s8)bar as a value and not a declaration. Reported-by: Jean Delvare Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8d010ac0efe1..3cec2990d51e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -845,6 +845,11 @@ sub annotate_values { $av_preprocessor = 0; } + } elsif ($cur =~ /^(\(\s*$Type\s*)\)/) { + print "CAST($1)\n" if ($dbg_values > 1); + push(@av_paren_type, $type); + $type = 'C'; + } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) { print "DECLARE($1)\n" if ($dbg_values > 1); $type = 'T'; From 53a3c4487a05b8f26ef72fe434a750a3402c998f Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:14 -0700 Subject: [PATCH 164/176] checkpatch: returning errno typically should be negative Add a (strict mode only) test to check for non-negative returns of what appear to be errno values as the majority case these should indeed be negative. Suggested-by: Andrew Morton Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3cec2990d51e..bcdb54bd61a0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2204,6 +2204,13 @@ sub process { ERROR("space required before the open parenthesis '('\n" . $herecurr); } } +# Return of what appears to be an errno should normally be -'ve + if ($line =~ /^.\s*return\s*(E[A-Z]*)\s*;/) { + my $name = $1; + if ($name ne 'EOF' && $name ne 'ERROR') { + WARN("return of an errno should typically be -ve (return -$1)\n" . $herecurr); + } + } # Need a space before open parenthesis after if, while etc if ($line=~/\b(if|while|for|switch)\(/) { From 8cf6de7145943caa38c56c61cd83b17687afd900 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:14 -0700 Subject: [PATCH 165/176] checkpatch: add check for space after struct, union, and enum Add spacing checks for struct, union, and enum definitions. Check the spacing after type and before the equals (=) and open brace ({). Based on a patch by Joe Perches. Cc: Joe Perches Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bcdb54bd61a0..983ac1816da0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1905,6 +1905,16 @@ sub process { ERROR("open brace '{' following function declarations go on the next line\n" . $herecurr); } +# missing space after union or struct definition + if ($rawline =~ /^\+\s*(union|struct)\s+$Ident[=\{]/) { + WARN("Missing space after struct or union definition\n" . $herecurr); + } + +# missing space after enum definition + if ($rawline =~ /^\+\s*enum\{/) { + WARN("Missing space after enum definition\n" . $herecurr); + } + # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { From 0c73b4eb7a825a5aff16d8a9701f6c28056de058 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:15 -0700 Subject: [PATCH 166/176] checkpatch: simplify and consolidate "missing space after" checks Commonise the code for missing spaces after struct, union, and enum such that they share the same code. Ensure we cover all the common cases in each case. Check against the sanitised line to ensure we do not report on comments and strings. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 983ac1816da0..cb19f54b4b2a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1905,22 +1905,17 @@ sub process { ERROR("open brace '{' following function declarations go on the next line\n" . $herecurr); } -# missing space after union or struct definition - if ($rawline =~ /^\+\s*(union|struct)\s+$Ident[=\{]/) { - WARN("Missing space after struct or union definition\n" . $herecurr); - } - -# missing space after enum definition - if ($rawline =~ /^\+\s*enum\{/) { - WARN("Missing space after enum definition\n" . $herecurr); - } - # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { ERROR("open brace '{' following $1 go on the same line\n" . $hereprev); } +# missing space after union, struct or enum definition + if ($line =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?(?:\s+$Ident)?[=\{]/) { + WARN("missing space after $1 definition\n" . $herecurr); + } + # check for spacing round square brackets; allowed: # 1. with a type on the left -- int [] a; # 2. at the beginning of a line for slice initialisers -- [0...10] = 5, From 9fe287d79b0af983050d24e7916cf3d1f019f553 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:15 -0700 Subject: [PATCH 167/176] checkpatch: ensure kconfig help checks only apply when we are adding help When checking the length of the help we need to be sure we are seeing the whole story before erroring. Firstly we only want to check when adding the help in the first place. Second we need to be sure that we are seeing the end of the entry, nominally when there is no context below or that context shows the start of the next entry. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index cb19f54b4b2a..e44ff91e811d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1400,18 +1400,34 @@ sub process { } # check for Kconfig help text having a real description +# Only applies when adding the entry originally, after that we do not have +# sufficient context to determine whether it is indeed long enough. if ($realfile =~ /Kconfig/ && - $line =~ /\+?\s*(---)?help(---)?$/) { + $line =~ /\+\s*(?:---)?help(?:---)?$/) { my $length = 0; - for (my $l = $linenr; defined($lines[$l]); $l++) { - my $f = $lines[$l]; + my $cnt = $realcnt; + my $ln = $linenr + 1; + my $f; + my $is_end = 0; + while ($cnt > 0 && defined $lines[$ln - 1]) { + $f = $lines[$ln - 1]; + $cnt-- if ($lines[$ln - 1] !~ /^-/); + $is_end = $lines[$ln - 1] =~ /^\+/; + $ln++; + + next if ($f =~ /^-/); + $f =~ s/^.//; $f =~ s/#.*//; $f =~ s/^\s+//; next if ($f =~ /^$/); - last if ($f =~ /^\s*config\s/); + if ($f =~ /^\s*config\s/) { + $is_end = 1; + last; + } $length++; } - WARN("please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($length < 4); + WARN("please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_end && $length < 4); + #print "is_end<$is_end> length<$length>\n"; } # check we are in a valid source file if not then ignore this hunk From 3bf9a009fccea422bc355414a3bdf5f35fff9f36 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 26 Oct 2010 14:23:16 -0700 Subject: [PATCH 168/176] checkpatch: check for incorrect permissions Throw an error when a source file has been given execute permissions using the mode change line present in git diffs. Also alow the filename matching to use the "diff" line in addition to the "+++" line, since the mode change lines appear before any "+++" lines. [apw@canonical.com: simplified filename logic slightly, added tests] Cc: Andy Whitcroft Acked-by: Linus Walleij Signed-off-by: Rabin Vincent Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e44ff91e811d..93fa145671a0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1315,7 +1315,11 @@ sub process { $here = "#$realline: " if ($file); # extract the filename as it passes - if ($line=~/^\+\+\+\s+(\S+)/) { + if ($line =~ /^diff --git.*?(\S+)$/) { + $realfile = $1; + $realfile =~ s@^([^/]*)/@@; + + } elsif ($line =~ /^\+\+\+\s+(\S+)/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; @@ -1339,6 +1343,14 @@ sub process { $cnt_lines++ if ($realcnt != 0); +# Check for incorrect file permissions + if ($line =~ /^new (file )?mode.*[7531]\d{0,2}$/) { + my $permhere = $here . "FILE: $realfile\n"; + if ($realfile =~ /(Makefile|Kconfig|\.c|\.h|\.S|\.tmpl)$/) { + ERROR("do not set execute permissions for source files\n" . $permhere); + } + } + #check the patch for a signoff: if ($line =~ /^\s*signed-off-by:/i) { # This is a signoff, if ugly, so do not double report. From 03f1df7da5696ddfa6e167b37e0c0ce5aad3de79 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:16 -0700 Subject: [PATCH 169/176] checkpatch: Add additional attribute #defines On Wed, 2010-08-11 at 12:35 -0400, Dave Jones wrote: > I just got this from a patch I merged.. > > ERROR: need consistent spacing around '*' (ctx:WxV) > #121: FILE: arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c:113: > +static struct pcc_cpu __percpu *pcc_cpu_info; > ^ > which doesn't seem right. Perhaps these need to be added to checkpatch. [apw@canonical.com: added tests] Signed-off-by: Joe Perches Signed-off-by: Andy Whitcroft Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 93fa145671a0..c151c9e142ad 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -152,6 +152,20 @@ our $Sparse = qr{ # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ const| + __percpu| + __nocast| + __safe| + __bitwise__| + __packed__| + __packed2__| + __naked| + __maybe_unused| + __always_unused| + __noreturn| + __used| + __cold| + __noclone| + __deprecated| __read_mostly| __kprobes| __(?:mem|cpu|dev|)(?:initdata|initconst|init\b)| From 015830be9779aeae7de7060b07a3157a8e41bcb4 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:17 -0700 Subject: [PATCH 170/176] checkpatch: update copyright dates Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c151c9e142ad..53b2eaecba0c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2,7 +2,7 @@ # (c) 2001, Dave Jones. (the file handling bit) # (c) 2005, Joel Schopp (the ugly bit) # (c) 2007,2008, Andy Whitcroft (new conditions, test suite) -# (c) 2008,2009, Andy Whitcroft +# (c) 2008-2010 Andy Whitcroft # Licensed under the terms of the GNU GPL License version 2 use strict; From 5eaa20b984eb316533b4a098d8de3912e434df6a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:18 -0700 Subject: [PATCH 171/176] checkpatch: clean up structure definition macro handline Handle definitions such as the following correctly, it is not a complex statement: #define PREALLOC(NAME, START, END, FLAGS) { \ .name = (NAME), \ .start = (START), \ .end = (END), \ .flags = (FLAGS) \ }, Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 53b2eaecba0c..d5361e49abad 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2476,8 +2476,8 @@ sub process { \.$Ident\s*=\s*| ^\"|\"$ }x; - #print "REST<$rest> dstat<$dstat>\n"; - if ($rest ne '') { + #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; + if ($rest ne '' && $rest ne ',') { if ($rest !~ /while\s*\(/ && $dstat !~ /$exceptions/) { From 3cbf62df3a8ce61cb1aa20b7dae964058988bfdd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:18 -0700 Subject: [PATCH 172/176] checkpatch: handle EXPORT_SYMBOL for DEVICE_ATTR and similar Handly definitions similar to below. The definition macro spits out a symbol with a prefix. Add matching of any identifier prefix: DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR, ata_scsi_lpm_show, ata_scsi_lpm_put); EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy); Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d5361e49abad..8ab45b72b396 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1822,8 +1822,17 @@ sub process { !defined $suppress_export{$realline_next} && ($lines[$realline_next - 1] =~ /EXPORT_SYMBOL.*\((.*)\)/ || $lines[$realline_next - 1] =~ /EXPORT_UNUSED_SYMBOL.*\((.*)\)/)) { + # Handle definitions which produce identifiers with + # a prefix: + # XXX(foo); + # EXPORT_SYMBOL(something_foo); my $name = $1; - if ($stat !~ /(?: + if ($stat =~ /^.([A-Z_]+)\s*\(\s*($Ident)/ && + $name =~ /^${Ident}_$2/) { +#print "FOO C name<$name>\n"; + $suppress_export{$realline_next} = 1; + + } elsif ($stat !~ /(?: \n.}\s*$| ^.DEFINE_$Ident\(\Q$name\E\)| ^.DECLARE_$Ident\(\Q$name\E\)| From 01464f30a97c5c30bf9633309b27cce055cef8fd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:19 -0700 Subject: [PATCH 173/176] checkpatch: statement/block context analyser should look at sanitised lines When tracking context to find a block or statement we need to use the sanitised lines, else perentheses '(' & ')' and braces '{' & '}' can throw the scanner out. Also fix up a couple of error outputs which include those sanitised lines incorrectly. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8ab45b72b396..d086ffe377d5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -691,15 +691,15 @@ sub ctx_block_get { $blk .= $rawlines[$line]; # Handle nested #if/#else. - if ($rawlines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) { + if ($lines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) { push(@stack, $level); - } elsif ($rawlines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) { + } elsif ($lines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) { $level = $stack[$#stack - 1]; - } elsif ($rawlines[$line] =~ /^.\s*#\s*endif\b/) { + } elsif ($lines[$line] =~ /^.\s*#\s*endif\b/) { $level = pop(@stack); } - foreach my $c (split(//, $rawlines[$line])) { + foreach my $c (split(//, $lines[$line])) { ##print "C<$c>L<$level><$open$close>O<$off>\n"; if ($off > 0) { $off--; @@ -1652,7 +1652,7 @@ sub process { if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { ERROR("that open brace { should be on the previous line\n" . - "$here\n$ctx\n$lines[$ctx_ln - 1]\n"); + "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } if ($level == 0 && $pre_ctx !~ /}\s*while\s*\($/ && $ctx =~ /\)\s*\;\s*$/ && @@ -1661,7 +1661,7 @@ sub process { my ($nlength, $nindent) = line_stats($lines[$ctx_ln - 1]); if ($nindent > $indent) { WARN("trailing semicolon indicates no statements, indent implies otherwise\n" . - "$here\n$ctx\n$lines[$ctx_ln - 1]\n"); + "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } } } From 267ad8f42644c2fa4ff6c2e7596d2b02c7397c85 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 26 Oct 2010 14:23:19 -0700 Subject: [PATCH 174/176] checkpatch: version 0.31 Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d086ffe377d5..c1e7fb3eab44 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -10,7 +10,7 @@ use strict; my $P = $0; $P =~ s@.*/@@g; -my $V = '0.30'; +my $V = '0.31'; use Getopt::Long qw(:config no_auto_abbrev); From cb710eca6820493add0ddd3d7e8e3ee53f2b6e57 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:20 -0700 Subject: [PATCH 175/176] scripts/checkpatch.pl: add warnings for static char that could be static const char Add warnings for possible missing const uses of static char foo[] = "bar" that could be static const char foo[] = "bar" and static const char *foo[] = {"bar", "baz"} that could be static const char * const foo[] = {"bar", "baz"} Signed-off-by: Joe Perches Cc: Mike Frysinger Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c1e7fb3eab44..2ec5fc6a4046 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1869,6 +1869,18 @@ sub process { $herecurr); } +# check for static const char * arrays. + if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { + WARN("static const char * array should probably be static const char * const\n" . + $herecurr); + } + +# check for static char foo[] = "bar" declarations. + if ($line =~ /\bstatic\s+char\s+(\w+)\s*\[\s*\]\s*=\s*"/) { + WARN("static char array declaration should probably be static const char\n" . + $herecurr); + } + # check for new typedefs, only function parameters and sparse annotations # make sense. if ($line =~ /\btypedef\s/ && From 93ed0e2d07b25aff4db1d61bfbcd1e82074c0ad5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 Oct 2010 14:23:21 -0700 Subject: [PATCH 176/176] scripts/checkpatch.pl: add check for declaration of pci_device_id Signed-off-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2ec5fc6a4046..90b54d4697fd 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1881,6 +1881,11 @@ sub process { $herecurr); } +# check for declarations of struct pci_device_id + if ($line =~ /\bstruct\s+pci_device_id\s+\w+\s*\[\s*\]\s*\=\s*\{/) { + WARN("Use DEFINE_PCI_DEVICE_TABLE for struct pci_device_id\n" . $herecurr); + } + # check for new typedefs, only function parameters and sparse annotations # make sense. if ($line =~ /\btypedef\s/ &&