From f8f72ad5396987e05a42cf7eff826fb2a15ff148 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:10 -0700
Subject: [PATCH 001/176] mm: fix return value of scan_lru_pages in memory
 unplug

scan_lru_pages returns pfn. So, it's type should be "unsigned long"
not "int".

Note: I guess this has been work until now because memory hotplug tester's
      machine has not very big memory....
      physical address < 32bit << PAGE_SHIFT.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4e940a26945..06662c5a3e86 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -659,7 +659,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
  * Scanning pfn is much easier than scanning lru list.
  * Scan pfn from start to end and Find LRU page.
  */
-int scan_lru_pages(unsigned long start, unsigned long end)
+unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
 	struct page *page;

From a75d377686037982cbec320bb770b19fe7be6a5d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:21:10 -0700
Subject: [PATCH 002/176] types.h: move misplaced comment

This comment landed in the wrong place.

Cc: Andi Kleen <andi@firstfloor.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Miller <davem@davemloft.net>
Cc: Eric Paris <eparis@redhat.com>
Cc: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/types.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/types.h b/include/linux/types.h
index 357dbc19606f..c2a9eb44f2fa 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -121,15 +121,7 @@ typedef		__u64		u_int64_t;
 typedef		__s64		int64_t;
 #endif
 
-/*
- * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid
- * common 32/64-bit compat problems.
- * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other
- * architectures) and to 8-byte boundaries on 64-bit architetures.  The new
- * aligned_64 type enforces 8-byte alignment so that structs containing
- * aligned_64 values have the same alignment on 32-bit and 64-bit architectures.
- * No conversions are necessary between 32-bit user-space and a 64-bit kernel.
- */
+/* this is a special 64bit data type that is 8-byte aligned */
 #define aligned_u64 __u64 __attribute__((aligned(8)))
 #define aligned_be64 __be64 __attribute__((aligned(8)))
 #define aligned_le64 __le64 __attribute__((aligned(8)))
@@ -186,7 +178,15 @@ typedef __u64 __bitwise __be64;
 typedef __u16 __bitwise __sum16;
 typedef __u32 __bitwise __wsum;
 
-/* this is a special 64bit data type that is 8-byte aligned */
+/*
+ * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid
+ * common 32/64-bit compat problems.
+ * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other
+ * architectures) and to 8-byte boundaries on 64-bit architetures.  The new
+ * aligned_64 type enforces 8-byte alignment so that structs containing
+ * aligned_64 values have the same alignment on 32-bit and 64-bit architectures.
+ * No conversions are necessary between 32-bit user-space and a 64-bit kernel.
+ */
 #define __aligned_u64 __u64 __attribute__((aligned(8)))
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))

From b7f50cfa3630b6e079929ffccfd442d65064ee1f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 26 Oct 2010 14:21:11 -0700
Subject: [PATCH 003/176] mm, page-allocator: do not check the state of a
 non-existant buddy during free

There is a bug in commit 6dda9d55 ("page allocator: reduce fragmentation
in buddy allocator by adding buddies that are merging to the tail of the
free lists") that means a buddy at order MAX_ORDER is checked for merging.
 A page of this order never exists so at times, an effectively random
piece of memory is being checked.

Alan Curry has reported that this is causing memory corruption in
userspace data on a PPC32 platform (http://lkml.org/lkml/2010/10/9/32).
It is not clear why this is happening.  It could be a cache coherency
problem where pages mapped in both user and kernel space are getting
different cache lines due to the bad read from kernel space
(http://lkml.org/lkml/2010/10/13/179).  It could also be that there are
some special registers being io-remapped at the end of the memmap array
and that a read has special meaning on them.  Compiler bugs have been
ruled out because the assembly before and after the patch looks relatively
harmless.

This patch fixes the problem by ensuring we are not reading a possibly
invalid location of memory.  It's not clear why the read causes corruption
but one way or the other it is a buggy read.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Corrado Zoccolo <czoccolo@gmail.com>
Reported-by: Alan Curry <pacman@kosh.dhis.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2a362c52fdf4..5faf876cfc3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -531,7 +531,7 @@ static inline void __free_one_page(struct page *page,
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
-	if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = __find_combined_index(page_idx, order);
 		higher_page = page + combined_idx - page_idx;

From 482db6df1746c4fa7d64a2441d4cb2610249c679 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:21:13 -0700
Subject: [PATCH 004/176] um: fix global timer issue when using CONFIG_NO_HZ

This fixes a issue which was introduced by fe2cc53e ("uml: track and make
up lost ticks").

timeval_to_ns() returns long long and not int.  Due to that UML's timer
did not work properlt and caused timer freezes.

Signed-off-by: Richard Weinberger <richard@nod.at>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index dec5678fc17f..6e3359d6a839 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -60,7 +60,7 @@ static inline long long timeval_to_ns(const struct timeval *tv)
 long long disable_timer(void)
 {
 	struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } });
-	int remain, max = UM_NSEC_PER_SEC / UM_HZ;
+	long long remain, max = UM_NSEC_PER_SEC / UM_HZ;
 
 	if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0)
 		printk(UM_KERN_ERR "disable_timer - setitimer failed, "

From 09358972bff5ce99de496bbba97c85d417b3c054 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 26 Oct 2010 14:21:15 -0700
Subject: [PATCH 005/176] sgi-xp: incoming XPC channel messages can come in
 after the channel's partition structures have been torn down

Under some workloads, some channel messages have been observed being
delayed on the sending side past the point where the receiving side has
been able to tear down its partition structures.

This condition is already detected in xpc_handle_activate_IRQ_uv(), but
that information is not given to xpc_handle_activate_mq_msg_uv().  As a
result, xpc_handle_activate_mq_msg_uv() assumes the structures still exist
and references them, causing a NULL-pointer deref.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/sgi-xp/xpc_uv.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 1f59ee2226ca..17bbacb1b4b1 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -417,6 +417,7 @@ xpc_process_activate_IRQ_rcvd_uv(void)
 static void
 xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
 			      struct xpc_activate_mq_msghdr_uv *msg_hdr,
+			      int part_setup,
 			      int *wakeup_hb_checker)
 {
 	unsigned long irq_flags;
@@ -481,6 +482,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
 	case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV: {
 		struct xpc_activate_mq_msg_chctl_closerequest_uv *msg;
 
+		if (!part_setup)
+			break;
+
 		msg = container_of(msg_hdr, struct
 				   xpc_activate_mq_msg_chctl_closerequest_uv,
 				   hdr);
@@ -497,6 +501,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
 	case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV: {
 		struct xpc_activate_mq_msg_chctl_closereply_uv *msg;
 
+		if (!part_setup)
+			break;
+
 		msg = container_of(msg_hdr, struct
 				   xpc_activate_mq_msg_chctl_closereply_uv,
 				   hdr);
@@ -511,6 +518,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
 	case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV: {
 		struct xpc_activate_mq_msg_chctl_openrequest_uv *msg;
 
+		if (!part_setup)
+			break;
+
 		msg = container_of(msg_hdr, struct
 				   xpc_activate_mq_msg_chctl_openrequest_uv,
 				   hdr);
@@ -528,6 +538,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
 	case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV: {
 		struct xpc_activate_mq_msg_chctl_openreply_uv *msg;
 
+		if (!part_setup)
+			break;
+
 		msg = container_of(msg_hdr, struct
 				   xpc_activate_mq_msg_chctl_openreply_uv, hdr);
 		args = &part->remote_openclose_args[msg->ch_number];
@@ -545,6 +558,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
 	case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV: {
 		struct xpc_activate_mq_msg_chctl_opencomplete_uv *msg;
 
+		if (!part_setup)
+			break;
+
 		msg = container_of(msg_hdr, struct
 				xpc_activate_mq_msg_chctl_opencomplete_uv, hdr);
 		spin_lock_irqsave(&part->chctl_lock, irq_flags);
@@ -621,6 +637,7 @@ xpc_handle_activate_IRQ_uv(int irq, void *dev_id)
 
 			part_referenced = xpc_part_ref(part);
 			xpc_handle_activate_mq_msg_uv(part, msg_hdr,
+						      part_referenced,
 						      &wakeup_hb_checker);
 			if (part_referenced)
 				xpc_part_deref(part);

From 6915e04f8847bea16d0890f559694ad8eedd026c Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:21:16 -0700
Subject: [PATCH 006/176] um: remove PAGE_SIZE alignment in linker script
 causing kernel segfault.

The linker script cleanup that I did in commit 5d150a97f93 ("um: Clean up
linker script using standard macros.") (2.6.32) accidentally introduced an
ALIGN(PAGE_SIZE) when converting to use INIT_TEXT_SECTION; Richard
Weinberger reported that this causes the kernel to segfault with
CONFIG_STATIC_LINK=y.

I'm not certain why this extra alignment is a problem, but it seems likely
it is because previously

__init_begin = _stext = _text = _sinittext

and with the extra ALIGN(PAGE_SIZE), _sinittext becomes different from the
rest.  So there is likely a bug here where something is assuming that
_sinittext is the same as one of those other symbols.  But reverting the
accidental change fixes the regression, so it seems worth committing that
now.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Reported-by: Richard Weinberger <richard@nod.at>
Cc: Jeff Dike <jdike@addtoit.com>
Tested by: Antoine Martin <antoine@nagafix.co.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/uml.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index ec6378550671..9a873d765615 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -22,7 +22,7 @@ SECTIONS
   _text = .;
   _stext = .;
   __init_begin = .;
-  INIT_TEXT_SECTION(PAGE_SIZE)
+  INIT_TEXT_SECTION(0)
   . = ALIGN(PAGE_SIZE);
 
   .text      :

From 1f9fa5216eacf4fdf9d3e4ab57feb8b642f0e78b Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Tue, 26 Oct 2010 14:21:16 -0700
Subject: [PATCH 007/176] drivers/misc/ad525x_dpot.c: fix typo in spi write16
 and write24 transfer counts

This is a bug fix.  Some SPI connected devices using 16/24 bit accesses,
previously failed, now work.

This typo slipped in after testing, during some restructuring.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Chris Verges <chrisv@cyberswitching.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/ad525x_dpot-spi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/ad525x_dpot-spi.c b/drivers/misc/ad525x_dpot-spi.c
index b8c6df9c8437..6cfcb636577a 100644
--- a/drivers/misc/ad525x_dpot-spi.c
+++ b/drivers/misc/ad525x_dpot-spi.c
@@ -53,13 +53,13 @@ static int write8(void *client, u8 val)
 static int write16(void *client, u8 reg, u8 val)
 {
 	u8 data[2] = {reg, val};
-	return spi_write(client, data, 1);
+	return spi_write(client, data, 2);
 }
 
 static int write24(void *client, u8 reg, u16 val)
 {
 	u8 data[3] = {reg, val >> 8, val};
-	return spi_write(client, data, 1);
+	return spi_write(client, data, 3);
 }
 
 static int read8(void *client)

From de5e2ddf9bb3ce7b643223b9b0718062254f302f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 26 Oct 2010 14:21:17 -0700
Subject: [PATCH 008/176] ipmi: proper spinlock initialization

Unloading ipmi module can trigger following error.  (if
CONFIG_DEBUG_SPINLOCK=y)

[ 9633.779590] BUG: spinlock bad magic on CPU#1, rmmod/7170
[ 9633.779606]  lock: f41f5414, .magic: 00000000, .owner:
<none>/-1, .owner_cpu: 0
[ 9633.779626] Pid: 7170, comm: rmmod Not tainted
2.6.36-rc7-11474-gb71eb1e-dirty #328
[ 9633.779644] Call Trace:
[ 9633.779657]  [<c13921cc>] ? printk+0x18/0x1c
[ 9633.779672]  [<c11a1f33>] spin_bug+0xa3/0xf0
[ 9633.779685]  [<c11a1ffd>] do_raw_spin_lock+0x7d/0x160
[ 9633.779702]  [<c1131537>] ? release_sysfs_dirent+0x47/0xb0
[ 9633.779718]  [<c1131b78>] ? sysfs_addrm_finish+0xa8/0xd0
[ 9633.779734]  [<c1394bac>] _raw_spin_lock_irqsave+0xc/0x20
[ 9633.779752]  [<f99d93da>] cleanup_one_si+0x6a/0x200 [ipmi_si]
[ 9633.779768]  [<c11305b2>] ? sysfs_hash_and_remove+0x72/0x80
[ 9633.779786]  [<f99dcf26>] ipmi_pnp_remove+0xd/0xf [ipmi_si]
[ 9633.779802]  [<c11f622b>] pnp_device_remove+0x1b/0x40

Fix this by initializing spinlocks in a smi_info_alloc() helper function,
right after memory allocation and clearing.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/ipmi/ipmi_si_intf.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index e537610d2f09..b293d57d30a7 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1665,6 +1665,17 @@ static int check_hotmod_int_op(const char *curr, const char *option,
 	return 0;
 }
 
+static struct smi_info *smi_info_alloc(void)
+{
+	struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (info) {
+		spin_lock_init(&info->si_lock);
+		spin_lock_init(&info->msg_lock);
+	}
+	return info;
+}
+
 static int hotmod_handler(const char *val, struct kernel_param *kp)
 {
 	char *str = kstrdup(val, GFP_KERNEL);
@@ -1779,7 +1790,7 @@ static int hotmod_handler(const char *val, struct kernel_param *kp)
 		}
 
 		if (op == HM_ADD) {
-			info = kzalloc(sizeof(*info), GFP_KERNEL);
+			info = smi_info_alloc();
 			if (!info) {
 				rv = -ENOMEM;
 				goto out;
@@ -1844,7 +1855,7 @@ static __devinit void hardcode_find_bmc(void)
 		if (!ports[i] && !addrs[i])
 			continue;
 
-		info = kzalloc(sizeof(*info), GFP_KERNEL);
+		info = smi_info_alloc();
 		if (!info)
 			return;
 
@@ -2027,7 +2038,7 @@ static __devinit int try_init_spmi(struct SPMITable *spmi)
 		return -ENODEV;
 	}
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = smi_info_alloc();
 	if (!info) {
 		printk(KERN_ERR PFX "Could not allocate SI data (3)\n");
 		return -ENOMEM;
@@ -2137,7 +2148,7 @@ static int __devinit ipmi_pnp_probe(struct pnp_dev *dev,
 	if (!acpi_dev)
 		return -ENODEV;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = smi_info_alloc();
 	if (!info)
 		return -ENOMEM;
 
@@ -2318,7 +2329,7 @@ static __devinit void try_init_dmi(struct dmi_ipmi_data *ipmi_data)
 {
 	struct smi_info *info;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = smi_info_alloc();
 	if (!info) {
 		printk(KERN_ERR PFX "Could not allocate SI data\n");
 		return;
@@ -2425,7 +2436,7 @@ static int __devinit ipmi_pci_probe(struct pci_dev *pdev,
 	int class_type = pdev->class & PCI_ERMC_CLASSCODE_TYPE_MASK;
 	struct smi_info *info;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = smi_info_alloc();
 	if (!info)
 		return -ENOMEM;
 
@@ -2566,7 +2577,7 @@ static int __devinit ipmi_of_probe(struct platform_device *dev,
 		return -EINVAL;
 	}
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = smi_info_alloc();
 
 	if (!info) {
 		dev_err(&dev->dev,
@@ -3013,7 +3024,7 @@ static __devinit void default_find_bmc(void)
 		if (check_legacy_ioport(ipmi_defaults[i].port))
 			continue;
 #endif
-		info = kzalloc(sizeof(*info), GFP_KERNEL);
+		info = smi_info_alloc();
 		if (!info)
 			return;
 
@@ -3138,9 +3149,6 @@ static int try_smi_init(struct smi_info *new_smi)
 		goto out_err;
 	}
 
-	spin_lock_init(&(new_smi->si_lock));
-	spin_lock_init(&(new_smi->msg_lock));
-
 	/* Do low-level detection first. */
 	if (new_smi->handlers->detect(new_smi->si_sm)) {
 		if (new_smi->addr_source)

From 1b627d5771312c92404b66f0a0b16f66036dd2e1 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:21:18 -0700
Subject: [PATCH 009/176] hostfs: fix UML crash: remove f_spare from hostfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

365b1818 ("add f_flags to struct statfs(64)") resized f_spare within
struct statfs which caused a UML crash.  There is no need to copy f_spare.

Signed-off-by: Richard Weinberger <richard@nod.at>
Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Tested-by: Toralf Förster <toralf.foerster@gmx.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs.h      | 3 +--
 fs/hostfs/hostfs_kern.c | 2 +-
 fs/hostfs/hostfs_user.c | 9 ++-------
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 7c232c1487ee..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
 		     long long *files_out, long long *ffree_out,
-		     void *fsid_out, int fsid_size, long *namelen_out,
-		     long *spare_out);
+		     void *fsid_out, int fsid_size, long *namelen_out);
 
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..cd7c93917cc7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 	err = do_statfs(dentry->d_sb->s_fs_info,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-			&sf->f_namelen, sf->f_spare);
+			&sf->f_namelen);
 	if (err)
 		return err;
 	sf->f_blocks = f_blocks;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..8d02683585e0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -364,8 +364,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	      long long *bfree_out, long long *bavail_out,
 	      long long *files_out, long long *ffree_out,
-	      void *fsid_out, int fsid_size, long *namelen_out,
-	      long *spare_out)
+	      void *fsid_out, int fsid_size, long *namelen_out)
 {
 	struct statfs64 buf;
 	int err;
@@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	       sizeof(buf.f_fsid) > fsid_size ? fsid_size :
 	       sizeof(buf.f_fsid));
 	*namelen_out = buf.f_namelen;
-	spare_out[0] = buf.f_spare[0];
-	spare_out[1] = buf.f_spare[1];
-	spare_out[2] = buf.f_spare[2];
-	spare_out[3] = buf.f_spare[3];
-	spare_out[4] = buf.f_spare[4];
+
 	return 0;
 }

From 52c5171214ff3327961d0ce0db7e8d2ce55004fd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:21:19 -0700
Subject: [PATCH 010/176] kfifo: disable __kfifo_must_check_helper()

This helper is wrong: it coerces signed values into unsigned ones, so code
such as

	if (kfifo_alloc(...) < 0) {
		error
	}

will fail to detect the error.

So let's disable __kfifo_must_check_helper() for 2.6.36.

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfifo.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 62dbee554f60..c238ad2f82ea 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -171,11 +171,8 @@ struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void);
 	}
 
 
-static inline unsigned int __must_check
-__kfifo_must_check_helper(unsigned int val)
-{
-	return val;
-}
+/* __kfifo_must_check_helper() is temporarily disabled because it was faulty */
+#define __kfifo_must_check_helper(x) (x)
 
 /**
  * kfifo_initialized - Check if the fifo is initialized

From 8474b591faf3bb0a1e08a60d21d6baac498f15e4 Mon Sep 17 00:00:00 2001
From: Masanori ITOH <itoumsn@nttdata.co.jp>
Date: Tue, 26 Oct 2010 14:21:20 -0700
Subject: [PATCH 011/176] percpu: fix list_head init bug in
 __percpu_counter_init()

WARNING: at lib/list_debug.c:26 __list_add+0x3f/0x81()
Hardware name: Express5800/B120a [N8400-085]
list_add corruption. next->prev should be prev (ffffffff81a7ea00), but was dead000000200200. (next=ffff88080b872d58).
Modules linked in: aoe ipt_MASQUERADE iptable_nat nf_nat autofs4 sunrpc bridge 8021q garp stp llc ipv6 cpufreq_ondemand acpi_cpufreq freq_table dm_round_robin dm_multipath kvm_intel kvm uinput lpfc scsi_transport_fc igb ioatdma scsi_tgt i2c_i801 i2c_core dca iTCO_wdt iTCO_vendor_support pcspkr shpchp megaraid_sas [last unloaded: aoe]
Pid: 54, comm: events/3 Tainted: G        W  2.6.34-vanilla1 #1
Call Trace:
[<ffffffff8104bd77>] warn_slowpath_common+0x7c/0x94
[<ffffffff8104bde6>] warn_slowpath_fmt+0x41/0x43
[<ffffffff8120fd2e>] __list_add+0x3f/0x81
[<ffffffff81212a12>] __percpu_counter_init+0x59/0x6b
[<ffffffff810d8499>] bdi_init+0x118/0x17e
[<ffffffff811f2c50>] blk_alloc_queue_node+0x79/0x143
[<ffffffff811f2d2b>] blk_alloc_queue+0x11/0x13
[<ffffffffa02a931d>] aoeblk_gdalloc+0x8e/0x1c9 [aoe]
[<ffffffffa02aa655>] aoecmd_sleepwork+0x25/0xa8 [aoe]
[<ffffffff8106186c>] worker_thread+0x1a9/0x237
[<ffffffffa02aa630>] ? aoecmd_sleepwork+0x0/0xa8 [aoe]
[<ffffffff81065827>] ? autoremove_wake_function+0x0/0x39
[<ffffffff810616c3>] ? worker_thread+0x0/0x237
[<ffffffff810653ad>] kthread+0x7f/0x87
[<ffffffff8100aa24>] kernel_thread_helper+0x4/0x10
[<ffffffff8106532e>] ? kthread+0x0/0x87
[<ffffffff8100aa20>] ? kernel_thread_helper+0x0/0x10

It's because there is no initialization code for a list_head contained in
the struct backing_dev_info under CONFIG_HOTPLUG_CPU, and the bug comes up
when block device drivers calling blk_alloc_queue() are used.  In case of
me, I got them by using aoe.

Signed-off-by: Masanori Itoh <itoumsn@nttdata.co.jp>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/percpu_counter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ec9048e74f44..209448e1d2b9 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -76,6 +76,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
 	if (!fbc->counters)
 		return -ENOMEM;
 #ifdef CONFIG_HOTPLUG_CPU
+	INIT_LIST_HEAD(&fbc->list);
 	mutex_lock(&percpu_counters_lock);
 	list_add(&fbc->list, &percpu_counters);
 	mutex_unlock(&percpu_counters_lock);

From dbec921370ea049eccd38e34e067373fb3bd9b01 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:21:21 -0700
Subject: [PATCH 012/176] um: fix IRQ flag handling naming

Commit df9ee292 ("Fix IRQ flag handling naming") changed the IRQ flag
handling naming scheme and broke UML:

In file included from arch/um/include/asm/fixmap.h:5,
                 from arch/um/include/shared/um_uaccess.h:10,
                 from arch/um/include/asm/uaccess.h:41,
                 from arch/um/include/asm/thread_info.h:13,
                 from include/linux/thread_info.h:56,
                 from include/linux/preempt.h:9,
                 from include/linux/spinlock.h:50,
                 from include/linux/seqlock.h:29,
                 from include/linux/time.h:8,
                 from include/linux/stat.h:60,
                 from include/linux/module.h:10,
                 from init/main.c:13:
arch/um/include/asm/system.h:11:1: warning: "local_save_flags" redefined

This patch brings the new scheme to UML and makes it work again.

Signed-off-by: Richard Weinberger <richard@nod.at>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/asm/system.h | 43 ++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/um/include/asm/system.h b/arch/um/include/asm/system.h
index 93af1cf0907d..68a90ecd1450 100644
--- a/arch/um/include/asm/system.h
+++ b/arch/um/include/asm/system.h
@@ -8,23 +8,38 @@ extern int set_signals(int enable);
 extern void block_signals(void);
 extern void unblock_signals(void);
 
-#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
-				     (flags) = get_signals(); } while(0)
-#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
-				      set_signals(flags); } while(0)
+static inline unsigned long arch_local_save_flags(void)
+{
+	return get_signals();
+}
 
-#define local_irq_save(flags) do { local_save_flags(flags); \
-                                   local_irq_disable(); } while(0)
+static inline void arch_local_irq_restore(unsigned long flags)
+{
+	set_signals(flags);
+}
 
-#define local_irq_enable() unblock_signals()
-#define local_irq_disable() block_signals()
+static inline void arch_local_irq_enable(void)
+{
+	unblock_signals();
+}
 
-#define irqs_disabled()                 \
-({                                      \
-        unsigned long flags;            \
-        local_save_flags(flags);        \
-        (flags == 0);                   \
-})
+static inline void arch_local_irq_disable(void)
+{
+	block_signals();
+}
+
+static inline unsigned long arch_local_irq_save(void)
+{
+	unsigned long flags;
+	flags = arch_local_save_flags();
+	arch_local_irq_disable();
+	return flags;
+}
+
+static inline bool arch_irqs_disabled(void)
+{
+	return arch_local_save_flags() == 0;
+}
 
 extern void *_switch_to(void *prev, void *next, void *last);
 #define switch_to(prev, next, last) prev = _switch_to(prev, next, last)

From a4f7326da2bfe788b85509ec0c261e64596cdde4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 26 Oct 2010 14:21:21 -0700
Subject: [PATCH 013/176] vmcore: it is not experimental any more

We use vmcore in our production kernel for a long time, it is pretty
stable now.  So I don't think we need to mark it as experimental any more.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..6a0068841d96 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
 	depends on PROC_FS && MMU
 
 config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && CRASH_DUMP
+	bool "/proc/vmcore support"
+	depends on PROC_FS && CRASH_DUMP
 	default y
         help
         Exports the dump image of crashed kernel in ELF format.

From 0f4d208f1975f16f269134cee5f44c1f048581da Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Tue, 26 Oct 2010 14:21:22 -0700
Subject: [PATCH 014/176] Documentation/filesystems/proc.txt: improve smaps
 field documentation

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Nikanth Karthikesan <knikanth@suse.de>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a6aca8740883..a563b74c7aef 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -374,13 +374,13 @@ Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 
-The first  of these lines shows  the same information  as is displayed for the
-mapping in /proc/PID/maps.  The remaining lines show  the size of the mapping,
-the amount of the mapping that is currently resident in RAM, the "proportional
-set size” (divide each shared page by the number of processes sharing it), the
-number of clean and dirty shared pages in the mapping, and the number of clean
-and dirty private pages in the mapping.  The "Referenced" indicates the amount
-of memory currently marked as referenced or accessed.
+The first of these lines shows the same information as is displayed for the
+mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
+(size), the amount of the mapping that is currently resident in RAM (RSS), the
+process' proportional share of this mapping (PSS), the number of clean and
+dirty shared pages in the mapping, and the number of clean and dirty private
+pages in the mapping.  The "Referenced" indicates the amount of memory
+currently marked as referenced or accessed.
 
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.

From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001
From: Ying Han <yinghan@google.com>
Date: Tue, 26 Oct 2010 14:21:23 -0700
Subject: [PATCH 015/176] oom: add per-mm oom disable count

It's pointless to kill a task if another thread sharing its mm cannot be
killed to allow future memory freeing.  A subsequent patch will prevent
kills in such cases, but first it's necessary to have a way to flag a task
that shares memory with an OOM_DISABLE task that doesn't incur an
additional tasklist scan, which would make select_bad_process() an O(n^2)
function.

This patch adds an atomic counter to struct mm_struct that follows how
many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
They cannot be killed by the kernel, so their memory cannot be freed in
oom conditions.

This only requires task_lock() on the task that we're operating on, it
does not require mm->mmap_sem since task_lock() pins the mm and the
operation is atomic.

[rientjes@google.com: changelog and sys_unshare() code]
[rientjes@google.com: protect oom_disable_count with task_lock in fork]
[rientjes@google.com: use old_mm for oom_disable_count in exec]
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                |  5 +++++
 fs/proc/base.c           | 30 ++++++++++++++++++++++++++++++
 include/linux/mm_types.h |  2 ++
 kernel/exit.c            |  3 +++
 kernel/fork.c            | 15 ++++++++++++++-
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..3aa75b8888a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -759,6 +760,10 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		atomic_dec(&old_mm->oom_disable_count);
+		atomic_inc(&tsk->mm->oom_disable_count);
+	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5d5f51f3fe..6e50c8e65513 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	if (oom_adjust != task->signal->oom_adj) {
+		if (oom_adjust == OOM_DISABLE)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_adj == OOM_DISABLE)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
+
 	/*
 	 * Warn that /proc/pid/oom_adj is deprecated, see
 	 * Documentation/feature-removal-schedule.txt.
@@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+	task_unlock(task);
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+	if (oom_score_adj != task->signal->oom_score_adj) {
+		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
 	task->signal->oom_score_adj = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
+	task_unlock(task);
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 	return count;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cb57d657ce4d..bb7288a782fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -310,6 +310,8 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
+	/* How many tasks sharing this mm are OOM_DISABLE */
+	atomic_t oom_disable_count;
 };
 
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..894179a32ec1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk)
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..e87aaaaf5131 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -741,6 +743,8 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		atomic_inc(&mm->oom_disable_count);
 
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -1299,8 +1303,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm)
+	if (p->mm) {
+		task_lock(p);
+		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&p->mm->oom_disable_count);
+		task_unlock(p);
 		mmput(p->mm);
+	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
@@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 			active_mm = current->active_mm;
 			current->mm = new_mm;
 			current->active_mm = new_mm;
+			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+				atomic_dec(&mm->oom_disable_count);
+				atomic_inc(&new_mm->oom_disable_count);
+			}
 			activate_mm(active_mm, new_mm);
 			new_mm = mm;
 		}

From e18641e19a9204f241f04a5ac700168dcd18de4f Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 26 Oct 2010 14:21:23 -0700
Subject: [PATCH 016/176] oom: avoid killing a task if a thread sharing its mm
 cannot be killed

The oom killer's goal is to kill a memory-hogging task so that it may
exit, free its memory, and allow the current context to allocate the
memory that triggered it in the first place.  Thus, killing a task is
pointless if other threads sharing its mm cannot be killed because of its
/proc/pid/oom_adj or /proc/pid/oom_score_adj value.

This patch checks whether any other thread sharing p->mm has an
oom_score_adj of OOM_SCORE_ADJ_MIN.  If so, the thread cannot be killed
and oom_badness(p) returns 0, meaning it's unkillable.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..4395f371bc7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 		return 0;
 
 	/*
-	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
-	 * need to be executed for something that cannot be killed.
+	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
+	 * so the entire heuristic doesn't need to be executed for something
+	 * that cannot be killed.
 	 */
-	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+	if (atomic_read(&p->mm->oom_disable_count)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -680,7 +681,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	read_lock(&tasklist_lock);
 	if (sysctl_oom_kill_allocating_task &&
 	    !oom_unkillable_task(current, NULL, nodemask) &&
-	    (current->signal->oom_adj != OOM_DISABLE)) {
+	    current->mm && !atomic_read(&current->mm->oom_disable_count)) {
 		/*
 		 * oom_kill_process() needs tasklist_lock held.  If it returns
 		 * non-zero, current could not be killed so we must fallback to

From 1e99bad0d9c12a4aaa60cd812c84ef152564bcf5 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 26 Oct 2010 14:21:24 -0700
Subject: [PATCH 017/176] oom: kill all threads sharing oom killed task's mm

It's necessary to kill all threads that share an oom killed task's mm if
the goal is to lead to future memory freeing.

This patch reintroduces the code removed in 8c5cd6f3 (oom: oom_kill
doesn't kill vfork parent (or child)) since it is obsoleted.

It's now guaranteed that any task passed to oom_kill_task() does not share
an mm with any thread that is unkillable.  Thus, we're safe to issue a
SIGKILL to any thread sharing the same mm.

This is especially necessary to solve an mm->mmap_sem livelock issue
whereas an oom killed thread must acquire the lock in the exit path while
another thread is holding it in the page allocator while trying to
allocate memory itself (and will preempt the oom killer since a task was
already killed).  Since tasks with pending fatal signals are now granted
access to memory reserves, the thread holding the lock may quickly
allocate and release the lock so that the oom killed task may exit.

This mainly is for threads that are cloned with CLONE_VM but not
CLONE_THREAD, so they are in a different thread group.  Non-NPTL threads
exist in the wild and this change is necessary to prevent the livelock in
such cases.  We care more about preventing the livelock than incurring the
additional tasklist in the oom killer when a task has been killed.
Systems that are sufficiently large to not want the tasklist scan in the
oom killer in the first place already have the option of enabling
/proc/sys/vm/oom_kill_allocating_task, which was designed specifically for
that purpose.

This code had existed in the oom killer for over eight years dating back
to the 2.4 kernel.

[akpm@linux-foundation.org: add nice comment]
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4395f371bc7c..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -404,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 {
+	struct task_struct *q;
+	struct mm_struct *mm;
+
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 1;
 
+	/* mm cannot be safely dereferenced after task_unlock(p) */
+	mm = p->mm;
+
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(p), p->comm, K(p->mm->total_vm),
 		K(get_mm_counter(p->mm, MM_ANONPAGES)),
 		K(get_mm_counter(p->mm, MM_FILEPAGES)));
 	task_unlock(p);
 
+	/*
+	 * Kill all processes sharing p->mm in other thread groups, if any.
+	 * They don't get access to memory reserves or a higher scheduler
+	 * priority, though, to avoid depletion of all memory or task
+	 * starvation.  This prevents mm->mmap_sem livelock when an oom killed
+	 * task cannot exit because it requires the semaphore and its contended
+	 * by another thread trying to allocate memory itself.  That thread will
+	 * now get access to memory reserves since it has a pending fatal
+	 * signal.
+	 */
+	for_each_process(q)
+		if (q->mm == mm && !same_thread_group(q, p)) {
+			task_lock(q);	/* Protect ->comm from prctl() */
+			pr_err("Kill process %d (%s) sharing same memory\n",
+				task_pid_nr(q), q->comm);
+			task_unlock(q);
+			force_sig(SIGKILL, q);
+		}
 
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 	force_sig(SIGKILL, p);

From 723548bff1dde9ab6bdb23f4bb92277c4da49473 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 26 Oct 2010 14:21:25 -0700
Subject: [PATCH 018/176] oom: rewrite error handling for oom_adj and
 oom_score_adj tunables

It's better to use proper error handling in oom_adjust_write() and
oom_score_adj_write() instead of duplicating the locking order on various
exit paths.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 83 +++++++++++++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6e50c8e65513..34d11ac31f2e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1023,36 +1023,39 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
 
 	err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
 	if (err)
-		return -EINVAL;
+		goto out;
 	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-	     oom_adjust != OOM_DISABLE)
-		return -EINVAL;
+	     oom_adjust != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
 	if (!lock_task_sighand(task, &flags)) {
-		put_task_struct(task);
-		return -ESRCH;
+		err = -ESRCH;
+		goto err_task_struct;
 	}
 
 	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EACCES;
+		err = -EACCES;
+		goto err_sighand;
 	}
 
 	task_lock(task);
 	if (!task->mm) {
-		task_unlock(task);
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_task_lock;
 	}
 
 	if (oom_adjust != task->signal->oom_adj) {
@@ -1080,11 +1083,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+err_task_lock:
 	task_unlock(task);
+err_sighand:
 	unlock_task_sighand(task, &flags);
+err_task_struct:
 	put_task_struct(task);
-
-	return count;
+out:
+	return err < 0 ? err : count;
 }
 
 static const struct file_operations proc_oom_adjust_operations = {
@@ -1125,36 +1131,39 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
 
 	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
 	if (err)
-		return -EINVAL;
+		goto out;
 	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
-			oom_score_adj > OOM_SCORE_ADJ_MAX)
-		return -EINVAL;
+			oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
 	if (!lock_task_sighand(task, &flags)) {
-		put_task_struct(task);
-		return -ESRCH;
+		err = -ESRCH;
+		goto err_task_struct;
 	}
 	if (oom_score_adj < task->signal->oom_score_adj &&
 			!capable(CAP_SYS_RESOURCE)) {
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EACCES;
+		err = -EACCES;
+		goto err_sighand;
 	}
 
 	task_lock(task);
 	if (!task->mm) {
-		task_unlock(task);
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_task_lock;
 	}
 	if (oom_score_adj != task->signal->oom_score_adj) {
 		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
@@ -1172,10 +1181,14 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
+err_task_lock:
 	task_unlock(task);
+err_sighand:
 	unlock_task_sighand(task, &flags);
+err_task_struct:
 	put_task_struct(task);
-	return count;
+out:
+	return err < 0 ? err : count;
 }
 
 static const struct file_operations proc_oom_score_adj_operations = {

From d19d5476f4b9f91d2de92b91588bb118beba6c0d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 26 Oct 2010 14:21:26 -0700
Subject: [PATCH 019/176] oom: fix locking for oom_adj and oom_score_adj

The locking order in oom_adjust_write() and oom_score_adj_write() for
task->alloc_lock and task->sighand->siglock is reversed, and lockdep
notices that irqs could encounter an ABBA scenario.

This fixes the locking order so that we always take task_lock(task) prior
to lock_task_sighand(task).

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 34d11ac31f2e..53dc8ad40ae6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1042,15 +1042,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		err = -ESRCH;
 		goto out;
 	}
-	if (!lock_task_sighand(task, &flags)) {
-		err = -ESRCH;
-		goto err_task_struct;
-	}
-
-	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		err = -EACCES;
-		goto err_sighand;
-	}
 
 	task_lock(task);
 	if (!task->mm) {
@@ -1058,6 +1049,16 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
 	if (oom_adjust != task->signal->oom_adj) {
 		if (oom_adjust == OOM_DISABLE)
 			atomic_inc(&task->mm->oom_disable_count);
@@ -1083,11 +1084,10 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
-err_task_lock:
-	task_unlock(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
-err_task_struct:
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
 out:
 	return err < 0 ? err : count;
@@ -1150,21 +1150,24 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		err = -ESRCH;
 		goto out;
 	}
-	if (!lock_task_sighand(task, &flags)) {
-		err = -ESRCH;
-		goto err_task_struct;
-	}
-	if (oom_score_adj < task->signal->oom_score_adj &&
-			!capable(CAP_SYS_RESOURCE)) {
-		err = -EACCES;
-		goto err_sighand;
-	}
 
 	task_lock(task);
 	if (!task->mm) {
 		err = -EINVAL;
 		goto err_task_lock;
 	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	if (oom_score_adj < task->signal->oom_score_adj &&
+			!capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
 	if (oom_score_adj != task->signal->oom_score_adj) {
 		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
 			atomic_inc(&task->mm->oom_disable_count);
@@ -1181,11 +1184,10 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
-err_task_lock:
-	task_unlock(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
-err_task_struct:
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
 out:
 	return err < 0 ? err : count;

From 1b430beee5e388605dfb092b214ef0320f752cf6 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 26 Oct 2010 14:21:26 -0700
Subject: [PATCH 020/176] writeback: remove nonblocking/encountered_congestion
 references

This removes more dead code that was somehow missed by commit 0d99519efef
(writeback: remove unused nonblocking and congestion checks).  There are
no behavior change except for the removal of two entries from one of the
ext4 tracing interface.

The nonblocking checks in ->writepages are no longer used because the
flusher now prefer to block on get_request_wait() than to skip inodes on
IO congestion.  The latter will lead to more seeky IO.

The nonblocking checks in ->writepage are no longer used because it's
redundant with the WB_SYNC_NONE check.

We no long set ->nonblocking in VM page out and page migration, because
a) it's effectively redundant with WB_SYNC_NONE in current code
b) it's old semantic of "Don't get stuck on request queues" is mis-behavior:
   that would skip some dirty inodes on congestion and page out others, which
   is unfair in terms of LRU age.

Inspired by Christoph Hellwig. Thanks!

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: David Howells <dhowells@redhat.com>
Cc: Sage Weil <sage@newdream.net>
Cc: Steve French <sfrench@samba.org>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/write.c                   | 19 +------------------
 fs/buffer.c                      |  2 +-
 fs/ceph/addr.c                   |  9 ---------
 fs/cifs/file.c                   | 10 ----------
 fs/gfs2/meta_io.c                |  2 +-
 fs/nfs/write.c                   |  4 +---
 fs/reiserfs/inode.c              |  2 +-
 fs/xfs/linux-2.6/xfs_aops.c      |  3 +--
 include/trace/events/ext4.h      |  8 +++++---
 include/trace/events/writeback.h |  2 --
 mm/migrate.c                     |  1 -
 mm/vmscan.c                      |  1 -
 12 files changed, 11 insertions(+), 52 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
  */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = page->mapping->backing_dev_info;
 	struct afs_writeback *wb;
 	int ret;
 
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	wbc->nr_to_write -= ret;
-	if (wbc->nonblocking && bdi_write_congested(bdi))
-		wbc->encountered_congestion = 1;
 
 	_leave(" = 0");
 	return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
 				 struct writeback_control *wbc,
 				 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct afs_writeback *wb;
 	struct page *page;
 	int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
 
 		wbc->nr_to_write -= ret;
 
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
-			wbc->encountered_congestion = 1;
-			break;
-		}
-
 		cond_resched();
 	} while (index < end && wbc->nr_to_write > 0);
 
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
 int afs_writepages(struct address_space *mapping,
 		   struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	pgoff_t start, end, next;
 	int ret;
 
 	_enter("");
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		_leave(" = 0 [congest]");
-		return 0;
-	}
-
 	if (wbc->range_cyclic) {
 		start = mapping->writeback_index;
 		end = -1;
 		ret = afs_writepages_region(mapping, wbc, start, end, &next);
-		if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
-		    !(wbc->nonblocking && wbc->encountered_congestion))
+		if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
 			ret = afs_writepages_region(mapping, wbc, 0, start,
 						    &next);
 		mapping->writeback_index = next;
diff --git a/fs/buffer.c b/fs/buffer.c
index 7f0b9b083f77..ec21a92e08b4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1706,7 +1706,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		 * and kswapd activity, but those code paths have their own
 		 * higher-level throttling.
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 51bcc5ce3230..e9c874abc9e1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping,
 				 struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc;
 	pgoff_t index, start, end;
@@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping,
 
 	pagevec_init(&pvec, 0);
 
-	/* ?? */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		dout(" writepages congested\n");
-		wbc->encountered_congestion = 1;
-		goto out_final;
-	}
-
 	/* where to start/end? */
 	if (wbc->range_cyclic) {
 		start = mapping->writeback_index; /* Start from prev offset */
@@ -885,7 +877,6 @@ static int ceph_writepages_start(struct address_space *mapping,
 		rc = 0;  /* vfs expects us to return 0 */
 	ceph_put_snap_context(snapc);
 	dout("writepages done, rc = %d\n", rc);
-out_final:
 	return rc;
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8c81e7b14d53..45af003865d2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1303,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned int bytes_to_write;
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
@@ -1326,15 +1325,6 @@ static int cifs_writepages(struct address_space *mapping,
 	int scanned = 0;
 	int xid, long_op;
 
-	/*
-	 * BB: Is this meaningful for a non-block-device file system?
-	 * If it is, we should test it again after we do I/O
-	 */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
 	/*
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 		 * activity, but those code paths have their own higher-level
 		 * throttling.
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 605e292501f4..4c14c17a5276 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -290,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
 	nfs_pageio_cond_complete(pgio, page->index);
-	ret = nfs_page_async_flush(pgio, page,
-			wbc->sync_mode == WB_SYNC_NONE ||
-			wbc->nonblocking != 0);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..c1f93896cb53 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2438,7 +2438,7 @@ static int reiserfs_write_full_page(struct page *page,
 		/* from this point on, we know the buffer is mapped to a
 		 * real block and not a direct item
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else {
 			if (!trylock_buffer(bh)) {
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..c9af48fffcd7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1139,8 +1139,7 @@ xfs_vm_writepage(
 				type = IO_DELAY;
 				flags = BMAPI_ALLOCATE;
 
-				if (wbc->sync_mode == WB_SYNC_NONE &&
-				    wbc->nonblocking)
+				if (wbc->sync_mode == WB_SYNC_NONE)
 					flags |= BMAPI_TRYLOCK;
 			}
 
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 01e9e0076a92..6bcb00645de4 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -242,18 +242,20 @@ TRACE_EVENT(ext4_da_writepages,
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
-		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
 
-	TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
+	TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld "
+		  "range_start %llu range_end %llu "
+		  "for_kupdate %d for_reclaim %d "
+		  "range_cyclic %d writeback_index %lu",
 		  jbd2_dev_to_name(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->nr_to_write,
 		  __entry->pages_skipped, __entry->range_start,
-		  __entry->range_end, __entry->nonblocking,
+		  __entry->range_end,
 		  __entry->for_kupdate, __entry->for_reclaim,
 		  __entry->range_cyclic,
 		  (unsigned long) __entry->writeback_index)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index f345f66ae9d1..0bb01ab2e984 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -96,8 +96,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(long, nr_to_write)
 		__field(long, pages_skipped)
 		__field(int, sync_mode)
-		__field(int, nonblocking)
-		__field(int, encountered_congestion)
 		__field(int, for_kupdate)
 		__field(int, for_background)
 		__field(int, for_reclaim)
diff --git a/mm/migrate.c b/mm/migrate.c
index f8c9bccf2520..d917ac3207f5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -497,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
 		.nr_to_write = 1,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
-		.nonblocking = 1,
 		.for_reclaim = 1
 	};
 	int rc;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94c9464f262..6cbc1aac23ae 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -376,7 +376,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			.nr_to_write = SWAP_CLUSTER_MAX,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
-			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
 

From f19e77a3dc884510dba740caa6dee126b7d40156 Mon Sep 17 00:00:00 2001
From: zeal <zealcook@gmail.com>
Date: Tue, 26 Oct 2010 14:21:27 -0700
Subject: [PATCH 021/176] include/linux/pageblock-flags.h: fix
 set_pageblock_flags() macro definiton

The presently-unused macro was missing one parameter.

Signed-off-by: zeal <zealcook@gmail.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e8c06122be36..19ef95d293ae 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -67,7 +67,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 
 #define get_pageblock_flags(page) \
 			get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1)
-#define set_pageblock_flags(page) \
-			set_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1)
+#define set_pageblock_flags(page, flags) \
+			set_pageblock_flags_group(page, flags,	\
+						  0, NR_PAGEBLOCK_BITS-1)
 
 #endif	/* PAGEBLOCK_FLAGS_H */

From e4455abb50a19562dbfdc51a8424fda9b588bd6d Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Tue, 26 Oct 2010 14:21:28 -0700
Subject: [PATCH 022/176] mm: only build per-node scan_unevictable functions
 when NUMA is enabled

Non-NUMA systems do never create these files anyway, since they are only
created by driver subsystem when NUMA is configured.

[akpm@linux-foundation.org: cleanup]
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 10 ++++++++++
 mm/vmscan.c          |  3 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7cdd63366f88..eba53e71d2cc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -271,8 +271,18 @@ extern void scan_mapping_unevictable_pages(struct address_space *);
 extern unsigned long scan_unevictable_pages;
 extern int scan_unevictable_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+#ifdef CONFIG_NUMA
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
+#else
+static inline int scan_unevictable_register_node(struct node *node)
+{
+	return 0;
+}
+static inline void scan_unevictable_unregister_node(struct node *node)
+{
+}
+#endif
 
 extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6cbc1aac23ae..f5871ee50000 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2986,6 +2986,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA
 /*
  * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
  * a specified node's per zone unevictable lists for evictable pages.
@@ -3032,4 +3033,4 @@ void scan_unevictable_unregister_node(struct node *node)
 {
 	sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
 }
-
+#endif

From cf608ac19c95804dc2df43b1f4f9e068aa9034ab Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 26 Oct 2010 14:21:29 -0700
Subject: [PATCH 023/176] mm: compaction: fix COMPACTPAGEFAILED counting

Presently update_nr_listpages() doesn't have a role.  That's because lists
passed is always empty just after calling migrate_pages.  The
migrate_pages cleans up page list which have failed to migrate before
returning by aaa994b3.

 [PATCH] page migration: handle freeing of pages in migrate_pages()

 Do not leave pages on the lists passed to migrate_pages().  Seems that we will
 not need any postprocessing of pages.  This will simplify the handling of
 pages by the callers of migrate_pages().

At that time, we thought we don't need any postprocessing of pages.  But
the situation is changed.  The compaction need to know the number of
failed to migrate for COMPACTPAGEFAILED stat

This patch makes new rule for caller of migrate_pages to call
putback_lru_pages.  So caller need to clean up the lists so it has a
chance to postprocess the pages.  [suggested by Christoph Lameter]

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <andi@firstfloor.org>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c |  1 +
 mm/memory_hotplug.c |  2 ++
 mm/mempolicy.c      | 10 ++++++++--
 mm/migrate.c        | 12 +++++++-----
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 44a8cefeae6e..124324134ff6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1292,6 +1292,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	list_add(&hpage->lru, &pagelist);
 	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
 	if (ret) {
+			putback_lru_pages(&pagelist);
 		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
 			 pfn, ret, page->flags);
 		if (ret > 0)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 06662c5a3e86..4821338b4e4b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -731,6 +731,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		goto out;
 	/* this function returns # of failed pages */
 	ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+	if (ret)
+		putback_lru_pages(&source);
 
 out:
 	return ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..21243b2b7b07 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -931,8 +931,11 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
-	if (!list_empty(&pagelist))
+	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest, 0);
+		if (err)
+			putback_lru_pages(&pagelist);
+	}
 
 	return err;
 }
@@ -1147,9 +1150,12 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 		err = mbind_range(mm, start, end, new);
 
-		if (!list_empty(&pagelist))
+		if (!list_empty(&pagelist)) {
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
 						(unsigned long)vma, 0);
+			if (nr_failed)
+				putback_lru_pages(&pagelist);
+		}
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
diff --git a/mm/migrate.c b/mm/migrate.c
index d917ac3207f5..35e454189966 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -883,8 +883,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
- * returned to the LRU or freed.
+ * or no retryable pages exist anymore.
+ * Caller should call putback_lru_pages to return pages to the LRU
+ * or free list.
  *
  * Return: Number of pages not migrated or error code.
  */
@@ -931,8 +932,6 @@ int migrate_pages(struct list_head *from,
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
-	putback_lru_pages(from);
-
 	if (rc)
 		return rc;
 
@@ -1087,9 +1086,12 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 	}
 
 	err = 0;
-	if (!list_empty(&pagelist))
+	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_page_node,
 				(unsigned long)pm, 0);
+		if (err)
+			putback_lru_pages(&pagelist);
+	}
 
 	up_read(&mm->mmap_sem);
 	return err;

From 4b20477f588055fbe87e69435d3c2344d250f0d7 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:29 -0700
Subject: [PATCH 024/176] memory hotplug: fix notifier's return value check

Even if notifier cannot find any pages, it doesn't mean no pages are
available...And, if there are no notifiers registered, this condition will
be always true and memory hotplug will show -EBUSY.

This is a bug but not critical.

In most case, a pageblock which will be offlined is MIGRATE_MOVABLE This
"notifier" is called only when the pageblock is _not_ MIGRATE_MOVABLE.
But if not MIGRATE_MOVABLE, it's common case that memory hotplug will
fail.  So, no one notice this bug.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5faf876cfc3a..099790052bfe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5336,7 +5336,7 @@ int set_migratetype_isolate(struct page *page)
 	 */
 	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
 	notifier_ret = notifier_to_errno(notifier_ret);
-	if (notifier_ret || !arg.pages_found)
+	if (notifier_ret)
 		goto out;
 
 	for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {

From 49ac825587f33afec8841b7fab2eb4db775014e6 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:30 -0700
Subject: [PATCH 025/176] memory hotplug: unify is_removable and offline
 detection code

Now, sysfs interface of memory hotplug shows whether the section is
removable or not.  But it checks only migrateype of pages and doesn't
check details of cluster of pages.

Next, memory hotplug's set_migratetype_isolate() has the same kind of
check, too.

This patch adds the function __count_unmovable_pages() and makes above 2
checks to use the same logic.  Then, is_removable and hotremove code uses
the same logic.  No changes in the hotremove logic itself.

TODO: need to find a way to check RECLAMABLE. But, considering bit,
      calling shrink_slab() against a range before starting memory hotremove
      sounds better. If so, this patch's logic doesn't need to be changed.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reported-by: Michal Hocko <mhocko@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  4 ++
 mm/memory_hotplug.c            | 17 +------
 mm/page_alloc.c                | 87 ++++++++++++++++++++++++++--------
 3 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 864035fb8f8a..4307231bd22f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -70,6 +70,10 @@ extern void online_page(struct page *page);
 extern int online_pages(unsigned long, unsigned long);
 extern void __offline_isolated_pages(unsigned long, unsigned long);
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+extern bool is_pageblock_removable_nolock(struct page *page);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4821338b4e4b..b0dc65452973 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page)
 /* Checks if this range of memory is likely to be hot-removable. */
 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
-	int type;
 	struct page *page = pfn_to_page(start_pfn);
 	struct page *end_page = page + nr_pages;
 
 	/* Check the starting page of each pageblock within the range */
 	for (; page < end_page; page = next_active_pageblock(page)) {
-		type = get_pageblock_migratetype(page);
-
-		/*
-		 * A pageblock containing MOVABLE or free pages is considered
-		 * removable
-		 */
-		if (type != MIGRATE_MOVABLE && !pageblock_free(page))
-			return 0;
-
-		/*
-		 * A pageblock starting with a PageReserved page is not
-		 * considered removable.
-		 */
-		if (PageReserved(page))
+		if (!is_pageblock_removable_nolock(page))
 			return 0;
+		cond_resched();
 	}
 
 	/* All pageblocks in the memory block are likely to be hot-removable */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 099790052bfe..6a683f819439 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5297,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
  * page allocater never alloc memory from ISOLATE block.
  */
 
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+	unsigned long pfn, iter, found;
+	/*
+	 * For avoiding noise data, lru_add_drain_all() should be called
+	 * If ZONE_MOVABLE, the zone never contains immobile pages
+	 */
+	if (zone_idx(zone) == ZONE_MOVABLE)
+		return true;
+
+	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+		return true;
+
+	pfn = page_to_pfn(page);
+	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+		unsigned long check = pfn + iter;
+
+		if (!pfn_valid_within(check)) {
+			iter++;
+			continue;
+		}
+		page = pfn_to_page(check);
+		if (!page_count(page)) {
+			if (PageBuddy(page))
+				iter += (1 << page_order(page)) - 1;
+			continue;
+		}
+		if (!PageLRU(page))
+			found++;
+		/*
+		 * If there are RECLAIMABLE pages, we need to check it.
+		 * But now, memory offline itself doesn't call shrink_slab()
+		 * and it still to be fixed.
+		 */
+		/*
+		 * If the page is not RAM, page_count()should be 0.
+		 * we don't need more check. This is an _used_ not-movable page.
+		 *
+		 * The problematic thing here is PG_reserved pages. PG_reserved
+		 * is set to both of a memory hole page and a _used_ kernel
+		 * page at boot.
+		 */
+		if (found > count)
+			return false;
+	}
+	return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+	return __count_immobile_pages(zone, page, 0);
+}
+
 int set_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
-	struct page *curr_page;
-	unsigned long flags, pfn, iter;
-	unsigned long immobile = 0;
+	unsigned long flags, pfn;
 	struct memory_isolate_notify arg;
 	int notifier_ret;
 	int ret = -EBUSY;
@@ -5312,11 +5365,6 @@ int set_migratetype_isolate(struct page *page)
 	zone_idx = zone_idx(zone);
 
 	spin_lock_irqsave(&zone->lock, flags);
-	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
-	    zone_idx == ZONE_MOVABLE) {
-		ret = 0;
-		goto out;
-	}
 
 	pfn = page_to_pfn(page);
 	arg.start_pfn = pfn;
@@ -5338,21 +5386,18 @@ int set_migratetype_isolate(struct page *page)
 	notifier_ret = notifier_to_errno(notifier_ret);
 	if (notifier_ret)
 		goto out;
-
-	for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
-		if (!pfn_valid_within(pfn))
-			continue;
-
-		curr_page = pfn_to_page(iter);
-		if (!page_count(curr_page) || PageLRU(curr_page))
-			continue;
-
-		immobile++;
-	}
-
-	if (arg.pages_found == immobile)
+	/*
+	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+	 * We just check MOVABLE pages.
+	 */
+	if (__count_immobile_pages(zone, page, arg.pages_found))
 		ret = 0;
 
+	/*
+	 * immobile means "not-on-lru" paes. If immobile is larger than
+	 * removable-by-driver pages reported by notifier, we'll fail.
+	 */
+
 out:
 	if (!ret) {
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);

From 74e3f3c3391d81a959f58a1191a560703a4415b4 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 26 Oct 2010 14:21:31 -0700
Subject: [PATCH 026/176] vmscan: prevent background aging of anon page in no
 swap system

Ying Han reported that backing aging of anon pages in no swap system
causes unnecessary TLB flush.

When I sent a patch(69c8548175), I wanted this patch but Rik pointed out
and allowed aging of anon pages to give a chance to promote from inactive
to active LRU.

It has a two problem.

1) non-swap system

Never make sense to age anon pages.

2) swap configured but still doesn't swapon

It doesn't make sense to age anon pages until swap-on time.  But it's
arguable.  If we have aged anon pages by swapon, VM have moved anon pages
from active to inactive.  And in the time swapon by admin, the VM can't
reclaim hot pages so we can protect hot pages swapout.

But let's think about it.  When does swap-on happen?  It depends on admin.
 we can't expect it.  Nonetheless, we have done aging of anon pages to
protect hot pages swapout.  It means we lost run time overhead when below
high watermark but gain hot page swap-[in/out] overhead when VM decide
swapout.  Is it true?  Let's think more detail.  We don't promote anon
pages in case of non-swap system.  So even though VM does aging of anon
pages, the pages would be in inactive LRU for a long time.  It means many
of pages in there would mark access bit again.  So access bit hot/code
separation would be pointless.

This patch prevents unnecessary anon pages demotion in not-yet-swapon and
non-configured swap system.  Even, in non-configuared swap system
inactive_anon_is_low can be compiled out.

It could make side effect that hot anon pages could swap out when admin
does swap on.  But I think sooner or later it would be steady state.  So
it's not a big problem.

We could lose someting but gain more thing(TLB flush and unnecessary
function call to demote anon pages).

Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5871ee50000..0c33a0997907 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1505,6 +1505,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	spin_unlock_irq(&zone->lru_lock);
 }
 
+#ifdef CONFIG_SWAP
 static int inactive_anon_is_low_global(struct zone *zone)
 {
 	unsigned long active, inactive;
@@ -1530,12 +1531,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 {
 	int low;
 
+	/*
+	 * If we don't have swap space, anonymous page deactivation
+	 * is pointless.
+	 */
+	if (!total_swap_pages)
+		return 0;
+
 	if (scanning_global_lru(sc))
 		low = inactive_anon_is_low_global(zone);
 	else
 		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
 	return low;
 }
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+					struct scan_control *sc)
+{
+	return 0;
+}
+#endif
 
 static int inactive_file_is_low_global(struct zone *zone)
 {
@@ -1781,7 +1796,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+	if (inactive_anon_is_low(zone, sc))
 		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
 	throttle_vm_writeout(sc->gfp_mask);

From 0def08e3acc2c9c934e4671487029aed52202d42 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Tue, 26 Oct 2010 14:21:32 -0700
Subject: [PATCH 027/176] mm/mempolicy.c: check return code of check_range

Function check_range may return ERR_PTR(...). Check for it.

Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 21243b2b7b07..81a127643aea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -924,12 +924,15 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	nodemask_t nmask;
 	LIST_HEAD(pagelist);
 	int err = 0;
+	struct vm_area_struct *vma;
 
 	nodes_clear(nmask);
 	node_set(source, nmask);
 
-	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest, 0);

From f629d1c9bd0dbc44a6c4f9a4a67d1646c42bfc6f Mon Sep 17 00:00:00 2001
From: Michael Rubin <mrubin@google.com>
Date: Tue, 26 Oct 2010 14:21:33 -0700
Subject: [PATCH 028/176] mm: add account_page_writeback()

To help developers and applications gain visibility into writeback
behaviour this patch adds two counters to /proc/vmstat.

  # grep nr_dirtied /proc/vmstat
  nr_dirtied 3747
  # grep nr_written /proc/vmstat
  nr_written 3618

These entries allow user apps to understand writeback behaviour over time
and learn how it is impacting their performance.  Currently there is no
way to inspect dirty and writeback speed over time.  It's not possible for
nr_dirty/nr_writeback.

These entries are necessary to give visibility into writeback behaviour.
We have /proc/diskstats which lets us understand the io in the block
layer.  We have blktrace for more in depth understanding.  We have
e2fsprogs and debugsfs to give insight into the file systems behaviour,
but we don't offer our users the ability understand what writeback is
doing.  There is no way to know how active it is over the whole system, if
it's falling behind or to quantify it's efforts.  With these values
exported users can easily see how much data applications are sending
through writeback and also at what rates writeback is processing this
data.  Comparing the rates of change between the two allow developers to
see when writeback is not able to keep up with incoming traffic and the
rate of dirty memory being sent to the IO back end.  This allows folks to
understand their io workloads and track kernel issues.  Non kernel
engineers at Google often use these counters to solve puzzling performance
problems.

Patch #4 adds a pernode vmstat file with nr_dirtied and nr_written

Patch #5 add writeback thresholds to /proc/vmstat

Currently these values are in debugfs. But they should be promoted to
/proc since they are useful for developers who are writing databases
and file servers and are not debugging the kernel.

The output is as below:

 # grep threshold /proc/vmstat
 nr_pages_dirty_threshold 409111
 nr_pages_dirty_background_threshold 818223

This patch:

This allows code outside of the mm core to safely manipulate page
writeback state and not worry about the other accounting.  Not using these
routines means that some code will lose track of the accounting and we get
bugs.

Modify nilfs2 to use interface.

Signed-off-by: Michael Rubin <mrubin@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: Jiro SEKIBA <jir@unicus.jp>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c |  2 +-
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 13 ++++++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index d926af626177..687d090cea34 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1609,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (!TestSetPageWriteback(clone_page))
-		inc_zone_page_state(clone_page, NR_WRITEBACK);
+		account_page_writeback(clone_page);
 	unlock_page(clone_page);
 
 	return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a4c66846fb8f..c36297faf7cb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -868,6 +868,7 @@ int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
 void account_page_dirtied(struct page *page, struct address_space *mapping);
+void account_page_writeback(struct page *page);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..94159819a651 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1128,6 +1128,17 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 }
 EXPORT_SYMBOL(account_page_dirtied);
 
+/*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+	inc_zone_page_state(page, NR_WRITEBACK);
+}
+EXPORT_SYMBOL(account_page_writeback);
+
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
@@ -1366,7 +1377,7 @@ int test_set_page_writeback(struct page *page)
 		ret = TestSetPageWriteback(page);
 	}
 	if (!ret)
-		inc_zone_page_state(page, NR_WRITEBACK);
+		account_page_writeback(page);
 	return ret;
 
 }

From ea941f0e2a8c02ae876cd73deb4e1557248f258c Mon Sep 17 00:00:00 2001
From: Michael Rubin <mrubin@google.com>
Date: Tue, 26 Oct 2010 14:21:35 -0700
Subject: [PATCH 029/176] writeback: add nr_dirtied and nr_written to
 /proc/vmstat

To help developers and applications gain visibility into writeback
behaviour adding two entries to vm_stat_items and /proc/vmstat.  This will
allow us to track the "written" and "dirtied" counts.

   # grep nr_dirtied /proc/vmstat
   nr_dirtied 3747
   # grep nr_written /proc/vmstat
   nr_written 3618

Signed-off-by: Michael Rubin <mrubin@google.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 ++
 mm/page-writeback.c    | 2 ++
 mm/vmstat.c            | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3984c4eb41fd..c3c17fb675ee 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -104,6 +104,8 @@ enum zone_stat_item {
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
+	NR_DIRTIED,		/* page dirtyings since bootup */
+	NR_WRITTEN,		/* page writings since bootup */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
 	NUMA_MISS,		/* allocated in non intended node */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94159819a651..4dd91f7fd39f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1121,6 +1121,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
+		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
@@ -1136,6 +1137,7 @@ EXPORT_SYMBOL(account_page_dirtied);
 void account_page_writeback(struct page *page)
 {
 	inc_zone_page_state(page, NR_WRITEBACK);
+	inc_zone_page_state(page, NR_WRITTEN);
 }
 EXPORT_SYMBOL(account_page_writeback);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..44e7ac0fdb66 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,9 @@ static const char * const vmstat_text[] = {
 	"nr_isolated_anon",
 	"nr_isolated_file",
 	"nr_shmem",
+	"nr_dirtied",
+	"nr_written",
+
 #ifdef CONFIG_NUMA
 	"numa_hit",
 	"numa_miss",

From 2ac390370aac4aaa49cab17f328b478cbd5b3d8d Mon Sep 17 00:00:00 2001
From: Michael Rubin <mrubin@google.com>
Date: Tue, 26 Oct 2010 14:21:35 -0700
Subject: [PATCH 030/176] writeback: add /sys/devices/system/node/<node>/vmstat

For NUMA node systems it is important to have visibility in memory
characteristics.  Two of the /proc/vmstat values "nr_written" and
"nr_dirtied" are added here.

	# cat /sys/devices/system/node/node20/vmstat
	nr_written 0
	nr_dirtied 0

Signed-off-by: Michael Rubin <mrubin@google.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee53558b452f..ce012a9c6201 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,6 +160,18 @@ static ssize_t node_read_numastat(struct sys_device * dev,
 }
 static SYSDEV_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
+static ssize_t node_read_vmstat(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
+{
+	int nid = dev->id;
+	return sprintf(buf,
+		"nr_written %lu\n"
+		"nr_dirtied %lu\n",
+		node_page_state(nid, NR_WRITTEN),
+		node_page_state(nid, NR_DIRTIED));
+}
+static SYSDEV_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
+
 static ssize_t node_read_distance(struct sys_device * dev,
 			struct sysdev_attribute *attr, char * buf)
 {
@@ -243,6 +255,7 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+		sysdev_create_file(&node->sysdev, &attr_vmstat);
 
 		scan_unevictable_register_node(node);
 
@@ -267,6 +280,7 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_meminfo);
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
+	sysdev_remove_file(&node->sysdev, &attr_vmstat);
 
 	scan_unevictable_unregister_node(node);
 	hugetlb_unregister_node(node);		/* no-op, if memoryless node */

From 79da826aee6a10902ef411bc65864bd02102fa83 Mon Sep 17 00:00:00 2001
From: Michael Rubin <mrubin@google.com>
Date: Tue, 26 Oct 2010 14:21:36 -0700
Subject: [PATCH 031/176] writeback: report dirty thresholds in /proc/vmstat

The kernel already exposes the user desired thresholds in /proc/sys/vm
with dirty_background_ratio and background_ratio.  But the kernel may
alter the number requested without giving the user any indication that is
the case.

Knowing the actual ratios the kernel is honoring can help app developers
understand how their buffered IO will be sent to the disk.

        $ grep threshold /proc/vmstat
        nr_dirty_threshold 409111
        nr_dirty_background_threshold 818223

Signed-off-by: Michael Rubin <mrubin@google.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 44e7ac0fdb66..baa4ab387db7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,7 @@
 #include <linux/vmstat.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
+#include <linux/writeback.h>
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -747,6 +748,8 @@ static const char * const vmstat_text[] = {
 	"nr_shmem",
 	"nr_dirtied",
 	"nr_written",
+	"nr_dirty_threshold",
+	"nr_dirty_background_threshold",
 
 #ifdef CONFIG_NUMA
 	"numa_hit",
@@ -907,36 +910,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
 	.release	= seq_release,
 };
 
+enum writeback_stat_item {
+	NR_DIRTY_THRESHOLD,
+	NR_DIRTY_BG_THRESHOLD,
+	NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
 	unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
-	unsigned long *e;
-#endif
-	int i;
+	int i, stat_items_size;
 
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
+	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+			  NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
-	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
-			+ sizeof(struct vm_event_state), GFP_KERNEL);
-#else
-	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
-			GFP_KERNEL);
+	stat_items_size += sizeof(struct vm_event_state);
 #endif
+
+	v = kmalloc(stat_items_size, GFP_KERNEL);
 	m->private = v;
 	if (!v)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 		v[i] = global_page_state(i);
+	v += NR_VM_ZONE_STAT_ITEMS;
+
+	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+			    v + NR_DIRTY_THRESHOLD);
+	v += NR_VM_WRITEBACK_STAT_ITEMS;
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
-	e = v + NR_VM_ZONE_STAT_ITEMS;
-	all_vm_events(e);
-	e[PGPGIN] /= 2;		/* sectors -> kbytes */
-	e[PGPGOUT] /= 2;
+	all_vm_events(v);
+	v[PGPGIN] /= 2;		/* sectors -> kbytes */
+	v[PGPGOUT] /= 2;
 #endif
-	return v + *pos;
+	return m->private + *pos;
 }
 
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)

From bce54bbfde07e8b300f39dae14756c12a6ceca65 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 26 Oct 2010 14:21:37 -0700
Subject: [PATCH 032/176] mm: fix typo in mm.h when NODE_NOT_IN_PAGE_FLAGS

NODE_NOT_IN_PAGE_FLAGS is defined in mm.h when the node information is not
stored in the page flags bitmap.

Unfortunately, there's a typo in one of the checks for it.  This patch
fixes it (s/NODE_NOT_IN_PAGEFLAGS/NODE_NOT_IN_PAGE_FLAGS/).  Since this
has been around for ages, I doubt it's been causing any serious problems.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c36297faf7cb..2862009f9573 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -497,8 +497,8 @@ static inline void set_compound_order(struct page *page, unsigned long order)
 #define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
 #define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
 
-/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */
-#ifdef NODE_NOT_IN_PAGEFLAGS
+/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
 #define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
 #define ZONEID_PGOFF		((SECTIONS_PGOFF < ZONES_PGOFF)? \
 						SECTIONS_PGOFF : ZONES_PGOFF)

From 66d9a986cddbbc2ea5db013e7999c621a956cc47 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 26 Oct 2010 14:21:37 -0700
Subject: [PATCH 033/176] vmscan: delete dead code

`priority' cannot be negative here.  And the comment is obsolete.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0c33a0997907..28b0521e4bfd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1956,16 +1956,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	}
 
 out:
-	/*
-	 * Now that we've scanned all the zones at this priority level, note
-	 * that level within the zone so that the next thread which performs
-	 * scanning of this zone will immediately start out at this priority
-	 * level.  This affects only the decision whether or not to bring
-	 * mapped pages onto the inactive list.
-	 */
-	if (priority < 0)
-		priority = 0;
-
 	delayacct_freepages_end();
 	put_mems_allowed();
 

From e11da5b4fdf01d71d73c21cb92b00595b917d7fd Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 26 Oct 2010 14:21:40 -0700
Subject: [PATCH 034/176] tracing, vmscan: add trace events for LRU list
 shrinking

There have been numerous reports of stalls that pointed at the problem
being somewhere in the VM.  There are multiple roots to the problems which
means dealing with any of the root problems in isolation is tricky to
justify on their own and they would still need integration testing.  This
patch series puts together two different patch sets which in combination
should tackle some of the root causes of latency problems being reported.

Patch 1 adds a tracepoint for shrink_inactive_list.  For this series, the
most important results is being able to calculate the scanning/reclaim
ratio as a measure of the amount of work being done by page reclaim.

Patch 2 accounts for time spent in congestion_wait.

Patches 3-6 were originally developed by Kosaki Motohiro but reworked for
this series.  It has been noted that lumpy reclaim is far too aggressive
and trashes the system somewhat.  As SLUB uses high-order allocations, a
large cost incurred by lumpy reclaim will be noticeable.  It was also
reported during transparent hugepage support testing that lumpy reclaim
was trashing the system and these patches should mitigate that problem
without disabling lumpy reclaim.

Patch 7 adds wait_iff_congested() and replaces some callers of
congestion_wait().  wait_iff_congested() only sleeps if there is a BDI
that is currently congested.  Patch 8 notes that any BDI being congested
is not necessarily a problem because there could be multiple BDIs of
varying speeds and numberous zones.  It attempts to track when a zone
being reclaimed contains many pages backed by a congested BDI and if so,
reclaimers wait on the congestion queue.

I ran a number of tests with monitoring on X86, X86-64 and PPC64. Each
machine had 3G of RAM and the CPUs were

X86:    Intel P4 2-core
X86-64: AMD Phenom 4-core
PPC64:  PPC970MP

Each used a single disk and the onboard IO controller.  Dirty ratio was
left at 20.  I'm just going to report for X86-64 and PPC64 in a vague
attempt to keep this report short.  Four kernels were tested each based on
v2.6.36-rc4

traceonly-v2r2:     Patches 1 and 2 to instrument vmscan reclaims and congestion_wait
lowlumpy-v2r3:      Patches 1-6 to test if lumpy reclaim is better
waitcongest-v2r3:   Patches 1-7 to only wait on congestion
waitwriteback-v2r4: Patches 1-8 to detect when a zone is congested

nocongest-v1r5: Patches 1-3 for testing wait_iff_congestion
nodirect-v1r5:  Patches 1-10 to disable filesystem writeback for better IO

The tests run were as follows

kernbench
	compile-based benchmark. Smoke test performance

sysbench
	OLTP read-only benchmark. Will be re-run in the future as read-write

micro-mapped-file-stream
	This is a micro-benchmark from Johannes Weiner that accesses a
	large sparse-file through mmap(). It was configured to run in only
	single-CPU mode but can be indicative of how well page reclaim
	identifies suitable pages.

stress-highalloc
	Tries to allocate huge pages under heavy load.

kernbench, iozone and sysbench did not report any performance regression
on any machine.  sysbench did pressure the system lightly and there was
reclaim activity but there were no difference of major interest between
the kernels.

X86-64 micro-mapped-file-stream

                                      traceonly-v2r2           lowlumpy-v2r3        waitcongest-v2r3     waitwriteback-v2r4
pgalloc_dma                       1639.00 (   0.00%)       667.00 (-145.73%)      1167.00 ( -40.45%)       578.00 (-183.56%)
pgalloc_dma32                  2842410.00 (   0.00%)   2842626.00 (   0.01%)   2843043.00 (   0.02%)   2843014.00 (   0.02%)
pgalloc_normal                       0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pgsteal_dma                        729.00 (   0.00%)        85.00 (-757.65%)       609.00 ( -19.70%)       125.00 (-483.20%)
pgsteal_dma32                  2338721.00 (   0.00%)   2447354.00 (   4.44%)   2429536.00 (   3.74%)   2436772.00 (   4.02%)
pgsteal_normal                       0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pgscan_kswapd_dma                 1469.00 (   0.00%)       532.00 (-176.13%)      1078.00 ( -36.27%)       220.00 (-567.73%)
pgscan_kswapd_dma32            4597713.00 (   0.00%)   4503597.00 (  -2.09%)   4295673.00 (  -7.03%)   3891686.00 ( -18.14%)
pgscan_kswapd_normal                 0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pgscan_direct_dma                   71.00 (   0.00%)       134.00 (  47.01%)       243.00 (  70.78%)       352.00 (  79.83%)
pgscan_direct_dma32             305820.00 (   0.00%)    280204.00 (  -9.14%)    600518.00 (  49.07%)    957485.00 (  68.06%)
pgscan_direct_normal                 0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pageoutrun                       16296.00 (   0.00%)     21254.00 (  23.33%)     18447.00 (  11.66%)     20067.00 (  18.79%)
allocstall                         443.00 (   0.00%)       273.00 ( -62.27%)       513.00 (  13.65%)      1568.00 (  71.75%)

These are based on the raw figures taken from /proc/vmstat.  It's a rough
measure of reclaim activity.  Note that allocstall counts are higher
because we are entering direct reclaim more often as a result of not
sleeping in congestion.  In itself, it's not necessarily a bad thing.
It's easier to get a view of what happened from the vmscan tracepoint
report.

FTrace Reclaim Statistics: vmscan

                                traceonly-v2r2   lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4
Direct reclaims                                443        273        513       1568
Direct reclaim pages scanned                305968     280402     600825     957933
Direct reclaim pages reclaimed               43503      19005      30327     117191
Direct reclaim write file async I/O              0          0          0          0
Direct reclaim write anon async I/O              0          3          4         12
Direct reclaim write file sync I/O               0          0          0          0
Direct reclaim write anon sync I/O               0          0          0          0
Wake kswapd requests                        187649     132338     191695     267701
Kswapd wakeups                                   3          1          4          1
Kswapd pages scanned                       4599269    4454162    4296815    3891906
Kswapd pages reclaimed                     2295947    2428434    2399818    2319706
Kswapd reclaim write file async I/O              1          0          1          1
Kswapd reclaim write anon async I/O             59        187         41        222
Kswapd reclaim write file sync I/O               0          0          0          0
Kswapd reclaim write anon sync I/O               0          0          0          0
Time stalled direct reclaim (seconds)         4.34       2.52       6.63       2.96
Time kswapd awake (seconds)                  11.15      10.25      11.01      10.19

Total pages scanned                        4905237   4734564   4897640   4849839
Total pages reclaimed                      2339450   2447439   2430145   2436897
%age total pages scanned/reclaimed          47.69%    51.69%    49.62%    50.25%
%age total pages scanned/written             0.00%     0.00%     0.00%     0.00%
%age  file pages scanned/written             0.00%     0.00%     0.00%     0.00%
Percentage Time Spent Direct Reclaim        29.23%    19.02%    38.48%    20.25%
Percentage Time kswapd Awake                78.58%    78.85%    76.83%    79.86%

What is interesting here for nocongest in particular is that while direct
reclaim scans more pages, the overall number of pages scanned remains the
same and the ratio of pages scanned to pages reclaimed is more or less the
same.  In other words, while we are sleeping less, reclaim is not doing
more work and as direct reclaim and kswapd is awake for less time, it
would appear to be doing less work.

FTrace Reclaim Statistics: congestion_wait
Direct number congest     waited                87        196         64          0
Direct time   congest     waited            4604ms     4732ms     5420ms        0ms
Direct full   congest     waited                72        145         53          0
Direct number conditional waited                 0          0        324       1315
Direct time   conditional waited               0ms        0ms        0ms        0ms
Direct full   conditional waited                 0          0          0          0
KSwapd number congest     waited                20         10         15          7
KSwapd time   congest     waited            1264ms      536ms      884ms      284ms
KSwapd full   congest     waited                10          4          6          2
KSwapd number conditional waited                 0          0          0          0
KSwapd time   conditional waited               0ms        0ms        0ms        0ms
KSwapd full   conditional waited                 0          0          0          0

The vanilla kernel spent 8 seconds asleep in direct reclaim and no time at
all asleep with the patches.

MMTests Statistics: duration
User/Sys Time Running Test (seconds)         10.51     10.73      10.6     11.66
Total Elapsed Time (seconds)                 14.19     13.00     14.33     12.76

Overall, the tests completed faster. It is interesting to note that backing off further
when a zone is congested and not just a BDI was more efficient overall.

PPC64 micro-mapped-file-stream
pgalloc_dma                    3024660.00 (   0.00%)   3027185.00 (   0.08%)   3025845.00 (   0.04%)   3026281.00 (   0.05%)
pgalloc_normal                       0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pgsteal_dma                    2508073.00 (   0.00%)   2565351.00 (   2.23%)   2463577.00 (  -1.81%)   2532263.00 (   0.96%)
pgsteal_normal                       0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pgscan_kswapd_dma              4601307.00 (   0.00%)   4128076.00 ( -11.46%)   3912317.00 ( -17.61%)   3377165.00 ( -36.25%)
pgscan_kswapd_normal                 0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pgscan_direct_dma               629825.00 (   0.00%)    971622.00 (  35.18%)   1063938.00 (  40.80%)   1711935.00 (  63.21%)
pgscan_direct_normal                 0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)         0.00 (   0.00%)
pageoutrun                       27776.00 (   0.00%)     20458.00 ( -35.77%)     18763.00 ( -48.04%)     18157.00 ( -52.98%)
allocstall                         977.00 (   0.00%)      2751.00 (  64.49%)      2098.00 (  53.43%)      5136.00 (  80.98%)

Similar trends to x86-64. allocstalls are up but it's not necessarily bad.

FTrace Reclaim Statistics: vmscan
Direct reclaims                                977       2709       2098       5136
Direct reclaim pages scanned                629825     963814    1063938    1711935
Direct reclaim pages reclaimed               75550     242538     150904     387647
Direct reclaim write file async I/O              0          0          0          2
Direct reclaim write anon async I/O              0         10          0          4
Direct reclaim write file sync I/O               0          0          0          0
Direct reclaim write anon sync I/O               0          0          0          0
Wake kswapd requests                        392119    1201712     571935     571921
Kswapd wakeups                                   3          2          3          3
Kswapd pages scanned                       4601307    4128076    3912317    3377165
Kswapd pages reclaimed                     2432523    2318797    2312673    2144616
Kswapd reclaim write file async I/O             20          1          1          1
Kswapd reclaim write anon async I/O             57        132         11        121
Kswapd reclaim write file sync I/O               0          0          0          0
Kswapd reclaim write anon sync I/O               0          0          0          0
Time stalled direct reclaim (seconds)         6.19       7.30      13.04      10.88
Time kswapd awake (seconds)                  21.73      26.51      25.55      23.90

Total pages scanned                        5231132   5091890   4976255   5089100
Total pages reclaimed                      2508073   2561335   2463577   2532263
%age total pages scanned/reclaimed          47.95%    50.30%    49.51%    49.76%
%age total pages scanned/written             0.00%     0.00%     0.00%     0.00%
%age  file pages scanned/written             0.00%     0.00%     0.00%     0.00%
Percentage Time Spent Direct Reclaim        18.89%    20.65%    32.65%    27.65%
Percentage Time kswapd Awake                72.39%    80.68%    78.21%    77.40%

Again, a similar trend that the congestion_wait changes mean that direct
reclaim scans more pages but the overall number of pages scanned while
slightly reduced, are very similar.  The ratio of scanning/reclaimed
remains roughly similar.  The downside is that kswapd and direct reclaim
was awake longer and for a larger percentage of the overall workload.
It's possible there were big differences in the amount of time spent
reclaiming slab pages between the different kernels which is plausible
considering that the micro tests runs after fsmark and sysbench.

Trace Reclaim Statistics: congestion_wait
Direct number congest     waited               845       1312        104          0
Direct time   congest     waited           19416ms    26560ms     7544ms        0ms
Direct full   congest     waited               745       1105         72          0
Direct number conditional waited                 0          0       1322       2935
Direct time   conditional waited               0ms        0ms       12ms      312ms
Direct full   conditional waited                 0          0          0          3
KSwapd number congest     waited                39        102         75         63
KSwapd time   congest     waited            2484ms     6760ms     5756ms     3716ms
KSwapd full   congest     waited                20         48         46         25
KSwapd number conditional waited                 0          0          0          0
KSwapd time   conditional waited               0ms        0ms        0ms        0ms
KSwapd full   conditional waited                 0          0          0          0

The vanilla kernel spent 20 seconds asleep in direct reclaim and only
312ms asleep with the patches.  The time kswapd spent congest waited was
also reduced by a large factor.

MMTests Statistics: duration
ser/Sys Time Running Test (seconds)         26.58     28.05      26.9     28.47
Total Elapsed Time (seconds)                 30.02     32.86     32.67     30.88

With all patches applies, the completion times are very similar.

X86-64 STRESS-HIGHALLOC
                traceonly-v2r2     lowlumpy-v2r3  waitcongest-v2r3waitwriteback-v2r4
Pass 1          82.00 ( 0.00%)    84.00 ( 2.00%)    85.00 ( 3.00%)    85.00 ( 3.00%)
Pass 2          90.00 ( 0.00%)    87.00 (-3.00%)    88.00 (-2.00%)    89.00 (-1.00%)
At Rest         92.00 ( 0.00%)    90.00 (-2.00%)    90.00 (-2.00%)    91.00 (-1.00%)

Success figures across the board are broadly similar.

                traceonly-v2r2     lowlumpy-v2r3  waitcongest-v2r3waitwriteback-v2r4
Direct reclaims                               1045        944        886        887
Direct reclaim pages scanned                135091     119604     109382     101019
Direct reclaim pages reclaimed               88599      47535      47863      46671
Direct reclaim write file async I/O            494        283        465        280
Direct reclaim write anon async I/O          29357      13710      16656      13462
Direct reclaim write file sync I/O             154          2          2          3
Direct reclaim write anon sync I/O           14594        571        509        561
Wake kswapd requests                          7491        933        872        892
Kswapd wakeups                                 814        778        731        780
Kswapd pages scanned                       7290822   15341158   11916436   13703442
Kswapd pages reclaimed                     3587336    3142496    3094392    3187151
Kswapd reclaim write file async I/O          91975      32317      28022      29628
Kswapd reclaim write anon async I/O        1992022     789307     829745     849769
Kswapd reclaim write file sync I/O               0          0          0          0
Kswapd reclaim write anon sync I/O               0          0          0          0
Time stalled direct reclaim (seconds)      4588.93    2467.16    2495.41    2547.07
Time kswapd awake (seconds)                2497.66    1020.16    1098.06    1176.82

Total pages scanned                        7425913  15460762  12025818  13804461
Total pages reclaimed                      3675935   3190031   3142255   3233822
%age total pages scanned/reclaimed          49.50%    20.63%    26.13%    23.43%
%age total pages scanned/written            28.66%     5.41%     7.28%     6.47%
%age  file pages scanned/written             1.25%     0.21%     0.24%     0.22%
Percentage Time Spent Direct Reclaim        57.33%    42.15%    42.41%    42.99%
Percentage Time kswapd Awake                43.56%    27.87%    29.76%    31.25%

Scanned/reclaimed ratios again look good with big improvements in
efficiency.  The Scanned/written ratios also look much improved.  With a
better scanned/written ration, there is an expectation that IO would be
more efficient and indeed, the time spent in direct reclaim is much
reduced by the full series and kswapd spends a little less time awake.

Overall, indications here are that allocations were happening much faster
and this can be seen with a graph of the latency figures as the
allocations were taking place
http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-hydra-mean.ps

FTrace Reclaim Statistics: congestion_wait
Direct number congest     waited              1333        204        169          4
Direct time   congest     waited           78896ms     8288ms     7260ms      200ms
Direct full   congest     waited               756         92         69          2
Direct number conditional waited                 0          0         26        186
Direct time   conditional waited               0ms        0ms        0ms     2504ms
Direct full   conditional waited                 0          0          0         25
KSwapd number congest     waited                 4        395        227        282
KSwapd time   congest     waited             384ms    25136ms    10508ms    18380ms
KSwapd full   congest     waited                 3        232         98        176
KSwapd number conditional waited                 0          0          0          0
KSwapd time   conditional waited               0ms        0ms        0ms        0ms
KSwapd full   conditional waited                 0          0          0          0
KSwapd full   conditional waited               318          0        312          9

Overall, the time spent speeping is reduced.  kswapd is still hitting
congestion_wait() but that is because there are callers remaining where it
wasn't clear in advance if they should be changed to wait_iff_congested()
or not.  Overall the sleep imes are reduced though - from 79ish seconds to
about 19.

MMTests Statistics: duration
User/Sys Time Running Test (seconds)       3415.43   3386.65   3388.39    3377.5
Total Elapsed Time (seconds)               5733.48   3660.33   3689.41   3765.39

With the full series, the time to complete the tests are reduced by 30%

PPC64 STRESS-HIGHALLOC
                traceonly-v2r2     lowlumpy-v2r3  waitcongest-v2r3waitwriteback-v2r4
Pass 1          17.00 ( 0.00%)    34.00 (17.00%)    38.00 (21.00%)    43.00 (26.00%)
Pass 2          25.00 ( 0.00%)    37.00 (12.00%)    42.00 (17.00%)    46.00 (21.00%)
At Rest         49.00 ( 0.00%)    43.00 (-6.00%)    45.00 (-4.00%)    51.00 ( 2.00%)

Success rates there are *way* up particularly considering that the 16MB
huge pages on PPC64 mean that it's always much harder to allocate them.

FTrace Reclaim Statistics: vmscan
              stress-highalloc  stress-highalloc  stress-highalloc  stress-highalloc
                traceonly-v2r2     lowlumpy-v2r3  waitcongest-v2r3waitwriteback-v2r4
Direct reclaims                                499        505        564        509
Direct reclaim pages scanned                223478      41898      51818      45605
Direct reclaim pages reclaimed              137730      21148      27161      23455
Direct reclaim write file async I/O            399        136        162        136
Direct reclaim write anon async I/O          46977       2865       4686       3998
Direct reclaim write file sync I/O              29          0          1          3
Direct reclaim write anon sync I/O           31023        159        237        239
Wake kswapd requests                           420        351        360        326
Kswapd wakeups                                 185        294        249        277
Kswapd pages scanned                      15703488   16392500   17821724   17598737
Kswapd pages reclaimed                     5808466    2908858    3139386    3145435
Kswapd reclaim write file async I/O         159938      18400      18717      13473
Kswapd reclaim write anon async I/O        3467554     228957     322799     234278
Kswapd reclaim write file sync I/O               0          0          0          0
Kswapd reclaim write anon sync I/O               0          0          0          0
Time stalled direct reclaim (seconds)      9665.35    1707.81    2374.32    1871.23
Time kswapd awake (seconds)                9401.21    1367.86    1951.75    1328.88

Total pages scanned                       15926966  16434398  17873542  17644342
Total pages reclaimed                      5946196   2930006   3166547   3168890
%age total pages scanned/reclaimed          37.33%    17.83%    17.72%    17.96%
%age total pages scanned/written            23.27%     1.52%     1.94%     1.43%
%age  file pages scanned/written             1.01%     0.11%     0.11%     0.08%
Percentage Time Spent Direct Reclaim        44.55%    35.10%    41.42%    36.91%
Percentage Time kswapd Awake                86.71%    43.58%    52.67%    41.14%

While the scanning rates are slightly up, the scanned/reclaimed and
scanned/written figures are much improved.  The time spent in direct
reclaim and with kswapd are massively reduced, mostly by the lowlumpy
patches.

FTrace Reclaim Statistics: congestion_wait
Direct number congest     waited               725        303        126          3
Direct time   congest     waited           45524ms     9180ms     5936ms      300ms
Direct full   congest     waited               487        190         52          3
Direct number conditional waited                 0          0        200        301
Direct time   conditional waited               0ms        0ms        0ms     1904ms
Direct full   conditional waited                 0          0          0         19
KSwapd number congest     waited                 0          2         23          4
KSwapd time   congest     waited               0ms      200ms      420ms      404ms
KSwapd full   congest     waited                 0          2          2          4
KSwapd number conditional waited                 0          0          0          0
KSwapd time   conditional waited               0ms        0ms        0ms        0ms
KSwapd full   conditional waited                 0          0          0          0

Not as dramatic a story here but the time spent asleep is reduced and we
can still see what wait_iff_congested is going to sleep when necessary.

MMTests Statistics: duration
User/Sys Time Running Test (seconds)      12028.09   3157.17   3357.79   3199.16
Total Elapsed Time (seconds)              10842.07   3138.72   3705.54   3229.85

The time to complete this test goes way down.  With the full series, we
are allocating over twice the number of huge pages in 30% of the time and
there is a corresponding impact on the allocation latency graph available
at.

http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-powyah-mean.ps

This patch:

Add a trace event for shrink_inactive_list() and updates the sample
postprocessing script appropriately.  It can be used to determine how many
pages were reclaimed and for non-lumpy reclaim where exactly the pages
were reclaimed from.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../postprocess/trace-vmscan-postprocess.pl   | 39 ++++++++++++-----
 include/trace/events/vmscan.h                 | 42 +++++++++++++++++++
 mm/vmscan.c                                   |  6 +++
 3 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 1b55146d1c8d..b3e73ddb1567 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -46,7 +46,7 @@ use constant HIGH_KSWAPD_LATENCY		=> 20;
 use constant HIGH_KSWAPD_REWAKEUP		=> 21;
 use constant HIGH_NR_SCANNED			=> 22;
 use constant HIGH_NR_TAKEN			=> 23;
-use constant HIGH_NR_RECLAIM			=> 24;
+use constant HIGH_NR_RECLAIMED			=> 24;
 use constant HIGH_NR_CONTIG_DIRTY		=> 25;
 
 my %perprocesspid;
@@ -58,11 +58,13 @@ my $opt_read_procstat;
 my $total_wakeup_kswapd;
 my ($total_direct_reclaim, $total_direct_nr_scanned);
 my ($total_direct_latency, $total_kswapd_latency);
+my ($total_direct_nr_reclaimed);
 my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async);
 my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async);
 my ($total_kswapd_nr_scanned, $total_kswapd_wake);
 my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async);
 my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async);
+my ($total_kswapd_nr_reclaimed);
 
 # Catch sigint and exit on request
 my $sigint_report = 0;
@@ -104,7 +106,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
 my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
 my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
 my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)';
-my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)';
+my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
 my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
 
@@ -203,8 +205,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex(
 			"vmscan/mm_vmscan_lru_shrink_inactive",
 			$regex_lru_shrink_inactive_default,
 			"nid", "zid",
-			"lru",
-			"nr_scanned", "nr_reclaimed", "priority");
+			"nr_scanned", "nr_reclaimed", "priority",
+			"flags");
 $regex_lru_shrink_active = generate_traceevent_regex(
 			"vmscan/mm_vmscan_lru_shrink_active",
 			$regex_lru_shrink_active_default,
@@ -375,6 +377,16 @@ EVENT_PROCESS:
 			my $nr_contig_dirty = $7;
 			$perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
 			$perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
+		} elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") {
+			$details = $5;
+			if ($details !~ /$regex_lru_shrink_inactive/o) {
+				print "WARNING: Failed to parse mm_vmscan_lru_shrink_inactive as expected\n";
+				print "         $details\n";
+				print "         $regex_lru_shrink_inactive/o\n";
+				next;
+			}
+			my $nr_reclaimed = $4;
+			$perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed;
 		} elsif ($tracepoint eq "mm_vmscan_writepage") {
 			$details = $5;
 			if ($details !~ /$regex_writepage/o) {
@@ -464,8 +476,8 @@ sub dump_stats {
 
 	# Print out process activity
 	printf("\n");
-	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s %8s\n", "Process", "Direct",  "Wokeup", "Pages",   "Pages",   "Pages",     "Time");
-	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s %8s\n", "details", "Rclms",   "Kswapd", "Scanned", "Sync-IO", "ASync-IO",  "Stalled");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s %8s  %8s %8s %8s %8s\n", "Process", "Direct",  "Wokeup", "Pages",   "Pages",   "Pages",   "Pages",     "Time");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s %8s  %8s %8s %8s %8s\n", "details", "Rclms",   "Kswapd", "Scanned", "Rclmed",  "Sync-IO", "ASync-IO",  "Stalled");
 	foreach $process_pid (keys %stats) {
 
 		if (!$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
@@ -475,6 +487,7 @@ sub dump_stats {
 		$total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
 		$total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
 		$total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+		$total_direct_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED};
 		$total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
 		$total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
 		$total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
@@ -489,11 +502,12 @@ sub dump_stats {
 			$index++;
 		}
 
-		printf("%-" . $max_strlen . "s %8d %10d   %8u   %8u %8u %8.3f",
+		printf("%-" . $max_strlen . "s %8d %10d   %8u %8u  %8u %8u %8.3f",
 			$process_pid,
 			$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN},
 			$stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD},
 			$stats{$process_pid}->{HIGH_NR_SCANNED},
+			$stats{$process_pid}->{HIGH_NR_RECLAIMED},
 			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
 			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC},
 			$this_reclaim_delay / 1000);
@@ -529,8 +543,8 @@ sub dump_stats {
 
 	# Print out kswapd activity
 	printf("\n");
-	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s\n", "Kswapd",   "Kswapd",  "Order",     "Pages",   "Pages",  "Pages");
-	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Sync-IO", "ASync-IO");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s\n", "Kswapd",   "Kswapd",  "Order",     "Pages",   "Pages",   "Pages",  "Pages");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Rclmed",  "Sync-IO", "ASync-IO");
 	foreach $process_pid (keys %stats) {
 
 		if (!$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
@@ -539,16 +553,18 @@ sub dump_stats {
 
 		$total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
 		$total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+		$total_kswapd_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED};
 		$total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
 		$total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
 		$total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
 		$total_kswapd_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
 
-		printf("%-" . $max_strlen . "s %8d %10d   %8u   %8i %8u",
+		printf("%-" . $max_strlen . "s %8d %10d   %8u %8u  %8i %8u",
 			$process_pid,
 			$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE},
 			$stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP},
 			$stats{$process_pid}->{HIGH_NR_SCANNED},
+			$stats{$process_pid}->{HIGH_NR_RECLAIMED},
 			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
 			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC});
 
@@ -579,6 +595,7 @@ sub dump_stats {
 	print "\nSummary\n";
 	print "Direct reclaims:     			$total_direct_reclaim\n";
 	print "Direct reclaim pages scanned:		$total_direct_nr_scanned\n";
+	print "Direct reclaim pages reclaimed:		$total_direct_nr_reclaimed\n";
 	print "Direct reclaim write file sync I/O:	$total_direct_writepage_file_sync\n";
 	print "Direct reclaim write anon sync I/O:	$total_direct_writepage_anon_sync\n";
 	print "Direct reclaim write file async I/O:	$total_direct_writepage_file_async\n";
@@ -588,6 +605,7 @@ sub dump_stats {
 	print "\n";
 	print "Kswapd wakeups:				$total_kswapd_wake\n";
 	print "Kswapd pages scanned:			$total_kswapd_nr_scanned\n";
+	print "Kswapd pages reclaimed:			$total_kswapd_nr_reclaimed\n";
 	print "Kswapd reclaim write file sync I/O:	$total_kswapd_writepage_file_sync\n";
 	print "Kswapd reclaim write anon sync I/O:	$total_kswapd_writepage_anon_sync\n";
 	print "Kswapd reclaim write file async I/O:	$total_kswapd_writepage_file_async\n";
@@ -612,6 +630,7 @@ sub aggregate_perprocesspid() {
 		$perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
 		$perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP};
 		$perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED};
+		$perprocess{$process}->{HIGH_NR_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED};
 		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
 		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
 		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 370aa5a87322..ecf952192a93 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -10,6 +10,7 @@
 
 #define RECLAIM_WB_ANON		0x0001u
 #define RECLAIM_WB_FILE		0x0002u
+#define RECLAIM_WB_MIXED	0x0010u
 #define RECLAIM_WB_SYNC		0x0004u
 #define RECLAIM_WB_ASYNC	0x0008u
 
@@ -17,6 +18,7 @@
 	(flags) ? __print_flags(flags, "|",			\
 		{RECLAIM_WB_ANON,	"RECLAIM_WB_ANON"},	\
 		{RECLAIM_WB_FILE,	"RECLAIM_WB_FILE"},	\
+		{RECLAIM_WB_MIXED,	"RECLAIM_WB_MIXED"},	\
 		{RECLAIM_WB_SYNC,	"RECLAIM_WB_SYNC"},	\
 		{RECLAIM_WB_ASYNC,	"RECLAIM_WB_ASYNC"}	\
 		) : "RECLAIM_WB_NONE"
@@ -26,6 +28,12 @@
 	(sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
 	)
 
+#define trace_shrink_flags(file, sync) ( \
+	(sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \
+			(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
+	(sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+	)
+
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
 
 	TP_PROTO(int nid),
@@ -269,6 +277,40 @@ TRACE_EVENT(mm_vmscan_writepage,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
+TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
+
+	TP_PROTO(int nid, int zid,
+			unsigned long nr_scanned, unsigned long nr_reclaimed,
+			int priority, int reclaim_flags),
+
+	TP_ARGS(nid, zid, nr_scanned, nr_reclaimed, priority, reclaim_flags),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, zid)
+		__field(unsigned long, nr_scanned)
+		__field(unsigned long, nr_reclaimed)
+		__field(int, priority)
+		__field(int, reclaim_flags)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->zid = zid;
+		__entry->nr_scanned = nr_scanned;
+		__entry->nr_reclaimed = nr_reclaimed;
+		__entry->priority = priority;
+		__entry->reclaim_flags = reclaim_flags;
+	),
+
+	TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
+		__entry->nid, __entry->zid,
+		__entry->nr_scanned, __entry->nr_reclaimed,
+		__entry->priority,
+		show_reclaim_flags(__entry->reclaim_flags))
+);
+
+
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28b0521e4bfd..4a6dccb57586 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1358,6 +1358,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
 
 	putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+
+	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+		zone_idx(zone),
+		nr_scanned, nr_reclaimed,
+		priority,
+		trace_shrink_flags(file, sc->lumpy_reclaim_mode));
 	return nr_reclaimed;
 }
 

From 52bb9198668968506f9d12bf35d7f5d3f094921e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 26 Oct 2010 14:21:41 -0700
Subject: [PATCH 035/176] writeback: account for time spent congestion_waited

There is strong evidence to indicate a lot of time is being spent in
congestion_wait(), some of it unnecessarily.  This patch adds a tracepoint
for congestion_wait to record when congestion_wait() was called, how long
the timeout was for and how long it actually slept.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/writeback.h | 28 ++++++++++++++++++++++++++++
 mm/backing-dev.c                 |  5 +++++
 2 files changed, 33 insertions(+)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 0bb01ab2e984..d2b2654606ec 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -151,6 +151,34 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written);
 DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
 
+DECLARE_EVENT_CLASS(writeback_congest_waited_template,
+
+	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+	TP_ARGS(usec_timeout, usec_delayed),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	usec_timeout	)
+		__field(	unsigned int,	usec_delayed	)
+	),
+
+	TP_fast_assign(
+		__entry->usec_timeout	= usec_timeout;
+		__entry->usec_delayed	= usec_delayed;
+	),
+
+	TP_printk("usec_timeout=%u usec_delayed=%u",
+			__entry->usec_timeout,
+			__entry->usec_delayed)
+);
+
+DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
+
+	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+	TP_ARGS(usec_timeout, usec_delayed)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..55627306abe0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -764,12 +764,17 @@ EXPORT_SYMBOL(set_bdi_congested);
 long congestion_wait(int sync, long timeout)
 {
 	long ret;
+	unsigned long start = jiffies;
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
+
+	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+					jiffies_to_usecs(jiffies - start));
+
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait);

From bc57e00f5e0b2480ef222c775c49552d3a930db7 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:41 -0700
Subject: [PATCH 036/176] vmscan: synchronous lumpy reclaim should not call
 congestion_wait()

congestion_wait() means "wait until queue congestion is cleared".
However, synchronous lumpy reclaim does not need this congestion_wait() as
shrink_page_list(PAGEOUT_IO_SYNC) uses wait_on_page_writeback() and it
provides the necessary waiting.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4a6dccb57586..399d54e8a82c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1340,8 +1340,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
 	/* Check if we should syncronously wait for writeback */
 	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-		congestion_wait(BLK_RW_ASYNC, HZ/10);
-
 		/*
 		 * The attempt at page out may have made some
 		 * of the pages active, mark them inactive again.

From 7d3579e8e61937cbba268ea9b218d006b6d64221 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:42 -0700
Subject: [PATCH 037/176] vmscan: narrow the scenarios in whcih lumpy reclaim
 uses synchrounous reclaim

shrink_page_list() can decide to give up reclaiming a page under a
number of conditions such as

  1. trylock_page() failure
  2. page is unevictable
  3. zone reclaim and page is mapped
  4. PageWriteback() is true
  5. page is swapbacked and swap is full
  6. add_to_swap() failure
  7. page is dirty and gfpmask don't have GFP_IO, GFP_FS
  8. page is pinned
  9. IO queue is congested
 10. pageout() start IO, but not finished

With lumpy reclaim, failures result in entering synchronous lumpy reclaim
but this can be unnecessary.  In cases (2), (3), (5), (6), (7) and (8),
there is no point retrying.  This patch causes lumpy reclaim to abort when
it is known it will fail.

Case (9) is more interesting. current behavior is,
  1. start shrink_page_list(async)
  2. found queue_congested()
  3. skip pageout write
  4. still start shrink_page_list(sync)
  5. wait on a lot of pages
  6. again, found queue_congested()
  7. give up pageout write again

So, it's useless time wasting.  However, just skipping page reclaim is
also notgood as x86 allocating a huge page needs 512 pages for example.
It can have more dirty pages than queue congestion threshold (~=128).

After this patch, pageout() behaves as follows;

 - If order > PAGE_ALLOC_COSTLY_ORDER
	Ignore queue congestion always.
 - If order <= PAGE_ALLOC_COSTLY_ORDER
	skip write page and disable lumpy reclaim.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h |   6 +-
 mm/vmscan.c                   | 120 +++++++++++++++++++++-------------
 2 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index ecf952192a93..c255fcc587bf 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -25,13 +25,13 @@
 
 #define trace_reclaim_flags(page, sync) ( \
 	(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-	(sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
+	(sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
 	)
 
 #define trace_shrink_flags(file, sync) ( \
-	(sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \
+	(sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \
 			(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
-	(sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+	(sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
 	)
 
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 399d54e8a82c..d9fc2dce93af 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
 
+enum lumpy_mode {
+	LUMPY_MODE_NONE,
+	LUMPY_MODE_ASYNC,
+	LUMPY_MODE_SYNC,
+};
+
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -82,7 +88,7 @@ struct scan_control {
 	 * Intend to reclaim enough continuous memory rather than reclaim
 	 * enough amount of memory. i.e, mode for high order allocation.
 	 */
-	bool lumpy_reclaim_mode;
+	enum lumpy_mode lumpy_reclaim_mode;
 
 	/* Which cgroup do we reclaim from */
 	struct mem_cgroup *mem_cgroup;
@@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 	return ret;
 }
 
+static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+				   bool sync)
+{
+	enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+
+	/*
+	 * Some reclaim have alredy been failed. No worth to try synchronous
+	 * lumpy reclaim.
+	 */
+	if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+		return;
+
+	/*
+	 * If we need a large contiguous chunk of memory, or have
+	 * trouble getting a small set of contiguous pages, we
+	 * will reclaim both active and inactive pages.
+	 */
+	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+		sc->lumpy_reclaim_mode = mode;
+	else if (sc->order && priority < DEF_PRIORITY - 2)
+		sc->lumpy_reclaim_mode = mode;
+	else
+		sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+
+static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+{
+	sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
@@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
 	return page_count(page) - page_has_private(page) == 2;
 }
 
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+			      struct scan_control *sc)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
@@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
+
+	/* lumpy reclaim for hugepage often need a lot of write */
+	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+		return 1;
 	return 0;
 }
 
@@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
 	unlock_page(page);
 }
 
-/* Request for sync pageout. */
-enum pageout_io {
-	PAGEOUT_IO_ASYNC,
-	PAGEOUT_IO_SYNC,
-};
-
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
@@ -330,7 +365,7 @@ typedef enum {
  * Calls ->writepage().
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
-						enum pageout_io sync_writeback)
+			 struct scan_control *sc)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -366,8 +401,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info))
+	if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
+		disable_lumpy_reclaim_mode(sc);
 		return PAGE_KEEP;
+	}
 
 	if (clear_page_dirty_for_io(page)) {
 		int res;
@@ -393,7 +430,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 		 * direct reclaiming a large contiguous area and the
 		 * first attempt to free a range of pages fails.
 		 */
-		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+		if (PageWriteback(page) &&
+		    sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
 			wait_on_page_writeback(page);
 
 		if (!PageWriteback(page)) {
@@ -401,7 +439,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			ClearPageReclaim(page);
 		}
 		trace_mm_vmscan_writepage(page,
-			trace_reclaim_flags(page, sync_writeback));
+			trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
 		return PAGE_SUCCESS;
 	}
@@ -579,7 +617,7 @@ static enum page_references page_check_references(struct page *page,
 	referenced_page = TestClearPageReferenced(page);
 
 	/* Lumpy reclaim - ignore references */
-	if (sc->lumpy_reclaim_mode)
+	if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
 		return PAGEREF_RECLAIM;
 
 	/*
@@ -643,8 +681,7 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-					struct scan_control *sc,
-					enum pageout_io sync_writeback)
+				      struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
@@ -693,10 +730,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			 * for any page for which writeback has already
 			 * started.
 			 */
-			if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+			if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+			    may_enter_fs)
 				wait_on_page_writeback(page);
-			else
-				goto keep_locked;
+			else {
+				unlock_page(page);
+				goto keep_lumpy;
+			}
 		}
 
 		references = page_check_references(page, sc);
@@ -750,14 +790,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 
 			/* Page is dirty, try to write it out here */
-			switch (pageout(page, mapping, sync_writeback)) {
+			switch (pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
-				if (PageWriteback(page) || PageDirty(page))
+				if (PageWriteback(page))
+					goto keep_lumpy;
+				if (PageDirty(page))
 					goto keep;
+
 				/*
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
@@ -840,6 +883,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			try_to_free_swap(page);
 		unlock_page(page);
 		putback_lru_page(page);
+		disable_lumpy_reclaim_mode(sc);
 		continue;
 
 activate_locked:
@@ -852,6 +896,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 keep_locked:
 		unlock_page(page);
 keep:
+		disable_lumpy_reclaim_mode(sc);
+keep_lumpy:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
@@ -1252,7 +1298,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
 		return false;
 
 	/* Only stall on lumpy reclaim */
-	if (!sc->lumpy_reclaim_mode)
+	if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
 		return false;
 
 	/* If we have relaimed everything on the isolated list, no stall */
@@ -1297,15 +1343,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 			return SWAP_CLUSTER_MAX;
 	}
 
-
+	set_lumpy_reclaim_mode(priority, sc, false);
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 
 	if (scanning_global_lru(sc)) {
 		nr_taken = isolate_pages_global(nr_to_scan,
 			&page_list, &nr_scanned, sc->order,
-			sc->lumpy_reclaim_mode ?
-				ISOLATE_BOTH : ISOLATE_INACTIVE,
+			sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+					ISOLATE_INACTIVE : ISOLATE_BOTH,
 			zone, 0, file);
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
@@ -1317,8 +1363,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
 			&page_list, &nr_scanned, sc->order,
-			sc->lumpy_reclaim_mode ?
-				ISOLATE_BOTH : ISOLATE_INACTIVE,
+			sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+					ISOLATE_INACTIVE : ISOLATE_BOTH,
 			zone, sc->mem_cgroup,
 			0, file);
 		/*
@@ -1336,7 +1382,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
 	spin_unlock_irq(&zone->lru_lock);
 
-	nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+	nr_reclaimed = shrink_page_list(&page_list, sc);
 
 	/* Check if we should syncronously wait for writeback */
 	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
@@ -1347,7 +1393,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 		nr_active = clear_active_flags(&page_list, NULL);
 		count_vm_events(PGDEACTIVATE, nr_active);
 
-		nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+		set_lumpy_reclaim_mode(priority, sc, true);
+		nr_reclaimed += shrink_page_list(&page_list, sc);
 	}
 
 	local_irq_disable();
@@ -1739,21 +1786,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
 	}
 }
 
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
-{
-	/*
-	 * If we need a large contiguous chunk of memory, or have
-	 * trouble getting a small set of contiguous pages, we
-	 * will reclaim both active and inactive pages.
-	 */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		sc->lumpy_reclaim_mode = 1;
-	else if (sc->order && priority < DEF_PRIORITY - 2)
-		sc->lumpy_reclaim_mode = 1;
-	else
-		sc->lumpy_reclaim_mode = 0;
-}
-
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1768,8 +1800,6 @@ static void shrink_zone(int priority, struct zone *zone,
 
 	get_scan_count(zone, sc, nr, priority);
 
-	set_lumpy_reclaim_mode(priority, sc);
-
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
 		for_each_evictable_lru(l) {

From 47185052165a4c5de0a461018238375dd982c2ec Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:43 -0700
Subject: [PATCH 038/176] vmscan: remove dead code in shrink_inactive_list()

After synchrounous lumpy reclaim, the page_list is guaranteed to not have
active pages as page activation in shrink_page_list() disables lumpy
reclaim.  Remove the dead code.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d9fc2dce93af..cec8081bbd95 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1331,7 +1331,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	unsigned long nr_scanned;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_taken;
-	unsigned long nr_active;
 	unsigned long nr_anon;
 	unsigned long nr_file;
 
@@ -1386,13 +1385,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
 	/* Check if we should syncronously wait for writeback */
 	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-		/*
-		 * The attempt at page out may have made some
-		 * of the pages active, mark them inactive again.
-		 */
-		nr_active = clear_active_flags(&page_list, NULL);
-		count_vm_events(PGDEACTIVATE, nr_active);
-
 		set_lumpy_reclaim_mode(priority, sc, true);
 		nr_reclaimed += shrink_page_list(&page_list, sc);
 	}

From 08fc468f4eaf6683bae5bdb94743a09d8630cb80 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:44 -0700
Subject: [PATCH 039/176] vmscan: isolate_lru_pages(): stop neighbour search if
 neighbour cannot be isolated

isolate_lru_pages() does not just isolate LRU tail pages, but also
isolates neighbour pages of the eviction page.  The neighbour search does
not stop even if neighbours cannot be isolated which is excessive as the
lumpy reclaim will no longer result in a successful higher order
allocation.  This patch stops the PFN neighbour pages if an isolation
fails and moves on to the next block.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index cec8081bbd95..130ad0239f52 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1051,7 +1051,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 			/* Check that we have not crossed a zone boundary. */
 			if (unlikely(page_zone_id(cursor_page) != zone_id))
-				continue;
+				break;
 
 			/*
 			 * If we don't have enough swap space, reclaiming of
@@ -1059,8 +1059,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			 * pointless.
 			 */
 			if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
-					!PageSwapCache(cursor_page))
-				continue;
+			    !PageSwapCache(cursor_page))
+				break;
 
 			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
 				list_move(&cursor_page->lru, dst);
@@ -1071,11 +1071,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 					nr_lumpy_dirty++;
 				scan++;
 			} else {
-				if (mode == ISOLATE_BOTH &&
-						page_count(cursor_page))
-					nr_lumpy_failed++;
+				/* the page is freed already. */
+				if (!page_count(cursor_page))
+					continue;
+				break;
 			}
 		}
+
+		/* If we break out of the loop above, lumpy reclaim failed */
+		if (pfn < end_pfn)
+			nr_lumpy_failed++;
 	}
 
 	*scanned = scan;

From 0e093d99763eb4cea09f8ca4f1d01f34e121d10b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 26 Oct 2010 14:21:45 -0700
Subject: [PATCH 040/176] writeback: do not sleep on the congestion queue if
 there are no congested BDIs or if significant congestion is not being
 encountered in the current zone

If congestion_wait() is called with no BDI congested, the caller will
sleep for the full timeout and this may be an unnecessary sleep.  This
patch adds a wait_iff_congested() that checks congestion and only sleeps
if a BDI is congested else, it calls cond_resched() to ensure the caller
is not hogging the CPU longer than its quota but otherwise will not sleep.

This is aimed at reducing some of the major desktop stalls reported during
IO.  For example, while kswapd is operating, it calls congestion_wait()
but it could just have been reclaiming clean page cache pages with no
congestion.  Without this patch, it would sleep for a full timeout but
after this patch, it'll just call schedule() if it has been on the CPU too
long.  Similar logic applies to direct reclaimers that are not making
enough progress.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h      |  2 +-
 include/linux/mmzone.h           |  8 +++++
 include/trace/events/writeback.h |  7 ++++
 mm/backing-dev.c                 | 61 ++++++++++++++++++++++++++++++--
 mm/page_alloc.c                  |  4 +--
 mm/vmscan.c                      | 42 ++++++++++++++++++----
 6 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 35b00746c712..f1b402a50679 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -285,7 +285,7 @@ enum {
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
-
+long wait_iff_congested(struct zone *zone, int sync, long timeout);
 
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c3c17fb675ee..39c24ebe9cfd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -423,6 +423,9 @@ struct zone {
 typedef enum {
 	ZONE_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
 	ZONE_OOM_LOCKED,		/* zone is in OOM killer zonelist */
+	ZONE_CONGESTED,			/* zone has many dirty pages backed by
+					 * a congested BDI
+					 */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
 	clear_bit(flag, &zone->flags);
 }
 
+static inline int zone_is_reclaim_congested(const struct zone *zone)
+{
+	return test_bit(ZONE_CONGESTED, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index d2b2654606ec..89a2b2db4375 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
 	TP_ARGS(usec_timeout, usec_delayed)
 );
 
+DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
+
+	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+	TP_ARGS(usec_timeout, usec_delayed)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 55627306abe0..5ad3c106606b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
+static atomic_t nr_bdi_congested[2];
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	clear_bit(bit, &bdi->state);
+	if (test_and_clear_bit(bit, &bdi->state))
+		atomic_dec(&nr_bdi_congested[sync]);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 	enum bdi_state bit;
 
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	set_bit(bit, &bdi->state);
+	if (!test_and_set_bit(bit, &bdi->state))
+		atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
 
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+	long ret;
+	unsigned long start = jiffies;
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+	/*
+	 * If there is no congestion, or heavy congestion is not being
+	 * encountered in the current zone, yield if necessary instead
+	 * of sleeping on the congestion queue
+	 */
+	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+			!zone_is_reclaim_congested(zone)) {
+		cond_resched();
+
+		/* In case we scheduled, work out time remaining */
+		ret = timeout - (jiffies - start);
+		if (ret < 0)
+			ret = 0;
+
+		goto out;
+	}
+
+	/* Sleep until uncongested or a write happens */
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	ret = io_schedule_timeout(timeout);
+	finish_wait(wqh, &wait);
+
+out:
+	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+					jiffies_to_usecs(jiffies - start));
+
+	return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a683f819439..b13bc5e5bd7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 			preferred_zone, migratetype);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
-			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 
 	return page;
@@ -2095,7 +2095,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
-		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 130ad0239f52..30fd658bb289 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
-		disable_lumpy_reclaim_mode(sc);
+	if (!may_write_to_queue(mapping->backing_dev_info, sc))
 		return PAGE_KEEP;
-	}
 
 	if (clear_page_dirty_for_io(page)) {
 		int res;
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
+				      struct zone *zone,
 				      struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
 	int pgactivate = 0;
+	unsigned long nr_dirty = 0;
+	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
 
 	cond_resched();
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			goto keep;
 
 		VM_BUG_ON(PageActive(page));
+		VM_BUG_ON(page_zone(page) != zone);
 
 		sc->nr_scanned++;
 
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		}
 
 		if (PageDirty(page)) {
+			nr_dirty++;
+
 			if (references == PAGEREF_RECLAIM_CLEAN)
 				goto keep_locked;
 			if (!may_enter_fs)
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			/* Page is dirty, try to write it out here */
 			switch (pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
+				nr_congested++;
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
@@ -902,6 +907,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 
+	/*
+	 * Tag a zone as congested if all the dirty pages encountered were
+	 * backed by a congested BDI. In this case, reclaimers should just
+	 * back off and wait for congestion to clear because further reclaim
+	 * will encounter the same problem
+	 */
+	if (nr_dirty == nr_congested)
+		zone_set_flag(zone, ZONE_CONGESTED);
+
 	free_page_list(&free_pages);
 
 	list_splice(&ret_pages, page_list);
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
 	spin_unlock_irq(&zone->lru_lock);
 
-	nr_reclaimed = shrink_page_list(&page_list, sc);
+	nr_reclaimed = shrink_page_list(&page_list, zone, sc);
 
 	/* Check if we should syncronously wait for writeback */
 	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
 		set_lumpy_reclaim_mode(priority, sc, true);
-		nr_reclaimed += shrink_page_list(&page_list, sc);
+		nr_reclaimed += shrink_page_list(&page_list, zone, sc);
 	}
 
 	local_irq_disable();
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 		/* Take a nap, wait for some writeback to complete */
 		if (!sc->hibernation_mode && sc->nr_scanned &&
-		    priority < DEF_PRIORITY - 2)
-			congestion_wait(BLK_RW_ASYNC, HZ/10);
+		    priority < DEF_PRIORITY - 2) {
+			struct zone *preferred_zone;
+
+			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+							NULL, &preferred_zone);
+			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+		}
 	}
 
 out:
@@ -2282,6 +2301,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 				if (!zone_watermark_ok(zone, order,
 					    min_wmark_pages(zone), end_zone, 0))
 					has_under_min_watermark_zone = 1;
+			} else {
+				/*
+				 * If a zone reaches its high watermark,
+				 * consider it to be no longer congested. It's
+				 * possible there are dirty pages backed by
+				 * congested BDIs but as pressure is relieved,
+				 * spectulatively avoid congestion waits
+				 */
+				zone_clear_flag(zone, ZONE_CONGESTED);
 			}
 
 		}

From 4cbec4c8b9fda9ec784086fe7f74cd32a8adda95 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 26 Oct 2010 14:21:45 -0700
Subject: [PATCH 041/176] writeback: remove the internal 5% low bound on
 dirty_ratio

The dirty_ratio was silently limited in global_dirty_limits() to >= 5%.
This is not a user expected behavior.  And it's inconsistent with
calc_period_shift(), which uses the plain vm_dirty_ratio value.

Let's remove the internal bound.

At the same time, fix balance_dirty_pages() to work with the
dirty_thresh=0 case.  This allows applications to proceed when
dirty+writeback pages are all cleaned.

And ">" fits with the name "exceeded" better than ">=" does.  Neil thinks
it is an aesthetic improvement as well as a functional one :)

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jan Kara <jack@suse.cz>
Proposed-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c   |  2 +-
 mm/page-writeback.c | 16 +++++-----------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..97d2951bd4d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -582,7 +582,7 @@ static inline bool over_bground_thresh(void)
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
 	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4dd91f7fd39f..b840afa89761 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 
 	if (vm_dirty_bytes)
 		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-	else {
-		int dirty_ratio;
-
-		dirty_ratio = vm_dirty_ratio;
-		if (dirty_ratio < 5)
-			dirty_ratio = 5;
-		dirty = (dirty_ratio * available_memory) / 100;
-	}
+	else
+		dirty = (vm_dirty_ratio * available_memory) / 100;
 
 	if (dirty_background_bytes)
 		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <
+		if (nr_reclaimable + nr_writeback <=
 				(background_thresh + dirty_thresh) / 2)
 			break;
 
@@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * the last resort safeguard.
 		 */
 		dirty_exceeded =
-			(bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
-			|| (nr_reclaimable + nr_writeback >= dirty_thresh);
+			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+			|| (nr_reclaimable + nr_writeback > dirty_thresh);
 
 		if (!dirty_exceeded)
 			break;

From 2e30244a7cc1ff09013a1238d415b4076406388e Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:21:46 -0700
Subject: [PATCH 042/176] vmscan,tmpfs: treat used once pages on tmpfs as used
 once

When a page has PG_referenced, shrink_page_list() discards it only if it
is not dirty.  This rule works fine if the backing filesystem is a regular
one.  PG_dirty is a good signal that the page was used recently because
the flusher threads clean pages periodically.  In addition, page writeback
is costlier than simple page discard.

However, when a page is on tmpfs this heuristic doesn't work because
flusher threads don't write back tmpfs pages.  Consequently tmpfs pages
always rotate around the lru twice at least and adds unnecessary lru
churn.  Simple tmpfs streaming io shouldn't cause large anonymous page
swap-out.

Remove this unncessary reclaim bonus of tmpfs pages.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 30fd658bb289..b8a6fdc21312 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -651,7 +651,7 @@ static enum page_references page_check_references(struct page *page,
 	}
 
 	/* Reclaim if clean, defer dirty pages to writeback */
-	if (referenced_page)
+	if (referenced_page && !PageSwapBacked(page))
 		return PAGEREF_RECLAIM_CLEAN;
 
 	return PAGEREF_RECLAIM;

From 61ecdb801ef2cd28e32442383106d7837d76deac Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:47 -0700
Subject: [PATCH 043/176] mm: strictly nested kmap_atomic()

Ensure kmap_atomic() usage is strictly nested

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 crypto/async_tx/async_memcpy.c | 2 +-
 crypto/blkcipher.c             | 2 +-
 drivers/block/loop.c           | 4 ++--
 include/linux/highmem.h        | 4 ++--
 kernel/power/snapshot.c        | 4 ++--
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index 0ec1fb69d4ea..518c22bd9562 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -83,8 +83,8 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 
 		memcpy(dest_buf, src_buf, len);
 
-		kunmap_atomic(dest_buf, KM_USER0);
 		kunmap_atomic(src_buf, KM_USER1);
+		kunmap_atomic(dest_buf, KM_USER0);
 
 		async_tx_sync_epilog(submit);
 	}
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index 90d26c91f4e9..7a7219266e3c 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -89,9 +89,9 @@ static inline unsigned int blkcipher_done_fast(struct blkcipher_walk *walk,
 		memcpy(walk->dst.virt.addr, walk->page, n);
 		blkcipher_unmap_dst(walk);
 	} else if (!(walk->flags & BLKCIPHER_WALK_PHYS)) {
-		blkcipher_unmap_src(walk);
 		if (walk->flags & BLKCIPHER_WALK_DIFF)
 			blkcipher_unmap_dst(walk);
+		blkcipher_unmap_src(walk);
 	}
 
 	scatterwalk_advance(&walk->in, n);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6c48b3545f84..450c958b514f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -101,8 +101,8 @@ static int transfer_none(struct loop_device *lo, int cmd,
 	else
 		memcpy(raw_buf, loop_buf, size);
 
-	kunmap_atomic(raw_buf, KM_USER0);
 	kunmap_atomic(loop_buf, KM_USER1);
+	kunmap_atomic(raw_buf, KM_USER0);
 	cond_resched();
 	return 0;
 }
@@ -130,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 	for (i = 0; i < size; i++)
 		*out++ = *in++ ^ key[(i & 511) % keysize];
 
-	kunmap_atomic(raw_buf, KM_USER0);
 	kunmap_atomic(loop_buf, KM_USER1);
+	kunmap_atomic(raw_buf, KM_USER0);
 	cond_resched();
 	return 0;
 }
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index e3060ef85b6d..283cd47bb34c 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -201,8 +201,8 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 	vfrom = kmap_atomic(from, KM_USER0);
 	vto = kmap_atomic(to, KM_USER1);
 	copy_user_page(vto, vfrom, vaddr, to);
-	kunmap_atomic(vfrom, KM_USER0);
 	kunmap_atomic(vto, KM_USER1);
+	kunmap_atomic(vfrom, KM_USER0);
 }
 
 #endif
@@ -214,8 +214,8 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	vfrom = kmap_atomic(from, KM_USER0);
 	vto = kmap_atomic(to, KM_USER1);
 	copy_page(vto, vfrom);
-	kunmap_atomic(vfrom, KM_USER0);
 	kunmap_atomic(vto, KM_USER1);
+	kunmap_atomic(vfrom, KM_USER0);
 }
 
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ac7eb109f196..9e3581f4619a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 		src = kmap_atomic(s_page, KM_USER0);
 		dst = kmap_atomic(d_page, KM_USER1);
 		do_copy_page(dst, src);
-		kunmap_atomic(src, KM_USER0);
 		kunmap_atomic(dst, KM_USER1);
+		kunmap_atomic(src, KM_USER0);
 	} else {
 		if (PageHighMem(d_page)) {
 			/* Page pointed to by src may contain some kernel
@@ -2273,8 +2273,8 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 	memcpy(buf, kaddr1, PAGE_SIZE);
 	memcpy(kaddr1, kaddr2, PAGE_SIZE);
 	memcpy(kaddr2, buf, PAGE_SIZE);
-	kunmap_atomic(kaddr1, KM_USER0);
 	kunmap_atomic(kaddr2, KM_USER1);
+	kunmap_atomic(kaddr1, KM_USER0);
 }
 
 /**

From 3e4d3af501cccdc8a8cca41bdbe57d54ad7e7e73 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:51 -0700
Subject: [PATCH 044/176] mm: stack based kmap_atomic()

Keep the current interface but ignore the KM_type and use a stack based
approach.

The advantage is that we get rid of crappy code like:

	#define __KM_PTE			\
		(in_nmi() ? KM_NMI_PTE : 	\
		 in_irq() ? KM_IRQ_PTE :	\
		 KM_PTE0)

and in general can stop worrying about what context we're in and what kmap
slots might be appropriate for that.

The downside is that FRV kmap_atomic() gets more expensive.

For now we use a CPP trick suggested by Andrew:

  #define kmap_atomic(page, args...) __kmap_atomic(page)

to avoid having to touch all kmap_atomic() users in a single patch.

[ not compiled on:
  - mn10300: the arch doesn't actually build with highmem to begin with ]

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix up drivers/gpu/drm/i915/intel_overlay.c]
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/highmem.h         |  6 +-
 arch/arm/mm/highmem.c                  | 23 ++++---
 arch/frv/include/asm/highmem.h         | 25 ++------
 arch/frv/mb93090-mb00/pci-dma.c        |  4 +-
 arch/frv/mm/cache-page.c               |  8 +--
 arch/frv/mm/highmem.c                  | 50 +++++++++++++++
 arch/mips/include/asm/highmem.h        | 18 ++----
 arch/mips/mm/highmem.c                 | 50 ++++++++-------
 arch/mn10300/include/asm/highmem.h     | 42 ++++++++-----
 arch/powerpc/include/asm/highmem.h     |  9 ++-
 arch/powerpc/mm/highmem.c              | 35 ++++++-----
 arch/sparc/include/asm/highmem.h       |  4 +-
 arch/sparc/mm/highmem.c                | 48 ++++++++-------
 arch/tile/include/asm/highmem.h        | 10 +--
 arch/tile/mm/highmem.c                 | 85 +++++++-------------------
 arch/x86/include/asm/highmem.h         | 11 ++--
 arch/x86/include/asm/iomap.h           |  4 +-
 arch/x86/kernel/crash_dump_32.c        |  2 +-
 arch/x86/mm/highmem_32.c               | 83 +++++++++++++------------
 arch/x86/mm/iomap_32.c                 | 42 +++++++------
 drivers/gpu/drm/i915/i915_gem.c        | 25 ++++----
 drivers/gpu/drm/i915/i915_irq.c        |  5 +-
 drivers/gpu/drm/i915/intel_overlay.c   |  5 +-
 drivers/gpu/drm/nouveau/nouveau_bios.c |  8 +--
 drivers/gpu/drm/ttm/ttm_bo_util.c      |  8 +--
 include/linux/highmem.h                | 61 +++++++++++-------
 include/linux/io-mapping.h             | 14 ++---
 mm/highmem.c                           | 62 ++-----------------
 28 files changed, 371 insertions(+), 376 deletions(-)

diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h
index 5aff58126602..1fc684e70ab6 100644
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -35,9 +35,9 @@ extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte);
 #ifdef CONFIG_HIGHMEM
 extern void *kmap(struct page *page);
 extern void kunmap(struct page *page);
-extern void *kmap_atomic(struct page *page, enum km_type type);
-extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+extern void *__kmap_atomic(struct page *page);
+extern void __kunmap_atomic(void *kvaddr);
+extern void *kmap_atomic_pfn(unsigned long pfn);
 extern struct page *kmap_atomic_to_page(const void *ptr);
 #endif
 
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 1fbdb55bfd1b..c00f119babbf 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -36,18 +36,17 @@ void kunmap(struct page *page)
 }
 EXPORT_SYMBOL(kunmap);
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page)
 {
 	unsigned int idx;
 	unsigned long vaddr;
 	void *kmap;
+	int type;
 
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic(type);
-
 #ifdef CONFIG_DEBUG_HIGHMEM
 	/*
 	 * There is no cache coherency issue when non VIVT, so force the
@@ -61,6 +60,8 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	if (kmap)
 		return kmap;
 
+	type = kmap_atomic_idx_push();
+
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -80,14 +81,17 @@ void *kmap_atomic(struct page *page, enum km_type type)
 
 	return (void *)vaddr;
 }
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic);
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	unsigned int idx = type + KM_TYPE_NR * smp_processor_id();
+	int idx, type;
 
 	if (kvaddr >= (void *)FIXADDR_START) {
+		type = kmap_atomic_idx_pop();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
 		if (cache_is_vivt())
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -103,15 +107,16 @@ void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 	}
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
+EXPORT_SYMBOL(__kunmap_atomic);
 
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-	unsigned int idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	pagefault_disable();
 
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h
index cb4c317eaecc..a8d6565d415d 100644
--- a/arch/frv/include/asm/highmem.h
+++ b/arch/frv/include/asm/highmem.h
@@ -112,12 +112,11 @@ extern struct page *kmap_atomic_to_page(void *ptr);
 	(void *) damlr;										  \
 })
 
-static inline void *kmap_atomic(struct page *page, enum km_type type)
+static inline void *kmap_atomic_primary(struct page *page, enum km_type type)
 {
 	unsigned long paddr;
 
 	pagefault_disable();
-	debug_kmap_atomic(type);
 	paddr = page_to_phys(page);
 
 	switch (type) {
@@ -125,14 +124,6 @@ static inline void *kmap_atomic(struct page *page, enum km_type type)
         case 1:		return __kmap_atomic_primary(1, paddr, 3);
         case 2:		return __kmap_atomic_primary(2, paddr, 4);
         case 3:		return __kmap_atomic_primary(3, paddr, 5);
-        case 4:		return __kmap_atomic_primary(4, paddr, 6);
-        case 5:		return __kmap_atomic_primary(5, paddr, 7);
-        case 6:		return __kmap_atomic_primary(6, paddr, 8);
-        case 7:		return __kmap_atomic_primary(7, paddr, 9);
-        case 8:		return __kmap_atomic_primary(8, paddr, 10);
-
-	case 9 ... 9 + NR_TLB_LINES - 1:
-		return __kmap_atomic_secondary(type - 9, paddr);
 
 	default:
 		BUG();
@@ -152,22 +143,13 @@ do {									\
 	asm volatile("tlbpr %0,gr0,#4,#1" : : "r"(vaddr) : "memory");	\
 } while(0)
 
-static inline void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+static inline void kunmap_atomic_primary(void *kvaddr, enum km_type type)
 {
 	switch (type) {
         case 0:		__kunmap_atomic_primary(0, 2);	break;
         case 1:		__kunmap_atomic_primary(1, 3);	break;
         case 2:		__kunmap_atomic_primary(2, 4);	break;
         case 3:		__kunmap_atomic_primary(3, 5);	break;
-        case 4:		__kunmap_atomic_primary(4, 6);	break;
-        case 5:		__kunmap_atomic_primary(5, 7);	break;
-        case 6:		__kunmap_atomic_primary(6, 8);	break;
-        case 7:		__kunmap_atomic_primary(7, 9);	break;
-        case 8:		__kunmap_atomic_primary(8, 10);	break;
-
-	case 9 ... 9 + NR_TLB_LINES - 1:
-		__kunmap_atomic_secondary(type - 9, kvaddr);
-		break;
 
 	default:
 		BUG();
@@ -175,6 +157,9 @@ static inline void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 	pagefault_enable();
 }
 
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index 85d110b71cf7..41098a3803a2 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -61,14 +61,14 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	dampr2 = __get_DAMPR(2);
 
 	for (i = 0; i < nents; i++) {
-		vaddr = kmap_atomic(sg_page(&sg[i]), __KM_CACHE);
+		vaddr = kmap_atomic_primary(sg_page(&sg[i]), __KM_CACHE);
 
 		frv_dcache_writeback((unsigned long) vaddr,
 				     (unsigned long) vaddr + PAGE_SIZE);
 
 	}
 
-	kunmap_atomic(vaddr, __KM_CACHE);
+	kunmap_atomic_primary(vaddr, __KM_CACHE);
 	if (dampr2) {
 		__set_DAMPR(2, dampr2);
 		__set_IAMPR(2, dampr2);
diff --git a/arch/frv/mm/cache-page.c b/arch/frv/mm/cache-page.c
index 0261cbe153b5..b24ade27a0f0 100644
--- a/arch/frv/mm/cache-page.c
+++ b/arch/frv/mm/cache-page.c
@@ -26,11 +26,11 @@ void flush_dcache_page(struct page *page)
 
 	dampr2 = __get_DAMPR(2);
 
-	vaddr = kmap_atomic(page, __KM_CACHE);
+	vaddr = kmap_atomic_primary(page, __KM_CACHE);
 
 	frv_dcache_writeback((unsigned long) vaddr, (unsigned long) vaddr + PAGE_SIZE);
 
-	kunmap_atomic(vaddr, __KM_CACHE);
+	kunmap_atomic_primary(vaddr, __KM_CACHE);
 
 	if (dampr2) {
 		__set_DAMPR(2, dampr2);
@@ -54,12 +54,12 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 
 	dampr2 = __get_DAMPR(2);
 
-	vaddr = kmap_atomic(page, __KM_CACHE);
+	vaddr = kmap_atomic_primary(page, __KM_CACHE);
 
 	start = (start & ~PAGE_MASK) | (unsigned long) vaddr;
 	frv_cache_wback_inv(start, start + len);
 
-	kunmap_atomic(vaddr, __KM_CACHE);
+	kunmap_atomic_primary(vaddr, __KM_CACHE);
 
 	if (dampr2) {
 		__set_DAMPR(2, dampr2);
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index eadd07658075..61088dcc1594 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -36,3 +36,53 @@ struct page *kmap_atomic_to_page(void *ptr)
 {
 	return virt_to_page(ptr);
 }
+
+void *__kmap_atomic(struct page *page)
+{
+	unsigned long paddr;
+	int type;
+
+	pagefault_disable();
+	type = kmap_atomic_idx_push();
+	paddr = page_to_phys(page);
+
+	switch (type) {
+	/*
+	 * The first 4 primary maps are reserved for architecture code
+	 */
+	case 0:		return __kmap_atomic_primary(4, paddr, 6);
+	case 1:		return __kmap_atomic_primary(5, paddr, 7);
+	case 2:		return __kmap_atomic_primary(6, paddr, 8);
+	case 3:		return __kmap_atomic_primary(7, paddr, 9);
+	case 4:		return __kmap_atomic_primary(8, paddr, 10);
+
+	case 5 ... 5 + NR_TLB_LINES - 1:
+		return __kmap_atomic_secondary(type - 5, paddr);
+
+	default:
+		BUG();
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(__kmap_atomic);
+
+void __kunmap_atomic(void *kvaddr)
+{
+	int type = kmap_atomic_idx_pop();
+	switch (type) {
+	case 0:		__kunmap_atomic_primary(4, 6);	break;
+	case 1:		__kunmap_atomic_primary(5, 7);	break;
+	case 2:		__kunmap_atomic_primary(6, 8);	break;
+	case 3:		__kunmap_atomic_primary(7, 9);	break;
+	case 4:		__kunmap_atomic_primary(8, 10);	break;
+
+	case 5 ... 5 + NR_TLB_LINES - 1:
+		__kunmap_atomic_secondary(type - 5, kvaddr);
+		break;
+
+	default:
+		BUG();
+	}
+	pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h
index 75753ca73bfd..77e644082a3b 100644
--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -45,18 +45,12 @@ extern pte_t *pkmap_page_table;
 extern void * kmap_high(struct page *page);
 extern void kunmap_high(struct page *page);
 
-extern void *__kmap(struct page *page);
-extern void __kunmap(struct page *page);
-extern void *__kmap_atomic(struct page *page, enum km_type type);
-extern void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-extern struct page *__kmap_atomic_to_page(void *ptr);
-
-#define kmap			__kmap
-#define kunmap			__kunmap
-#define kmap_atomic		__kmap_atomic
-#define kunmap_atomic_notypecheck		__kunmap_atomic_notypecheck
-#define kmap_atomic_to_page	__kmap_atomic_to_page
+extern void *kmap(struct page *page);
+extern void kunmap(struct page *page);
+extern void *__kmap_atomic(struct page *page);
+extern void __kunmap_atomic(void *kvaddr);
+extern void *kmap_atomic_pfn(unsigned long pfn);
+extern struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	flush_cache_all()
 
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index 6a2b1bf9ef11..1e69b1fb4b85 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -9,7 +9,7 @@ static pte_t *kmap_pte;
 
 unsigned long highstart_pfn, highend_pfn;
 
-void *__kmap(struct page *page)
+void *kmap(struct page *page)
 {
 	void *addr;
 
@@ -21,16 +21,16 @@ void *__kmap(struct page *page)
 
 	return addr;
 }
-EXPORT_SYMBOL(__kmap);
+EXPORT_SYMBOL(kmap);
 
-void __kunmap(struct page *page)
+void kunmap(struct page *page)
 {
 	BUG_ON(in_interrupt());
 	if (!PageHighMem(page))
 		return;
 	kunmap_high(page);
 }
-EXPORT_SYMBOL(__kunmap);
+EXPORT_SYMBOL(kunmap);
 
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -41,17 +41,17 @@ EXPORT_SYMBOL(__kunmap);
  * kmaps are appropriate for short, tight code paths only.
  */
 
-void *__kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -64,43 +64,47 @@ void *__kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(__kmap_atomic);
 
-void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
-#ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+	int type;
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
 		return;
 	}
 
-	BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+	type = kmap_atomic_idx_pop();
+#ifdef CONFIG_DEBUG_HIGHMEM
+	{
+		int idx = type + KM_TYPE_NR * smp_processor_id();
 
-	/*
-	 * force other mappings to Oops if they'll try to access
-	 * this pte without first remap it
-	 */
-	pte_clear(&init_mm, vaddr, kmap_pte-idx);
-	local_flush_tlb_one(vaddr);
+		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+		/*
+		 * force other mappings to Oops if they'll try to access
+		 * this pte without first remap it
+		 */
+		pte_clear(&init_mm, vaddr, kmap_pte-idx);
+		local_flush_tlb_one(vaddr);
+	}
 #endif
-
 	pagefault_enable();
 }
-EXPORT_SYMBOL(__kunmap_atomic_notypecheck);
+EXPORT_SYMBOL(__kunmap_atomic);
 
 /*
  * This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	pagefault_disable();
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL));
@@ -109,7 +113,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 	return (void*) vaddr;
 }
 
-struct page *__kmap_atomic_to_page(void *ptr)
+struct page *kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index b0b187a29b88..f577ba2268ca 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -70,15 +70,16 @@ static inline void kunmap(struct page *page)
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
-static inline unsigned long kmap_atomic(struct page *page, enum km_type type)
+static inline unsigned long __kmap_atomic(struct page *page)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
+	pagefault_disable();
 	if (page < highmem_start_page)
 		return page_address(page);
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #if HIGHMEM_DEBUG
@@ -91,26 +92,35 @@ static inline unsigned long kmap_atomic(struct page *page, enum km_type type)
 	return vaddr;
 }
 
-static inline void kunmap_atomic_notypecheck(unsigned long vaddr, enum km_type type)
+static inline void __kunmap_atomic(unsigned long vaddr)
 {
-#if HIGHMEM_DEBUG
-	enum fixed_addresses idx = type + KM_TYPE_NR * smp_processor_id();
+	int type;
 
-	if (vaddr < FIXADDR_START) /* FIXME */
+	if (vaddr < FIXADDR_START) { /* FIXME */
+		pagefault_enable();
 		return;
+	}
 
-	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx))
-		BUG();
+	type = kmap_atomic_idx_pop();
 
-	/*
-	 * force other mappings to Oops if they'll try to access
-	 * this pte without first remap it
-	 */
-	pte_clear(kmap_pte - idx);
-	__flush_tlb_one(vaddr);
+#if HIGHMEM_DEBUG
+	{
+		unsigned int idx;
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+		if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx))
+			BUG();
+
+		/*
+		 * force other mappings to Oops if they'll try to access
+		 * this pte without first remap it
+		 */
+		pte_clear(kmap_pte - idx);
+		__flush_tlb_one(vaddr);
+	}
 #endif
+	pagefault_enable();
 }
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index d10d64a4be38..dbc264010d0b 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -60,9 +60,8 @@ extern pte_t *pkmap_page_table;
 
 extern void *kmap_high(struct page *page);
 extern void kunmap_high(struct page *page);
-extern void *kmap_atomic_prot(struct page *page, enum km_type type,
-			      pgprot_t prot);
-extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
+extern void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+extern void __kunmap_atomic(void *kvaddr);
 
 static inline void *kmap(struct page *page)
 {
@@ -80,9 +79,9 @@ static inline void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-static inline void *kmap_atomic(struct page *page, enum km_type type)
+static inline void *__kmap_atomic(struct page *page)
 {
-	return kmap_atomic_prot(page, type, kmap_prot);
+	return kmap_atomic_prot(page, kmap_prot);
 }
 
 static inline struct page *kmap_atomic_to_page(void *ptr)
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index 857d4173f9c6..b0848b462bbc 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -29,17 +29,17 @@
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-	unsigned int idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -52,26 +52,33 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
-#ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+	int type;
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
 		return;
 	}
 
-	BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+	type = kmap_atomic_idx_pop();
 
-	/*
-	 * force other mappings to Oops if they'll try to access
-	 * this pte without first remap it
-	 */
-	pte_clear(&init_mm, vaddr, kmap_pte-idx);
-	local_flush_tlb_page(NULL, vaddr);
+#ifdef CONFIG_DEBUG_HIGHMEM
+	{
+		unsigned int idx;
+
+		idx = type + KM_TYPE_NR * smp_processor_id();
+		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+		/*
+		 * force other mappings to Oops if they'll try to access
+		 * this pte without first remap it
+		 */
+		pte_clear(&init_mm, vaddr, kmap_pte-idx);
+		local_flush_tlb_page(NULL, vaddr);
+	}
 #endif
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
+EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h
index ec23b0a87b98..3d7afbb7f4bb 100644
--- a/arch/sparc/include/asm/highmem.h
+++ b/arch/sparc/include/asm/highmem.h
@@ -70,8 +70,8 @@ static inline void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-extern void *kmap_atomic(struct page *page, enum km_type type);
-extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
+extern void *__kmap_atomic(struct page *page);
+extern void __kunmap_atomic(void *kvaddr);
 extern struct page *kmap_atomic_to_page(void *vaddr);
 
 #define flush_cache_kmaps()	flush_cache_all()
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index e139e9cbf5f7..5e50c09b7dce 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -29,17 +29,17 @@
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page)
 {
-	unsigned long idx;
 	unsigned long vaddr;
+	long idx, type;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 
@@ -63,44 +63,50 @@ void *kmap_atomic(struct page *page, enum km_type type)
 
 	return (void*) vaddr;
 }
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic);
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
-#ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	unsigned long idx = type + KM_TYPE_NR*smp_processor_id();
+	int type;
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
 		return;
 	}
 
-	BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx));
+	type = kmap_atomic_idx_pop();
 
-/* XXX Fix - Anton */
+#ifdef CONFIG_DEBUG_HIGHMEM
+	{
+		unsigned long idx;
+
+		idx = type + KM_TYPE_NR * smp_processor_id();
+		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx));
+
+		/* XXX Fix - Anton */
 #if 0
-	__flush_cache_one(vaddr);
+		__flush_cache_one(vaddr);
 #else
-	flush_cache_all();
+		flush_cache_all();
 #endif
 
-	/*
-	 * force other mappings to Oops if they'll try to access
-	 * this pte without first remap it
-	 */
-	pte_clear(&init_mm, vaddr, kmap_pte-idx);
-/* XXX Fix - Anton */
+		/*
+		 * force other mappings to Oops if they'll try to access
+		 * this pte without first remap it
+		 */
+		pte_clear(&init_mm, vaddr, kmap_pte-idx);
+		/* XXX Fix - Anton */
 #if 0
-	__flush_tlb_one(vaddr);
+		__flush_tlb_one(vaddr);
 #else
-	flush_tlb_all();
+		flush_tlb_all();
 #endif
+	}
 #endif
-
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
+EXPORT_SYMBOL(__kunmap_atomic);
 
 /* We may be fed a pagetable here by ptep_to_xxx and others. */
 struct page *kmap_atomic_to_page(void *ptr)
diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h
index d155db6fa9bd..e0f7ee186721 100644
--- a/arch/tile/include/asm/highmem.h
+++ b/arch/tile/include/asm/highmem.h
@@ -60,12 +60,12 @@ void *kmap_fix_kpte(struct page *page, int finished);
 /* This macro is used only in map_new_virtual() to map "page". */
 #define kmap_prot page_to_kpgprot(page)
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
 void kmap_atomic_fix_kpte(struct page *page, int finished);
 
 #define flush_cache_kmaps()	do { } while (0)
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 12ab137e7d4f..8ef6595e162c 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -56,50 +56,6 @@ void kunmap(struct page *page)
 }
 EXPORT_SYMBOL(kunmap);
 
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ &&
-			    /* type != KM_BIO_DST_IRQ && */
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-	    type == KM_BIO_SRC_IRQ /* || type == KM_BIO_DST_IRQ */) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * Describe a single atomic mapping of a page on a given cpu at a
  * given address, and allow it to be linked into a list.
@@ -240,10 +196,10 @@ void kmap_atomic_fix_kpte(struct page *page, int finished)
  * When holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 	pte_t *pte;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
@@ -255,8 +211,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic_prot(type);
-
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	pte = kmap_get_pte(vaddr);
@@ -269,25 +224,31 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page)
 {
 	/* PAGE_NONE is a magic value that tells us to check immutability. */
 	return kmap_atomic_prot(page, type, PAGE_NONE);
 }
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic);
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	/*
-	 * Force other mappings to Oops if they try to access this pte without
-	 * first remapping it.  Keeping stale mappings around is a bad idea.
-	 */
-	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) {
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
 		pte_t *pte = kmap_get_pte(vaddr);
 		pte_t pteval = *pte;
+		int idx, type;
+
+		type = kmap_atomic_idx_pop();
+		idx = type + KM_TYPE_NR*smp_processor_id();
+
+		/*
+		 * Force other mappings to Oops if they try to access this pte
+		 * without first remapping it.  Keeping stale mappings around
+		 * is a bad idea.
+		 */
 		BUG_ON(!pte_present(pteval) && !pte_migrating(pteval));
 		kmap_atomic_unregister(pte_page(pteval), vaddr);
 		kpte_clear_flush(pte, vaddr);
@@ -300,19 +261,19 @@ void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
+EXPORT_SYMBOL(__kunmap_atomic);
 
 /*
  * This API is supposed to allow us to map memory without a "struct page".
  * Currently we don't support this, though this may change in the future.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-	return kmap_atomic(pfn_to_page(pfn), type);
+	return kmap_atomic(pfn_to_page(pfn));
 }
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
-	return kmap_atomic_prot(pfn_to_page(pfn), type, prot);
+	return kmap_atomic_prot(pfn_to_page(pfn), prot);
 }
 
 struct page *kmap_atomic_to_page(void *ptr)
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 8caac76ac324..3bd04022fd0c 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page);
 
 void *kmap(struct page *page);
 void kunmap(struct page *page);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c4191b3b7056..363e33eb6ec1 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -27,10 +27,10 @@
 #include <asm/tlbflush.h>
 
 void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
 
 void
-iounmap_atomic(void __iomem *kvaddr, enum km_type type);
+iounmap_atomic(void __iomem *kvaddr);
 
 int
 iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..d5cd13945d5a 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	if (!is_crashed_pfn_valid(pfn))
 		return -EFAULT;
 
-	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+	vaddr = kmap_atomic_pfn(pfn);
 
 	if (!userbuf) {
 		memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 5e8fa12ef861..d723e369003c 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
 		return page_address(page);
 	return kmap_high(page);
 }
+EXPORT_SYMBOL(kmap);
 
 void kunmap(struct page *page)
 {
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
 		return;
 	kunmap_high(page);
 }
+EXPORT_SYMBOL(kunmap);
 
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap it is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
 	pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic(type);
-
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,56 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 
 	return (void *)vaddr;
 }
+EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page)
 {
-	return kmap_atomic_prot(page, type, kmap_prot);
-}
-
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
-{
-	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
-
-	/*
-	 * Force other mappings to Oops if they'll try to access this pte
-	 * without first remap it.  Keeping stale mappings around is a bad idea
-	 * also, in case the page changes cacheability attributes or becomes
-	 * a protected page in a hypervisor.
-	 */
-	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
-		kpte_clear_flush(kmap_pte-idx, vaddr);
-	else {
-#ifdef CONFIG_DEBUG_HIGHMEM
-		BUG_ON(vaddr < PAGE_OFFSET);
-		BUG_ON(vaddr >= (unsigned long)high_memory);
-#endif
-	}
-
-	pagefault_enable();
+	return kmap_atomic_prot(page, kmap_prot);
 }
+EXPORT_SYMBOL(__kmap_atomic);
 
 /*
  * This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-	return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
+	return kmap_atomic_prot_pfn(pfn, kmap_prot);
 }
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
+
+void __kunmap_atomic(void *kvaddr)
+{
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx_pop();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
+		kpte_clear_flush(kmap_pte-idx, vaddr);
+	}
+#ifdef CONFIG_DEBUG_HIGHMEM
+	else {
+		BUG_ON(vaddr < PAGE_OFFSET);
+		BUG_ON(vaddr >= (unsigned long)high_memory);
+	}
+#endif
+
+	pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
 
 struct page *kmap_atomic_to_page(void *ptr)
 {
@@ -98,12 +111,6 @@ struct page *kmap_atomic_to_page(void *ptr)
 	pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
 	return pte_page(*pte);
 }
-
-EXPORT_SYMBOL(kmap);
-EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
-EXPORT_SYMBOL(kmap_atomic_prot);
 EXPORT_SYMBOL(kmap_atomic_to_page);
 
 void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 72fc70cf6184..75a3d7f24a2c 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
 }
 EXPORT_SYMBOL_GPL(iomap_create_wc);
 
-void
-iomap_free(resource_size_t base, unsigned long size)
+void iomap_free(resource_size_t base, unsigned long size)
 {
 	io_free_memtype(base, base + size);
 }
 EXPORT_SYMBOL_GPL(iomap_free);
 
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	pagefault_disable();
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 }
 
 /*
- * Map 'pfn' using fixed map 'type' and protections 'prot'
+ * Map 'pfn' using protections 'prot'
  */
 void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
 	/*
 	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,33 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
 		prot = PAGE_KERNEL_UC_MINUS;
 
-	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot);
+	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
 }
 EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
 
 void
-iounmap_atomic(void __iomem *kvaddr, enum km_type type)
+iounmap_atomic(void __iomem *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	/*
-	 * Force other mappings to Oops if they'll try to access this pte
-	 * without first remap it.  Keeping stale mappings around is a bad idea
-	 * also, in case the page changes cacheability attributes or becomes
-	 * a protected page in a hypervisor.
-	 */
-	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx_pop();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
 		kpte_clear_flush(kmap_pte-idx, vaddr);
+	}
 
 	pagefault_enable();
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 90b1d6753b9d..eb6c473c6d1b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -155,11 +155,11 @@ fast_shmem_read(struct page **pages,
 	char __iomem *vaddr;
 	int unwritten;
 
-	vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0);
+	vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT]);
 	if (vaddr == NULL)
 		return -ENOMEM;
 	unwritten = __copy_to_user_inatomic(data, vaddr + page_offset, length);
-	kunmap_atomic(vaddr, KM_USER0);
+	kunmap_atomic(vaddr);
 
 	if (unwritten)
 		return -EFAULT;
@@ -509,10 +509,10 @@ fast_user_write(struct io_mapping *mapping,
 	char *vaddr_atomic;
 	unsigned long unwritten;
 
-	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base, KM_USER0);
+	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
 						      user_data, length);
-	io_mapping_unmap_atomic(vaddr_atomic, KM_USER0);
+	io_mapping_unmap_atomic(vaddr_atomic);
 	if (unwritten)
 		return -EFAULT;
 	return 0;
@@ -551,11 +551,11 @@ fast_shmem_write(struct page **pages,
 	char __iomem *vaddr;
 	unsigned long unwritten;
 
-	vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0);
+	vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT]);
 	if (vaddr == NULL)
 		return -ENOMEM;
 	unwritten = __copy_from_user_inatomic(vaddr + page_offset, data, length);
-	kunmap_atomic(vaddr, KM_USER0);
+	kunmap_atomic(vaddr);
 
 	if (unwritten)
 		return -EFAULT;
@@ -3346,8 +3346,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 		reloc_offset = obj_priv->gtt_offset + reloc->offset;
 		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
 						      (reloc_offset &
-						       ~(PAGE_SIZE - 1)),
-						      KM_USER0);
+						       ~(PAGE_SIZE - 1)));
 		reloc_entry = (uint32_t __iomem *)(reloc_page +
 						   (reloc_offset & (PAGE_SIZE - 1)));
 		reloc_val = target_obj_priv->gtt_offset + reloc->delta;
@@ -3358,7 +3357,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			  readl(reloc_entry), reloc_val);
 #endif
 		writel(reloc_val, reloc_entry);
-		io_mapping_unmap_atomic(reloc_page, KM_USER0);
+		io_mapping_unmap_atomic(reloc_page);
 
 		/* The updated presumed offset for this entry will be
 		 * copied back out to the user.
@@ -4772,11 +4771,11 @@ void i915_gem_detach_phys_object(struct drm_device *dev,
 	page_count = obj->size / PAGE_SIZE;
 
 	for (i = 0; i < page_count; i++) {
-		char *dst = kmap_atomic(obj_priv->pages[i], KM_USER0);
+		char *dst = kmap_atomic(obj_priv->pages[i]);
 		char *src = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE);
 
 		memcpy(dst, src, PAGE_SIZE);
-		kunmap_atomic(dst, KM_USER0);
+		kunmap_atomic(dst);
 	}
 	drm_clflush_pages(obj_priv->pages, page_count);
 	drm_agp_chipset_flush(dev);
@@ -4833,11 +4832,11 @@ i915_gem_attach_phys_object(struct drm_device *dev,
 	page_count = obj->size / PAGE_SIZE;
 
 	for (i = 0; i < page_count; i++) {
-		char *src = kmap_atomic(obj_priv->pages[i], KM_USER0);
+		char *src = kmap_atomic(obj_priv->pages[i]);
 		char *dst = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE);
 
 		memcpy(dst, src, PAGE_SIZE);
-		kunmap_atomic(src, KM_USER0);
+		kunmap_atomic(src);
 	}
 
 	i915_gem_object_put_pages(obj);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 744225ebb4b2..b80010f0c4c9 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -456,10 +456,9 @@ i915_error_object_create(struct drm_device *dev,
 
 		local_irq_save(flags);
 		s = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
-					     reloc_offset,
-					     KM_IRQ0);
+					     reloc_offset);
 		memcpy_fromio(d, s, PAGE_SIZE);
-		io_mapping_unmap_atomic(s, KM_IRQ0);
+		io_mapping_unmap_atomic(s);
 		local_irq_restore(flags);
 
 		dst->pages[page] = d;
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 1d306a458be6..3264bbd47e65 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -187,8 +187,7 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over
 
 	if (OVERLAY_NONPHYSICAL(overlay->dev)) {
 		regs = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
-						overlay->reg_bo->gtt_offset,
-						KM_USER0);
+						overlay->reg_bo->gtt_offset);
 
 		if (!regs) {
 			DRM_ERROR("failed to map overlay regs in GTT\n");
@@ -203,7 +202,7 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over
 static void intel_overlay_unmap_regs_atomic(struct intel_overlay *overlay)
 {
 	if (OVERLAY_NONPHYSICAL(overlay->dev))
-		io_mapping_unmap_atomic(overlay->virt_addr, KM_USER0);
+		io_mapping_unmap_atomic(overlay->virt_addr);
 
 	overlay->virt_addr = NULL;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c
index 974b0f8ae048..8fa339600fe3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bios.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bios.c
@@ -2167,11 +2167,11 @@ peek_fb(struct drm_device *dev, struct io_mapping *fb,
 
 	if (off < pci_resource_len(dev->pdev, 1)) {
 		uint8_t __iomem *p =
-			io_mapping_map_atomic_wc(fb, off & PAGE_MASK, KM_USER0);
+			io_mapping_map_atomic_wc(fb, off & PAGE_MASK);
 
 		val = ioread32(p + (off & ~PAGE_MASK));
 
-		io_mapping_unmap_atomic(p, KM_USER0);
+		io_mapping_unmap_atomic(p);
 	}
 
 	return val;
@@ -2183,12 +2183,12 @@ poke_fb(struct drm_device *dev, struct io_mapping *fb,
 {
 	if (off < pci_resource_len(dev->pdev, 1)) {
 		uint8_t __iomem *p =
-			io_mapping_map_atomic_wc(fb, off & PAGE_MASK, KM_USER0);
+			io_mapping_map_atomic_wc(fb, off & PAGE_MASK);
 
 		iowrite32(val, p + (off & ~PAGE_MASK));
 		wmb();
 
-		io_mapping_unmap_atomic(p, KM_USER0);
+		io_mapping_unmap_atomic(p);
 	}
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 3451a82adba7..e8a73e65da69 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -170,7 +170,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
 	src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
 
 #ifdef CONFIG_X86
-	dst = kmap_atomic_prot(d, KM_USER0, prot);
+	dst = kmap_atomic_prot(d, prot);
 #else
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
 		dst = vmap(&d, 1, 0, prot);
@@ -183,7 +183,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
 	memcpy_fromio(dst, src, PAGE_SIZE);
 
 #ifdef CONFIG_X86
-	kunmap_atomic(dst, KM_USER0);
+	kunmap_atomic(dst);
 #else
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
 		vunmap(dst);
@@ -206,7 +206,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
 
 	dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
 #ifdef CONFIG_X86
-	src = kmap_atomic_prot(s, KM_USER0, prot);
+	src = kmap_atomic_prot(s, prot);
 #else
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
 		src = vmap(&s, 1, 0, prot);
@@ -219,7 +219,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
 	memcpy_toio(dst, src, PAGE_SIZE);
 
 #ifdef CONFIG_X86
-	kunmap_atomic(src, KM_USER0);
+	kunmap_atomic(src);
 #else
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
 		vunmap(src);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 283cd47bb34c..8a85ec109a3a 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -28,18 +28,6 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 
 #include <asm/kmap_types.h>
 
-#ifdef CONFIG_DEBUG_HIGHMEM
-
-void debug_kmap_atomic(enum km_type type);
-
-#else
-
-static inline void debug_kmap_atomic(enum km_type type)
-{
-}
-
-#endif
-
 #ifdef CONFIG_HIGHMEM
 #include <asm/highmem.h>
 
@@ -49,6 +37,27 @@ extern unsigned long totalhigh_pages;
 
 void kmap_flush_unused(void);
 
+DECLARE_PER_CPU(int, __kmap_atomic_idx);
+
+static inline int kmap_atomic_idx_push(void)
+{
+	int idx = __get_cpu_var(__kmap_atomic_idx)++;
+#ifdef CONFIG_DEBUG_HIGHMEM
+	WARN_ON_ONCE(in_irq() && !irqs_disabled());
+	BUG_ON(idx > KM_TYPE_NR);
+#endif
+	return idx;
+}
+
+static inline int kmap_atomic_idx_pop(void)
+{
+	int idx = --__get_cpu_var(__kmap_atomic_idx);
+#ifdef CONFIG_DEBUG_HIGHMEM
+	BUG_ON(idx < 0);
+#endif
+	return idx;
+}
+
 #else /* CONFIG_HIGHMEM */
 
 static inline unsigned int nr_free_highpages(void) { return 0; }
@@ -66,19 +75,19 @@ static inline void kunmap(struct page *page)
 {
 }
 
-static inline void *kmap_atomic(struct page *page, enum km_type idx)
+static inline void *__kmap_atomic(struct page *page)
 {
 	pagefault_disable();
 	return page_address(page);
 }
-#define kmap_atomic_prot(page, idx, prot)	kmap_atomic(page, idx)
+#define kmap_atomic_prot(page, prot)	__kmap_atomic(page)
 
-static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx)
+static inline void __kunmap_atomic(void *addr)
 {
 	pagefault_enable();
 }
 
-#define kmap_atomic_pfn(pfn, idx)	kmap_atomic(pfn_to_page(pfn), (idx))
+#define kmap_atomic_pfn(pfn)	kmap_atomic(pfn_to_page(pfn))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
 #define kmap_flush_unused()	do {} while(0)
@@ -86,12 +95,20 @@ static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx)
 
 #endif /* CONFIG_HIGHMEM */
 
-/* Prevent people trying to call kunmap_atomic() as if it were kunmap() */
-/* kunmap_atomic() should get the return value of kmap_atomic, not the page. */
-#define kunmap_atomic(addr, idx) do { \
-		BUILD_BUG_ON(__same_type((addr), struct page *)); \
-		kunmap_atomic_notypecheck((addr), (idx)); \
-	} while (0)
+/*
+ * Make both: kmap_atomic(page, idx) and kmap_atomic(page) work.
+ */
+#define kmap_atomic(page, args...) __kmap_atomic(page)
+
+/*
+ * Prevent people trying to call kunmap_atomic() as if it were kunmap()
+ * kunmap_atomic() should get the return value of kmap_atomic, not the page.
+ */
+#define kunmap_atomic(addr, args...)				\
+do {								\
+	BUILD_BUG_ON(__same_type((addr), struct page *));	\
+	__kunmap_atomic(addr);					\
+} while (0)
 
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
 #ifndef clear_user_highpage
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 7fb592793738..8cdcc2a199ad 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -81,8 +81,7 @@ io_mapping_free(struct io_mapping *mapping)
 /* Atomic map/unmap */
 static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
-			 unsigned long offset,
-			 int slot)
+			 unsigned long offset)
 {
 	resource_size_t phys_addr;
 	unsigned long pfn;
@@ -90,13 +89,13 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
 	pfn = (unsigned long) (phys_addr >> PAGE_SHIFT);
-	return iomap_atomic_prot_pfn(pfn, slot, mapping->prot);
+	return iomap_atomic_prot_pfn(pfn, mapping->prot);
 }
 
 static inline void
-io_mapping_unmap_atomic(void __iomem *vaddr, int slot)
+io_mapping_unmap_atomic(void __iomem *vaddr)
 {
-	iounmap_atomic(vaddr, slot);
+	iounmap_atomic(vaddr);
 }
 
 static inline void __iomem *
@@ -137,14 +136,13 @@ io_mapping_free(struct io_mapping *mapping)
 /* Atomic map/unmap */
 static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
-			 unsigned long offset,
-			 int slot)
+			 unsigned long offset)
 {
 	return ((char __force __iomem *) mapping) + offset;
 }
 
 static inline void
-io_mapping_unmap_atomic(void __iomem *vaddr, int slot)
+io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 }
 
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..781e754a75ac 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -42,6 +42,10 @@
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
 
+
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
 unsigned int nr_free_highpages (void)
 {
 	pg_data_t *pgdat;
@@ -422,61 +426,3 @@ void __init page_address_init(void)
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-
-void debug_kmap_atomic(enum km_type type)
-{
-	static int warn_count = 10;
-
-	if (unlikely(warn_count < 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_nmi()) {
-			if (type != KM_NMI && type != KM_NMI_PTE) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-			    type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
-			type == KM_IRQ_PTE || type == KM_NMI ||
-			type == KM_NMI_PTE ) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#ifdef CONFIG_KGDB_KDB
-	if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
-		WARN_ON(1);
-		warn_count--;
-	}
-#endif /* CONFIG_KGDB_KDB */
-}
-
-#endif

From ece0e2b6406a995c371e0311190631ea34ad851a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:52 -0700
Subject: [PATCH 045/176] mm: remove pte_*map_nested()

Since we no longer need to provide KM_type, the whole pte_*map_nested()
API is now redundant, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/pgtable.h         |  2 --
 arch/arm/include/asm/pgtable.h           | 14 ++++++--------
 arch/arm/mm/fault-armv.c                 |  4 ++--
 arch/arm/mm/pgd.c                        |  4 ++--
 arch/avr32/include/asm/pgtable.h         |  2 --
 arch/cris/include/asm/pgtable.h          |  2 --
 arch/frv/include/asm/pgtable.h           |  9 ++-------
 arch/ia64/include/asm/pgtable.h          |  2 --
 arch/m32r/include/asm/pgtable.h          |  2 --
 arch/m68k/include/asm/motorola_pgtable.h |  2 --
 arch/m68k/include/asm/sun3_pgtable.h     |  2 --
 arch/microblaze/include/asm/pgtable.h    |  7 ++-----
 arch/mips/include/asm/pgtable-32.h       |  3 ---
 arch/mips/include/asm/pgtable-64.h       |  3 ---
 arch/mn10300/include/asm/pgtable.h       |  2 --
 arch/parisc/include/asm/pgtable.h        |  2 --
 arch/powerpc/include/asm/pgtable-ppc32.h |  8 ++------
 arch/powerpc/include/asm/pgtable-ppc64.h |  2 --
 arch/s390/include/asm/pgtable.h          |  2 --
 arch/score/include/asm/pgtable.h         |  3 ---
 arch/sh/include/asm/pgtable_32.h         |  3 ---
 arch/sh/include/asm/pgtable_64.h         |  2 --
 arch/sparc/include/asm/pgtable_32.h      |  3 ---
 arch/sparc/include/asm/pgtable_64.h      |  2 --
 arch/tile/include/asm/pgtable.h          |  5 -----
 arch/um/include/asm/pgtable.h            |  2 --
 arch/x86/include/asm/pgtable_32.h        | 14 ++------------
 arch/x86/include/asm/pgtable_64.h        |  2 --
 arch/xtensa/include/asm/pgtable.h        |  3 ---
 mm/memory.c                              |  4 ++--
 mm/mremap.c                              |  4 ++--
 31 files changed, 22 insertions(+), 99 deletions(-)

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 71a243294142..de98a732683d 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -318,9 +318,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address)
 }
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir),(addr))
-#define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir),(addr))
 #define pte_unmap(pte)			do { } while (0)
-#define pte_unmap_nested(pte)		do { } while (0)
 
 extern pgd_t swapper_pg_dir[1024];
 
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index a9672e8406a3..b155414192da 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -263,17 +263,15 @@ extern struct page *empty_zero_page;
 #define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
 #define pte_offset_kernel(dir,addr)	(pmd_page_vaddr(*(dir)) + __pte_index(addr))
 
-#define pte_offset_map(dir,addr)	(__pte_map(dir, KM_PTE0) + __pte_index(addr))
-#define pte_offset_map_nested(dir,addr)	(__pte_map(dir, KM_PTE1) + __pte_index(addr))
-#define pte_unmap(pte)			__pte_unmap(pte, KM_PTE0)
-#define pte_unmap_nested(pte)		__pte_unmap(pte, KM_PTE1)
+#define pte_offset_map(dir,addr)	(__pte_map(dir) + __pte_index(addr))
+#define pte_unmap(pte)			__pte_unmap(pte)
 
 #ifndef CONFIG_HIGHPTE
-#define __pte_map(dir,km)	pmd_page_vaddr(*(dir))
-#define __pte_unmap(pte,km)	do { } while (0)
+#define __pte_map(dir)		pmd_page_vaddr(*(dir))
+#define __pte_unmap(pte)	do { } while (0)
 #else
-#define __pte_map(dir,km)	((pte_t *)kmap_atomic(pmd_page(*(dir)), km) + PTRS_PER_PTE)
-#define __pte_unmap(pte,km)	kunmap_atomic((pte - PTRS_PER_PTE), km)
+#define __pte_map(dir)		((pte_t *)kmap_atomic(pmd_page(*(dir))) + PTRS_PER_PTE)
+#define __pte_unmap(pte)	kunmap_atomic((pte - PTRS_PER_PTE))
 #endif
 
 #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 8440d952ba6d..c493d7244d3d 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -89,13 +89,13 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
 	 * open-code the spin-locking.
 	 */
 	ptl = pte_lockptr(vma->vm_mm, pmd);
-	pte = pte_offset_map_nested(pmd, address);
+	pte = pte_offset_map(pmd, address);
 	spin_lock(ptl);
 
 	ret = do_adjust_pte(vma, address, pfn, pte);
 
 	spin_unlock(ptl);
-	pte_unmap_nested(pte);
+	pte_unmap(pte);
 
 	return ret;
 }
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index be5f58e153bf..69bbfc6645a6 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -57,9 +57,9 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 			goto no_pte;
 
 		init_pmd = pmd_offset(init_pgd, 0);
-		init_pte = pte_offset_map_nested(init_pmd, 0);
+		init_pte = pte_offset_map(init_pmd, 0);
 		set_pte_ext(new_pte, *init_pte, 0);
-		pte_unmap_nested(init_pte);
+		pte_unmap(init_pte);
 		pte_unmap(new_pte);
 	}
 
diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h
index a9ae30c41e74..6fbfea61f7bb 100644
--- a/arch/avr32/include/asm/pgtable.h
+++ b/arch/avr32/include/asm/pgtable.h
@@ -319,9 +319,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_offset_kernel(dir, address)					\
 	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
-#define pte_offset_map_nested(dir, address) pte_offset_kernel(dir, address)
 #define pte_unmap(pte)		do { } while (0)
-#define pte_unmap_nested(pte)	do { } while (0)
 
 struct vm_area_struct;
 extern void update_mmu_cache(struct vm_area_struct * vma,
diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h
index f63d6fccbc6c..9eaae217b21b 100644
--- a/arch/cris/include/asm/pgtable.h
+++ b/arch/cris/include/asm/pgtable.h
@@ -248,10 +248,8 @@ static inline pgd_t * pgd_offset(const struct mm_struct *mm, unsigned long addre
 	((pte_t *) pmd_page_vaddr(*(dir)) +  __pte_offset(address))
 #define pte_offset_map(dir, address) \
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
 
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 #define pte_pfn(x)		((unsigned long)(__va((x).pte)) >> PAGE_SHIFT)
 #define pfn_pte(pfn, prot)	__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h
index c18b0d32e636..6bc241e4b4f8 100644
--- a/arch/frv/include/asm/pgtable.h
+++ b/arch/frv/include/asm/pgtable.h
@@ -451,17 +451,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address) \
-	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
-#define pte_offset_map_nested(dir, address) \
-	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
-#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
-#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+	((pte_t *)kmap_atomic(pmd_page(*(dir))) + pte_index(address))
+#define pte_unmap(pte) kunmap_atomic(pte)
 #else
 #define pte_offset_map(dir, address) \
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
-#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 #endif
 
 /*
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index c3286f42e501..1a97af31ef17 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -406,9 +406,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address)
 #define pte_index(addr)	 	(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir,addr)	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir,addr)	pte_offset_kernel(dir, addr)
-#define pte_offset_map_nested(dir,addr)	pte_offset_map(dir, addr)
 #define pte_unmap(pte)			do { } while (0)
-#define pte_unmap_nested(pte)		do { } while (0)
 
 /* atomic versions of the some PTE manipulations: */
 
diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h
index e6359c566b50..8a28cfea2729 100644
--- a/arch/m32r/include/asm/pgtable.h
+++ b/arch/m32r/include/asm/pgtable.h
@@ -332,9 +332,7 @@ static inline void pmd_set(pmd_t * pmdp, pte_t * ptep)
 	((pte_t *)pmd_page_vaddr(*(dir)) + pte_index(address))
 #define pte_offset_map(dir, address)	\
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
-#define pte_offset_map_nested(dir, address)	pte_offset_map(dir, address)
 #define pte_unmap(pte)		do { } while (0)
-#define pte_unmap_nested(pte)	do { } while (0)
 
 /* Encode and de-code a swap entry */
 #define __swp_type(x)			(((x).val >> 2) & 0x1f)
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 8e9a8a754dde..45bd3f589bf0 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -221,9 +221,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmdp, unsigned long address)
 }
 
 #define pte_offset_map(pmdp,address) ((pte_t *)__pmd_page(*pmdp) + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
-#define pte_offset_map_nested(pmdp, address) pte_offset_map(pmdp, address)
 #define pte_unmap(pte)		((void)0)
-#define pte_unmap_nested(pte)	((void)0)
 
 /*
  * Allocate and free page tables. The xxx_kernel() versions are
diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h
index f847ec732d62..cf5fad9b5250 100644
--- a/arch/m68k/include/asm/sun3_pgtable.h
+++ b/arch/m68k/include/asm/sun3_pgtable.h
@@ -219,9 +219,7 @@ static inline pte_t pgoff_to_pte(unsigned off)
 #define pte_offset_kernel(pmd, address) ((pte_t *) __pmd_page(*pmd) + pte_index(address))
 /* FIXME: should we bother with kmap() here? */
 #define pte_offset_map(pmd, address) ((pte_t *)kmap(pmd_page(*pmd)) + pte_index(address))
-#define pte_offset_map_nested(pmd, address) pte_offset_map(pmd, address)
 #define pte_unmap(pte) kunmap(pte)
-#define pte_unmap_nested(pte) kunmap(pte)
 
 /* Macros to (de)construct the fake PTEs representing swap pages. */
 #define __swp_type(x)		((x).val & 0x7F)
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index d4f421672d3b..cae268c22ba2 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -504,12 +504,9 @@ static inline pmd_t *pmd_offset(pgd_t *dir, unsigned long address)
 #define pte_offset_kernel(dir, addr)	\
 	((pte_t *) pmd_page_kernel(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir, addr)		\
-	((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE0) + pte_index(addr))
-#define pte_offset_map_nested(dir, addr)	\
-	((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE1) + pte_index(addr))
+	((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
 
-#define pte_unmap(pte)		kunmap_atomic(pte, KM_PTE0)
-#define pte_unmap_nested(pte)	kunmap_atomic(pte, KM_PTE1)
+#define pte_unmap(pte)		kunmap_atomic(pte)
 
 /* Encode and decode a nonlinear file mapping entry */
 #define PTE_FILE_MAX_BITS	29
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index ae90412556d0..8a153d2fa62a 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -154,10 +154,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot)
 
 #define pte_offset_map(dir, address)                                    \
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
-#define pte_offset_map_nested(dir, address)                             \
-	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
 #define pte_unmap(pte) ((void)(pte))
-#define pte_unmap_nested(pte) ((void)(pte))
 
 #if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX)
 
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 1be4b0fa30da..f00896087dda 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -257,10 +257,7 @@ static inline pmd_t *pmd_offset(pud_t * pud, unsigned long address)
 	((pte_t *) pmd_page_vaddr(*(dir)) + __pte_offset(address))
 #define pte_offset_map(dir, address)					\
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
-#define pte_offset_map_nested(dir, address)				\
-	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
 #define pte_unmap(pte) ((void)(pte))
-#define pte_unmap_nested(pte) ((void)(pte))
 
 /*
  * Initialize a new pgd / pmd table with invalid pointers.
diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h
index 16d88577f3e0..b049a8bd1577 100644
--- a/arch/mn10300/include/asm/pgtable.h
+++ b/arch/mn10300/include/asm/pgtable.h
@@ -457,9 +457,7 @@ static inline int set_kernel_exec(unsigned long vaddr, int enable)
 
 #define pte_offset_map(dir, address) \
 	((pte_t *) page_address(pmd_page(*(dir))) + pte_index(address))
-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
 #define pte_unmap(pte)		do {} while (0)
-#define pte_unmap_nested(pte)	do {} while (0)
 
 /*
  * The MN10300 has external MMU info in the form of a TLB: this is adapted from
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 01c15035e783..865f37a8a881 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -397,9 +397,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_offset_kernel(pmd, address) \
 	((pte_t *) pmd_page_vaddr(*(pmd)) + pte_index(address))
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
-#define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 
 #define pte_unmap(pte)			do { } while (0)
 #define pte_unmap_nested(pte)		do { } while (0)
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index a7db96f2b5c3..47edde8c3556 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -308,12 +308,8 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define pte_offset_kernel(dir, addr)	\
 	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir, addr)		\
-	((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE0) + pte_index(addr))
-#define pte_offset_map_nested(dir, addr)	\
-	((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE1) + pte_index(addr))
-
-#define pte_unmap(pte)		kunmap_atomic(pte, KM_PTE0)
-#define pte_unmap_nested(pte)	kunmap_atomic(pte, KM_PTE1)
+	((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
+#define pte_unmap(pte)		kunmap_atomic(pte)
 
 /*
  * Encode and decode a swap entry.
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 49865045d56f..2b09cd522d33 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -193,9 +193,7 @@
   (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
-#define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
 #define pte_unmap(pte)			do { } while(0)
-#define pte_unmap_nested(pte)		do { } while(0)
 
 /* to find an entry in a kernel page-table-directory */
 /* This now only contains the vmalloc pages */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 986dc9476c21..02ace3491c51 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1094,9 +1094,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr))
 #define pte_offset_kernel(pmd, address) pte_offset(pmd,address)
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
-#define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 
 /*
  * 31 bit swap entry format:
diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h
index ccf38f06c57d..2fd469807683 100644
--- a/arch/score/include/asm/pgtable.h
+++ b/arch/score/include/asm/pgtable.h
@@ -88,10 +88,7 @@ static inline void pmd_clear(pmd_t *pmdp)
 
 #define pte_offset_map(dir, address)	\
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
-#define pte_offset_map_nested(dir, address)	\
-	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
 #define pte_unmap(pte) ((void)(pte))
-#define pte_unmap_nested(pte) ((void)(pte))
 
 /*
  * Bits 9(_PAGE_PRESENT) and 10(_PAGE_FILE)are taken,
diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h
index e172d696e52b..69fdfbf14ea5 100644
--- a/arch/sh/include/asm/pgtable_32.h
+++ b/arch/sh/include/asm/pgtable_32.h
@@ -429,10 +429,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_offset_kernel(dir, address) \
 	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
 #define pte_offset_map(dir, address)		pte_offset_kernel(dir, address)
-#define pte_offset_map_nested(dir, address)	pte_offset_kernel(dir, address)
-
 #define pte_unmap(pte)		do { } while (0)
-#define pte_unmap_nested(pte)	do { } while (0)
 
 #ifdef CONFIG_X2TLB
 #define pte_ERROR(e) \
diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h
index 0ee46776dad6..10a48111226d 100644
--- a/arch/sh/include/asm/pgtable_64.h
+++ b/arch/sh/include/asm/pgtable_64.h
@@ -84,9 +84,7 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval)
 		((pte_t *) ((pmd_val(*(dir))) & PAGE_MASK) + pte_index((addr)))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel(dir, addr)
-#define pte_offset_map_nested(dir,addr)	pte_offset_kernel(dir, addr)
 #define pte_unmap(pte)		do { } while (0)
-#define pte_unmap_nested(pte)	do { } while (0)
 
 #ifndef __ASSEMBLY__
 #define IOBASE_VADDR	0xff000000
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 0ece77f47753..303bd4dc8292 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -304,10 +304,7 @@ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long)
  * and sun4c is guaranteed to have no highmem anyway.
  */
 #define pte_offset_map(d, a)		pte_offset_kernel(d,a)
-#define pte_offset_map_nested(d, a)	pte_offset_kernel(d,a)
-
 #define pte_unmap(pte)		do{}while(0)
-#define pte_unmap_nested(pte)	do{}while(0)
 
 /* Certain architectures need to do special things when pte's
  * within a page table are directly modified.  Thus, the following
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index f5b5fa76c02d..f8dddb7045bb 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -652,9 +652,7 @@ static inline int pte_special(pte_t pte)
 	 ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
 #define pte_offset_kernel		pte_index
 #define pte_offset_map			pte_index
-#define pte_offset_map_nested		pte_index
 #define pte_unmap(pte)			do { } while (0)
-#define pte_unmap_nested(pte)		do { } while (0)
 
 /* Actual page table PTE updates.  */
 extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t orig);
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index b3367379d537..dc4ccdd855bc 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -347,15 +347,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 extern pte_t *_pte_offset_map(pmd_t *, unsigned long address, enum km_type);
 #define pte_offset_map(dir, address) \
 	_pte_offset_map(dir, address, KM_PTE0)
-#define pte_offset_map_nested(dir, address) \
-	_pte_offset_map(dir, address, KM_PTE1)
 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
-#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
 #else
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 #endif
 
 /* Clear a non-executable kernel PTE and flush it from the TLB. */
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index a9f7251b4a8d..41474fb5eee7 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -338,9 +338,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
 #define pte_offset_map(dir, address) \
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 
 struct mm_struct;
 extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 8abde9ec90bf..0c92113c4cb6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 #endif
 
 #if defined(CONFIG_HIGHPTE)
-#define __KM_PTE			\
-	(in_nmi() ? KM_NMI_PTE : 	\
-	 in_irq() ? KM_IRQ_PTE :	\
-	 KM_PTE0)
 #define pte_offset_map(dir, address)					\
-	((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) +		\
+	((pte_t *)kmap_atomic(pmd_page(*(dir))) +		\
 	 pte_index((address)))
-#define pte_offset_map_nested(dir, address)				\
-	((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) +		\
-	 pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
-#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#define pte_unmap(pte) kunmap_atomic((pte))
 #else
 #define pte_offset_map(dir, address)					\
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
-#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 #endif
 
 /* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f96ac9bedf75..f86da20347f2 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -127,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
-#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
-#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
 
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
 
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 76bf35554117..b03c043ce75b 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -324,10 +324,7 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 #define pte_offset_kernel(dir,addr) 					\
 	((pte_t*) pmd_page_vaddr(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir),(addr))
-#define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir),(addr))
-
 #define pte_unmap(pte)		do { } while (0)
-#define pte_unmap_nested(pte)	do { } while (0)
 
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index af82741caaa4..92cc54e94137 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -736,7 +736,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
-	src_pte = pte_offset_map_nested(src_pmd, addr);
+	src_pte = pte_offset_map(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	orig_src_pte = src_pte;
@@ -767,7 +767,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
-	pte_unmap_nested(orig_src_pte);
+	pte_unmap(orig_src_pte);
 	add_mm_rss_vec(dst_mm, rss);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..563fbdd6293a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	 * pte locks because exclusive mmap_sem prevents deadlock.
 	 */
 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	new_pte = pte_offset_map(new_pmd, new_addr);
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
-	pte_unmap_nested(new_pte - 1);
+	pte_unmap(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);

From 7a837d1bb7cb2bceb093ec639068626586a89234 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:53 -0700
Subject: [PATCH 046/176] perf, x86: Fix up kmap_atomic() type

Now that the KM_type stuff is history, clean up the compiler warning.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fe73c1844a9a..c1e8c7a51164 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -49,7 +49,6 @@ static unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
 	unsigned long size, len = 0;
 	struct page *page;
 	void *map;
@@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 		offset = addr & (PAGE_SIZE - 1);
 		size = min(PAGE_SIZE - offset, n - len);
 
-		map = kmap_atomic(page, type);
+		map = kmap_atomic(page);
 		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
+		kunmap_atomic(map);
 		put_page(page);
 
 		len  += size;

From d65bfacb046f3df8aa11a9cb9b6e448f6171174d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:54 -0700
Subject: [PATCH 047/176] mm: highmem documentation

Document outlining some of the highmem issues, started by me, edited by
David.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/highmem.txt | 162 +++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 Documentation/vm/highmem.txt

diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.txt
new file mode 100644
index 000000000000..4324d24ffacd
--- /dev/null
+++ b/Documentation/vm/highmem.txt
@@ -0,0 +1,162 @@
+
+			     ====================
+			     HIGH MEMORY HANDLING
+			     ====================
+
+By: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+Contents:
+
+ (*) What is high memory?
+
+ (*) Temporary virtual mappings.
+
+ (*) Using kmap_atomic.
+
+ (*) Cost of temporary mappings.
+
+ (*) i386 PAE.
+
+
+====================
+WHAT IS HIGH MEMORY?
+====================
+
+High memory (highmem) is used when the size of physical memory approaches or
+exceeds the maximum size of virtual memory.  At that point it becomes
+impossible for the kernel to keep all of the available physical memory mapped
+at all times.  This means the kernel needs to start using temporary mappings of
+the pieces of physical memory that it wants to access.
+
+The part of (physical) memory not covered by a permanent mapping is what we
+refer to as 'highmem'.  There are various architecture dependent constraints on
+where exactly that border lies.
+
+In the i386 arch, for example, we choose to map the kernel into every process's
+VM space so that we don't have to pay the full TLB invalidation costs for
+kernel entry/exit.  This means the available virtual memory space (4GiB on
+i386) has to be divided between user and kernel space.
+
+The traditional split for architectures using this approach is 3:1, 3GiB for
+userspace and the top 1GiB for kernel space:
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+This means that the kernel can at most map 1GiB of physical memory at any one
+time, but because we need virtual address space for other things - including
+temporary maps to access the rest of the physical memory - the actual direct
+map will typically be less (usually around ~896MiB).
+
+Other architectures that have mm context tagged TLBs can have separate kernel
+and user maps.  Some hardware (like some ARMs), however, have limited virtual
+space when they use mm context tags.
+
+
+==========================
+TEMPORARY VIRTUAL MAPPINGS
+==========================
+
+The kernel contains several ways of creating temporary mappings:
+
+ (*) vmap().  This can be used to make a long duration mapping of multiple
+     physical pages into a contiguous virtual space.  It needs global
+     synchronization to unmap.
+
+ (*) kmap().  This permits a short duration mapping of a single page.  It needs
+     global synchronization, but is amortized somewhat.  It is also prone to
+     deadlocks when using in a nested fashion, and so it is not recommended for
+     new code.
+
+ (*) kmap_atomic().  This permits a very short duration mapping of a single
+     page.  Since the mapping is restricted to the CPU that issued it, it
+     performs well, but the issuing task is therefore required to stay on that
+     CPU until it has finished, lest some other task displace its mappings.
+
+     kmap_atomic() may also be used by interrupt contexts, since it is does not
+     sleep and the caller may not sleep until after kunmap_atomic() is called.
+
+     It may be assumed that k[un]map_atomic() won't fail.
+
+
+=================
+USING KMAP_ATOMIC
+=================
+
+When and where to use kmap_atomic() is straightforward.  It is used when code
+wants to access the contents of a page that might be allocated from high memory
+(see __GFP_HIGHMEM), for example a page in the pagecache.  The API has two
+functions, and they can be used in a manner similar to the following:
+
+	/* Find the page of interest. */
+	struct page *page = find_get_page(mapping, offset);
+
+	/* Gain access to the contents of that page. */
+	void *vaddr = kmap_atomic(page);
+
+	/* Do something to the contents of that page. */
+	memset(vaddr, 0, PAGE_SIZE);
+
+	/* Unmap that page. */
+	kunmap_atomic(vaddr);
+
+Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
+not the argument.
+
+If you need to map two pages because you want to copy from one page to
+another you need to keep the kmap_atomic calls strictly nested, like:
+
+	vaddr1 = kmap_atomic(page1);
+	vaddr2 = kmap_atomic(page2);
+
+	memcpy(vaddr1, vaddr2, PAGE_SIZE);
+
+	kunmap_atomic(vaddr2);
+	kunmap_atomic(vaddr1);
+
+
+==========================
+COST OF TEMPORARY MAPPINGS
+==========================
+
+The cost of creating temporary mappings can be quite high.  The arch has to
+manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
+
+If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
+simply with a bit of arithmetic that will convert the page struct address into
+a pointer to the page contents rather than juggling mappings about.  In such a
+case, the unmap operation may be a null operation.
+
+If CONFIG_MMU is not set, then there can be no temporary mappings and no
+highmem.  In such a case, the arithmetic approach will also be used.
+
+
+========
+i386 PAE
+========
+
+The i386 arch, under some circumstances, will permit you to stick up to 64GiB
+of RAM into your 32-bit machine.  This has a number of consequences:
+
+ (*) Linux needs a page-frame structure for each page in the system and the
+     pageframes need to live in the permanent mapping, which means:
+
+ (*) you can have 896M/sizeof(struct page) page-frames at most; with struct
+     page being 32-bytes that would end up being something in the order of 112G
+     worth of pages; the kernel, however, needs to store more than just
+     page-frames in that memory...
+
+ (*) PAE makes your page tables larger - which slows the system down as more
+     data has to be accessed to traverse in TLB fills and the like.  One
+     advantage is that PAE has more PTE bits and can provide advanced features
+     like NX and PAT.
+
+The general recommendation is that you don't use more than 8GiB on a 32-bit
+machine - although more might work for you and your workload, you're pretty
+much on your own - don't expect kernel developers to really care much if things
+come apart.

From ea05c8444e451f1cfbf78c68733e717ad7b8602b Mon Sep 17 00:00:00 2001
From: Dima Zavin <dima@android.com>
Date: Tue, 26 Oct 2010 14:21:54 -0700
Subject: [PATCH 048/176] mm: add a might_sleep_if() to dma_pool_alloc()

Buggy drivers (e.g.  fsl_udc) could call dma_pool_alloc from atomic
context with GFP_KERNEL.  In most instances, the first pool_alloc_page
call would succeed and the sleeping functions would never be called.  This
allowed the buggy drivers to slip through the cracks.

Add a might_sleep_if() checking for __GFP_WAIT in flags.

Signed-off-by: Dima Zavin <dima@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/dmapool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..4df2de77e069 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 	size_t offset;
 	void *retval;
 
+	might_sleep_if(mem_flags & __GFP_WAIT);
+
 	spin_lock_irqsave(&pool->lock, flags);
  restart:
 	list_for_each_entry(page, &pool->page_list, page_list) {

From 182fea8f48332de085c0ae936605cb72671db9f2 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Tue, 26 Oct 2010 14:21:55 -0700
Subject: [PATCH 049/176] mm: remove alignment padding from anon_vma on (some)
 64 bit builds

Reorder structure anon_vma to remove alignment padding on 64 builds when
(CONFIG_KSM || CONFIG_MIGRATION).
This will shrink the size of the anon_vma structure from 40 to 32 bytes
& allow more objects per slab in its kmem_cache.

Under slub the objects in the anon_vma kmem_cache will then be 40 bytes
with 102 objects per slab.  (On v2.6.36 without this patch,the size is 48
bytes and 85 objects/slab.)

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 31b2fd75dcba..5c98df68a953 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -25,8 +25,8 @@
  * pointing to this anon_vma once its vma list is empty.
  */
 struct anon_vma {
-	spinlock_t lock;	/* Serialize access to vma list */
 	struct anon_vma *root;	/* Root of this anon_vma tree */
+	spinlock_t lock;	/* Serialize access to vma list */
 #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
 
 	/*

From b522c94da5d9cbc73f708be5e530ebc3bbd4a031 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 26 Oct 2010 14:21:56 -0700
Subject: [PATCH 050/176] mm: filemap_fault: unique path for locking page

Introduce a single location where filemap_fault() locks the desired page.
There used to be two such places, depending if the initial find_get_page()
was successful or not.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..8ed709a83eb7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,25 +1539,27 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * waiting for the lock.
 		 */
 		do_async_mmap_readahead(vma, ra, file, page, offset);
-		lock_page(page);
-
-		/* Did it get truncated? */
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto no_cached_page;
-		}
 	} else {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 retry_find:
-		page = find_lock_page(mapping, offset);
+		page = find_get_page(mapping, offset);
 		if (!page)
 			goto no_cached_page;
 	}
 
+	lock_page(page);
+
+	/* Did it get truncated? */
+	if (unlikely(page->mapping != mapping)) {
+		unlock_page(page);
+		put_page(page);
+		goto retry_find;
+	}
+	VM_BUG_ON(page->index != offset);
+
 	/*
 	 * We have a locked page in the page cache, now we need to check
 	 * that it's up-to-date. If not, it is going to be due to an error.

From d065bd810b6deb67d4897a14bfe21f8eb526ba99 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 26 Oct 2010 14:21:57 -0700
Subject: [PATCH 051/176] mm: retry page fault when blocking on disk transfer

This change reduces mmap_sem hold times that are caused by waiting for
disk transfers when accessing file mapped VMAs.

It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call
site wants mmap_sem to be released if blocking on a pending disk transfer.
In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and
do_page_fault() will then re-acquire mmap_sem and retry the page fault.

It is expected that the retry will hit the same page which will now be
cached, and thus it will complete with a low mmap_sem hold time.

Tests:

- microbenchmark: thread A mmaps a large file and does random read accesses
  to the mmaped area - achieves about 55 iterations/s. Thread B does
  mmap/munmap in a loop at a separate location - achieves 55 iterations/s
  before, 15000 iterations/s after.

- We are seeing related effects in some applications in house, which show
  significant performance regressions when running without this change.

[akpm@linux-foundation.org: fix warning & crash]
Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/fault.c     | 38 ++++++++++++++++++++++++++------------
 include/linux/mm.h      |  2 ++
 include/linux/pagemap.h | 13 +++++++++++++
 mm/filemap.c            | 16 +++++++++++++++-
 mm/memory.c             | 10 ++++++++--
 5 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 852b319edbdc..9b2345c9e0c3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -956,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	struct task_struct *tsk;
 	unsigned long address;
 	struct mm_struct *mm;
-	int write;
 	int fault;
+	int write = error_code & PF_WRITE;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+					(write ? FAULT_FLAG_WRITE : 0);
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1068,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 			bad_area_nosemaphore(regs, error_code, address);
 			return;
 		}
+retry:
 		down_read(&mm->mmap_sem);
 	} else {
 		/*
@@ -1111,8 +1114,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * we can handle it..
 	 */
 good_area:
-	write = error_code & PF_WRITE;
-
 	if (unlikely(access_error(error_code, write, vma))) {
 		bad_area_access_error(regs, error_code, address);
 		return;
@@ -1123,21 +1124,34 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault:
 	 */
-	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+	fault = handle_mm_fault(mm, vma, address, flags);
 
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR) {
-		tsk->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-				     regs, address);
-	} else {
-		tsk->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-				     regs, address);
+	/*
+	 * Major/minor page fault accounting is only done on the
+	 * initial attempt. If we go through a retry, it is extremely
+	 * likely that the page will be found in page cache at that point.
+	 */
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			tsk->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+				      regs, address);
+		} else {
+			tsk->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation. */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			goto retry;
+		}
 	}
 
 	check_v8086_mode(regs, address, tsk);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2862009f9573..3bf46655b50a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 #define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -723,6 +724,7 @@ static inline int page_mapped(struct page *page)
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
+#define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e12cdc6d79ee..2d1ffe3cf1ee 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -299,6 +299,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 extern void __lock_page(struct page *page);
 extern int __lock_page_killable(struct page *page);
 extern void __lock_page_nosync(struct page *page);
+extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+				unsigned int flags);
 extern void unlock_page(struct page *page);
 
 static inline void __set_page_locked(struct page *page)
@@ -350,6 +352,17 @@ static inline void lock_page_nosync(struct page *page)
 		__lock_page_nosync(page);
 }
 	
+/*
+ * lock_page_or_retry - Lock the page, unless this would block and the
+ * caller indicated that it can handle a retry.
+ */
+static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
+				     unsigned int flags)
+{
+	might_sleep();
+	return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
+}
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
diff --git a/mm/filemap.c b/mm/filemap.c
index 8ed709a83eb7..33f81252a744 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
 							TASK_UNINTERRUPTIBLE);
 }
 
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+			 unsigned int flags)
+{
+	if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+		__lock_page(page);
+		return 1;
+	} else {
+		up_read(&mm->mmap_sem);
+		wait_on_page_locked(page);
+		return 0;
+	}
+}
+
 /**
  * find_get_page - find and get a page reference
  * @mapping: the address_space to search
@@ -1550,7 +1563,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			goto no_cached_page;
 	}
 
-	lock_page(page);
+	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+		return ret | VM_FAULT_RETRY;
 
 	/* Did it get truncated? */
 	if (unlikely(page->mapping != mapping)) {
diff --git a/mm/memory.c b/mm/memory.c
index 92cc54e94137..714c4438d887 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2627,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page, *swapcache = NULL;
 	swp_entry_t entry;
 	pte_t pte;
+	int locked;
 	struct mem_cgroup *ptr = NULL;
 	int exclusive = 0;
 	int ret = 0;
@@ -2677,8 +2678,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_release;
 	}
 
-	lock_page(page);
+	locked = lock_page_or_retry(page, mm, flags);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (!locked) {
+		ret |= VM_FAULT_RETRY;
+		goto out_release;
+	}
 
 	/*
 	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2927,7 +2932,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	vmf.page = NULL;
 
 	ret = vma->vm_ops->fault(vma, &vmf);
-	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+			    VM_FAULT_RETRY)))
 		return ret;
 
 	if (unlikely(PageHWPoison(vmf.page))) {

From 68da336a14e16c2de95e987f3200995b707d7038 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 26 Oct 2010 14:21:58 -0700
Subject: [PATCH 052/176] x86: access_error API cleanup

access_error() already takes error_code as an argument, so there is
no need for an additional write flag.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/fault.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9b2345c9e0c3..7d90ceb882a4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -919,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
 int show_unhandled_signals = 1;
 
 static inline int
-access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
-	if (write) {
+	if (error_code & PF_WRITE) {
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			return 1;
@@ -1114,7 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * we can handle it..
 	 */
 good_area:
-	if (unlikely(access_error(error_code, write, vma))) {
+	if (unlikely(access_error(error_code, vma))) {
 		bad_area_access_error(regs, error_code, address);
 		return;
 	}

From 0116651c85e671a693dd2f56e95dd651f746c973 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:21:58 -0700
Subject: [PATCH 053/176] mm: remove temporary variable on
 generic_file_direct_write()

'end' shadows earlier one and is not necessary at all.  Remove it and use
'pos' instead.  This removes following sparse warnings:

 mm/filemap.c:2180:24: warning: symbol 'end' shadows an earlier one
 mm/filemap.c:2132:25: originally declared here

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 33f81252a744..75572b5f2374 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2193,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	}
 
 	if (written > 0) {
-		loff_t end = pos + written;
-		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-			i_size_write(inode,  end);
+		pos += written;
+		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+			i_size_write(inode, pos);
 			mark_inode_dirty(inode);
 		}
-		*ppos = end;
+		*ppos = pos;
 	}
 out:
 	return written;

From e6223a3b19421e3a8df1352d21fd0d71093f44ae Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:21:59 -0700
Subject: [PATCH 054/176] mm: add casts to/from gfp_t in gfp_to_alloc_flags()

This removes following warning from sparse:

 mm/page_alloc.c:1934:9: warning: restricted gfp_t degrades to integer

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b13bc5e5bd7d..07a654486f75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1932,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-	BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
@@ -1940,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
-	alloc_flags |= (gfp_mask & __GFP_HIGH);
+	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 
 	if (!wait) {
 		alloc_flags |= ALLOC_HARDER;

From 25ca1d6c02fe1c6d90d918867ef670d323725458 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:21:59 -0700
Subject: [PATCH 055/176] mm: wrap get_locked_pte() using __cond_lock()

The get_locked_pte() conditionally grabs 'ptl' in case of returning
non-NULL.  This leads sparse to complain about context imbalance.  Rename
and wrap it using __cond_lock() to make sparse happy.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 10 +++++++++-
 mm/memory.c        |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bf46655b50a..721f451c3029 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1034,7 +1034,15 @@ extern void unregister_shrinker(struct shrinker *);
 
 int vma_wants_writenotify(struct vm_area_struct *vma);
 
-extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl);
+extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+			       spinlock_t **ptl);
+static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+				    spinlock_t **ptl)
+{
+	pte_t *ptep;
+	__cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
+	return ptep;
+}
 
 #ifdef __PAGETABLE_PUD_FOLDED
 static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
diff --git a/mm/memory.c b/mm/memory.c
index 714c4438d887..4ce24a4d5d48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1591,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
 			spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);

From e6219ec8195efd5640765e657810f262ad9d1a92 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:00 -0700
Subject: [PATCH 056/176] mm: add lock release annotation on do_wp_page()

The do_wp_page() releases @ptl but was missing proper annotation.  Add it.
 This removes following warnings from sparse:

 mm/memory.c:2337:9: warning: context imbalance in 'do_wp_page' - unexpected unlock
 mm/memory.c:3142:19: warning: context imbalance in 'handle_mm_fault' - different lock contexts for basic block

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/memory.c b/mm/memory.c
index 4ce24a4d5d48..01bdf9dbbfac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2108,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
+	__releases(ptl)
 {
 	struct page *old_page, *new_page;
 	pte_t entry;

From 1b36ba815bd91f17e31277a44dd5c6b6a5a8d97e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:00 -0700
Subject: [PATCH 057/176] mm: wrap follow_pte() using __cond_lock()

The follow_pte() conditionally grabs *@ptlp in case of returning 0.
Rename and wrap it using __cond_lock() removes following warnings:

 mm/memory.c:2337:9: warning: context imbalance in 'do_wp_page' - unexpected unlock
 mm/memory.c:3142:19: warning: context imbalance in 'handle_mm_fault' - different lock contexts for basic block

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 01bdf9dbbfac..861f7982dd54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3351,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr)
 
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
-static int follow_pte(struct mm_struct *mm, unsigned long address,
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
 		pte_t **ptepp, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
@@ -3388,6 +3388,17 @@ static int follow_pte(struct mm_struct *mm, unsigned long address,
 	return -EINVAL;
 }
 
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+			     pte_t **ptepp, spinlock_t **ptlp)
+{
+	int res;
+
+	/* (void) is needed to make gcc happy */
+	(void) __cond_lock(*ptlp,
+			   !(res = __follow_pte(mm, address, ptepp, ptlp)));
+	return res;
+}
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping

From ea4525b6008fb29553306ec6719f8e6930ac9499 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:01 -0700
Subject: [PATCH 058/176] rmap: annotate lock context change on
 page_[un]lock_anon_vma()

The page_lock_anon_vma() conditionally grabs RCU and anon_vma lock but
page_unlock_anon_vma() releases them unconditionally.  This leads sparse
to complain about context imbalance.  Annotate them.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 15 ++++++++++++++-
 mm/rmap.c            |  4 +++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 5c98df68a953..07ea89c16761 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -230,7 +230,20 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma(struct page *page);
+struct anon_vma *__page_lock_anon_vma(struct page *page);
+
+static inline struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+	struct anon_vma *anon_vma;
+
+	__cond_lock(RCU, anon_vma = __page_lock_anon_vma(page));
+
+	/* (void) is needed to make gcc happy */
+	(void) __cond_lock(&anon_vma->root->lock, anon_vma);
+
+	return anon_vma;
+}
+
 void page_unlock_anon_vma(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index f5ad996a4a8f..0995a8f68866 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -314,7 +314,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *__page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma, *root_anon_vma;
 	unsigned long anon_mapping;
@@ -348,6 +348,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
 }
 
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
+	__releases(&anon_vma->root->lock)
+	__releases(RCU)
 {
 	anon_vma_unlock(anon_vma);
 	rcu_read_unlock();

From e9a81a821d7f9c5d899cc3acdeafbd884c2c48bb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:01 -0700
Subject: [PATCH 059/176] rmap: wrap page_check_address() using __cond_lock()

The page_check_address() conditionally grabs *@ptlp in case of returning
non-NULL.  Rename and wrap it using __cond_lock() removes following
warnings from sparse:

 mm/rmap.c:472:9: warning: context imbalance in 'page_mapped_in_vma' - unexpected unlock
 mm/rmap.c:524:9: warning: context imbalance in 'page_referenced_one' - unexpected unlock
 mm/rmap.c:706:9: warning: context imbalance in 'page_mkclean_one' - unexpected unlock
 mm/rmap.c:1066:9: warning: context imbalance in 'try_to_unmap_one' - unexpected unlock

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 13 ++++++++++++-
 mm/rmap.c            |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 07ea89c16761..bb83c0da2071 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -205,9 +205,20 @@ int try_to_unmap_one(struct page *, struct vm_area_struct *,
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
  */
-pte_t *page_check_address(struct page *, struct mm_struct *,
+pte_t *__page_check_address(struct page *, struct mm_struct *,
 				unsigned long, spinlock_t **, int);
 
+static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+					unsigned long address,
+					spinlock_t **ptlp, int sync)
+{
+	pte_t *ptep;
+
+	__cond_lock(*ptlp, ptep = __page_check_address(page, mm, address,
+						       ptlp, sync));
+	return ptep;
+}
+
 /*
  * Used by swapoff to help locate where page is expected in vma.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 0995a8f68866..bfeffbddb712 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -409,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  *
  * On success returns with pte mapped and locked.
  */
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
 			  unsigned long address, spinlock_t **ptlp, int sync)
 {
 	pgd_t *pgd;

From e574b5fd20027b422aa80790f710d695699b4fba Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:02 -0700
Subject: [PATCH 060/176] rmap: make anon_vma_chain_free() static

Make anon_vma_chain_free() static.  It is called only in rmap.c and the
corresponding alloc function is already static.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index bfeffbddb712..1a8bf76bfd03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
 	return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
 }
 
-void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 {
 	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 }

From 170168d0a351c045adc0bee0987e51dfc82890c0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:02 -0700
Subject: [PATCH 061/176] vmalloc: rename temporary variable in
 __insert_vmap_area()

Rename redundant 'tmp' to fix following sparse warnings:

 mm/vmalloc.c:296:34: warning: symbol 'tmp' shadows an earlier one
 mm/vmalloc.c:293:24: originally declared here

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9f909622a25e..55c7a47391f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va)
 	struct rb_node *tmp;
 
 	while (*p) {
-		struct vmap_area *tmp;
+		struct vmap_area *tmp_va;
 
 		parent = *p;
-		tmp = rb_entry(parent, struct vmap_area, rb_node);
-		if (va->va_start < tmp->va_end)
+		tmp_va = rb_entry(parent, struct vmap_area, rb_node);
+		if (va->va_start < tmp_va->va_end)
 			p = &(*p)->rb_left;
-		else if (va->va_end > tmp->va_start)
+		else if (va->va_end > tmp_va->va_start)
 			p = &(*p)->rb_right;
 		else
 			BUG();

From e199b5d1fed13f5e8f47a0ee8216f36244dad1f4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:03 -0700
Subject: [PATCH 062/176] vmalloc: annotate lock context change on
 s_start/stop()

s_start() and s_stop() grab/release vmlist_lock but were missing proper
annotations.  Add them.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 55c7a47391f6..f492c774fa7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2350,6 +2350,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
+	__acquires(&vmlist_lock)
 {
 	loff_t n = *pos;
 	struct vm_struct *v;
@@ -2376,6 +2377,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 }
 
 static void s_stop(struct seq_file *m, void *p)
+	__releases(&vmlist_lock)
 {
 	read_unlock(&vmlist_lock);
 }

From 92c09c041f15fc88b35f8628e07639f52e1fbb38 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:03 -0700
Subject: [PATCH 063/176] mm: declare some external symbols

Declare 'bdi_pending_list' and 'tag_pages_for_writeback()' to remove
following sparse warnings:

 mm/backing-dev.c:46:1: warning: symbol 'bdi_pending_list' was not declared. Should it be static?
 mm/page-writeback.c:825:6: warning: symbol 'tag_pages_for_writeback' was not declared. Should it be static?

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h | 1 +
 include/linux/writeback.h   | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f1b402a50679..4ce34fa937d4 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -111,6 +111,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
+extern struct list_head bdi_pending_list;
 
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 72a5d647a5f2..c7299d2ace6b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -149,6 +149,8 @@ int write_cache_pages(struct address_space *mapping,
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 void set_page_dirty_balance(struct page *page, int page_mkwrite);
 void writeback_set_ratelimit(void);
+void tag_pages_for_writeback(struct address_space *mapping,
+			     pgoff_t start, pgoff_t end);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl

From 36deb0be314702627aeae1f5737fc84d01dc26c6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:04 -0700
Subject: [PATCH 064/176] vmstat: include compaction.h when CONFIG_COMPACTION

This removes following warning from sparse:

 mm/vmstat.c:466:5: warning: symbol 'fragmentation_index' was not declared. Should it be static?

[akpm@linux-foundation.org: move the include to top-of-file]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index baa4ab387db7..cd2e42be7b68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/compaction.h>
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -395,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 
 #ifdef CONFIG_COMPACTION
+
 struct contig_page_info {
 	unsigned long free_pages;
 	unsigned long free_blocks_total;

From 16b56cf4b8a0fa9acc21bd2ad19839b917999b96 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:04 -0700
Subject: [PATCH 065/176] mm: fix sparse warnings on GFP_ZONE_TABLE/BAD

Introduce ___GFP_* masks in order for gfp_t to not be mixed with plain
integers which causes a lot of warnings like the following:

 warning: restricted gfp_t degrades to integer

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 105 ++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 42 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 975609cb8548..e8713d55360a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -9,6 +9,32 @@
 
 struct vm_area_struct;
 
+/* Plain integer GFP bitmasks. Do not use this directly. */
+#define ___GFP_DMA		0x01u
+#define ___GFP_HIGHMEM		0x02u
+#define ___GFP_DMA32		0x04u
+#define ___GFP_MOVABLE		0x08u
+#define ___GFP_WAIT		0x10u
+#define ___GFP_HIGH		0x20u
+#define ___GFP_IO		0x40u
+#define ___GFP_FS		0x80u
+#define ___GFP_COLD		0x100u
+#define ___GFP_NOWARN		0x200u
+#define ___GFP_REPEAT		0x400u
+#define ___GFP_NOFAIL		0x800u
+#define ___GFP_NORETRY		0x1000u
+#define ___GFP_COMP		0x4000u
+#define ___GFP_ZERO		0x8000u
+#define ___GFP_NOMEMALLOC	0x10000u
+#define ___GFP_HARDWALL		0x20000u
+#define ___GFP_THISNODE		0x40000u
+#define ___GFP_RECLAIMABLE	0x80000u
+#ifdef CONFIG_KMEMCHECK
+#define ___GFP_NOTRACK		0x200000u
+#else
+#define ___GFP_NOTRACK		0
+#endif
+
 /*
  * GFP bitmasks..
  *
@@ -18,10 +44,10 @@ struct vm_area_struct;
  * without the underscores and use them consistently. The definitions here may
  * be used in bit comparisons.
  */
-#define __GFP_DMA	((__force gfp_t)0x01u)
-#define __GFP_HIGHMEM	((__force gfp_t)0x02u)
-#define __GFP_DMA32	((__force gfp_t)0x04u)
-#define __GFP_MOVABLE	((__force gfp_t)0x08u)  /* Page is movable */
+#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
+#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
+#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
+#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* Page is movable */
 #define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
 /*
  * Action modifiers - doesn't change the zoning
@@ -38,27 +64,22 @@ struct vm_area_struct;
  * __GFP_MOVABLE: Flag that this page will be movable by the page migration
  * mechanism or reclaimed
  */
-#define __GFP_WAIT	((__force gfp_t)0x10u)	/* Can wait and reschedule? */
-#define __GFP_HIGH	((__force gfp_t)0x20u)	/* Should access emergency pools? */
-#define __GFP_IO	((__force gfp_t)0x40u)	/* Can start physical IO? */
-#define __GFP_FS	((__force gfp_t)0x80u)	/* Can call down to low-level FS? */
-#define __GFP_COLD	((__force gfp_t)0x100u)	/* Cache-cold page required */
-#define __GFP_NOWARN	((__force gfp_t)0x200u)	/* Suppress page allocation failure warning */
-#define __GFP_REPEAT	((__force gfp_t)0x400u)	/* See above */
-#define __GFP_NOFAIL	((__force gfp_t)0x800u)	/* See above */
-#define __GFP_NORETRY	((__force gfp_t)0x1000u)/* See above */
-#define __GFP_COMP	((__force gfp_t)0x4000u)/* Add compound page metadata */
-#define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
-#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
-#define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
-#define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
-#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-
-#ifdef CONFIG_KMEMCHECK
-#define __GFP_NOTRACK	((__force gfp_t)0x200000u)  /* Don't track with kmemcheck */
-#else
-#define __GFP_NOTRACK	((__force gfp_t)0)
-#endif
+#define __GFP_WAIT	((__force gfp_t)___GFP_WAIT)	/* Can wait and reschedule? */
+#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)	/* Should access emergency pools? */
+#define __GFP_IO	((__force gfp_t)___GFP_IO)	/* Can start physical IO? */
+#define __GFP_FS	((__force gfp_t)___GFP_FS)	/* Can call down to low-level FS? */
+#define __GFP_COLD	((__force gfp_t)___GFP_COLD)	/* Cache-cold page required */
+#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)	/* Suppress page allocation failure warning */
+#define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)	/* See above */
+#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)	/* See above */
+#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY) /* See above */
+#define __GFP_COMP	((__force gfp_t)___GFP_COMP)	/* Add compound page metadata */
+#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)	/* Return zeroed page on success */
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
+#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
+#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
+#define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
@@ -186,14 +207,14 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
 #endif
 
 #define GFP_ZONE_TABLE ( \
-	(ZONE_NORMAL << 0 * ZONES_SHIFT)				\
-	| (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT)			\
-	| (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT)		\
-	| (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT)			\
-	| (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT)			\
-	| (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT)	\
-	| (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\
-	| (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\
+	(ZONE_NORMAL << 0 * ZONES_SHIFT)				      \
+	| (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT)			      \
+	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT)		      \
+	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT)		      \
+	| (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT)			      \
+	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT)	      \
+	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT)   \
+	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT)   \
 )
 
 /*
@@ -203,20 +224,20 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
  * allowed.
  */
 #define GFP_ZONE_BAD ( \
-	1 << (__GFP_DMA | __GFP_HIGHMEM)				\
-	| 1 << (__GFP_DMA | __GFP_DMA32)				\
-	| 1 << (__GFP_DMA32 | __GFP_HIGHMEM)				\
-	| 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)		\
-	| 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA)		\
-	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA)		\
-	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM)		\
-	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\
+	1 << (___GFP_DMA | ___GFP_HIGHMEM)				      \
+	| 1 << (___GFP_DMA | ___GFP_DMA32)				      \
+	| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)				      \
+	| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
 )
 
 static inline enum zone_type gfp_zone(gfp_t flags)
 {
 	enum zone_type z;
-	int bit = flags & GFP_ZONEMASK;
+	int bit = (__force int) (flags & GFP_ZONEMASK);
 
 	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
 					 ((1 << ZONES_SHIFT) - 1);

From 74ce002d9aee23031b4967e1dd1c1966ddc60749 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:05 -0700
Subject: [PATCH 066/176] fs/fs-writeback.c: restore lost comment

I had to go back to a 2.6.20 tree to work out why we're adding a
number-of-inodes into a number-of-pages count.  Restore the lost comment.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 97d2951bd4d1..b5aae4bd0aca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -721,6 +721,10 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 		return 0;
 
 	wb->last_old_flush = jiffies;
+	/*
+	 * Add in the number of potentially dirty inodes, because each inode
+	 * write can dirty pagecache in the underlying blockdev.
+	 */
 	nr_pages = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);

From 7bbc0905ea4f7a471a7f79d0bea5d538f5114fc9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:05 -0700
Subject: [PATCH 067/176] mm/memory_hotplug.c: make scan_lru_pages() static

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b0dc65452973..bb63b36c4413 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -646,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
  * Scanning pfn is much easier than scanning lru list.
  * Scan pfn from start to end and Find LRU page.
  */
-unsigned long scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
 	struct page *page;

From e1ca7788dec6773b1a2bce51b7141948f2b8bccf Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Tue, 26 Oct 2010 14:22:06 -0700
Subject: [PATCH 068/176] mm: add vzalloc() and vzalloc_node() helpers

Add vzalloc() and vzalloc_node() to encapsulate the
vmalloc-then-memset-zero operation.

Use __GFP_ZERO to zero fill the allocated memory.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  2 ++
 mm/nommu.c              | 49 ++++++++++++++++++++++++++++++++++++++++-
 mm/vmalloc.c            | 46 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 63a4fe6d51bd..a03dcf62ca9d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -53,8 +53,10 @@ static inline void vmalloc_init(void)
 #endif
 
 extern void *vmalloc(unsigned long size);
+extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
+extern void *vzalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..30b5c20eec15 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -293,11 +293,58 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
+/*
+ *	vzalloc - allocate virtually continguos memory with zero fill
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator and map them into continguos kernel virtual space.
+ *	The memory allocated is set to zero.
+ *
+ *	For tight control over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size:	allocation size
+ * @node:	numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
 void *vmalloc_node(unsigned long size, int node)
 {
 	return vmalloc(size);
 }
-EXPORT_SYMBOL(vmalloc_node);
+
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size:	allocation size
+ * @node:	numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+	return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
 
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f492c774fa7b..a3d66b3dc5cb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1596,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+static inline void *__vmalloc_node_flags(unsigned long size,
+					int node, gfp_t flags)
+{
+	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+					node, __builtin_return_address(0));
+}
+
 /**
  *	vmalloc  -  allocate virtually contiguous memory
  *	@size:		allocation size
@@ -1607,11 +1614,27 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
-					-1, __builtin_return_address(0));
+	return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
 }
 EXPORT_SYMBOL(vmalloc);
 
+/**
+ *	vzalloc - allocate virtually contiguous memory with zero fill
+ *	@size:	allocation size
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator and map them into contiguous kernel virtual space.
+ *	The memory allocated is set to zero.
+ *
+ *	For tight control over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+	return __vmalloc_node_flags(size, -1,
+				GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+
 /**
  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
  * @size: allocation size
@@ -1653,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
 
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size:	allocation size
+ * @node:	numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+	return __vmalloc_node_flags(size, node,
+			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif

From 66d7dd518ae413a383ab2c6c263cc30617329842 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 26 Oct 2010 14:22:06 -0700
Subject: [PATCH 069/176] /proc/swaps: support polling

System management wants to subscribe to changes in swap configuration.
Make /proc/swaps pollable like /proc/mounts.

[akpm@linux-foundation.org: document proc_poll_event]
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Greg KH <greg@kroah.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9fc7bac7db0c..67ddaaf98c74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include <linux/poll.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
+
 static inline unsigned char swap_count(unsigned char ent)
 {
 	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
@@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	}
 	filp_close(swap_file, NULL);
 	err = 0;
+	atomic_inc(&proc_poll_event);
+	wake_up_interruptible(&proc_poll_wait);
 
 out_dput:
 	filp_close(victim, NULL);
@@ -1688,6 +1695,25 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 }
 
 #ifdef CONFIG_PROC_FS
+struct proc_swaps {
+	struct seq_file seq;
+	int event;
+};
+
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+	struct proc_swaps *s = file->private_data;
+
+	poll_wait(file, &proc_poll_wait, wait);
+
+	if (s->event != atomic_read(&proc_poll_event)) {
+		s->event = atomic_read(&proc_poll_event);
+		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+	return POLLIN | POLLRDNORM;
+}
+
 /* iterator */
 static void *swap_start(struct seq_file *swap, loff_t *pos)
 {
@@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = {
 
 static int swaps_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &swaps_op);
+	struct proc_swaps *s;
+	int ret;
+
+	s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	file->private_data = s;
+
+	ret = seq_open(file, &swaps_op);
+	if (ret) {
+		kfree(s);
+		return ret;
+	}
+
+	s->seq.private = s;
+	s->event = atomic_read(&proc_poll_event);
+	return ret;
 }
 
 static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
+	.poll		= swaps_poll,
 };
 
 static int __init procswaps_init(void)
@@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		swap_info[prev]->next = type;
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
+	atomic_inc(&proc_poll_event);
+	wake_up_interruptible(&proc_poll_wait);
+
 	error = 0;
 	goto out;
 bad_swap:

From 70384dc6dcc6aa76762200262820bdb8b724ecd5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 26 Oct 2010 14:22:07 -0700
Subject: [PATCH 070/176] mm: fix error reporting in move_pages() syscall

The vma returned by find_vma does not necessarily include the target
address.  If this happens the code tries to follow a page outside of any
vma and returns ENOENT instead of EFAULT.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 35e454189966..fe5a3c6a5426 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1037,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 
 		err = -EFAULT;
 		vma = find_vma(mm, pp->addr);
-		if (!vma || !vma_migratable(vma))
+		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
 			goto set_status;
 
 		page = follow_page(vma, pp->addr, FOLL_GET);
@@ -1204,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 		int err = -EFAULT;
 
 		vma = find_vma(mm, addr);
-		if (!vma)
+		if (!vma || addr < vma->vm_start)
 			goto set_status;
 
 		page = follow_page(vma, addr, 0);

From 44e2aa937e698ea95dd86b2a4fabd734ef2c76db Mon Sep 17 00:00:00 2001
From: Dean Nelson <dnelson@redhat.com>
Date: Tue, 26 Oct 2010 14:22:08 -0700
Subject: [PATCH 071/176] mm/hugetlb.c: add missing spin_lock() to
 hugetlb_cow()

Add missing spin_lock() of the page_table_lock before an error return in
hugetlb_cow(). Callers of hugtelb_cow() expect it to be held upon return.

Signed-off-by: Dean Nelson <dnelson@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 96991ded82fe..c4a3558589ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2448,8 +2448,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * When the original hugepage is shared one, it does not have
 	 * anon_vma prepared.
 	 */
-	if (unlikely(anon_vma_prepare(vma)))
+	if (unlikely(anon_vma_prepare(vma))) {
+		/* Caller expects lock to be held */
+		spin_lock(&mm->page_table_lock);
 		return VM_FAULT_OOM;
+	}
 
 	copy_user_huge_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);

From 572438f9b52236bd8938b1647cc15e027d27ef55 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 26 Oct 2010 14:22:08 -0700
Subject: [PATCH 072/176] mm: fix is_mem_section_removable() page_order BUG_ON
 check

page_order() is called by memory hotplug's user interface to check the
section is removable or not.  (is_mem_section_removable())

It calls page_order() withoug holding zone->lock.
So, even if the caller does

	if (PageBuddy(page))
		ret = page_order(page) ...
The caller may hit BUG_ON().

For fixing this, there are 2 choices.
  1. add zone->lock.
  2. remove BUG_ON().

is_mem_section_removable() is used for some "advice" and doesn't need to
be 100% accurate.  This is_removable() can be called via user program..
We don't want to take this important lock for long by user's request.  So,
this patch removes BUG_ON().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
  */
 static inline unsigned long page_order(struct page *page)
 {
-	VM_BUG_ON(!PageBuddy(page));
+	/* PageBuddy() must be checked by the caller */
 	return page_private(page);
 }
 

From f6a3607e5f30dc642bead8cd95c48d47b6b4bfbb Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Tue, 26 Oct 2010 14:22:09 -0700
Subject: [PATCH 073/176] mm: page_isolation: codeclean fix comment and rm
 unneeded val init

__test_page_isolated_in_pageblock() returns 1 if all pages in the range
are isolated, so fix the comment.  Variable `pfn' will be initialised in
the following loop so remove it.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
  * all pages in [start_pfn...end_pfn) must be in the same zone.
  * zone->lock must be held before call this.
  *
- * Returns 0 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range is isolated.
  */
 static int
 __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	struct zone *zone;
 	int ret;
 
-	pfn = start_pfn;
 	/*
 	 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
 	 * is not aligned to pageblock_nr_pages.

From 809c444977adb7313e0612e9e3af4b73ba3f5746 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Tue, 26 Oct 2010 14:22:10 -0700
Subject: [PATCH 074/176] mm: do_migrate_range: exit loop if not_managed is
 true

If not_managed is true all pages will be putback to lru, so break the loop
earlier to skip other pages isolate.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bb63b36c4413..e4af144ee409 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -696,15 +696,17 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 					    page_is_file_cache(page));
 
 		} else {
-			/* Becasue we don't have big zone->lock. we should
-			   check this again here. */
-			if (page_count(page))
-				not_managed++;
 #ifdef CONFIG_DEBUG_VM
 			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
 			       pfn);
 			dump_page(page);
 #endif
+			/* Becasue we don't have big zone->lock. we should
+			   check this again here. */
+			if (page_count(page)) {
+				not_managed++;
+				break;
+			}
 		}
 	}
 	ret = -EBUSY;

From f3ab2636c5c1dd9ab0ff53a46d8354d5769ffdd4 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Tue, 26 Oct 2010 14:22:10 -0700
Subject: [PATCH 075/176] mm: do_migrate_range: reduce list_empty() check

Simple code for reducing list_empty(&source) check.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4af144ee409..9260314a221e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -705,24 +705,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			   check this again here. */
 			if (page_count(page)) {
 				not_managed++;
+				ret = -EBUSY;
 				break;
 			}
 		}
 	}
-	ret = -EBUSY;
-	if (not_managed) {
-		if (!list_empty(&source))
+	if (!list_empty(&source)) {
+		if (not_managed) {
+			putback_lru_pages(&source);
+			goto out;
+		}
+		/* this function returns # of failed pages */
+		ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+		if (ret)
 			putback_lru_pages(&source);
-		goto out;
 	}
-	ret = 0;
-	if (list_empty(&source))
-		goto out;
-	/* this function returns # of failed pages */
-	ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
-	if (ret)
-		putback_lru_pages(&source);
-
 out:
 	return ret;
 }

From a56d5318716d120e040294bb258901ba89fb9c90 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 26 Oct 2010 14:22:11 -0700
Subject: [PATCH 076/176] hpet: unmap unused I/O space

When the initialization code in hpet finds a memory resource and does not
find an IRQ, it does not unmap the memory resource previously mapped.

There are buggy BIOSes which report resources exactly like this and what
is worse the memory region bases point to normal RAM.  This normally would
not matter since the space is not touched.  But when PAT is turned on,
ioremap causes the page to be uncached and sets this bit in page->flags.

Then when the page is about to be used by the allocator, it is reported
as:

BUG: Bad page state in process md5sum  pfn:3ed00
page:ffffea0000dbd800 count:0 mapcount:0 mapping:(null) index:0x0
page flags: 0x20000001000000(uncached)
Pid: 7956, comm: md5sum Not tainted 2.6.34-12-desktop #1
Call Trace:
 [<ffffffff810df851>] bad_page+0xb1/0x100
 [<ffffffff810dfa45>] prep_new_page+0x1a5/0x1c0
 [<ffffffff810dfe01>] get_page_from_freelist+0x3a1/0x640
 [<ffffffff810e01af>] __alloc_pages_nodemask+0x10f/0x6b0
...

In this particular case:

1) HPET returns 3ed00000 as memory region base, but it is not in
reserved ranges reported by the BIOS (excerpt):
 BIOS-e820: 0000000000100000 - 00000000af6cf000 (usable)
 BIOS-e820: 00000000af6cf000 - 00000000afdcf000 (reserved)

2) there is no IRQ resource reported by HPET method. On the other
hand, the Intel HPET specs (1.0a) says (3.2.5.1):
_CRS (
  // Report 1K of memory consumed by this Timer Block
  memory range consumed
  // Optional: only used if BIOS allocates Interrupts [1]
  IRQs consumed
)

[1] For case where Timer Block is configured to consume IRQ0/IRQ8 AND
Legacy 8254/Legacy RTC hardware still exists, the device objects
associated with 8254 & RTC devices should not report IRQ0/IRQ8 as
"consumed resources".

So in theory we should check whether if it is the case and use those
interrupts instead.

Anyway the address reported by the BIOS here is bogus, so non-presence
of IRQ doesn't mean the "optional" part in point 2).

Since I got no reply previously, fix this by simply unmapping the space
when IRQ is not found and memory region was mapped previously.  It would
be probably more safe to walk the resources again and unmap appropriately
depending on type.  But as we now use only ioremap for both 2 memory
resource types, it is not necessarily needed right now.

Addresses https://bugzilla.novell.com/show_bug.cgi?id=629908

Reported-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Clemens Ladisch <clemens@ladisch.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/hpet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index a4eee324eb1e..84e6a7e673b7 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -1000,6 +1000,8 @@ static int hpet_acpi_add(struct acpi_device *device)
 		return -ENODEV;
 
 	if (!data.hd_address || !data.hd_nirqs) {
+		if (data.hd_address)
+			iounmap(data.hd_address);
 		printk("%s: no address or irqs in _CRS\n", __func__);
 		return -ENODEV;
 	}

From 96e9694df446d1154ec2f4fdba8908588b9cba38 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Tue, 26 Oct 2010 14:22:13 -0700
Subject: [PATCH 077/176] hpet: fix unwanted interrupt due to stale irq status
 bit

Jaswinder Singh Rajput wrote:
> By executing Documentation/timers/hpet_example.c
>
> for polling, I requested for 3 iterations but it seems iteration work
> for only 2 as first expired time is always very small.
>
> # ./hpet_example poll /dev/hpet 10 3
> -hpet: executing poll
> hpet_poll: info.hi_flags 0x0
> hpet_poll: expired time = 0x13
> hpet_poll: revents = 0x1
> hpet_poll: data 0x1
> hpet_poll: expired time = 0x1868c
> hpet_poll: revents = 0x1
> hpet_poll: data 0x1
> hpet_poll: expired time = 0x18645
> hpet_poll: revents = 0x1
> hpet_poll: data 0x1

Clearing the HPET interrupt enable bit disables interrupt generation
but does not disable the timer, so the interrupt status bit will still
be set when the timer elapses.  If another interrupt arrives before
the timer has been correctly programmed (due to some other device on
the same interrupt line, or CONFIG_DEBUG_SHIRQ), this results in an
extra unwanted interrupt event because the status bit is likely to be
set from comparator matches that happened before the device was opened.

Therefore, we have to ensure that the interrupt status bit is and
stays cleared until we actually program the timer.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Reported-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Bob Picco <bpicco@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/hpet.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 84e6a7e673b7..a2cbb828e92a 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -465,6 +465,21 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp)
 	if (irq) {
 		unsigned long irq_flags;
 
+		if (devp->hd_flags & HPET_SHARED_IRQ) {
+			/*
+			 * To prevent the interrupt handler from seeing an
+			 * unwanted interrupt status bit, program the timer
+			 * so that it will not fire in the near future ...
+			 */
+			writel(readl(&timer->hpet_config) & ~Tn_TYPE_CNF_MASK,
+			       &timer->hpet_config);
+			write_counter(read_counter(&hpet->hpet_mc),
+				      &timer->hpet_compare);
+			/* ... and clear any left-over status. */
+			isr = 1 << (devp - devp->hd_hpets->hp_dev);
+			writel(isr, &hpet->hpet_isr);
+		}
+
 		sprintf(devp->hd_name, "hpet%d", (int)(devp - hpetp->hp_dev));
 		irq_flags = devp->hd_flags & HPET_SHARED_IRQ
 						? IRQF_SHARED : IRQF_DISABLED;

From 0ca01763a028d0034042a9397534bc1f27848652 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Tue, 26 Oct 2010 14:22:13 -0700
Subject: [PATCH 078/176] hpet: fix style problems

Fix the following style problems:

WARNING: Use #include <linux/uaccess.h> instead of <asm/uaccess.h>
WARNING: Use #include <linux/io.h> instead of <asm/io.h>
ERROR: code indent should use tabs where possible
ERROR: do not initialise statics to 0 or NULL

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/hpet.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index a2cbb828e92a..fcb5f0d6ce70 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -32,12 +32,12 @@
 #include <linux/bitops.h>
 #include <linux/compat.h>
 #include <linux/clocksource.h>
+#include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/io.h>
 
 #include <asm/current.h>
-#include <asm/uaccess.h>
 #include <asm/system.h>
-#include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/div64.h>
 
@@ -81,13 +81,13 @@ static cycle_t read_hpet(struct clocksource *cs)
 }
 
 static struct clocksource clocksource_hpet = {
-        .name           = "hpet",
-        .rating         = 250,
-        .read           = read_hpet,
-        .mask           = CLOCKSOURCE_MASK(64),
-	.mult		= 0, /* to be calculated */
-        .shift          = 10,
-        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.mask		= CLOCKSOURCE_MASK(64),
+	.mult		= 0,		/* to be calculated */
+	.shift		= 10,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 static struct clocksource *hpet_clocksource;
 #endif
@@ -826,7 +826,7 @@ int hpet_alloc(struct hpet_data *hdp)
 	struct hpets *hpetp;
 	size_t siz;
 	struct hpet __iomem *hpet;
-	static struct hpets *last = NULL;
+	static struct hpets *last;
 	unsigned long period;
 	unsigned long long temp;
 	u32 remainder;

From aaaddfe0b3bb449b8734bf29bbd36141076e5277 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Date: Tue, 26 Oct 2010 14:22:14 -0700
Subject: [PATCH 079/176] Documentation/timers/hpet_example.c: add supporting
 info for hpet_example

$./hpet_example info /dev/hpet
-hpet: executing info
hpet_info: hi_irqfreq 0x0 hi_flags 0x0 hi_hpet 0 hi_timer 2

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: "Venkatesh Pallipadi (Venki)" <venki@google.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/timers/hpet_example.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/timers/hpet_example.c b/Documentation/timers/hpet_example.c
index 4bfafb7bc4c5..9a3e7012c190 100644
--- a/Documentation/timers/hpet_example.c
+++ b/Documentation/timers/hpet_example.c
@@ -97,6 +97,33 @@ hpet_open_close(int argc, const char **argv)
 void
 hpet_info(int argc, const char **argv)
 {
+	struct hpet_info	info;
+	int			fd;
+
+	if (argc != 1) {
+		fprintf(stderr, "hpet_info: device-name\n");
+		return;
+	}
+
+	fd = open(argv[0], O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]);
+		return;
+	}
+
+	if (ioctl(fd, HPET_INFO, &info) < 0) {
+		fprintf(stderr, "hpet_info: failed to get info\n");
+		goto out;
+	}
+
+	fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ",
+		info.hi_ireqfreq, info.hi_flags);
+	fprintf(stderr, "hi_hpet %d hi_timer %d\n",
+		info.hi_hpet, info.hi_timer);
+
+out:
+	close(fd);
+	return;
 }
 
 void

From dae512edc6e945e127f0848aa757055265d70aa2 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Tue, 26 Oct 2010 14:22:15 -0700
Subject: [PATCH 080/176] drivers/char/hpet.c: fix information leak to userland

Structure info is copied to userland with some padding fields unitialized.
It leads to leaking of stack memory.

[akpm@linux-foundation.org: remove now-unneeded zeroing of info->hi_ireqfreq]
Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/hpet.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index fcb5f0d6ce70..55b8667f739f 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -596,11 +596,10 @@ hpet_ioctl_common(struct hpet_dev *devp, int cmd, unsigned long arg,
 		break;
 	case HPET_INFO:
 		{
+			memset(info, 0, sizeof(*info));
 			if (devp->hd_ireqfreq)
 				info->hi_ireqfreq =
 					hpet_time_div(hpetp, devp->hd_ireqfreq);
-			else
-				info->hi_ireqfreq = 0;
 			info->hi_flags =
 			    readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK;
 			info->hi_hpet = hpetp->hp_which;

From 947272dd3e959c69ff0fc54e62e44163b729b796 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 26 Oct 2010 14:22:15 -0700
Subject: [PATCH 081/176] alpha: enable ARCH_DMA_ADDR_T_64BIT

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Matt Turner <mattst88@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index d04ccd73af45..28f93a6c0fde 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -55,6 +55,9 @@ config ZONE_DMA
 	bool
 	default y
 
+config ARCH_DMA_ADDR_T_64BIT
+	def_bool y
+
 config NEED_DMA_MAP_STATE
        def_bool y
 

From 98c532ecbe582586e204688c6cde7e27580cc43f Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Tue, 26 Oct 2010 14:22:17 -0700
Subject: [PATCH 082/176] alpha: use single HAE window on T2 core logic (gamma,
 sable)

T2 are the only alpha SMP systems that do HAE switching at runtime, which
is fundamentally racy on SMP.  This patch limits MMIO space on T2 to HAE0
only, like we did on MCPCIA (rawhide) long ago.  This leaves us with only
112 Mb of PCI MMIO (128 Mb HAE aperture minus 16 Mb reserved for EISA),
but since linux PCI allocations are reasonably tight, it should be enough
for sane hardware configurations.

Also, fix a typo in MCPCIA_FROB_MMIO macro which shouldn't call set_hae()
if MCPCIA_ONE_HAE_WINDOW is defined.  It's more for correctness, as
set_hae() is a no-op anyway in that case.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/core_mcpcia.h |  2 +-
 arch/alpha/include/asm/core_t2.h     | 54 ++++++++++------------------
 arch/alpha/kernel/core_t2.c          | 11 ++++--
 arch/alpha/kernel/machvec_impl.h     |  3 ++
 4 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/arch/alpha/include/asm/core_mcpcia.h b/arch/alpha/include/asm/core_mcpcia.h
index 21ac53383b37..9f67a056b461 100644
--- a/arch/alpha/include/asm/core_mcpcia.h
+++ b/arch/alpha/include/asm/core_mcpcia.h
@@ -247,7 +247,7 @@ struct el_MCPCIA_uncorrected_frame_mcheck {
 #define vip	volatile int __force *
 #define vuip	volatile unsigned int __force *
 
-#ifdef MCPCIA_ONE_HAE_WINDOW
+#ifndef MCPCIA_ONE_HAE_WINDOW
 #define MCPCIA_FROB_MMIO						\
 	if (__mcpcia_is_mmio(hose)) {					\
 		set_hae(hose & 0xffffffff);				\
diff --git a/arch/alpha/include/asm/core_t2.h b/arch/alpha/include/asm/core_t2.h
index 471c07292e0b..91b46801b290 100644
--- a/arch/alpha/include/asm/core_t2.h
+++ b/arch/alpha/include/asm/core_t2.h
@@ -1,6 +1,9 @@
 #ifndef __ALPHA_T2__H__
 #define __ALPHA_T2__H__
 
+/* Fit everything into one 128MB HAE window. */
+#define T2_ONE_HAE_WINDOW 1
+
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <asm/compiler.h>
@@ -19,7 +22,7 @@
  *
  */
 
-#define T2_MEM_R1_MASK 0x07ffffff  /* Mem sparse region 1 mask is 26 bits */
+#define T2_MEM_R1_MASK 0x07ffffff  /* Mem sparse region 1 mask is 27 bits */
 
 /* GAMMA-SABLE is a SABLE with EV5-based CPUs */
 /* All LYNX machines, EV4 or EV5, use the GAMMA bias also */
@@ -85,7 +88,9 @@
 #define T2_DIR			(IDENT_ADDR + GAMMA_BIAS + 0x38e0004a0UL)
 #define T2_ICE			(IDENT_ADDR + GAMMA_BIAS + 0x38e0004c0UL)
 
+#ifndef T2_ONE_HAE_WINDOW
 #define T2_HAE_ADDRESS		T2_HAE_1
+#endif
 
 /*  T2 CSRs are in the non-cachable primary IO space from 3.8000.0000 to
  3.8fff.ffff
@@ -429,13 +434,15 @@ extern inline void t2_outl(u32 b, unsigned long addr)
  *
  */
 
+#ifdef T2_ONE_HAE_WINDOW
+#define t2_set_hae
+#else
 #define t2_set_hae { \
-	msb = addr  >> 27; \
+	unsigned long msb = addr >> 27; \
 	addr &= T2_MEM_R1_MASK; \
 	set_hae(msb); \
 }
-
-extern raw_spinlock_t t2_hae_lock;
+#endif
 
 /*
  * NOTE: take T2_DENSE_MEM off in each readX/writeX routine, since
@@ -446,28 +453,22 @@ extern raw_spinlock_t t2_hae_lock;
 __EXTERN_INLINE u8 t2_readb(const volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long result, msb;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long result;
 
 	t2_set_hae;
 
 	result = *(vip) ((addr << 5) + T2_SPARSE_MEM + 0x00);
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 	return __kernel_extbl(result, addr & 3);
 }
 
 __EXTERN_INLINE u16 t2_readw(const volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long result, msb;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long result;
 
 	t2_set_hae;
 
 	result = *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x08);
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 	return __kernel_extwl(result, addr & 3);
 }
 
@@ -478,59 +479,47 @@ __EXTERN_INLINE u16 t2_readw(const volatile void __iomem *xaddr)
 __EXTERN_INLINE u32 t2_readl(const volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long result, msb;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long result;
 
 	t2_set_hae;
 
 	result = *(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x18);
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 	return result & 0xffffffffUL;
 }
 
 __EXTERN_INLINE u64 t2_readq(const volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long r0, r1, work, msb;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long r0, r1, work;
 
 	t2_set_hae;
 
 	work = (addr << 5) + T2_SPARSE_MEM + 0x18;
 	r0 = *(vuip)(work);
 	r1 = *(vuip)(work + (4 << 5));
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 	return r1 << 32 | r0;
 }
 
 __EXTERN_INLINE void t2_writeb(u8 b, volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long msb, w;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long w;
 
 	t2_set_hae;
 
 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x00) = w;
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 }
 
 __EXTERN_INLINE void t2_writew(u16 b, volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long msb, w;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long w;
 
 	t2_set_hae;
 
 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x08) = w;
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 }
 
 /*
@@ -540,29 +529,22 @@ __EXTERN_INLINE void t2_writew(u16 b, volatile void __iomem *xaddr)
 __EXTERN_INLINE void t2_writel(u32 b, volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long msb;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
 
 	t2_set_hae;
 
 	*(vuip) ((addr << 5) + T2_SPARSE_MEM + 0x18) = b;
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 }
 
 __EXTERN_INLINE void t2_writeq(u64 b, volatile void __iomem *xaddr)
 {
 	unsigned long addr = (unsigned long) xaddr - T2_DENSE_MEM;
-	unsigned long msb, work;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&t2_hae_lock, flags);
+	unsigned long work;
 
 	t2_set_hae;
 
 	work = (addr << 5) + T2_SPARSE_MEM + 0x18;
 	*(vuip)work = b;
 	*(vuip)(work + (4 << 5)) = b >> 32;
-	raw_spin_unlock_irqrestore(&t2_hae_lock, flags);
 }
 
 __EXTERN_INLINE void __iomem *t2_ioportmap(unsigned long addr)
diff --git a/arch/alpha/kernel/core_t2.c b/arch/alpha/kernel/core_t2.c
index e6d90568b65d..2f770e994289 100644
--- a/arch/alpha/kernel/core_t2.c
+++ b/arch/alpha/kernel/core_t2.c
@@ -74,8 +74,6 @@
 # define DBG(args)
 #endif
 
-DEFINE_RAW_SPINLOCK(t2_hae_lock);
-
 static volatile unsigned int t2_mcheck_any_expected;
 static volatile unsigned int t2_mcheck_last_taken;
 
@@ -406,6 +404,7 @@ void __init
 t2_init_arch(void)
 {
 	struct pci_controller *hose;
+	struct resource *hae_mem;
 	unsigned long temp;
 	unsigned int i;
 
@@ -433,7 +432,13 @@ t2_init_arch(void)
 	 */
 	pci_isa_hose = hose = alloc_pci_controller();
 	hose->io_space = &ioport_resource;
-	hose->mem_space = &iomem_resource;
+	hae_mem = alloc_resource();
+	hae_mem->start = 0;
+	hae_mem->end = T2_MEM_R1_MASK;
+	hae_mem->name = pci_hae0_name;
+	if (request_resource(&iomem_resource, hae_mem) < 0)
+		printk(KERN_ERR "Failed to request HAE_MEM\n");
+	hose->mem_space = hae_mem;
 	hose->index = 0;
 
 	hose->sparse_mem_base = T2_SPARSE_MEM - IDENT_ADDR;
diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h
index 512685f78097..7fa62488bd16 100644
--- a/arch/alpha/kernel/machvec_impl.h
+++ b/arch/alpha/kernel/machvec_impl.h
@@ -25,6 +25,9 @@
 #ifdef MCPCIA_ONE_HAE_WINDOW
 #define MCPCIA_HAE_ADDRESS	(&alpha_mv.hae_cache)
 #endif
+#ifdef T2_ONE_HAE_WINDOW
+#define T2_HAE_ADDRESS		(&alpha_mv.hae_cache)
+#endif
 
 /* Only a few systems don't define IACK_SC, handling all interrupts through
    the SRM console.  But splitting out that one case from IO() below

From d911202e3f722c29065942400ea108b7096bdac7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 26 Oct 2010 14:22:18 -0700
Subject: [PATCH 083/176] uml: define CONFIG_NO_DMA

I think that it's better to detect DMA misuse at build time rather than
calling BUG_ON.  Architectures that can't do DMA need to define
CONFIG_NO_DMA.

Thanks to Sam Ravnborg for explaining how CONFIG_NO_DMA and CONFIG_HAS_DMA
work:

http://marc.info/?l=linux-kernel&m=128359913825550&w=2

HAS_DMA is defined like this:

config HAS_DMA
        boolean
        depends on !NO_DMA
        default y

So to set HAS_DMA to true an arch should do:
1) Do not define NO_DMA
2) Define NO_DMA abd set it to 'n'

Must archs - including um - used principle 1).

In the um case we want to say that we do NOT have any DMA.
This can be done in two ways.
a) define NO_DMA and set it to 'y'
b) redefine HAS_DMA and set it to 'n'.

The patch you provided used principle b) where other archs use principle a).
So I suggest you should use principle a) for um too.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig.um                |   3 +
 arch/um/defconfig                 |   1 -
 arch/um/include/asm/dma-mapping.h | 112 ------------------------------
 3 files changed, 3 insertions(+), 113 deletions(-)
 delete mode 100644 arch/um/include/asm/dma-mapping.h

diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index ec2b8da1aba4..ffe0934a3ccf 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -147,3 +147,6 @@ config KERNEL_STACK_ORDER
 	  This option determines the size of UML kernel stacks.  They will
 	  be 1 << order pages.  The default is OK unless you're running Valgrind
 	  on UML, in which case, set this to 3.
+
+config NO_DMA
+	def_bool y
diff --git a/arch/um/defconfig b/arch/um/defconfig
index 6bd456f96f90..564f3de65b4a 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig
@@ -566,7 +566,6 @@ CONFIG_CRC32=m
 # CONFIG_CRC7 is not set
 # CONFIG_LIBCRC32C is not set
 CONFIG_PLIST=y
-CONFIG_HAS_DMA=y
 
 #
 # SCSI device support
diff --git a/arch/um/include/asm/dma-mapping.h b/arch/um/include/asm/dma-mapping.h
deleted file mode 100644
index 1f469e80fdd3..000000000000
--- a/arch/um/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef _ASM_DMA_MAPPING_H
-#define _ASM_DMA_MAPPING_H
-
-#include <asm/scatterlist.h>
-
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-	BUG();
-	return(0);
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	BUG();
-	return(0);
-}
-
-static inline void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flag)
-{
-	BUG();
-	return((void *) 0);
-}
-
-static inline void
-dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
-		  dma_addr_t dma_handle)
-{
-	BUG();
-}
-
-static inline dma_addr_t
-dma_map_single(struct device *dev, void *cpu_addr, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG();
-	return(0);
-}
-
-static inline void
-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline dma_addr_t
-dma_map_page(struct device *dev, struct page *page,
-	     unsigned long offset, size_t size,
-	     enum dma_data_direction direction)
-{
-	BUG();
-	return(0);
-}
-
-static inline void
-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-	   enum dma_data_direction direction)
-{
-	BUG();
-	return(0);
-}
-
-static inline void
-dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
-	     enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline void
-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-		enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-	    enum dma_data_direction direction)
-{
-	BUG();
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_handle)
-{
-	BUG();
-	return 0;
-}
-
-#endif

From aa5fb4dbfd121296ca97c68cf90043a7ea97579d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 26 Oct 2010 14:22:19 -0700
Subject: [PATCH 084/176] uml: fix CONFIG_STATIC_LINK=y build failure with
 newer glibc

With glibc 2.11 or later that was built with --enable-multi-arch, the UML
link fails with undefined references to __rel_iplt_start and similar
symbols.  In recent binutils, the default linker script defines these
symbols (see ld --verbose).  Fix the UML linker scripts to match the new
defaults for these sections.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/dyn.lds.S | 14 ++++++++++++--
 arch/um/kernel/uml.lds.S | 17 +++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 69268014dd8e..a3cab6d3ae02 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -50,8 +50,18 @@ SECTIONS
   .rela.got       : { *(.rela.got) }
   .rel.bss        : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) }
   .rela.bss       : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
-  .rel.plt        : { *(.rel.plt) }
-  .rela.plt       : { *(.rela.plt) }
+  .rel.plt : {
+	*(.rel.plt)
+	PROVIDE_HIDDEN(__rel_iplt_start = .);
+	*(.rel.iplt)
+	PROVIDE_HIDDEN(__rel_iplt_end = .);
+  }
+  .rela.plt : {
+	*(.rela.plt)
+	PROVIDE_HIDDEN(__rela_iplt_start = .);
+	*(.rela.iplt)
+	PROVIDE_HIDDEN(__rela_iplt_end = .);
+  }
   .init           : {
     KEEP (*(.init))
   } =0x90909090
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 9a873d765615..fbd99402d4d2 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -43,6 +43,23 @@ SECTIONS
 	__syscall_stub_end = .;
   }
 
+  /*
+   * These are needed even in a static link, even if they wind up being empty.
+   * Newer glibc needs these __rel{,a}_iplt_{start,end} symbols.
+   */
+  .rel.plt : {
+	*(.rel.plt)
+	PROVIDE_HIDDEN(__rel_iplt_start = .);
+	*(.rel.iplt)
+	PROVIDE_HIDDEN(__rel_iplt_end = .);
+  }
+  .rela.plt : {
+	*(.rela.plt)
+	PROVIDE_HIDDEN(__rela_iplt_start = .);
+	*(.rela.iplt)
+	PROVIDE_HIDDEN(__rela_iplt_end = .);
+  }
+
   #include "asm/common.lds.S"
 
   init.data : { INIT_DATA }

From be76d81f99c4c120adcd201a7316e4dd7dbe3c11 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:22:20 -0700
Subject: [PATCH 085/176] um: migrate from __do_IRQ() to generic_handle_irq()

This patch removes __do_IRQ() from user mode linux.  __do_IRQ is deprecated.

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig.um   |  3 +++
 arch/um/kernel/irq.c | 15 ++++-----------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index ffe0934a3ccf..50d6aa20c353 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -120,6 +120,9 @@ config SMP
 
 	  If you don't know what to do, say N.
 
+config GENERIC_HARDIRQS_NO__DO_IRQ
+	def_bool y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index a746e3037a5b..3f0ac9e0c966 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -334,7 +334,7 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs);
 	irq_enter();
-	__do_IRQ(irq);
+	generic_handle_irq(irq);
 	irq_exit();
 	set_irq_regs(old_regs);
 	return 1;
@@ -391,17 +391,10 @@ void __init init_IRQ(void)
 {
 	int i;
 
-	irq_desc[TIMER_IRQ].status = IRQ_DISABLED;
-	irq_desc[TIMER_IRQ].action = NULL;
-	irq_desc[TIMER_IRQ].depth = 1;
-	irq_desc[TIMER_IRQ].chip = &SIGVTALRM_irq_type;
-	enable_irq(TIMER_IRQ);
+	set_irq_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
+
 	for (i = 1; i < NR_IRQS; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = NULL;
-		irq_desc[i].depth = 1;
-		irq_desc[i].chip = &normal_irq_type;
-		enable_irq(i);
+		set_irq_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
 	}
 }
 

From c5c6dd4e2dce3cb4d705d97183bc42b4e33e2606 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:22:20 -0700
Subject: [PATCH 086/176] hostfs: code cleanups

Some code cleanups for hostfs.

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs_user.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 8d02683585e0..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
 
 	dir = opendir(path);
 	*err_out = errno;
-	if (dir == NULL)
-		return NULL;
+
 	return dir;
 }
 
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 	if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
 		if (fd >= 0) {
 			if (fchmod(fd, attrs->ia_mode) != 0)
-				return (-errno);
+				return -errno;
 		} else if (chmod(file, attrs->ia_mode) != 0) {
 			return -errno;
 		}

From f27c85c56b32c42bcc54a43189c1e00fdceb23ec Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Tue, 26 Oct 2010 14:22:21 -0700
Subject: [PATCH 087/176] kernel.h: add {min,max}3 macros

Introduce two additional min/max macros to compare three operands.  This
will save some cycles as well as some bytes on the stack and last but not
least more pleasing as macro nesting.

[akpm@linux-foundation.org: fix warnings]
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Joe Perches <joe@perches.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Roland Dreier <rolandd@cisco.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index edef168a0406..8e786a27cfe6 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -651,6 +651,24 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	(void) (&_max1 == &_max2);		\
 	_max1 > _max2 ? _max1 : _max2; })
 
+#define min3(x, y, z) ({			\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	typeof(z) _min3 = (z);			\
+	(void) (&_min1 == &_min2);		\
+	(void) (&_min1 == &_min3);		\
+	_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
+		(_min2 < _min3 ? _min2 : _min3); })
+
+#define max3(x, y, z) ({			\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	typeof(z) _max3 = (z);			\
+	(void) (&_max1 == &_max2);		\
+	(void) (&_max1 == &_max3);		\
+	_max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
+		(_max2 > _max3 ? _max2 : _max3); })
+
 /**
  * min_not_zero - return the minimum that is _not_ zero, unless both are zero
  * @x: value1

From 732eacc0542d0aa48797f675888b85d6065af837 Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Tue, 26 Oct 2010 14:22:23 -0700
Subject: [PATCH 088/176] replace nested max/min macros with {max,min}3 macro

Use the new {max,min}3 macros to save some cycles and bytes on the stack.
This patch substitutes trivial nested macros with their counterpart.

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Joe Perches <joe@perches.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Roland Dreier <rolandd@cisco.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-ep93xx/clock.c              | 3 +--
 arch/powerpc/kernel/vio.c                 | 4 +---
 arch/x86/kernel/cpu/intel_cacheinfo.c     | 1 +
 drivers/crypto/hifn_795x.c                | 4 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 +--
 drivers/macintosh/windfarm_pm121.c        | 2 +-
 drivers/video/au1200fb.c                  | 2 +-
 mm/slab.c                                 | 2 +-
 8 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/arm/mach-ep93xx/clock.c b/arch/arm/mach-ep93xx/clock.c
index 4566bd1c8660..ef06c66a6f16 100644
--- a/arch/arm/mach-ep93xx/clock.c
+++ b/arch/arm/mach-ep93xx/clock.c
@@ -358,8 +358,7 @@ static int calc_clk_div(struct clk *clk, unsigned long rate,
 	int i, found = 0, __div = 0, __pdiv = 0;
 
 	/* Don't exceed the maximum rate */
-	max_rate = max(max(clk_pll1.rate / 4, clk_pll2.rate / 4),
-		       clk_xtali.rate / 4);
+	max_rate = max3(clk_pll1.rate / 4, clk_pll2.rate / 4, clk_xtali.rate / 4);
 	rate = min(rate, max_rate);
 
 	/*
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index d692989a4318..441d2a722f06 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -238,9 +238,7 @@ static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
 	 * memory in this pool does not change.
 	 */
 	if (spare_needed && reserve_freed) {
-		tmp = min(spare_needed, min(reserve_freed,
-		                            (viodev->cmo.entitled -
-		                             VIO_CMO_MIN_ENT)));
+		tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT));
 
 		vio_cmo.spare += tmp;
 		viodev->cmo.entitled -= tmp;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 12cd823c8d03..17ad03366211 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
 	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
 static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index 0eac3da566ba..a84250a5dd51 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -1467,7 +1467,7 @@ static int ablkcipher_add(unsigned int *drestp, struct scatterlist *dst,
 		return -EINVAL;
 
 	while (size) {
-		copy = min(drest, min(size, dst->length));
+		copy = min3(drest, size, dst->length);
 
 		size -= copy;
 		drest -= copy;
@@ -1729,7 +1729,7 @@ static int ablkcipher_get(void *saddr, unsigned int *srestp, unsigned int offset
 		return -EINVAL;
 
 	while (size) {
-		copy = min(srest, min(dst->length, size));
+		copy = min3(srest, dst->length, size);
 
 		daddr = kmap_atomic(sg_page(dst), KM_IRQ0);
 		memcpy(daddr + dst->offset + offset, saddr, copy);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index b4b22576f12a..19c5d3b191f4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1409,8 +1409,7 @@ static int __init ipoib_init_module(void)
 
 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
-	ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
-						     IPOIB_MIN_QUEUE_SIZE));
+	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
 #endif
diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c
index 947d4afa25ca..30e6195e19d4 100644
--- a/drivers/macintosh/windfarm_pm121.c
+++ b/drivers/macintosh/windfarm_pm121.c
@@ -482,7 +482,7 @@ static s32 pm121_correct(s32 new_setpoint,
 	new_min += correction->offset;
 	new_min = (new_min >> 16) + min;
 
-	return max(new_setpoint, max(new_min, 0));
+	return max3(new_setpoint, new_min, 0);
 }
 
 static s32 pm121_connect(unsigned int control_id, s32 setpoint)
diff --git a/drivers/video/au1200fb.c b/drivers/video/au1200fb.c
index e77e8e4280fb..4ea187d93768 100644
--- a/drivers/video/au1200fb.c
+++ b/drivers/video/au1200fb.c
@@ -1079,7 +1079,7 @@ static int au1200fb_fb_check_var(struct fb_var_screeninfo *var,
 	 * clock can only be obtain by dividing this value by an even integer.
 	 * Fallback to a slower pixel clock if necessary. */
 	pixclock = max((u32)(PICOS2KHZ(var->pixclock) * 1000), fbi->monspecs.dclkmin);
-	pixclock = min(pixclock, min(fbi->monspecs.dclkmax, (u32)AU1200_LCD_MAX_CLK/2));
+	pixclock = min3(pixclock, fbi->monspecs.dclkmax, (u32)AU1200_LCD_MAX_CLK/2);
 
 	if (AU1200_LCD_MAX_CLK % pixclock) {
 		int diff = AU1200_LCD_MAX_CLK % pixclock;
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..b1e40dafbab3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
 		struct array_cache *from, unsigned int max)
 {
 	/* Figure out how many entries to transfer */
-	int nr = min(min(from->avail, max), to->limit - to->avail);
+	int nr = min3(from->avail, max, to->limit - to->avail);
 
 	if (!nr)
 		return 0;

From c925cf0b80cb486b31e1ec0e9f981d75a4b38453 Mon Sep 17 00:00:00 2001
From: Philippe De Muyter <phdm@macqel.be>
Date: Tue, 26 Oct 2010 14:22:23 -0700
Subject: [PATCH 089/176] m68k{nommu}/blackfin : remove old assembler-only
 flags bit definitions

Long ago, PT_TRACESYS_OFF and friends were introduced as hard defines to
avoid straight constants in assembler parts of linux m68k.  They are not
used anymore, and were not updated to follow changes in linux kernel.
Remove them.  When similar constants are needed, they are now generated
using asm-offsets.c.

Signed-off-by: Philippe De Muyter <phdm@macqel.be>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/blackfin/include/asm/entry.h |  8 --------
 arch/m68k/include/asm/entry_mm.h  |  8 --------
 arch/m68k/include/asm/entry_no.h  | 10 ----------
 3 files changed, 26 deletions(-)

diff --git a/arch/blackfin/include/asm/entry.h b/arch/blackfin/include/asm/entry.h
index a6886f6e4819..4104d5783e2c 100644
--- a/arch/blackfin/include/asm/entry.h
+++ b/arch/blackfin/include/asm/entry.h
@@ -15,14 +15,6 @@
 #define	LFLUSH_I_AND_D	0x00000808
 #define	LSIGTRAP	5
 
-/* process bits for task_struct.flags */
-#define	PF_TRACESYS_OFF	3
-#define	PF_TRACESYS_BIT	5
-#define	PF_PTRACED_OFF	3
-#define	PF_PTRACED_BIT	4
-#define	PF_DTRACE_OFF	1
-#define	PF_DTRACE_BIT	5
-
 /*
  * NOTE!  The single-stepping code assumes that all interrupt handlers
  * start by saving SYSCFG on the stack with their first instruction.
diff --git a/arch/m68k/include/asm/entry_mm.h b/arch/m68k/include/asm/entry_mm.h
index e41fea399bfe..73b8c8fbed9c 100644
--- a/arch/m68k/include/asm/entry_mm.h
+++ b/arch/m68k/include/asm/entry_mm.h
@@ -50,14 +50,6 @@
 
 LFLUSH_I_AND_D = 0x00000808
 
-/* process bits for task_struct.ptrace */
-PT_TRACESYS_OFF = 3
-PT_TRACESYS_BIT = 1
-PT_PTRACED_OFF = 3
-PT_PTRACED_BIT = 0
-PT_DTRACE_OFF = 3
-PT_DTRACE_BIT = 2
-
 #define SAVE_ALL_INT save_all_int
 #define SAVE_ALL_SYS save_all_sys
 #define RESTORE_ALL restore_all
diff --git a/arch/m68k/include/asm/entry_no.h b/arch/m68k/include/asm/entry_no.h
index 80e41492aa2a..26be277394f9 100644
--- a/arch/m68k/include/asm/entry_no.h
+++ b/arch/m68k/include/asm/entry_no.h
@@ -32,16 +32,6 @@
 
 #ifdef __ASSEMBLY__
 
-/* process bits for task_struct.flags */
-PF_TRACESYS_OFF = 3
-PF_TRACESYS_BIT = 5
-PF_PTRACED_OFF = 3
-PF_PTRACED_BIT = 4
-PF_DTRACE_OFF = 1
-PF_DTRACE_BIT = 5
-
-LENOSYS = 38
-
 #define SWITCH_STACK_SIZE (6*4+4)	/* Includes return address */
 
 /*

From a55621f15bc61826969a29e111ba131a55ef45de Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:25 -0700
Subject: [PATCH 090/176] include/linux/kernel.h: add __must_check to
 strict_strto*()

The whole point to using the strict functions is to check the return
value.  If you don't, strict_strto*() will return you uninitialised
garbage.  Offenders have been observed in the wild.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8e786a27cfe6..e9b492b33032 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -203,10 +203,10 @@ extern unsigned long simple_strtoul(const char *,char **,unsigned int);
 extern long simple_strtol(const char *,char **,unsigned int);
 extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
 extern long long simple_strtoll(const char *,char **,unsigned int);
-extern int strict_strtoul(const char *, unsigned int, unsigned long *);
-extern int strict_strtol(const char *, unsigned int, long *);
-extern int strict_strtoull(const char *, unsigned int, unsigned long long *);
-extern int strict_strtoll(const char *, unsigned int, long long *);
+extern int __must_check strict_strtoul(const char *, unsigned int, unsigned long *);
+extern int __must_check strict_strtol(const char *, unsigned int, long *);
+extern int __must_check strict_strtoull(const char *, unsigned int, unsigned long long *);
+extern int __must_check strict_strtoll(const char *, unsigned int, long long *);
 extern int sprintf(char * buf, const char * fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int vsprintf(char *buf, const char *, va_list)

From b6472776816af1ed52848c93d26e3edb3b17adab Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 26 Oct 2010 14:22:26 -0700
Subject: [PATCH 091/176] modules: no need to align .modinfo strings

gcc aligns strings as a performance consideration for those cases where
strings are being used a lot.

Their use is not performance critical, and hence it seems better to save
some space.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/moduleparam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 9d2f1837b3d8..112adf8bd47d 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -21,8 +21,8 @@
 #define __module_cat(a,b) ___module_cat(a,b)
 #define __MODULE_INFO(tag, name, info)					  \
 static const char __module_cat(name,__LINE__)[]				  \
-  __used								  \
-  __attribute__((section(".modinfo"),unused)) = __stringify(tag) "=" info
+  __used __attribute__((section(".modinfo"), unused, aligned(1)))	  \
+  = __stringify(tag) "=" info
 #else  /* !MODULE */
 #define __MODULE_INFO(tag, name, info)
 #endif

From 3ecb01df3261d3b1f02ccfcf8384e2a255d2a1d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 26 Oct 2010 14:22:27 -0700
Subject: [PATCH 092/176] use clear_page()/copy_page() in favor of
 memset()/memcpy() on whole pages

After all that's what they are intended for.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c           |  2 +-
 kernel/kexec.c          |  2 +-
 kernel/power/snapshot.c | 14 +++++++-------
 kernel/power/swap.c     |  6 +++---
 mm/memory.c             |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..69037118b2c9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -811,7 +811,7 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 
 	if (page && zeroing && count < PAGE_SIZE) {
 		void *mapaddr = kmap_atomic(page, KM_USER1);
-		memset(mapaddr, 0, PAGE_SIZE);
+		clear_page(mapaddr);
 		kunmap_atomic(mapaddr, KM_USER1);
 	}
 	while (count) {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 
 		ptr = kmap(page);
 		/* Start with a clear page */
-		memset(ptr, 0, PAGE_SIZE);
+		clear_page(ptr);
 		ptr += maddr & ~PAGE_MASK;
 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 		if (mchunk > mbytes)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9e3581f4619a..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 			 */
 			safe_copy_page(buffer, s_page);
 			dst = kmap_atomic(d_page, KM_USER0);
-			memcpy(dst, buffer, PAGE_SIZE);
+			copy_page(dst, buffer);
 			kunmap_atomic(dst, KM_USER0);
 		} else {
 			safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 		memory_bm_position_reset(&orig_bm);
 		memory_bm_position_reset(&copy_bm);
 	} else if (handle->cur <= nr_meta_pages) {
-		memset(buffer, 0, PAGE_SIZE);
+		clear_page(buffer);
 		pack_pfns(buffer, &orig_bm);
 	} else {
 		struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 			void *kaddr;
 
 			kaddr = kmap_atomic(page, KM_USER0);
-			memcpy(buffer, kaddr, PAGE_SIZE);
+			copy_page(buffer, kaddr);
 			kunmap_atomic(kaddr, KM_USER0);
 			handle->buffer = buffer;
 		} else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
 		void *dst;
 
 		dst = kmap_atomic(last_highmem_page, KM_USER0);
-		memcpy(dst, buffer, PAGE_SIZE);
+		copy_page(dst, buffer);
 		kunmap_atomic(dst, KM_USER0);
 		last_highmem_page = NULL;
 	}
@@ -2270,9 +2270,9 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 
 	kaddr1 = kmap_atomic(p1, KM_USER0);
 	kaddr2 = kmap_atomic(p2, KM_USER1);
-	memcpy(buf, kaddr1, PAGE_SIZE);
-	memcpy(kaddr1, kaddr2, PAGE_SIZE);
-	memcpy(kaddr2, buf, PAGE_SIZE);
+	copy_page(buf, kaddr1);
+	copy_page(kaddr1, kaddr2);
+	copy_page(kaddr2, buf);
 	kunmap_atomic(kaddr2, KM_USER1);
 	kunmap_atomic(kaddr1, KM_USER0);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	if (bio_chain) {
 		src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
 		if (src) {
-			memcpy(src, buf, PAGE_SIZE);
+			copy_page(src, buf);
 		} else {
 			WARN_ON_ONCE(1);
 			bio_chain = NULL;	/* Go synchronous */
@@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		error = write_page(handle->cur, handle->cur_swap, NULL);
 		if (error)
 			goto out;
-		memset(handle->cur, 0, PAGE_SIZE);
+		clear_page(handle->cur);
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
@@ -910,7 +910,7 @@ int swsusp_check(void)
 	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
-		memset(swsusp_header, 0, PAGE_SIZE);
+		clear_page(swsusp_header);
 		error = hib_bio_read_page(swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
diff --git a/mm/memory.c b/mm/memory.c
index 861f7982dd54..02e48aa0ed13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2080,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 		 * zeroes.
 		 */
 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
-			memset(kaddr, 0, PAGE_SIZE);
+			clear_page(kaddr);
 		kunmap_atomic(kaddr, KM_USER0);
 		flush_dcache_page(dst);
 	} else

From b6777c40c79168d938c30b5b7471fbd64bca109c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 26 Oct 2010 14:22:27 -0700
Subject: [PATCH 093/176] fuse: use clear_highpage() and KM_USER0 instead of
 KM_USER1

Commit 7909b1c640 ("fuse: don't use atomic kmap") removed KM_USER0 usage
from fuse/dev.c.  Switch KM_USER1 uses to KM_USER0 for clarity.  Also
replace open coded clear_highpage().

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 69037118b2c9..b98664275f02 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 	int err;
 	struct page *page = *pagep;
 
-	if (page && zeroing && count < PAGE_SIZE) {
-		void *mapaddr = kmap_atomic(page, KM_USER1);
-		clear_page(mapaddr);
-		kunmap_atomic(mapaddr, KM_USER1);
-	}
+	if (page && zeroing && count < PAGE_SIZE)
+		clear_highpage(page);
+
 	while (count) {
 		if (cs->write && cs->pipebufs && page) {
 			return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 			}
 		}
 		if (page) {
-			void *mapaddr = kmap_atomic(page, KM_USER1);
+			void *mapaddr = kmap_atomic(page, KM_USER0);
 			void *buf = mapaddr + offset;
 			offset += fuse_copy_do(cs, &buf, &count);
-			kunmap_atomic(mapaddr, KM_USER1);
+			kunmap_atomic(mapaddr, KM_USER0);
 		} else
 			offset += fuse_copy_do(cs, NULL, &count);
 	}

From 2473238eac95ba6dd2c4ba19cc36aaf01465076b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 26 Oct 2010 14:22:28 -0700
Subject: [PATCH 094/176] ihex: add support for CS:IP/EIP records

ihex firmwares can include a jump address for starting execution.  Add a
-j option which will cause this to be written into the generated file as a
record with address zero and data consisting of the address to jump to,
allowing drivers to make use of this information.

This format is chosen because it most closely follows the original ihex
format, though it may make more sense to write a record with length zero
and the address stored as the address.  The records are not omitted by
default since our ihex format does not include record type information and
so including additional records may lead to confusion.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 firmware/ihex2fw.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/firmware/ihex2fw.c b/firmware/ihex2fw.c
index 5a03ba8c8364..ba0cf0b601bb 100644
--- a/firmware/ihex2fw.c
+++ b/firmware/ihex2fw.c
@@ -55,6 +55,7 @@ static int output_records(int outfd);
 
 static int sort_records = 0;
 static int wide_records = 0;
+static int include_jump = 0;
 
 static int usage(void)
 {
@@ -63,6 +64,7 @@ static int usage(void)
 	fprintf(stderr, "usage: ihex2fw [<options>] <src.HEX> <dst.fw>\n");
 	fprintf(stderr, "       -w: wide records (16-bit length)\n");
 	fprintf(stderr, "       -s: sort records by address\n");
+	fprintf(stderr, "       -j: include records for CS:IP/EIP address\n");
 	return 1;
 }
 
@@ -73,7 +75,7 @@ int main(int argc, char **argv)
 	uint8_t *data;
 	int opt;
 
-	while ((opt = getopt(argc, argv, "ws")) != -1) {
+	while ((opt = getopt(argc, argv, "wsj")) != -1) {
 		switch (opt) {
 		case 'w':
 			wide_records = 1;
@@ -81,7 +83,9 @@ int main(int argc, char **argv)
 		case 's':
 			sort_records = 1;
 			break;
-		default:
+		case 'j':
+			include_jump = 1;
+			break;
 			return usage();
 		}
 	}
@@ -128,6 +132,7 @@ static int process_ihex(uint8_t *data, ssize_t size)
 {
 	struct ihex_binrec *record;
 	uint32_t offset = 0;
+	uint32_t data32;
 	uint8_t type, crc = 0, crcbyte = 0;
 	int i, j;
 	int line = 1;
@@ -223,8 +228,14 @@ static int process_ihex(uint8_t *data, ssize_t size)
 			return -EINVAL;
 		}
 
+		memcpy(&data32, &record->data[0], sizeof(data32));
+		data32 = htonl(data32);
+		memcpy(&record->data[0], &data32, sizeof(data32));
+
 		/* These records contain the CS/IP or EIP where execution
-		 * starts. Don't really know what to do with them. */
+		 * starts. If requested output this as a record. */
+		if (include_jump)
+			file_record(record);
 		goto next_record;
 
 	default:

From cd1c584f380183aca268d454c14b22d8454eac4f Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Tue, 26 Oct 2010 14:22:28 -0700
Subject: [PATCH 095/176] fs/direct-io.c: fix truncation error in
 dio_complete() return

Fix up truncation (ssize_t->int).  This only matters with >2G
reads/writes, which the kernel doesn't permit.

Signed-off-by: Edward Shishkin <edward@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..85882f6ba5f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
 	ssize_t transferred = 0;
 

From d356c0b680d15e0187de48aa303e541b461ea794 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 Oct 2010 14:22:29 -0700
Subject: [PATCH 096/176] vmlinux.lds.h: gather .data..shared_aligned sections
 in DATA_DATA

With the recent change "net: remove time limit in process_backlog()", the
softnet_data variable changed from "DEFINE_PER_CPU()" to
"DEFINE_PER_CPU_ALIGNED()" which moved it from the .data section to the
.data.shared_align section.  I'm not saying this patch is wrong, just that
is what caused me to notice this larger problem.  No one else in the
kernel is using this aligned macro variant, so I imagine that's why no one
has noticed yet.

Since .data..shared_align isn't declared in any vmlinux files that I can
see, the linker just places it last.  This "just works" for most people,
but when building a ROM kernel on Blackfin systems, it causes section
overlap errors:

bfin-uclinux-ld.real:
	section .init.data [00000000202e06b8 -> 00000000202e48b7] overlaps
	section .data.shared_aligned [00000000202e06b8 -> 00000000202e0723]

I imagine other arches which support the ROM config option and thus do
funky placement would see similar issues ...

On x86, it is stuck in a dedicated section at the end:
 [8] .data             PROGBITS ffffffff810ec000 2ec0000303a8 00 WA 0 0 4096
 [9] .data.shared_alig PROGBITS ffffffff8111c3c0 31c3c00000c8 00 WA 0 0 64

So make sure we include this section in the DATA_DATA macro so that it is
placed in the right location.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f4229fb315e1..aaf1105d9d91 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -150,6 +150,7 @@
 #define DATA_DATA							\
 	*(.data)							\
 	*(.ref.data)							\
+	*(.data..shared_aligned) /* percpu related */			\
 	DEV_KEEP(init.data)						\
 	DEV_KEEP(exit.data)						\
 	CPU_KEEP(init.data)						\

From d88262623fb58853ed60fb110141c435de26e3de Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 Oct 2010 14:22:30 -0700
Subject: [PATCH 097/176] vmlinux.lds.h: lower init ramfs alignment to 4

The new init ramfs format (cpio based) requires an alignment of 4 (per the
documentation and per the source files themselves).  As for compressed
sources, the decompressors can all deal with unaligned buffers.

The cpio source is also found in the __init sections of the kernel, so
once they are read and expanded into a tmpfs, the source is freed.  That
means there is no need to force page alignment here either.

This has been used on Blackfin systems for many releases without issue.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index aaf1105d9d91..2c0fc10956ba 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -637,7 +637,7 @@
 
 #ifdef CONFIG_BLK_DEV_INITRD
 #define INIT_RAM_FS							\
-	. = ALIGN(PAGE_SIZE);						\
+	. = ALIGN(4);							\
 	VMLINUX_SYMBOL(__initramfs_start) = .;				\
 	*(.init.ramfs)							\
 	VMLINUX_SYMBOL(__initramfs_end) = .;

From 1df79da85657aecde2ecff052ff0cf9910311078 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:31 -0700
Subject: [PATCH 098/176] fs/buffer.c: remove duplicated assignment to
 b_private

bh->b_private is initialized within init_buffer(), thus this assignment is
redundant.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index ec21a92e08b4..8d595ab2aed1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -905,7 +905,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
-		bh->b_private = NULL;
 		bh->b_size = size;
 
 		/* Link the buffer to its page */

From 4199ca77cc44c418972e8f30a36be7d26d21a0d2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 26 Oct 2010 14:22:32 -0700
Subject: [PATCH 099/176] fs: move exportfs since it is not a networking
 filesystem

Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block
since it provides a library function that can be (and is) used by other
(non-network) filesystems.

This also eliminates a kconfig dependency warning:

warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 65781de44fc0..b5e582bd769d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
+config EXPORTFS
+	tristate
+
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
@@ -222,9 +225,6 @@ config LOCKD_V4
 	depends on FILE_LOCKING
 	default y
 
-config EXPORTFS
-	tristate
-
 config NFS_ACL_SUPPORT
 	tristate
 	select FS_POSIX_ACL

From 99dc829256bb8cfcb1f58b7f118893fdbf608e60 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 26 Oct 2010 14:22:33 -0700
Subject: [PATCH 100/176] procfs: fix numbering in /proc/locks

The lock number in /proc/locks (first field) is implemented by a counter
(private field of struct seq_file) which is incremented at each call of
locks_show() and reset to 1 in locks_start() whatever the offset is.  It
should be reset according to the actual position in the list.  Because of
this, the numbering erratically restarts at 1 several times when reading a
long /proc/locks file.

Moreover, locks_show() can be called twice to print a single line thus
skipping a number.  The counter should be incremented in locks_next().

And last, pos is a loff_t, which can be bigger than a pointer, so we don't
use the pointer as an integer anymore, and allocate a loff_t instead.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 8b2b6ad56a09..4de3a2666810 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2109,7 +2109,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/seq_file.h>
 
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
-							int id, char *pfx)
+			    loff_t id, char *pfx)
 {
 	struct inode *inode = NULL;
 	unsigned int fl_pid;
@@ -2122,7 +2122,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	if (fl->fl_file != NULL)
 		inode = fl->fl_file->f_path.dentry->d_inode;
 
-	seq_printf(f, "%d:%s ", id, pfx);
+	seq_printf(f, "%lld:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
 		seq_printf(f, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2185,24 +2185,27 @@ static int locks_show(struct seq_file *f, void *v)
 
 	fl = list_entry(v, struct file_lock, fl_link);
 
-	lock_get_status(f, fl, (long)f->private, "");
+	lock_get_status(f, fl, *((loff_t *)f->private), "");
 
 	list_for_each_entry(bfl, &fl->fl_block, fl_block)
-		lock_get_status(f, bfl, (long)f->private, " ->");
+		lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
 
-	f->private++;
 	return 0;
 }
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
+	loff_t *p = f->private;
+
 	lock_flocks();
-	f->private = (void *)1;
+	*p = (*pos + 1);
 	return seq_list_start(&file_lock_list, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
+	loff_t *p = f->private;
+	++*p;
 	return seq_list_next(v, &file_lock_list, pos);
 }
 
@@ -2220,14 +2223,14 @@ static const struct seq_operations locks_seq_operations = {
 
 static int locks_open(struct inode *inode, struct file *filp)
 {
-	return seq_open(filp, &locks_seq_operations);
+	return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
 }
 
 static const struct file_operations proc_locks_operations = {
 	.open		= locks_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
 static int __init proc_locks_init(void)

From ca1cab37d91cbe8a8333732540d43cabb54cfa85 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:34 -0700
Subject: [PATCH 101/176] workqueues: s/ON_STACK/ONSTACK/

Silly though it is, completions and wait_queue_heads use foo_ONSTACK
(COMPLETION_INITIALIZER_ONSTACK, DECLARE_COMPLETION_ONSTACK,
__WAIT_QUEUE_HEAD_INIT_ONSTACK and DECLARE_WAIT_QUEUE_HEAD_ONSTACK) so I
guess workqueues should do the same thing.

s/INIT_WORK_ON_STACK/INIT_WORK_ONSTACK/
s/INIT_DELAYED_WORK_ON_STACK/INIT_DELAYED_WORK_ONSTACK/

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c          | 2 +-
 arch/x86/kernel/smpboot.c       | 2 +-
 drivers/md/dm-snap-persistent.c | 2 +-
 include/linux/workqueue.h       | 6 +++---
 kernel/workqueue.c              | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index aff0b3c27509..ae03cab4352e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -713,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 
 	switch (action & 0xf) {
 	case CPU_ONLINE:
-		INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
+		INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
 		init_completion(&work.complete);
 		/* FIXME: add schedule_work_on() */
 		schedule_delayed_work_on(cpu, &work.work, 0);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6af118511b4a..6c7faecd9e4a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -747,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		.done	= COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
 
-	INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+	INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
 
 	alternatives_smp_switch(1);
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 0b61792a2780..2129cdb115dc 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -254,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 	 * Issue the synchronous I/O from a different thread
 	 * to avoid generic_make_request recursion.
 	 */
-	INIT_WORK_ON_STACK(&req.work, do_metadata);
+	INIT_WORK_ONSTACK(&req.work, do_metadata);
 	queue_work(ps->metadata_wq, &req.work);
 	flush_workqueue(ps->metadata_wq);
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 070bb7a88936..0c0771f06bfa 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -190,7 +190,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 		__INIT_WORK((_work), (_func), 0);		\
 	} while (0)
 
-#define INIT_WORK_ON_STACK(_work, _func)			\
+#define INIT_WORK_ONSTACK(_work, _func)				\
 	do {							\
 		__INIT_WORK((_work), (_func), 1);		\
 	} while (0)
@@ -201,9 +201,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 		init_timer(&(_work)->timer);			\
 	} while (0)
 
-#define INIT_DELAYED_WORK_ON_STACK(_work, _func)		\
+#define INIT_DELAYED_WORK_ONSTACK(_work, _func)			\
 	do {							\
-		INIT_WORK_ON_STACK(&(_work)->work, (_func));	\
+		INIT_WORK_ONSTACK(&(_work)->work, (_func));	\
 		init_timer_on_stack(&(_work)->timer);		\
 	} while (0)
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e5ff2cbaadc2..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	 * checks and call back into the fixup functions where we
 	 * might deadlock.
 	 */
-	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
+	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 	init_completion(&barr->done);
 

From 10ad5278bbc961c9df8260f3e116d60eaaa3fb18 Mon Sep 17 00:00:00 2001
From: Rahul Ruikar <rahul.ruikar@gmail.com>
Date: Tue, 26 Oct 2010 14:22:35 -0700
Subject: [PATCH 102/176] drivers/misc/phantom.c: add missing warning messages
 in phantom_probe()

phantom_probe() can fail in many places.  Add missing warning messages in
pci_enable_device() and pci_request_regions().

Signed-off-by: Rahul Ruikar <rahul.ruikar@gmail.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/phantom.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index 4197a3cb26ba..b05db55c8c8e 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -343,8 +343,10 @@ static int __devinit phantom_probe(struct pci_dev *pdev,
 	int retval;
 
 	retval = pci_enable_device(pdev);
-	if (retval)
+	if (retval) {
+		dev_err(&pdev->dev, "pci_enable_device failed!\n");
 		goto err;
+	}
 
 	minor = phantom_get_free();
 	if (minor == PHANTOM_MAX_MINORS) {
@@ -356,8 +358,10 @@ static int __devinit phantom_probe(struct pci_dev *pdev,
 	phantom_devices[minor] = 1;
 
 	retval = pci_request_regions(pdev, "phantom");
-	if (retval)
+	if (retval) {
+		dev_err(&pdev->dev, "pci_request_regions failed!\n");
 		goto err_null;
+	}
 
 	retval = -ENOMEM;
 	pht = kzalloc(sizeof(*pht), GFP_KERNEL);

From 5f400cf40fc703673aa791966ffb1c628c1ff45a Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Tue, 26 Oct 2010 14:22:35 -0700
Subject: [PATCH 103/176] drivers/misc/ad525x_dpot.c: fix part name typos in
 defines

There is no runtime effect by this change.  It frees up namespace for
defines erroneously used.  This is required to actually support devices
requiring the namespace, added with "drivers/misc/ad525x_dpot.c: new
features".

All defines touched have the same value defined, after the change.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Chris Verges <chrisv@cyberswitching.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/ad525x_dpot.c | 10 +++++-----
 drivers/misc/ad525x_dpot.h | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/ad525x_dpot.c b/drivers/misc/ad525x_dpot.c
index 5e6fa8449e8b..9698c7546911 100644
--- a/drivers/misc/ad525x_dpot.c
+++ b/drivers/misc/ad525x_dpot.c
@@ -166,7 +166,7 @@ static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg)
 	case DPOT_UID(AD5280_ID):
 	case DPOT_UID(AD5282_ID):
 		ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
-			0 : DPOT_AD5291_RDAC_AB;
+			0 : DPOT_AD5282_RDAC_AB;
 		return dpot_read_r8d8(dpot, ctrl);
 	case DPOT_UID(AD5170_ID):
 	case DPOT_UID(AD5171_ID):
@@ -175,7 +175,7 @@ static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg)
 	case DPOT_UID(AD5172_ID):
 	case DPOT_UID(AD5173_ID):
 		ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
-			0 : DPOT_AD5272_3_A0;
+			0 : DPOT_AD5172_3_A0;
 		return dpot_read_r8d8(dpot, ctrl);
 	default:
 		if ((reg & DPOT_REG_TOL) || (dpot->max_pos > 256))
@@ -273,7 +273,7 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value)
 	case DPOT_UID(AD5280_ID):
 	case DPOT_UID(AD5282_ID):
 		ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
-			0 : DPOT_AD5291_RDAC_AB;
+			0 : DPOT_AD5282_RDAC_AB;
 		return dpot_write_r8d8(dpot, ctrl, value);
 		break;
 	case DPOT_UID(AD5171_ID):
@@ -289,12 +289,12 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value)
 	case DPOT_UID(AD5172_ID):
 	case DPOT_UID(AD5173_ID):
 		ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
-			0 : DPOT_AD5272_3_A0;
+			0 : DPOT_AD5172_3_A0;
 		if (reg & DPOT_ADDR_OTP) {
 			tmp = dpot_read_r8d16(dpot, ctrl);
 			if (tmp >> 14) /* Ready to Program? */
 				return -EFAULT;
-			ctrl |= DPOT_AD5270_2_3_FUSE;
+			ctrl |= DPOT_AD5170_2_3_FUSE;
 		}
 		return dpot_write_r8d8(dpot, ctrl, value);
 		break;
diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h
index 78b89fd2e2fd..7609d49efd31 100644
--- a/drivers/misc/ad525x_dpot.h
+++ b/drivers/misc/ad525x_dpot.h
@@ -166,14 +166,14 @@ enum dpot_devid {
 #define DPOT_AD5291_RDAC	0x01
 #define DPOT_AD5291_READ_RDAC	0x02
 
-/* AD524x use special commands */
 #define DPOT_AD5291_RDAC_AB	0x80
 
+#define DPOT_AD5282_RDAC_AB	0x80
 #define DPOT_AD5273_FUSE	0x80
-#define DPOT_AD5270_2_3_FUSE	0x20
-#define DPOT_AD5270_2_3_OW	0x08
-#define DPOT_AD5272_3_A0	0x08
-#define DPOT_AD5270_2FUSE	0x80
+#define DPOT_AD5170_2_3_FUSE	0x20
+#define DPOT_AD5170_2_3_OW	0x08
+#define DPOT_AD5172_3_A0	0x08
+#define DPOT_AD5170_2FUSE	0x80
 
 struct dpot_data;
 

From a4bd394956f20d0bfc0ca6ecfac2af4150da274a Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Tue, 26 Oct 2010 14:22:36 -0700
Subject: [PATCH 104/176] drivers/misc/ad525x_dpot.c: new features

Add support for AD5270, AD5271, AD5272, AD5274 digital potentiometers.
Add 20-TP feature for AD5291 and AD5292 parts, and update feature list.
AD5291 rdac read back must be shifted by two.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Chris Verges <chrisv@cyberswitching.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/Kconfig           |   3 +-
 drivers/misc/ad525x_dpot-i2c.c |   2 +
 drivers/misc/ad525x_dpot-spi.c |   2 +
 drivers/misc/ad525x_dpot.c     | 107 ++++++++++++++++++++++++++++++---
 drivers/misc/ad525x_dpot.h     |  23 ++++++-
 5 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 1f69743b12ec..3d57adfc8703 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -24,7 +24,8 @@ config AD525X_DPOT
 	  AD5260, AD5262, AD5263, AD5290, AD5291, AD5292, AD5293,
 	  AD7376, AD8400, AD8402, AD8403, ADN2850, AD5241, AD5242,
 	  AD5243, AD5245, AD5246, AD5247, AD5248, AD5280, AD5282,
-	  ADN2860, AD5273, AD5171, AD5170, AD5172, AD5173
+	  ADN2860, AD5273, AD5171, AD5170, AD5172, AD5173, AD5270,
+	  AD5271, AD5272, AD5274
 	  digital potentiometer chips.
 
 	  See Documentation/misc-devices/ad525x_dpot.txt for the
diff --git a/drivers/misc/ad525x_dpot-i2c.c b/drivers/misc/ad525x_dpot-i2c.c
index 374352af7979..4ff73c215746 100644
--- a/drivers/misc/ad525x_dpot-i2c.c
+++ b/drivers/misc/ad525x_dpot-i2c.c
@@ -102,6 +102,8 @@ static const struct i2c_device_id ad_dpot_id[] = {
 	{"ad5170", AD5170_ID},
 	{"ad5172", AD5172_ID},
 	{"ad5173", AD5173_ID},
+	{"ad5272", AD5272_ID},
+	{"ad5274", AD5274_ID},
 	{}
 };
 MODULE_DEVICE_TABLE(i2c, ad_dpot_id);
diff --git a/drivers/misc/ad525x_dpot-spi.c b/drivers/misc/ad525x_dpot-spi.c
index 6cfcb636577a..7f9a55afe05d 100644
--- a/drivers/misc/ad525x_dpot-spi.c
+++ b/drivers/misc/ad525x_dpot-spi.c
@@ -38,6 +38,8 @@ static const struct ad_dpot_id ad_dpot_spi_devlist[] = {
 	{.name = "ad8402", .devid = AD8402_ID},
 	{.name = "ad8403", .devid = AD8403_ID},
 	{.name = "adn2850", .devid = ADN2850_ID},
+	{.name = "ad5270", .devid = AD5270_ID},
+	{.name = "ad5271", .devid = AD5271_ID},
 	{}
 };
 
diff --git a/drivers/misc/ad525x_dpot.c b/drivers/misc/ad525x_dpot.c
index 9698c7546911..7cb911028d09 100644
--- a/drivers/misc/ad525x_dpot.c
+++ b/drivers/misc/ad525x_dpot.c
@@ -29,9 +29,9 @@
  * AD5262		2		256		20, 50, 200
  * AD5263		4		256		20, 50, 200
  * AD5290		1		256		10, 50, 100
- * AD5291		1		256		20
- * AD5292		1		1024		20
- * AD5293		1		1024		20
+ * AD5291		1		256		20, 50, 100  (20-TP)
+ * AD5292		1		1024		20, 50, 100  (20-TP)
+ * AD5293		1		1024		20, 50, 100
  * AD7376		1		128		10, 50, 100, 1M
  * AD8400		1		256		1, 10, 50, 100
  * AD8402		2		256		1, 10, 50, 100
@@ -52,6 +52,10 @@
  * AD5170		1		256		2.5, 10, 50, 100 (OTP)
  * AD5172		2		256		2.5, 10, 50, 100 (OTP)
  * AD5173		2		256		2.5, 10, 50, 100 (OTP)
+ * AD5270		1		1024		20, 50, 100 (50-TP)
+ * AD5271		1		256		20, 50, 100 (50-TP)
+ * AD5272		1		1024		20, 50, 100 (50-TP)
+ * AD5274		1		256		20, 50, 100 (50-TP)
  *
  * See Documentation/misc-devices/ad525x_dpot.txt for more info.
  *
@@ -126,18 +130,38 @@ static inline int dpot_write_r8d16(struct dpot_data *dpot, u8 reg, u16 val)
 static s32 dpot_read_spi(struct dpot_data *dpot, u8 reg)
 {
 	unsigned ctrl = 0;
+	int value;
 
 	if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD))) {
 
 		if (dpot->feat & F_RDACS_WONLY)
 			return dpot->rdac_cache[reg & DPOT_RDAC_MASK];
-
 		if (dpot->uid == DPOT_UID(AD5291_ID) ||
 			dpot->uid == DPOT_UID(AD5292_ID) ||
-			dpot->uid == DPOT_UID(AD5293_ID))
-			return dpot_read_r8d8(dpot,
+			dpot->uid == DPOT_UID(AD5293_ID)) {
+
+			value = dpot_read_r8d8(dpot,
 				DPOT_AD5291_READ_RDAC << 2);
 
+			if (dpot->uid == DPOT_UID(AD5291_ID))
+				value = value >> 2;
+
+			return value;
+		} else if (dpot->uid == DPOT_UID(AD5270_ID) ||
+			dpot->uid == DPOT_UID(AD5271_ID)) {
+
+			value = dpot_read_r8d8(dpot,
+				DPOT_AD5270_1_2_4_READ_RDAC << 2);
+
+			if (value < 0)
+				return value;
+
+			if (dpot->uid == DPOT_UID(AD5271_ID))
+				value = value >> 2;
+
+			return value;
+		}
+
 		ctrl = DPOT_SPI_READ_RDAC;
 	} else if (reg & DPOT_ADDR_EEPROM) {
 		ctrl = DPOT_SPI_READ_EEPROM;
@@ -153,6 +177,7 @@ static s32 dpot_read_spi(struct dpot_data *dpot, u8 reg)
 
 static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg)
 {
+	int value;
 	unsigned ctrl = 0;
 	switch (dpot->uid) {
 	case DPOT_UID(AD5246_ID):
@@ -177,6 +202,25 @@ static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg)
 		ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
 			0 : DPOT_AD5172_3_A0;
 		return dpot_read_r8d8(dpot, ctrl);
+	case DPOT_UID(AD5272_ID):
+	case DPOT_UID(AD5274_ID):
+			dpot_write_r8d8(dpot,
+				(DPOT_AD5270_1_2_4_READ_RDAC << 2), 0);
+
+			value = dpot_read_r8d16(dpot,
+				DPOT_AD5270_1_2_4_RDAC << 2);
+
+			if (value < 0)
+				return value;
+			/*
+			 * AD5272/AD5274 returns high byte first, however
+			 * underling smbus expects low byte first.
+			 */
+			value = swab16(value);
+
+			if (dpot->uid == DPOT_UID(AD5271_ID))
+				value = value >> 2;
+		return value;
 	default:
 		if ((reg & DPOT_REG_TOL) || (dpot->max_pos > 256))
 			return dpot_read_r8d16(dpot, (reg & 0xF8) |
@@ -198,7 +242,7 @@ static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value)
 {
 	unsigned val = 0;
 
-	if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD))) {
+	if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD | DPOT_ADDR_OTP))) {
 		if (dpot->feat & F_RDACS_WONLY)
 			dpot->rdac_cache[reg & DPOT_RDAC_MASK] = value;
 
@@ -219,11 +263,30 @@ static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value)
 		} else {
 			if (dpot->uid == DPOT_UID(AD5291_ID) ||
 				dpot->uid == DPOT_UID(AD5292_ID) ||
-				dpot->uid == DPOT_UID(AD5293_ID))
+				dpot->uid == DPOT_UID(AD5293_ID)) {
+
+				dpot_write_r8d8(dpot, DPOT_AD5291_CTRLREG << 2,
+						DPOT_AD5291_UNLOCK_CMD);
+
+				if (dpot->uid == DPOT_UID(AD5291_ID))
+					value = value << 2;
+
 				return dpot_write_r8d8(dpot,
 					(DPOT_AD5291_RDAC << 2) |
 					(value >> 8), value & 0xFF);
+			} else if (dpot->uid == DPOT_UID(AD5270_ID) ||
+				dpot->uid == DPOT_UID(AD5271_ID)) {
+				dpot_write_r8d8(dpot,
+						DPOT_AD5270_1_2_4_CTRLREG << 2,
+						DPOT_AD5270_1_2_4_UNLOCK_CMD);
 
+				if (dpot->uid == DPOT_UID(AD5271_ID))
+					value = value << 2;
+
+				return dpot_write_r8d8(dpot,
+					(DPOT_AD5270_1_2_4_RDAC << 2) |
+					(value >> 8), value & 0xFF);
+			}
 			val = DPOT_SPI_RDAC | (reg & DPOT_RDAC_MASK);
 		}
 	} else if (reg & DPOT_ADDR_EEPROM) {
@@ -243,6 +306,16 @@ static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value)
 			val = DPOT_SPI_INC_ALL;
 			break;
 		}
+	} else if (reg & DPOT_ADDR_OTP) {
+		if (dpot->uid == DPOT_UID(AD5291_ID) ||
+			dpot->uid == DPOT_UID(AD5292_ID)) {
+			return dpot_write_r8d8(dpot,
+				DPOT_AD5291_STORE_XTPM << 2, 0);
+		} else if (dpot->uid == DPOT_UID(AD5270_ID) ||
+			dpot->uid == DPOT_UID(AD5271_ID)) {
+			return dpot_write_r8d8(dpot,
+				DPOT_AD5270_1_2_4_STORE_XTPM << 2, 0);
+		}
 	} else
 		BUG();
 
@@ -303,10 +376,25 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value)
 			tmp = dpot_read_r8d16(dpot, tmp);
 			if (tmp >> 14) /* Ready to Program? */
 				return -EFAULT;
-			ctrl = DPOT_AD5270_2_3_FUSE;
+			ctrl = DPOT_AD5170_2_3_FUSE;
 		}
 		return dpot_write_r8d8(dpot, ctrl, value);
 		break;
+	case DPOT_UID(AD5272_ID):
+	case DPOT_UID(AD5274_ID):
+		dpot_write_r8d8(dpot, DPOT_AD5270_1_2_4_CTRLREG << 2,
+				DPOT_AD5270_1_2_4_UNLOCK_CMD);
+
+		if (reg & DPOT_ADDR_OTP)
+			return dpot_write_r8d8(dpot,
+					DPOT_AD5270_1_2_4_STORE_XTPM << 2, 0);
+
+		if (dpot->uid == DPOT_UID(AD5274_ID))
+			value = value << 2;
+
+		return dpot_write_r8d8(dpot, (DPOT_AD5270_1_2_4_RDAC << 2) |
+				       (value >> 8), value & 0xFF);
+		break;
 	default:
 		if (reg & DPOT_ADDR_CMD)
 			return dpot_write_d8(dpot, reg);
@@ -320,7 +408,6 @@ static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value)
 	}
 }
 
-
 static s32 dpot_write(struct dpot_data *dpot, u8 reg, u16 value)
 {
 	if (dpot->feat & F_SPI)
diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h
index 7609d49efd31..3b9e355489ae 100644
--- a/drivers/misc/ad525x_dpot.h
+++ b/drivers/misc/ad525x_dpot.h
@@ -93,8 +93,10 @@ enum dpot_devid {
 			BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 8, 23),
 	AD5290_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
 			BRDAC0, 8, 24),
-	AD5291_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 8, 25),
-	AD5292_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 10, 26),
+	AD5291_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT | F_CMD_OTP,
+			BRDAC0, 8, 25),
+	AD5292_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT | F_CMD_OTP,
+			BRDAC0, 10, 26),
 	AD5293_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 10, 27),
 	AD7376_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
 			BRDAC0, 7, 28),
@@ -122,6 +124,12 @@ enum dpot_devid {
 	AD5170_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 8, 45),
 	AD5172_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0 | BRDAC1, 8, 46),
 	AD5173_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0 | BRDAC1, 8, 47),
+	AD5270_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP | F_SPI_16BIT,
+			BRDAC0, 10, 48),
+	AD5271_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP | F_SPI_16BIT,
+			BRDAC0, 8, 49),
+	AD5272_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 10, 50),
+	AD5274_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 8, 51),
 };
 
 #define DPOT_RDAC0		0
@@ -165,10 +173,19 @@ enum dpot_devid {
 /* AD5291/2/3 use special commands */
 #define DPOT_AD5291_RDAC	0x01
 #define DPOT_AD5291_READ_RDAC	0x02
+#define DPOT_AD5291_STORE_XTPM	0x03
+#define DPOT_AD5291_CTRLREG	0x06
+#define DPOT_AD5291_UNLOCK_CMD	0x03
 
-#define DPOT_AD5291_RDAC_AB	0x80
+/* AD5270/1/2/4 use special commands */
+#define DPOT_AD5270_1_2_4_RDAC		0x01
+#define DPOT_AD5270_1_2_4_READ_RDAC	0x02
+#define DPOT_AD5270_1_2_4_STORE_XTPM	0x03
+#define DPOT_AD5270_1_2_4_CTRLREG	0x07
+#define DPOT_AD5270_1_2_4_UNLOCK_CMD	0x03
 
 #define DPOT_AD5282_RDAC_AB	0x80
+
 #define DPOT_AD5273_FUSE	0x80
 #define DPOT_AD5170_2_3_FUSE	0x20
 #define DPOT_AD5170_2_3_OW	0x08

From 4b068de9ab1c404734fde90ce5d3f4f5b4f0b9d5 Mon Sep 17 00:00:00 2001
From: steven miao <realmz6@gmail.com>
Date: Tue, 26 Oct 2010 14:22:37 -0700
Subject: [PATCH 105/176] ad525x_dpot: use correct rdac channel for
 ad5251/ad5252

The ad5251/ad5252 devices have rdac1 and rdac3, but no rdac0.  So make
sure we use the right channels so userspace gets correct data and not just
garbage.

Signed-off-by: steven miao <realmz6@gmail.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Michael Hennerich <michael.hennerich@analog.com>
Cc: Chris Verges <chrisv@cyberswitching.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/ad525x_dpot.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h
index 3b9e355489ae..a662f5987b68 100644
--- a/drivers/misc/ad525x_dpot.h
+++ b/drivers/misc/ad525x_dpot.h
@@ -47,9 +47,9 @@ enum dpot_devid {
 	AD5258_ID = DPOT_CONF(F_RDACS_RW_TOL, BRDAC0, 6, 0), /* I2C */
 	AD5259_ID = DPOT_CONF(F_RDACS_RW_TOL, BRDAC0, 8, 1),
 	AD5251_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
-			BRDAC0 | BRDAC3, 6, 2),
+			BRDAC1 | BRDAC3, 6, 2),
 	AD5252_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
-			BRDAC0 | BRDAC3, 8, 3),
+			BRDAC1 | BRDAC3, 8, 3),
 	AD5253_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
 			BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 6, 4),
 	AD5254_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,

From 190420ab34ab4c077c641893ac19f364cf3606e4 Mon Sep 17 00:00:00 2001
From: Samu Onkalo <samu.p.onkalo@nokia.com>
Date: Tue, 26 Oct 2010 14:22:37 -0700
Subject: [PATCH 106/176] drivers/misc: driver for bh1770glc / sfh7770 ALS and
 proximity sensor

This is a driver for ROHM BH1770GLC and OSRAM SFH7770 combined ALS and
proximity sensor.

Interface is sysfs based.  The driver uses interrupts to provide new data.
 The driver supports pm_runtime and regulator frameworks.

See Documentation/misc-devices/bh1770glc.txt for details

Signed-off-by: Samu Onkalo <samu.p.onkalo@nokia.com>
Acked-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/Kconfig          |   10 +
 drivers/misc/Makefile         |    1 +
 drivers/misc/bh1770glc.c      | 1413 +++++++++++++++++++++++++++++++++
 include/linux/i2c/bh1770glc.h |   53 ++
 4 files changed, 1477 insertions(+)
 create mode 100644 drivers/misc/bh1770glc.c
 create mode 100644 include/linux/i2c/bh1770glc.h

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 3d57adfc8703..3b50106abfa5 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -315,6 +315,16 @@ config SENSORS_BH1780
 	  This driver can also be built as a module.  If so, the module
 	  will be called bh1780gli.
 
+config SENSORS_BH1770
+         tristate "BH1770GLC / SFH7770 combined ALS - Proximity sensor"
+         depends on I2C
+         ---help---
+           Say Y here if you want to build a driver for BH1770GLC (ROHM) or
+	   SFH7770 (Osram) combined ambient light and proximity sensor chip.
+
+           To compile this driver as a module, choose M here: the
+           module will be called bh1770glc. If unsure, say N here.
+
 config HMC6352
 	tristate "Honeywell HMC6352 compass"
 	depends on I2C
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 9f2986b4da2f..5fed6ab533e2 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_TIFM_CORE)       	+= tifm_core.o
 obj-$(CONFIG_TIFM_7XX1)       	+= tifm_7xx1.o
 obj-$(CONFIG_PHANTOM)		+= phantom.o
 obj-$(CONFIG_SENSORS_BH1780)	+= bh1780gli.o
+obj-$(CONFIG_SENSORS_BH1770)	+= bh1770glc.o
 obj-$(CONFIG_SGI_IOC4)		+= ioc4.o
 obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
 obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
diff --git a/drivers/misc/bh1770glc.c b/drivers/misc/bh1770glc.c
new file mode 100644
index 000000000000..cee632e645e1
--- /dev/null
+++ b/drivers/misc/bh1770glc.c
@@ -0,0 +1,1413 @@
+/*
+ * This file is part of the ROHM BH1770GLC / OSRAM SFH7770 sensor driver.
+ * Chip is combined proximity and ambient light sensor.
+ *
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
+ *
+ * Contact: Samu Onkalo <samu.p.onkalo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/i2c/bh1770glc.h>
+#include <linux/regulator/consumer.h>
+#include <linux/pm_runtime.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+
+#define BH1770_ALS_CONTROL	0x80 /* ALS operation mode control */
+#define BH1770_PS_CONTROL	0x81 /* PS operation mode control */
+#define BH1770_I_LED		0x82 /* active LED and LED1, LED2 current */
+#define BH1770_I_LED3		0x83 /* LED3 current setting */
+#define BH1770_ALS_PS_MEAS	0x84 /* Forced mode trigger */
+#define BH1770_PS_MEAS_RATE	0x85 /* PS meas. rate at stand alone mode */
+#define BH1770_ALS_MEAS_RATE	0x86 /* ALS meas. rate at stand alone mode */
+#define BH1770_PART_ID		0x8a /* Part number and revision ID */
+#define BH1770_MANUFACT_ID	0x8b /* Manufacturerer ID */
+#define BH1770_ALS_DATA_0	0x8c /* ALS DATA low byte */
+#define BH1770_ALS_DATA_1	0x8d /* ALS DATA high byte */
+#define BH1770_ALS_PS_STATUS	0x8e /* Measurement data and int status */
+#define BH1770_PS_DATA_LED1	0x8f /* PS data from LED1 */
+#define BH1770_PS_DATA_LED2	0x90 /* PS data from LED2 */
+#define BH1770_PS_DATA_LED3	0x91 /* PS data from LED3 */
+#define BH1770_INTERRUPT	0x92 /* Interrupt setting */
+#define BH1770_PS_TH_LED1	0x93 /* PS interrupt threshold for LED1 */
+#define BH1770_PS_TH_LED2	0x94 /* PS interrupt threshold for LED2 */
+#define BH1770_PS_TH_LED3	0x95 /* PS interrupt threshold for LED3 */
+#define BH1770_ALS_TH_UP_0	0x96 /* ALS upper threshold low byte */
+#define BH1770_ALS_TH_UP_1	0x97 /* ALS upper threshold high byte */
+#define BH1770_ALS_TH_LOW_0	0x98 /* ALS lower threshold low byte */
+#define BH1770_ALS_TH_LOW_1	0x99 /* ALS lower threshold high byte */
+
+/* MANUFACT_ID */
+#define BH1770_MANUFACT_ROHM	0x01
+#define BH1770_MANUFACT_OSRAM	0x03
+
+/* PART_ID */
+#define BH1770_PART		0x90
+#define BH1770_PART_MASK	0xf0
+#define BH1770_REV_MASK		0x0f
+#define BH1770_REV_SHIFT	0
+#define BH1770_REV_0		0x00
+#define BH1770_REV_1		0x01
+
+/* Operating modes for both */
+#define BH1770_STANDBY		0x00
+#define BH1770_FORCED		0x02
+#define BH1770_STANDALONE	0x03
+#define BH1770_SWRESET		(0x01 << 2)
+
+#define BH1770_PS_TRIG_MEAS	(1 << 0)
+#define BH1770_ALS_TRIG_MEAS	(1 << 1)
+
+/* Interrupt control */
+#define BH1770_INT_OUTPUT_MODE	(1 << 3) /* 0 = latched */
+#define BH1770_INT_POLARITY	(1 << 2) /* 1 = active high */
+#define BH1770_INT_ALS_ENA	(1 << 1)
+#define BH1770_INT_PS_ENA	(1 << 0)
+
+/* Interrupt status */
+#define BH1770_INT_LED1_DATA	(1 << 0)
+#define BH1770_INT_LED1_INT	(1 << 1)
+#define BH1770_INT_LED2_DATA	(1 << 2)
+#define BH1770_INT_LED2_INT	(1 << 3)
+#define BH1770_INT_LED3_DATA	(1 << 4)
+#define BH1770_INT_LED3_INT	(1 << 5)
+#define BH1770_INT_LEDS_INT	((1 << 1) | (1 << 3) | (1 << 5))
+#define BH1770_INT_ALS_DATA	(1 << 6)
+#define BH1770_INT_ALS_INT	(1 << 7)
+
+/* Led channels */
+#define BH1770_LED1		0x00
+
+#define BH1770_DISABLE		0
+#define BH1770_ENABLE		1
+#define BH1770_PROX_CHANNELS	1
+
+#define BH1770_LUX_DEFAULT_RATE	1 /* Index to lux rate table */
+#define BH1770_PROX_DEFAULT_RATE 1 /* Direct HW value =~ 50Hz */
+#define BH1770_PROX_DEF_RATE_THRESH 6 /* Direct HW value =~ 5 Hz */
+#define BH1770_STARTUP_DELAY	50
+#define BH1770_RESET_TIME	10
+#define BH1770_TIMEOUT		2100 /* Timeout in 2.1 seconds */
+
+#define BH1770_LUX_RANGE	65535
+#define BH1770_PROX_RANGE	255
+#define BH1770_COEF_SCALER	1024
+#define BH1770_CALIB_SCALER	8192
+#define BH1770_LUX_NEUTRAL_CALIB_VALUE (1 * BH1770_CALIB_SCALER)
+#define BH1770_LUX_DEF_THRES	1000
+#define BH1770_PROX_DEF_THRES	70
+#define BH1770_PROX_DEF_ABS_THRES   100
+#define BH1770_DEFAULT_PERSISTENCE  10
+#define BH1770_PROX_MAX_PERSISTENCE 50
+#define BH1770_LUX_GA_SCALE	16384
+#define BH1770_LUX_CF_SCALE	2048 /* CF ChipFactor */
+#define BH1770_NEUTRAL_CF	BH1770_LUX_CF_SCALE
+#define BH1770_LUX_CORR_SCALE	4096
+
+#define PROX_ABOVE_THRESHOLD	1
+#define PROX_BELOW_THRESHOLD	0
+
+#define PROX_IGNORE_LUX_LIMIT	500
+
+struct bh1770_chip {
+	struct bh1770_platform_data	*pdata;
+	char				chipname[10];
+	u8				revision;
+	struct i2c_client		*client;
+	struct regulator_bulk_data	regs[2];
+	struct mutex			mutex; /* avoid parallel access */
+	wait_queue_head_t		wait;
+
+	bool			int_mode_prox;
+	bool			int_mode_lux;
+	struct delayed_work	prox_work;
+	u32	lux_cf; /* Chip specific factor */
+	u32	lux_ga;
+	u32	lux_calib;
+	int	lux_rate_index;
+	u32	lux_corr;
+	u16	lux_data_raw;
+	u16	lux_threshold_hi;
+	u16	lux_threshold_lo;
+	u16	lux_thres_hi_onchip;
+	u16	lux_thres_lo_onchip;
+	bool	lux_wait_result;
+
+	int	prox_enable_count;
+	u16	prox_coef;
+	u16	prox_const;
+	int	prox_rate;
+	int	prox_rate_threshold;
+	u8	prox_persistence;
+	u8	prox_persistence_counter;
+	u8	prox_data;
+	u8	prox_threshold;
+	u8	prox_threshold_hw;
+	bool	prox_force_update;
+	u8	prox_abs_thres;
+	u8	prox_led;
+};
+
+static const char reg_vcc[] = "Vcc";
+static const char reg_vleds[] = "Vleds";
+
+/*
+ * Supported stand alone rates in ms from chip data sheet
+ * {10, 20, 30, 40, 70, 100, 200, 500, 1000, 2000};
+ */
+static const s16 prox_rates_hz[] = {100, 50, 33, 25, 14, 10, 5, 2};
+static const s16 prox_rates_ms[] = {10, 20, 30, 40, 70, 100, 200, 500};
+
+/* Supported IR-led currents in mA */
+static const u8 prox_curr_ma[] = {5, 10, 20, 50, 100, 150, 200};
+
+/*
+ * Supported stand alone rates in ms from chip data sheet
+ * {100, 200, 500, 1000, 2000};
+ */
+static const s16 lux_rates_hz[] = {10, 5, 2, 1, 0};
+
+/*
+ * interrupt control functions are called while keeping chip->mutex
+ * excluding module probe / remove
+ */
+static inline int bh1770_lux_interrupt_control(struct bh1770_chip *chip,
+					int lux)
+{
+	chip->int_mode_lux = lux;
+	/* Set interrupt modes, interrupt active low, latched */
+	return i2c_smbus_write_byte_data(chip->client,
+					BH1770_INTERRUPT,
+					(lux << 1) | chip->int_mode_prox);
+}
+
+static inline int bh1770_prox_interrupt_control(struct bh1770_chip *chip,
+					int ps)
+{
+	chip->int_mode_prox = ps;
+	return i2c_smbus_write_byte_data(chip->client,
+					BH1770_INTERRUPT,
+					(chip->int_mode_lux << 1) | (ps << 0));
+}
+
+/* chip->mutex is always kept here */
+static int bh1770_lux_rate(struct bh1770_chip *chip, int rate_index)
+{
+	/* sysfs may call this when the chip is powered off */
+	if (pm_runtime_suspended(&chip->client->dev))
+		return 0;
+
+	/* Proper proximity response needs fastest lux rate (100ms) */
+	if (chip->prox_enable_count)
+		rate_index = 0;
+
+	return i2c_smbus_write_byte_data(chip->client,
+					BH1770_ALS_MEAS_RATE,
+					rate_index);
+}
+
+static int bh1770_prox_rate(struct bh1770_chip *chip, int mode)
+{
+	int rate;
+
+	rate = (mode == PROX_ABOVE_THRESHOLD) ?
+		chip->prox_rate_threshold : chip->prox_rate;
+
+	return i2c_smbus_write_byte_data(chip->client,
+					BH1770_PS_MEAS_RATE,
+					rate);
+}
+
+/* InfraredLED is controlled by the chip during proximity scanning */
+static inline int bh1770_led_cfg(struct bh1770_chip *chip)
+{
+	/* LED cfg, current for leds 1 and 2 */
+	return i2c_smbus_write_byte_data(chip->client,
+					BH1770_I_LED,
+					(BH1770_LED1 << 6) |
+					(BH1770_LED_5mA << 3) |
+					chip->prox_led);
+}
+
+/*
+ * Following two functions converts raw ps values from HW to normalized
+ * values. Purpose is to compensate differences between different sensor
+ * versions and variants so that result means about the same between
+ * versions.
+ */
+static inline u8 bh1770_psraw_to_adjusted(struct bh1770_chip *chip, u8 psraw)
+{
+	u16 adjusted;
+	adjusted = (u16)(((u32)(psraw + chip->prox_const) * chip->prox_coef) /
+		BH1770_COEF_SCALER);
+	if (adjusted > BH1770_PROX_RANGE)
+		adjusted = BH1770_PROX_RANGE;
+	return adjusted;
+}
+
+static inline u8 bh1770_psadjusted_to_raw(struct bh1770_chip *chip, u8 ps)
+{
+	u16 raw;
+
+	raw = (((u32)ps * BH1770_COEF_SCALER) / chip->prox_coef);
+	if (raw > chip->prox_const)
+		raw = raw - chip->prox_const;
+	else
+		raw = 0;
+	return raw;
+}
+
+/*
+ * Following two functions converts raw lux values from HW to normalized
+ * values. Purpose is to compensate differences between different sensor
+ * versions and variants so that result means about the same between
+ * versions. Chip->mutex is kept when this is called.
+ */
+static int bh1770_prox_set_threshold(struct bh1770_chip *chip)
+{
+	u8 tmp = 0;
+
+	/* sysfs may call this when the chip is powered off */
+	if (pm_runtime_suspended(&chip->client->dev))
+		return 0;
+
+	tmp = bh1770_psadjusted_to_raw(chip, chip->prox_threshold);
+	chip->prox_threshold_hw = tmp;
+
+	return	i2c_smbus_write_byte_data(chip->client, BH1770_PS_TH_LED1,
+					tmp);
+}
+
+static inline u16 bh1770_lux_raw_to_adjusted(struct bh1770_chip *chip, u16 raw)
+{
+	u32 lux;
+	lux = ((u32)raw * chip->lux_corr) / BH1770_LUX_CORR_SCALE;
+	return min(lux, (u32)BH1770_LUX_RANGE);
+}
+
+static inline u16 bh1770_lux_adjusted_to_raw(struct bh1770_chip *chip,
+					u16 adjusted)
+{
+	return (u32)adjusted * BH1770_LUX_CORR_SCALE / chip->lux_corr;
+}
+
+/* chip->mutex is kept when this is called */
+static int bh1770_lux_update_thresholds(struct bh1770_chip *chip,
+					u16 threshold_hi, u16 threshold_lo)
+{
+	u8 data[4];
+	int ret;
+
+	/* sysfs may call this when the chip is powered off */
+	if (pm_runtime_suspended(&chip->client->dev))
+		return 0;
+
+	/*
+	 * Compensate threshold values with the correction factors if not
+	 * set to minimum or maximum.
+	 * Min & max values disables interrupts.
+	 */
+	if (threshold_hi != BH1770_LUX_RANGE && threshold_hi != 0)
+		threshold_hi = bh1770_lux_adjusted_to_raw(chip, threshold_hi);
+
+	if (threshold_lo != BH1770_LUX_RANGE && threshold_lo != 0)
+		threshold_lo = bh1770_lux_adjusted_to_raw(chip, threshold_lo);
+
+	if (chip->lux_thres_hi_onchip == threshold_hi &&
+	    chip->lux_thres_lo_onchip == threshold_lo)
+		return 0;
+
+	chip->lux_thres_hi_onchip = threshold_hi;
+	chip->lux_thres_lo_onchip = threshold_lo;
+
+	data[0] = threshold_hi;
+	data[1] = threshold_hi >> 8;
+	data[2] = threshold_lo;
+	data[3] = threshold_lo >> 8;
+
+	ret = i2c_smbus_write_i2c_block_data(chip->client,
+					BH1770_ALS_TH_UP_0,
+					ARRAY_SIZE(data),
+					data);
+	return ret;
+}
+
+static int bh1770_lux_get_result(struct bh1770_chip *chip)
+{
+	u16 data;
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(chip->client, BH1770_ALS_DATA_0);
+	if (ret < 0)
+		return ret;
+
+	data = ret & 0xff;
+	ret = i2c_smbus_read_byte_data(chip->client, BH1770_ALS_DATA_1);
+	if (ret < 0)
+		return ret;
+
+	chip->lux_data_raw = data | ((ret & 0xff) << 8);
+
+	return 0;
+}
+
+/* Calculate correction value which contains chip and device specific parts */
+static u32 bh1770_get_corr_value(struct bh1770_chip *chip)
+{
+	u32 tmp;
+	/* Impact of glass attenuation correction */
+	tmp = (BH1770_LUX_CORR_SCALE * chip->lux_ga) / BH1770_LUX_GA_SCALE;
+	/* Impact of chip factor correction */
+	tmp = (tmp * chip->lux_cf) / BH1770_LUX_CF_SCALE;
+	/* Impact of Device specific calibration correction */
+	tmp = (tmp * chip->lux_calib) / BH1770_CALIB_SCALER;
+	return tmp;
+}
+
+static int bh1770_lux_read_result(struct bh1770_chip *chip)
+{
+	bh1770_lux_get_result(chip);
+	return bh1770_lux_raw_to_adjusted(chip, chip->lux_data_raw);
+}
+
+/*
+ * Chip on / off functions are called while keeping mutex except probe
+ * or remove phase
+ */
+static int bh1770_chip_on(struct bh1770_chip *chip)
+{
+	int ret = regulator_bulk_enable(ARRAY_SIZE(chip->regs),
+					chip->regs);
+	if (ret < 0)
+		return ret;
+
+	usleep_range(BH1770_STARTUP_DELAY, BH1770_STARTUP_DELAY * 2);
+
+	/* Reset the chip */
+	i2c_smbus_write_byte_data(chip->client, BH1770_ALS_CONTROL,
+				BH1770_SWRESET);
+	usleep_range(BH1770_RESET_TIME, BH1770_RESET_TIME * 2);
+
+	/*
+	 * ALS is started always since proximity needs als results
+	 * for realibility estimation.
+	 * Let's assume dark until the first ALS measurement is ready.
+	 */
+	chip->lux_data_raw = 0;
+	chip->prox_data = 0;
+	ret = i2c_smbus_write_byte_data(chip->client,
+					BH1770_ALS_CONTROL, BH1770_STANDALONE);
+
+	/* Assume reset defaults */
+	chip->lux_thres_hi_onchip = BH1770_LUX_RANGE;
+	chip->lux_thres_lo_onchip = 0;
+
+	return ret;
+}
+
+static void bh1770_chip_off(struct bh1770_chip *chip)
+{
+	i2c_smbus_write_byte_data(chip->client,
+					BH1770_INTERRUPT, BH1770_DISABLE);
+	i2c_smbus_write_byte_data(chip->client,
+				BH1770_ALS_CONTROL, BH1770_STANDBY);
+	i2c_smbus_write_byte_data(chip->client,
+				BH1770_PS_CONTROL, BH1770_STANDBY);
+	regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs);
+}
+
+/* chip->mutex is kept when this is called */
+static int bh1770_prox_mode_control(struct bh1770_chip *chip)
+{
+	if (chip->prox_enable_count) {
+		chip->prox_force_update = true; /* Force immediate update */
+
+		bh1770_lux_rate(chip, chip->lux_rate_index);
+		bh1770_prox_set_threshold(chip);
+		bh1770_led_cfg(chip);
+		bh1770_prox_rate(chip, PROX_BELOW_THRESHOLD);
+		bh1770_prox_interrupt_control(chip, BH1770_ENABLE);
+		i2c_smbus_write_byte_data(chip->client,
+					BH1770_PS_CONTROL, BH1770_STANDALONE);
+	} else {
+		chip->prox_data = 0;
+		bh1770_lux_rate(chip, chip->lux_rate_index);
+		bh1770_prox_interrupt_control(chip, BH1770_DISABLE);
+		i2c_smbus_write_byte_data(chip->client,
+					BH1770_PS_CONTROL, BH1770_STANDBY);
+	}
+	return 0;
+}
+
+/* chip->mutex is kept when this is called */
+static int bh1770_prox_read_result(struct bh1770_chip *chip)
+{
+	int ret;
+	bool above;
+	u8 mode;
+
+	ret = i2c_smbus_read_byte_data(chip->client, BH1770_PS_DATA_LED1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > chip->prox_threshold_hw)
+		above = true;
+	else
+		above = false;
+
+	/*
+	 * when ALS levels goes above limit, proximity result may be
+	 * false proximity. Thus ignore the result. With real proximity
+	 * there is a shadow causing low als levels.
+	 */
+	if (chip->lux_data_raw > PROX_IGNORE_LUX_LIMIT)
+		ret = 0;
+
+	chip->prox_data = bh1770_psraw_to_adjusted(chip, ret);
+
+	/* Strong proximity level or force mode requires immediate response */
+	if (chip->prox_data >= chip->prox_abs_thres ||
+	    chip->prox_force_update)
+		chip->prox_persistence_counter = chip->prox_persistence;
+
+	chip->prox_force_update = false;
+
+	/* Persistence filttering to reduce false proximity events */
+	if (likely(above)) {
+		if (chip->prox_persistence_counter < chip->prox_persistence) {
+			chip->prox_persistence_counter++;
+			ret = -ENODATA;
+		} else {
+			mode = PROX_ABOVE_THRESHOLD;
+			ret = 0;
+		}
+	} else {
+		chip->prox_persistence_counter = 0;
+		mode = PROX_BELOW_THRESHOLD;
+		chip->prox_data = 0;
+		ret = 0;
+	}
+
+	/* Set proximity detection rate based on above or below value */
+	if (ret == 0) {
+		bh1770_prox_rate(chip, mode);
+		sysfs_notify(&chip->client->dev.kobj, NULL, "prox0_raw");
+	}
+out:
+	return ret;
+}
+
+static int bh1770_detect(struct bh1770_chip *chip)
+{
+	struct i2c_client *client = chip->client;
+	s32 ret;
+	u8 manu, part;
+
+	ret = i2c_smbus_read_byte_data(client, BH1770_MANUFACT_ID);
+	if (ret < 0)
+		goto error;
+	manu = (u8)ret;
+
+	ret = i2c_smbus_read_byte_data(client, BH1770_PART_ID);
+	if (ret < 0)
+		goto error;
+	part = (u8)ret;
+
+	chip->revision = (part & BH1770_REV_MASK) >> BH1770_REV_SHIFT;
+	chip->prox_coef = BH1770_COEF_SCALER;
+	chip->prox_const = 0;
+	chip->lux_cf = BH1770_NEUTRAL_CF;
+
+	if ((manu == BH1770_MANUFACT_ROHM) &&
+	    ((part & BH1770_PART_MASK) == BH1770_PART)) {
+		snprintf(chip->chipname, sizeof(chip->chipname), "BH1770GLC");
+		return 0;
+	}
+
+	if ((manu == BH1770_MANUFACT_OSRAM) &&
+	    ((part & BH1770_PART_MASK) == BH1770_PART)) {
+		snprintf(chip->chipname, sizeof(chip->chipname), "SFH7770");
+		/* Values selected by comparing different versions */
+		chip->prox_coef = 819; /* 0.8 * BH1770_COEF_SCALER */
+		chip->prox_const = 40;
+		return 0;
+	}
+
+	ret = -ENODEV;
+error:
+	dev_dbg(&client->dev, "BH1770 or SFH7770 not found\n");
+
+	return ret;
+}
+
+/*
+ * This work is re-scheduled at every proximity interrupt.
+ * If this work is running, it means that there hasn't been any
+ * proximity interrupt in time. Situation is handled as no-proximity.
+ * It would be nice to have low-threshold interrupt or interrupt
+ * when measurement and hi-threshold are both 0. But neither of those exists.
+ * This is a workaroud for missing HW feature.
+ */
+
+static void bh1770_prox_work(struct work_struct *work)
+{
+	struct bh1770_chip *chip =
+		container_of(work, struct bh1770_chip, prox_work.work);
+
+	mutex_lock(&chip->mutex);
+	bh1770_prox_read_result(chip);
+	mutex_unlock(&chip->mutex);
+}
+
+/* This is threaded irq handler */
+static irqreturn_t bh1770_irq(int irq, void *data)
+{
+	struct bh1770_chip *chip = data;
+	int status;
+	int rate = 0;
+
+	mutex_lock(&chip->mutex);
+	status = i2c_smbus_read_byte_data(chip->client, BH1770_ALS_PS_STATUS);
+
+	/* Acknowledge interrupt by reading this register */
+	i2c_smbus_read_byte_data(chip->client, BH1770_INTERRUPT);
+
+	/*
+	 * Check if there is fresh data available for als.
+	 * If this is the very first data, update thresholds after that.
+	 */
+	if (status & BH1770_INT_ALS_DATA) {
+		bh1770_lux_get_result(chip);
+		if (unlikely(chip->lux_wait_result)) {
+			chip->lux_wait_result = false;
+			wake_up(&chip->wait);
+			bh1770_lux_update_thresholds(chip,
+						chip->lux_threshold_hi,
+						chip->lux_threshold_lo);
+		}
+	}
+
+	/* Disable interrupt logic to guarantee acknowledgement */
+	i2c_smbus_write_byte_data(chip->client, BH1770_INTERRUPT,
+				  (0 << 1) | (0 << 0));
+
+	if ((status & BH1770_INT_ALS_INT))
+		sysfs_notify(&chip->client->dev.kobj, NULL, "lux0_input");
+
+	if (chip->int_mode_prox && (status & BH1770_INT_LEDS_INT)) {
+		rate = prox_rates_ms[chip->prox_rate_threshold];
+		bh1770_prox_read_result(chip);
+	}
+
+	/* Re-enable interrupt logic */
+	i2c_smbus_write_byte_data(chip->client, BH1770_INTERRUPT,
+				  (chip->int_mode_lux << 1) |
+				  (chip->int_mode_prox << 0));
+	mutex_unlock(&chip->mutex);
+
+	/*
+	 * Can't cancel work while keeping mutex since the work uses the
+	 * same mutex.
+	 */
+	if (rate) {
+		/*
+		 * Simulate missing no-proximity interrupt 50ms after the
+		 * next expected interrupt time.
+		 */
+		cancel_delayed_work_sync(&chip->prox_work);
+		schedule_delayed_work(&chip->prox_work,
+				msecs_to_jiffies(rate + 50));
+	}
+	return IRQ_HANDLED;
+}
+
+static ssize_t bh1770_power_state_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+	size_t ret;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	if (value) {
+		pm_runtime_get_sync(dev);
+
+		ret = bh1770_lux_rate(chip, chip->lux_rate_index);
+		ret |= bh1770_lux_interrupt_control(chip, BH1770_ENABLE);
+
+		if (ret < 0) {
+			pm_runtime_put(dev);
+			goto leave;
+		}
+
+		/* This causes interrupt after the next measurement cycle */
+		bh1770_lux_update_thresholds(chip, BH1770_LUX_DEF_THRES,
+					BH1770_LUX_DEF_THRES);
+		/* Inform that we are waiting for a result from ALS */
+		chip->lux_wait_result = true;
+		bh1770_prox_mode_control(chip);
+	} else if (!pm_runtime_suspended(dev)) {
+		pm_runtime_put(dev);
+	}
+	ret = count;
+leave:
+	mutex_unlock(&chip->mutex);
+	return ret;
+}
+
+static ssize_t bh1770_power_state_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", !pm_runtime_suspended(dev));
+}
+
+static ssize_t bh1770_lux_result_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	ssize_t ret;
+	long timeout;
+
+	if (pm_runtime_suspended(dev))
+		return -EIO; /* Chip is not enabled at all */
+
+	timeout = wait_event_interruptible_timeout(chip->wait,
+					!chip->lux_wait_result,
+					msecs_to_jiffies(BH1770_TIMEOUT));
+	if (!timeout)
+		return -EIO;
+
+	mutex_lock(&chip->mutex);
+	ret = sprintf(buf, "%d\n", bh1770_lux_read_result(chip));
+	mutex_unlock(&chip->mutex);
+
+	return ret;
+}
+
+static ssize_t bh1770_lux_range_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", BH1770_LUX_RANGE);
+}
+
+static ssize_t bh1770_prox_enable_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	/* Assume no proximity. Sensor will tell real state soon */
+	if (!chip->prox_enable_count)
+		chip->prox_data = 0;
+
+	if (value)
+		chip->prox_enable_count++;
+	else if (chip->prox_enable_count > 0)
+		chip->prox_enable_count--;
+	else
+		goto leave;
+
+	/* Run control only when chip is powered on */
+	if (!pm_runtime_suspended(dev))
+		bh1770_prox_mode_control(chip);
+leave:
+	mutex_unlock(&chip->mutex);
+	return count;
+}
+
+static ssize_t bh1770_prox_enable_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	ssize_t len;
+
+	mutex_lock(&chip->mutex);
+	len = sprintf(buf, "%d\n", chip->prox_enable_count);
+	mutex_unlock(&chip->mutex);
+	return len;
+}
+
+static ssize_t bh1770_prox_result_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	ssize_t ret;
+
+	mutex_lock(&chip->mutex);
+	if (chip->prox_enable_count && !pm_runtime_suspended(dev))
+		ret = sprintf(buf, "%d\n", chip->prox_data);
+	else
+		ret = -EIO;
+	mutex_unlock(&chip->mutex);
+	return ret;
+}
+
+static ssize_t bh1770_prox_range_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", BH1770_PROX_RANGE);
+}
+
+static ssize_t bh1770_get_prox_rate_avail(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	int i;
+	int pos = 0;
+	for (i = 0; i < ARRAY_SIZE(prox_rates_hz); i++)
+		pos += sprintf(buf + pos, "%d ", prox_rates_hz[i]);
+	sprintf(buf + pos - 1, "\n");
+	return pos;
+}
+
+static ssize_t bh1770_get_prox_rate_above(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", prox_rates_hz[chip->prox_rate_threshold]);
+}
+
+static ssize_t bh1770_get_prox_rate_below(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", prox_rates_hz[chip->prox_rate]);
+}
+
+static int bh1770_prox_rate_validate(int rate)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(prox_rates_hz) - 1; i++)
+		if (rate >= prox_rates_hz[i])
+			break;
+	return i;
+}
+
+static ssize_t bh1770_set_prox_rate_above(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	chip->prox_rate_threshold = bh1770_prox_rate_validate(value);
+	mutex_unlock(&chip->mutex);
+	return count;
+}
+
+static ssize_t bh1770_set_prox_rate_below(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	chip->prox_rate = bh1770_prox_rate_validate(value);
+	mutex_unlock(&chip->mutex);
+	return count;
+}
+
+static ssize_t bh1770_get_prox_thres(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->prox_threshold);
+}
+
+static ssize_t bh1770_set_prox_thres(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+	int ret;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+	if (value > BH1770_PROX_RANGE)
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	chip->prox_threshold = value;
+	ret = bh1770_prox_set_threshold(chip);
+	mutex_unlock(&chip->mutex);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+static ssize_t bh1770_prox_persistence_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", chip->prox_persistence);
+}
+
+static ssize_t bh1770_prox_persistence_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	struct bh1770_chip *chip = dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	if (value > BH1770_PROX_MAX_PERSISTENCE)
+		return -EINVAL;
+
+	chip->prox_persistence = value;
+
+	return len;
+}
+
+static ssize_t bh1770_prox_abs_thres_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip = dev_get_drvdata(dev);
+	return sprintf(buf, "%u\n", chip->prox_abs_thres);
+}
+
+static ssize_t bh1770_prox_abs_thres_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	struct bh1770_chip *chip = dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	if (value > BH1770_PROX_RANGE)
+		return -EINVAL;
+
+	chip->prox_abs_thres = value;
+
+	return len;
+}
+
+static ssize_t bh1770_chip_id_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%s rev %d\n", chip->chipname, chip->revision);
+}
+
+static ssize_t bh1770_lux_calib_default_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", BH1770_CALIB_SCALER);
+}
+
+static ssize_t bh1770_lux_calib_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip = dev_get_drvdata(dev);
+	ssize_t len;
+
+	mutex_lock(&chip->mutex);
+	len = sprintf(buf, "%u\n", chip->lux_calib);
+	mutex_unlock(&chip->mutex);
+	return len;
+}
+
+static ssize_t bh1770_lux_calib_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct bh1770_chip *chip = dev_get_drvdata(dev);
+	unsigned long value;
+	u32 old_calib;
+	u32 new_corr;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	old_calib = chip->lux_calib;
+	chip->lux_calib = value;
+	new_corr = bh1770_get_corr_value(chip);
+	if (new_corr == 0) {
+		chip->lux_calib = old_calib;
+		mutex_unlock(&chip->mutex);
+		return -EINVAL;
+	}
+	chip->lux_corr = new_corr;
+	/* Refresh thresholds on HW after changing correction value */
+	bh1770_lux_update_thresholds(chip, chip->lux_threshold_hi,
+				chip->lux_threshold_lo);
+
+	mutex_unlock(&chip->mutex);
+
+	return len;
+}
+
+static ssize_t bh1770_get_lux_rate_avail(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	int i;
+	int pos = 0;
+	for (i = 0; i < ARRAY_SIZE(lux_rates_hz); i++)
+		pos += sprintf(buf + pos, "%d ", lux_rates_hz[i]);
+	sprintf(buf + pos - 1, "\n");
+	return pos;
+}
+
+static ssize_t bh1770_get_lux_rate(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", lux_rates_hz[chip->lux_rate_index]);
+}
+
+static ssize_t bh1770_set_lux_rate(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	unsigned long rate_hz;
+	int ret, i;
+
+	if (strict_strtoul(buf, 0, &rate_hz))
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(lux_rates_hz) - 1; i++)
+		if (rate_hz >= lux_rates_hz[i])
+			break;
+
+	mutex_lock(&chip->mutex);
+	chip->lux_rate_index = i;
+	ret = bh1770_lux_rate(chip, i);
+	mutex_unlock(&chip->mutex);
+
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static ssize_t bh1770_get_lux_thresh_above(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->lux_threshold_hi);
+}
+
+static ssize_t bh1770_get_lux_thresh_below(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->lux_threshold_lo);
+}
+
+static ssize_t bh1770_set_lux_thresh(struct bh1770_chip *chip, u16 *target,
+				const char *buf)
+{
+	int ret = 0;
+	unsigned long thresh;
+
+	if (strict_strtoul(buf, 0, &thresh))
+		return -EINVAL;
+
+	if (thresh > BH1770_LUX_RANGE)
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	*target = thresh;
+	/*
+	 * Don't update values in HW if we are still waiting for
+	 * first interrupt to come after device handle open call.
+	 */
+	if (!chip->lux_wait_result)
+		ret = bh1770_lux_update_thresholds(chip,
+						chip->lux_threshold_hi,
+						chip->lux_threshold_lo);
+	mutex_unlock(&chip->mutex);
+	return ret;
+
+}
+
+static ssize_t bh1770_set_lux_thresh_above(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	int ret = bh1770_set_lux_thresh(chip, &chip->lux_threshold_hi, buf);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+static ssize_t bh1770_set_lux_thresh_below(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct bh1770_chip *chip =  dev_get_drvdata(dev);
+	int ret = bh1770_set_lux_thresh(chip, &chip->lux_threshold_lo, buf);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+static DEVICE_ATTR(prox0_raw_en, S_IRUGO | S_IWUSR, bh1770_prox_enable_show,
+						bh1770_prox_enable_store);
+static DEVICE_ATTR(prox0_thresh_above1_value, S_IRUGO | S_IWUSR,
+						bh1770_prox_abs_thres_show,
+						bh1770_prox_abs_thres_store);
+static DEVICE_ATTR(prox0_thresh_above0_value, S_IRUGO | S_IWUSR,
+						bh1770_get_prox_thres,
+						bh1770_set_prox_thres);
+static DEVICE_ATTR(prox0_raw, S_IRUGO, bh1770_prox_result_show, NULL);
+static DEVICE_ATTR(prox0_sensor_range, S_IRUGO, bh1770_prox_range_show, NULL);
+static DEVICE_ATTR(prox0_thresh_above_count, S_IRUGO | S_IWUSR,
+						bh1770_prox_persistence_show,
+						bh1770_prox_persistence_store);
+static DEVICE_ATTR(prox0_rate_above, S_IRUGO | S_IWUSR,
+						bh1770_get_prox_rate_above,
+						bh1770_set_prox_rate_above);
+static DEVICE_ATTR(prox0_rate_below, S_IRUGO | S_IWUSR,
+						bh1770_get_prox_rate_below,
+						bh1770_set_prox_rate_below);
+static DEVICE_ATTR(prox0_rate_avail, S_IRUGO, bh1770_get_prox_rate_avail, NULL);
+
+static DEVICE_ATTR(lux0_calibscale, S_IRUGO | S_IWUSR, bh1770_lux_calib_show,
+						bh1770_lux_calib_store);
+static DEVICE_ATTR(lux0_calibscale_default, S_IRUGO,
+						bh1770_lux_calib_default_show,
+						NULL);
+static DEVICE_ATTR(lux0_input, S_IRUGO, bh1770_lux_result_show, NULL);
+static DEVICE_ATTR(lux0_sensor_range, S_IRUGO, bh1770_lux_range_show, NULL);
+static DEVICE_ATTR(lux0_rate, S_IRUGO | S_IWUSR, bh1770_get_lux_rate,
+						bh1770_set_lux_rate);
+static DEVICE_ATTR(lux0_rate_avail, S_IRUGO, bh1770_get_lux_rate_avail, NULL);
+static DEVICE_ATTR(lux0_thresh_above_value, S_IRUGO | S_IWUSR,
+						bh1770_get_lux_thresh_above,
+						bh1770_set_lux_thresh_above);
+static DEVICE_ATTR(lux0_thresh_below_value, S_IRUGO | S_IWUSR,
+						bh1770_get_lux_thresh_below,
+						bh1770_set_lux_thresh_below);
+static DEVICE_ATTR(chip_id, S_IRUGO, bh1770_chip_id_show, NULL);
+static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR, bh1770_power_state_show,
+						 bh1770_power_state_store);
+
+
+static struct attribute *sysfs_attrs[] = {
+	&dev_attr_lux0_calibscale.attr,
+	&dev_attr_lux0_calibscale_default.attr,
+	&dev_attr_lux0_input.attr,
+	&dev_attr_lux0_sensor_range.attr,
+	&dev_attr_lux0_rate.attr,
+	&dev_attr_lux0_rate_avail.attr,
+	&dev_attr_lux0_thresh_above_value.attr,
+	&dev_attr_lux0_thresh_below_value.attr,
+	&dev_attr_prox0_raw.attr,
+	&dev_attr_prox0_sensor_range.attr,
+	&dev_attr_prox0_raw_en.attr,
+	&dev_attr_prox0_thresh_above_count.attr,
+	&dev_attr_prox0_rate_above.attr,
+	&dev_attr_prox0_rate_below.attr,
+	&dev_attr_prox0_rate_avail.attr,
+	&dev_attr_prox0_thresh_above0_value.attr,
+	&dev_attr_prox0_thresh_above1_value.attr,
+	&dev_attr_chip_id.attr,
+	&dev_attr_power_state.attr,
+	NULL
+};
+
+static struct attribute_group bh1770_attribute_group = {
+	.attrs = sysfs_attrs
+};
+
+static int __devinit bh1770_probe(struct i2c_client *client,
+				const struct i2c_device_id *id)
+{
+	struct bh1770_chip *chip;
+	int err;
+
+	chip = kzalloc(sizeof *chip, GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, chip);
+	chip->client  = client;
+
+	mutex_init(&chip->mutex);
+	init_waitqueue_head(&chip->wait);
+	INIT_DELAYED_WORK(&chip->prox_work, bh1770_prox_work);
+
+	if (client->dev.platform_data == NULL) {
+		dev_err(&client->dev, "platform data is mandatory\n");
+		err = -EINVAL;
+		goto fail1;
+	}
+
+	chip->pdata		= client->dev.platform_data;
+	chip->lux_calib		= BH1770_LUX_NEUTRAL_CALIB_VALUE;
+	chip->lux_rate_index	= BH1770_LUX_DEFAULT_RATE;
+	chip->lux_threshold_lo	= BH1770_LUX_DEF_THRES;
+	chip->lux_threshold_hi	= BH1770_LUX_DEF_THRES;
+
+	if (chip->pdata->glass_attenuation == 0)
+		chip->lux_ga = BH1770_NEUTRAL_GA;
+	else
+		chip->lux_ga = chip->pdata->glass_attenuation;
+
+	chip->prox_threshold	= BH1770_PROX_DEF_THRES;
+	chip->prox_led		= chip->pdata->led_def_curr;
+	chip->prox_abs_thres	= BH1770_PROX_DEF_ABS_THRES;
+	chip->prox_persistence	= BH1770_DEFAULT_PERSISTENCE;
+	chip->prox_rate_threshold = BH1770_PROX_DEF_RATE_THRESH;
+	chip->prox_rate		= BH1770_PROX_DEFAULT_RATE;
+	chip->prox_data		= 0;
+
+	chip->regs[0].supply = reg_vcc;
+	chip->regs[1].supply = reg_vleds;
+
+	err = regulator_bulk_get(&client->dev,
+				 ARRAY_SIZE(chip->regs), chip->regs);
+	if (err < 0) {
+		dev_err(&client->dev, "Cannot get regulators\n");
+		goto fail1;
+	}
+
+	err = regulator_bulk_enable(ARRAY_SIZE(chip->regs),
+				chip->regs);
+	if (err < 0) {
+		dev_err(&client->dev, "Cannot enable regulators\n");
+		goto fail2;
+	}
+
+	usleep_range(BH1770_STARTUP_DELAY, BH1770_STARTUP_DELAY * 2);
+	err = bh1770_detect(chip);
+	if (err < 0)
+		goto fail3;
+
+	/* Start chip */
+	bh1770_chip_on(chip);
+	pm_runtime_set_active(&client->dev);
+	pm_runtime_enable(&client->dev);
+
+	chip->lux_corr = bh1770_get_corr_value(chip);
+	if (chip->lux_corr == 0) {
+		dev_err(&client->dev, "Improper correction values\n");
+		err = -EINVAL;
+		goto fail3;
+	}
+
+	if (chip->pdata->setup_resources) {
+		err = chip->pdata->setup_resources();
+		if (err) {
+			err = -EINVAL;
+			goto fail3;
+		}
+	}
+
+	err = sysfs_create_group(&chip->client->dev.kobj,
+				&bh1770_attribute_group);
+	if (err < 0) {
+		dev_err(&chip->client->dev, "Sysfs registration failed\n");
+		goto fail4;
+	}
+
+	/*
+	 * Chip needs level triggered interrupt to work. However,
+	 * level triggering doesn't work always correctly with power
+	 * management. Select both
+	 */
+	err = request_threaded_irq(client->irq, NULL,
+				bh1770_irq,
+				IRQF_TRIGGER_FALLING | IRQF_ONESHOT |
+				IRQF_TRIGGER_LOW,
+				"bh1770", chip);
+	if (err) {
+		dev_err(&client->dev, "could not get IRQ %d\n",
+			client->irq);
+		goto fail5;
+	}
+	regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs);
+	return err;
+fail5:
+	sysfs_remove_group(&chip->client->dev.kobj,
+			&bh1770_attribute_group);
+fail4:
+	if (chip->pdata->release_resources)
+		chip->pdata->release_resources();
+fail3:
+	regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs);
+fail2:
+	regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs);
+fail1:
+	kfree(chip);
+	return err;
+}
+
+static int __devexit bh1770_remove(struct i2c_client *client)
+{
+	struct bh1770_chip *chip = i2c_get_clientdata(client);
+
+	free_irq(client->irq, chip);
+
+	sysfs_remove_group(&chip->client->dev.kobj,
+			&bh1770_attribute_group);
+
+	if (chip->pdata->release_resources)
+		chip->pdata->release_resources();
+
+	cancel_delayed_work_sync(&chip->prox_work);
+
+	if (!pm_runtime_suspended(&client->dev))
+		bh1770_chip_off(chip);
+
+	pm_runtime_disable(&client->dev);
+	pm_runtime_set_suspended(&client->dev);
+
+	regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs);
+	kfree(chip);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int bh1770_suspend(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct bh1770_chip *chip = i2c_get_clientdata(client);
+
+	bh1770_chip_off(chip);
+
+	return 0;
+}
+
+static int bh1770_resume(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct bh1770_chip *chip = i2c_get_clientdata(client);
+	int ret = 0;
+
+	bh1770_chip_on(chip);
+
+	if (!pm_runtime_suspended(dev)) {
+		/*
+		 * If we were enabled at suspend time, it is expected
+		 * everything works nice and smoothly
+		 */
+		ret = bh1770_lux_rate(chip, chip->lux_rate_index);
+		ret |= bh1770_lux_interrupt_control(chip, BH1770_ENABLE);
+
+		/* This causes interrupt after the next measurement cycle */
+		bh1770_lux_update_thresholds(chip, BH1770_LUX_DEF_THRES,
+					BH1770_LUX_DEF_THRES);
+		/* Inform that we are waiting for a result from ALS */
+		chip->lux_wait_result = true;
+		bh1770_prox_mode_control(chip);
+	}
+	return ret;
+}
+
+#else
+#define bh1770_suspend	NULL
+#define bh1770_shutdown NULL
+#define bh1770_resume	NULL
+#endif
+
+#ifdef CONFIG_PM_RUNTIME
+static int bh1770_runtime_suspend(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct bh1770_chip *chip = i2c_get_clientdata(client);
+
+	bh1770_chip_off(chip);
+
+	return 0;
+}
+
+static int bh1770_runtime_resume(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct bh1770_chip *chip = i2c_get_clientdata(client);
+
+	bh1770_chip_on(chip);
+
+	return 0;
+}
+#endif
+
+static const struct i2c_device_id bh1770_id[] = {
+	{"bh1770glc", 0 },
+	{"sfh7770", 0 },
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, bh1770_id);
+
+static const struct dev_pm_ops bh1770_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(bh1770_suspend, bh1770_resume)
+	SET_RUNTIME_PM_OPS(bh1770_runtime_suspend, bh1770_runtime_resume, NULL)
+};
+
+static struct i2c_driver bh1770_driver = {
+	.driver	 = {
+		.name	= "bh1770glc",
+		.owner	= THIS_MODULE,
+		.pm	= &bh1770_pm_ops,
+	},
+	.probe	  = bh1770_probe,
+	.remove	  = __devexit_p(bh1770_remove),
+	.id_table = bh1770_id,
+};
+
+static int __init bh1770_init(void)
+{
+	return i2c_add_driver(&bh1770_driver);
+}
+
+static void __exit bh1770_exit(void)
+{
+	i2c_del_driver(&bh1770_driver);
+}
+
+MODULE_DESCRIPTION("BH1770GLC / SFH7770 combined ALS and proximity sensor");
+MODULE_AUTHOR("Samu Onkalo, Nokia Corporation");
+MODULE_LICENSE("GPL v2");
+
+module_init(bh1770_init);
+module_exit(bh1770_exit);
diff --git a/include/linux/i2c/bh1770glc.h b/include/linux/i2c/bh1770glc.h
new file mode 100644
index 000000000000..8b5e2df36c72
--- /dev/null
+++ b/include/linux/i2c/bh1770glc.h
@@ -0,0 +1,53 @@
+/*
+ * This file is part of the ROHM BH1770GLC / OSRAM SFH7770 sensor driver.
+ * Chip is combined proximity and ambient light sensor.
+ *
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
+ *
+ * Contact: Samu Onkalo <samu.p.onkalo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __BH1770_H__
+#define __BH1770_H__
+
+/**
+ * struct bh1770_platform_data - platform data for bh1770glc driver
+ * @led_def_curr: IR led driving current.
+ * @glass_attenuation: Attenuation factor for covering window.
+ * @setup_resources: Call back for interrupt line setup function
+ * @release_resources: Call back for interrupte line release function
+ *
+ * Example of glass attenuation: 16384 * 385 / 100 means attenuation factor
+ * of 3.85. i.e. light_above_sensor = light_above_cover_window / 3.85
+ */
+
+struct bh1770_platform_data {
+#define BH1770_LED_5mA	0
+#define BH1770_LED_10mA	1
+#define BH1770_LED_20mA	2
+#define BH1770_LED_50mA	3
+#define BH1770_LED_100mA 4
+#define BH1770_LED_150mA 5
+#define BH1770_LED_200mA 6
+	__u8 led_def_curr;
+#define BH1770_NEUTRAL_GA 16384 /* 16384 / 16384 = 1 */
+	__u32 glass_attenuation;
+	int (*setup_resources)(void);
+	int (*release_resources)(void);
+};
+#endif

From 92b1f84d46b24675493d95a239eea2b07e5f13f8 Mon Sep 17 00:00:00 2001
From: Samu Onkalo <samu.p.onkalo@nokia.com>
Date: Tue, 26 Oct 2010 14:22:38 -0700
Subject: [PATCH 107/176] drivers/misc: driver for APDS990X ALS and proximity
 sensors

This is a driver for Avago APDS990X combined ALS and proximity sensor.

Interface is sysfs based.  The driver uses interrupts to provide new data.
The driver supports pm_runtime and regulator frameworks.

See Documentation/misc-devices/apds990x.txt for details

Signed-off-by: Samu Onkalo <samu.p.onkalo@nokia.com>
Acked-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/Kconfig         |   11 +
 drivers/misc/Makefile        |    1 +
 drivers/misc/apds990x.c      | 1295 ++++++++++++++++++++++++++++++++++
 include/linux/i2c/apds990x.h |   79 +++
 4 files changed, 1386 insertions(+)
 create mode 100644 drivers/misc/apds990x.c
 create mode 100644 include/linux/i2c/apds990x.h

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 3b50106abfa5..7362851a2ca8 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -325,6 +325,17 @@ config SENSORS_BH1770
            To compile this driver as a module, choose M here: the
            module will be called bh1770glc. If unsure, say N here.
 
+config SENSORS_APDS990X
+	 tristate "APDS990X combined als and proximity sensors"
+	 depends on I2C
+	 default n
+	 ---help---
+	   Say Y here if you want to build a driver for Avago APDS990x
+	   combined ambient light and proximity sensor chip.
+
+	   To compile this driver as a module, choose M here: the
+	   module will be called apds990x. If unsure, say N here.
+
 config HMC6352
 	tristate "Honeywell HMC6352 compass"
 	depends on I2C
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 5fed6ab533e2..081b9fca8a06 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_TIFM_7XX1)       	+= tifm_7xx1.o
 obj-$(CONFIG_PHANTOM)		+= phantom.o
 obj-$(CONFIG_SENSORS_BH1780)	+= bh1780gli.o
 obj-$(CONFIG_SENSORS_BH1770)	+= bh1770glc.o
+obj-$(CONFIG_SENSORS_APDS990X)	+= apds990x.o
 obj-$(CONFIG_SGI_IOC4)		+= ioc4.o
 obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
 obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
diff --git a/drivers/misc/apds990x.c b/drivers/misc/apds990x.c
new file mode 100644
index 000000000000..200311fea369
--- /dev/null
+++ b/drivers/misc/apds990x.c
@@ -0,0 +1,1295 @@
+/*
+ * This file is part of the APDS990x sensor driver.
+ * Chip is combined proximity and ambient light sensor.
+ *
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
+ *
+ * Contact: Samu Onkalo <samu.p.onkalo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/regulator/consumer.h>
+#include <linux/pm_runtime.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/i2c/apds990x.h>
+
+/* Register map */
+#define APDS990X_ENABLE	 0x00 /* Enable of states and interrupts */
+#define APDS990X_ATIME	 0x01 /* ALS ADC time  */
+#define APDS990X_PTIME	 0x02 /* Proximity ADC time  */
+#define APDS990X_WTIME	 0x03 /* Wait time  */
+#define APDS990X_AILTL	 0x04 /* ALS interrupt low threshold low byte */
+#define APDS990X_AILTH	 0x05 /* ALS interrupt low threshold hi byte */
+#define APDS990X_AIHTL	 0x06 /* ALS interrupt hi threshold low byte */
+#define APDS990X_AIHTH	 0x07 /* ALS interrupt hi threshold hi byte */
+#define APDS990X_PILTL	 0x08 /* Proximity interrupt low threshold low byte */
+#define APDS990X_PILTH	 0x09 /* Proximity interrupt low threshold hi byte */
+#define APDS990X_PIHTL	 0x0a /* Proximity interrupt hi threshold low byte */
+#define APDS990X_PIHTH	 0x0b /* Proximity interrupt hi threshold hi byte */
+#define APDS990X_PERS	 0x0c /* Interrupt persistence filters */
+#define APDS990X_CONFIG	 0x0d /* Configuration */
+#define APDS990X_PPCOUNT 0x0e /* Proximity pulse count */
+#define APDS990X_CONTROL 0x0f /* Gain control register */
+#define APDS990X_REV	 0x11 /* Revision Number */
+#define APDS990X_ID	 0x12 /* Device ID */
+#define APDS990X_STATUS	 0x13 /* Device status */
+#define APDS990X_CDATAL	 0x14 /* Clear ADC low data register */
+#define APDS990X_CDATAH	 0x15 /* Clear ADC high data register */
+#define APDS990X_IRDATAL 0x16 /* IR ADC low data register */
+#define APDS990X_IRDATAH 0x17 /* IR ADC high data register */
+#define APDS990X_PDATAL	 0x18 /* Proximity ADC low data register */
+#define APDS990X_PDATAH	 0x19 /* Proximity ADC high data register */
+
+/* Control */
+#define APDS990X_MAX_AGAIN	3
+
+/* Enable register */
+#define APDS990X_EN_PIEN	(0x1 << 5)
+#define APDS990X_EN_AIEN	(0x1 << 4)
+#define APDS990X_EN_WEN		(0x1 << 3)
+#define APDS990X_EN_PEN		(0x1 << 2)
+#define APDS990X_EN_AEN		(0x1 << 1)
+#define APDS990X_EN_PON		(0x1 << 0)
+#define APDS990X_EN_DISABLE_ALL 0
+
+/* Status register */
+#define APDS990X_ST_PINT	(0x1 << 5)
+#define APDS990X_ST_AINT	(0x1 << 4)
+
+/* I2C access types */
+#define APDS990x_CMD_TYPE_MASK	(0x03 << 5)
+#define APDS990x_CMD_TYPE_RB	(0x00 << 5) /* Repeated byte */
+#define APDS990x_CMD_TYPE_INC	(0x01 << 5) /* Auto increment */
+#define APDS990x_CMD_TYPE_SPE	(0x03 << 5) /* Special function */
+
+#define APDS990x_ADDR_SHIFT	0
+#define APDS990x_CMD		0x80
+
+/* Interrupt ack commands */
+#define APDS990X_INT_ACK_ALS	0x6
+#define APDS990X_INT_ACK_PS	0x5
+#define APDS990X_INT_ACK_BOTH	0x7
+
+/* ptime */
+#define APDS990X_PTIME_DEFAULT	0xff /* Recommended conversion time 2.7ms*/
+
+/* wtime */
+#define APDS990X_WTIME_DEFAULT	0xee /* ~50ms wait time */
+
+#define APDS990X_TIME_TO_ADC	1024 /* One timetick as ADC count value */
+
+/* Persistence */
+#define APDS990X_APERS_SHIFT	0
+#define APDS990X_PPERS_SHIFT	4
+
+/* Supported ID:s */
+#define APDS990X_ID_0		0x0
+#define APDS990X_ID_4		0x4
+#define APDS990X_ID_29		0x29
+
+/* pgain and pdiode settings */
+#define APDS_PGAIN_1X	       0x0
+#define APDS_PDIODE_IR	       0x2
+
+#define APDS990X_LUX_OUTPUT_SCALE 10
+
+/* Reverse chip factors for threshold calculation */
+struct reverse_factors {
+	u32 afactor;
+	int cf1;
+	int irf1;
+	int cf2;
+	int irf2;
+};
+
+struct apds990x_chip {
+	struct apds990x_platform_data	*pdata;
+	struct i2c_client		*client;
+	struct mutex			mutex; /* avoid parallel access */
+	struct regulator_bulk_data	regs[2];
+	wait_queue_head_t		wait;
+
+	int	prox_en;
+	bool	prox_continuous_mode;
+	bool	lux_wait_fresh_res;
+
+	/* Chip parameters */
+	struct	apds990x_chip_factors	cf;
+	struct	reverse_factors		rcf;
+	u16	atime;		/* als integration time */
+	u16	arate;		/* als reporting rate */
+	u16	a_max_result;	/* Max possible ADC value with current atime */
+	u8	again_meas;	/* Gain used in last measurement */
+	u8	again_next;	/* Next calculated gain */
+	u8	pgain;
+	u8	pdiode;
+	u8	pdrive;
+	u8	lux_persistence;
+	u8	prox_persistence;
+
+	u32	lux_raw;
+	u32	lux;
+	u16	lux_clear;
+	u16	lux_ir;
+	u16	lux_calib;
+	u32	lux_thres_hi;
+	u32	lux_thres_lo;
+
+	u32	prox_thres;
+	u16	prox_data;
+	u16	prox_calib;
+
+	char	chipname[10];
+	u8	revision;
+};
+
+#define APDS_CALIB_SCALER		8192
+#define APDS_LUX_NEUTRAL_CALIB_VALUE	(1 * APDS_CALIB_SCALER)
+#define APDS_PROX_NEUTRAL_CALIB_VALUE	(1 * APDS_CALIB_SCALER)
+
+#define APDS_PROX_DEF_THRES		600
+#define APDS_PROX_HYSTERESIS		50
+#define APDS_LUX_DEF_THRES_HI		101
+#define APDS_LUX_DEF_THRES_LO		100
+#define APDS_DEFAULT_PROX_PERS		1
+
+#define APDS_TIMEOUT			2000
+#define APDS_STARTUP_DELAY		25000 /* us */
+#define APDS_RANGE			65535
+#define APDS_PROX_RANGE			1023
+#define APDS_LUX_GAIN_LO_LIMIT		100
+#define APDS_LUX_GAIN_LO_LIMIT_STRICT	25
+
+#define TIMESTEP			87 /* 2.7ms is about 87 / 32 */
+#define TIME_STEP_SCALER		32
+
+#define APDS_LUX_AVERAGING_TIME		50 /* tolerates 50/60Hz ripple */
+#define APDS_LUX_DEFAULT_RATE		200
+
+static const u8 again[]	= {1, 8, 16, 120}; /* ALS gain steps */
+static const u8 ir_currents[]	= {100, 50, 25, 12}; /* IRled currents in mA */
+
+/* Following two tables must match i.e 10Hz rate means 1 as persistence value */
+static const u16 arates_hz[] = {10, 5, 2, 1};
+static const u8 apersis[] = {1, 2, 4, 5};
+
+/* Regulators */
+static const char reg_vcc[] = "Vdd";
+static const char reg_vled[] = "Vled";
+
+static int apds990x_read_byte(struct apds990x_chip *chip, u8 reg, u8 *data)
+{
+	struct i2c_client *client = chip->client;
+	s32 ret;
+
+	reg &= ~APDS990x_CMD_TYPE_MASK;
+	reg |= APDS990x_CMD | APDS990x_CMD_TYPE_RB;
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+	*data = ret;
+	return (int)ret;
+}
+
+static int apds990x_read_word(struct apds990x_chip *chip, u8 reg, u16 *data)
+{
+	struct i2c_client *client = chip->client;
+	s32 ret;
+
+	reg &= ~APDS990x_CMD_TYPE_MASK;
+	reg |= APDS990x_CMD | APDS990x_CMD_TYPE_INC;
+
+	ret = i2c_smbus_read_word_data(client, reg);
+	*data = ret;
+	return (int)ret;
+}
+
+static int apds990x_write_byte(struct apds990x_chip *chip, u8 reg, u8 data)
+{
+	struct i2c_client *client = chip->client;
+	s32 ret;
+
+	reg &= ~APDS990x_CMD_TYPE_MASK;
+	reg |= APDS990x_CMD | APDS990x_CMD_TYPE_RB;
+
+	ret = i2c_smbus_write_byte_data(client, reg, data);
+	return (int)ret;
+}
+
+static int apds990x_write_word(struct apds990x_chip *chip, u8 reg, u16 data)
+{
+	struct i2c_client *client = chip->client;
+	s32 ret;
+
+	reg &= ~APDS990x_CMD_TYPE_MASK;
+	reg |= APDS990x_CMD | APDS990x_CMD_TYPE_INC;
+
+	ret = i2c_smbus_write_word_data(client, reg, data);
+	return (int)ret;
+}
+
+static int apds990x_mode_on(struct apds990x_chip *chip)
+{
+	/* ALS is mandatory, proximity optional */
+	u8 reg = APDS990X_EN_AIEN | APDS990X_EN_PON | APDS990X_EN_AEN |
+		APDS990X_EN_WEN;
+
+	if (chip->prox_en)
+		reg |= APDS990X_EN_PIEN | APDS990X_EN_PEN;
+
+	return apds990x_write_byte(chip, APDS990X_ENABLE, reg);
+}
+
+static u16 apds990x_lux_to_threshold(struct apds990x_chip *chip, u32 lux)
+{
+	u32 thres;
+	u32 cpl;
+	u32 ir;
+
+	if (lux == 0)
+		return 0;
+	else if (lux == APDS_RANGE)
+		return APDS_RANGE;
+
+	/*
+	 * Reported LUX value is a combination of the IR and CLEAR channel
+	 * values. However, interrupt threshold is only for clear channel.
+	 * This function approximates needed HW threshold value for a given
+	 * LUX value in the current lightning type.
+	 * IR level compared to visible light varies heavily depending on the
+	 * source of the light
+	 *
+	 * Calculate threshold value for the next measurement period.
+	 * Math: threshold = lux * cpl where
+	 * cpl = atime * again / (glass_attenuation * device_factor)
+	 * (count-per-lux)
+	 *
+	 * First remove calibration. Division by four is to avoid overflow
+	 */
+	lux = lux * (APDS_CALIB_SCALER / 4) / (chip->lux_calib / 4);
+
+	/* Multiplication by 64 is to increase accuracy */
+	cpl = ((u32)chip->atime * (u32)again[chip->again_next] *
+		APDS_PARAM_SCALE * 64) / (chip->cf.ga * chip->cf.df);
+
+	thres = lux * cpl / 64;
+	/*
+	 * Convert IR light from the latest result to match with
+	 * new gain step. This helps to adapt with the current
+	 * source of light.
+	 */
+	ir = (u32)chip->lux_ir * (u32)again[chip->again_next] /
+		(u32)again[chip->again_meas];
+
+	/*
+	 * Compensate count with IR light impact
+	 * IAC1 > IAC2 (see apds990x_get_lux for formulas)
+	 */
+	if (chip->lux_clear * APDS_PARAM_SCALE >=
+		chip->rcf.afactor * chip->lux_ir)
+		thres = (chip->rcf.cf1 * thres + chip->rcf.irf1 * ir) /
+			APDS_PARAM_SCALE;
+	else
+		thres = (chip->rcf.cf2 * thres + chip->rcf.irf2 * ir) /
+			APDS_PARAM_SCALE;
+
+	if (thres >= chip->a_max_result)
+		thres = chip->a_max_result - 1;
+	return thres;
+}
+
+static inline int apds990x_set_atime(struct apds990x_chip *chip, u32 time_ms)
+{
+	u8 reg_value;
+
+	chip->atime = time_ms;
+	/* Formula is specified in the data sheet */
+	reg_value = 256 - ((time_ms * TIME_STEP_SCALER) / TIMESTEP);
+	/* Calculate max ADC value for given integration time */
+	chip->a_max_result = (u16)(256 - reg_value) * APDS990X_TIME_TO_ADC;
+	return apds990x_write_byte(chip, APDS990X_ATIME, reg_value);
+}
+
+/* Called always with mutex locked */
+static int apds990x_refresh_pthres(struct apds990x_chip *chip, int data)
+{
+	int ret, lo, hi;
+
+	/* If the chip is not in use, don't try to access it */
+	if (pm_runtime_suspended(&chip->client->dev))
+		return 0;
+
+	if (data < chip->prox_thres) {
+		lo = 0;
+		hi = chip->prox_thres;
+	} else {
+		lo = chip->prox_thres - APDS_PROX_HYSTERESIS;
+		if (chip->prox_continuous_mode)
+			hi = chip->prox_thres;
+		else
+			hi = APDS_RANGE;
+	}
+
+	ret = apds990x_write_word(chip, APDS990X_PILTL, lo);
+	ret |= apds990x_write_word(chip, APDS990X_PIHTL, hi);
+	return ret;
+}
+
+/* Called always with mutex locked */
+static int apds990x_refresh_athres(struct apds990x_chip *chip)
+{
+	int ret;
+	/* If the chip is not in use, don't try to access it */
+	if (pm_runtime_suspended(&chip->client->dev))
+		return 0;
+
+	ret = apds990x_write_word(chip, APDS990X_AILTL,
+			apds990x_lux_to_threshold(chip, chip->lux_thres_lo));
+	ret |= apds990x_write_word(chip, APDS990X_AIHTL,
+			apds990x_lux_to_threshold(chip, chip->lux_thres_hi));
+
+	return ret;
+}
+
+/* Called always with mutex locked */
+static void apds990x_force_a_refresh(struct apds990x_chip *chip)
+{
+	/* This will force ALS interrupt after the next measurement. */
+	apds990x_write_word(chip, APDS990X_AILTL, APDS_LUX_DEF_THRES_LO);
+	apds990x_write_word(chip, APDS990X_AIHTL, APDS_LUX_DEF_THRES_HI);
+}
+
+/* Called always with mutex locked */
+static void apds990x_force_p_refresh(struct apds990x_chip *chip)
+{
+	/* This will force proximity interrupt after the next measurement. */
+	apds990x_write_word(chip, APDS990X_PILTL, APDS_PROX_DEF_THRES - 1);
+	apds990x_write_word(chip, APDS990X_PIHTL, APDS_PROX_DEF_THRES);
+}
+
+/* Called always with mutex locked */
+static int apds990x_calc_again(struct apds990x_chip *chip)
+{
+	int curr_again = chip->again_meas;
+	int next_again = chip->again_meas;
+	int ret = 0;
+
+	/* Calculate suitable als gain */
+	if (chip->lux_clear == chip->a_max_result)
+		next_again -= 2; /* ALS saturated. Decrease gain by 2 steps */
+	else if (chip->lux_clear > chip->a_max_result / 2)
+		next_again--;
+	else if (chip->lux_clear < APDS_LUX_GAIN_LO_LIMIT_STRICT)
+		next_again += 2; /* Too dark. Increase gain by 2 steps */
+	else if (chip->lux_clear < APDS_LUX_GAIN_LO_LIMIT)
+		next_again++;
+
+	/* Limit gain to available range */
+	if (next_again < 0)
+		next_again = 0;
+	else if (next_again > APDS990X_MAX_AGAIN)
+		next_again = APDS990X_MAX_AGAIN;
+
+	/* Let's check can we trust the measured result */
+	if (chip->lux_clear == chip->a_max_result)
+		/* Result can be totally garbage due to saturation */
+		ret = -ERANGE;
+	else if (next_again != curr_again &&
+		chip->lux_clear < APDS_LUX_GAIN_LO_LIMIT_STRICT)
+		/*
+		 * Gain is changed and measurement result is very small.
+		 * Result can be totally garbage due to underflow
+		 */
+		ret = -ERANGE;
+
+	chip->again_next = next_again;
+	apds990x_write_byte(chip, APDS990X_CONTROL,
+			(chip->pdrive << 6) |
+			(chip->pdiode << 4) |
+			(chip->pgain << 2) |
+			(chip->again_next << 0));
+
+	/*
+	 * Error means bad result -> re-measurement is needed. The forced
+	 * refresh uses fastest possible persistence setting to get result
+	 * as soon as possible.
+	 */
+	if (ret < 0)
+		apds990x_force_a_refresh(chip);
+	else
+		apds990x_refresh_athres(chip);
+
+	return ret;
+}
+
+/* Called always with mutex locked */
+static int apds990x_get_lux(struct apds990x_chip *chip, int clear, int ir)
+{
+	int iac, iac1, iac2; /* IR adjusted counts */
+	u32 lpc; /* Lux per count */
+
+	/* Formulas:
+	 * iac1 = CF1 * CLEAR_CH - IRF1 * IR_CH
+	 * iac2 = CF2 * CLEAR_CH - IRF2 * IR_CH
+	 */
+	iac1 = (chip->cf.cf1 * clear - chip->cf.irf1 * ir) / APDS_PARAM_SCALE;
+	iac2 = (chip->cf.cf2 * clear - chip->cf.irf2 * ir) / APDS_PARAM_SCALE;
+
+	iac = max(iac1, iac2);
+	iac = max(iac, 0);
+
+	lpc = APDS990X_LUX_OUTPUT_SCALE * (chip->cf.df * chip->cf.ga) /
+		(u32)(again[chip->again_meas] * (u32)chip->atime);
+
+	return (iac * lpc) / APDS_PARAM_SCALE;
+}
+
+static int apds990x_ack_int(struct apds990x_chip *chip, u8 mode)
+{
+	struct i2c_client *client = chip->client;
+	s32 ret;
+	u8 reg = APDS990x_CMD | APDS990x_CMD_TYPE_SPE;
+
+	switch (mode & (APDS990X_ST_AINT | APDS990X_ST_PINT)) {
+	case APDS990X_ST_AINT:
+		reg |= APDS990X_INT_ACK_ALS;
+		break;
+	case APDS990X_ST_PINT:
+		reg |= APDS990X_INT_ACK_PS;
+		break;
+	default:
+		reg |= APDS990X_INT_ACK_BOTH;
+		break;
+	}
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+	return (int)ret;
+}
+
+static irqreturn_t apds990x_irq(int irq, void *data)
+{
+	struct apds990x_chip *chip = data;
+	u8 status;
+
+	apds990x_read_byte(chip, APDS990X_STATUS, &status);
+	apds990x_ack_int(chip, status);
+
+	mutex_lock(&chip->mutex);
+	if (!pm_runtime_suspended(&chip->client->dev)) {
+		if (status & APDS990X_ST_AINT) {
+			apds990x_read_word(chip, APDS990X_CDATAL,
+					&chip->lux_clear);
+			apds990x_read_word(chip, APDS990X_IRDATAL,
+					&chip->lux_ir);
+			/* Store used gain for calculations */
+			chip->again_meas = chip->again_next;
+
+			chip->lux_raw = apds990x_get_lux(chip,
+							chip->lux_clear,
+							chip->lux_ir);
+
+			if (apds990x_calc_again(chip) == 0) {
+				/* Result is valid */
+				chip->lux = chip->lux_raw;
+				chip->lux_wait_fresh_res = false;
+				wake_up(&chip->wait);
+				sysfs_notify(&chip->client->dev.kobj,
+					NULL, "lux0_input");
+			}
+		}
+
+		if ((status & APDS990X_ST_PINT) && chip->prox_en) {
+			u16 clr_ch;
+
+			apds990x_read_word(chip, APDS990X_CDATAL, &clr_ch);
+			/*
+			 * If ALS channel is saturated at min gain,
+			 * proximity gives false posivite values.
+			 * Just ignore them.
+			 */
+			if (chip->again_meas == 0 &&
+				clr_ch == chip->a_max_result)
+				chip->prox_data = 0;
+			else
+				apds990x_read_word(chip,
+						APDS990X_PDATAL,
+						&chip->prox_data);
+
+			apds990x_refresh_pthres(chip, chip->prox_data);
+			if (chip->prox_data < chip->prox_thres)
+				chip->prox_data = 0;
+			else if (!chip->prox_continuous_mode)
+				chip->prox_data = APDS_PROX_RANGE;
+			sysfs_notify(&chip->client->dev.kobj,
+				NULL, "prox0_raw");
+		}
+	}
+	mutex_unlock(&chip->mutex);
+	return IRQ_HANDLED;
+}
+
+static int apds990x_configure(struct apds990x_chip *chip)
+{
+	/* It is recommended to use disabled mode during these operations */
+	apds990x_write_byte(chip, APDS990X_ENABLE, APDS990X_EN_DISABLE_ALL);
+
+	/* conversion and wait times for different state machince states */
+	apds990x_write_byte(chip, APDS990X_PTIME, APDS990X_PTIME_DEFAULT);
+	apds990x_write_byte(chip, APDS990X_WTIME, APDS990X_WTIME_DEFAULT);
+	apds990x_set_atime(chip, APDS_LUX_AVERAGING_TIME);
+
+	apds990x_write_byte(chip, APDS990X_CONFIG, 0);
+
+	/* Persistence levels */
+	apds990x_write_byte(chip, APDS990X_PERS,
+			(chip->lux_persistence << APDS990X_APERS_SHIFT) |
+			(chip->prox_persistence << APDS990X_PPERS_SHIFT));
+
+	apds990x_write_byte(chip, APDS990X_PPCOUNT, chip->pdata->ppcount);
+
+	/* Start with relatively small gain */
+	chip->again_meas = 1;
+	chip->again_next = 1;
+	apds990x_write_byte(chip, APDS990X_CONTROL,
+			(chip->pdrive << 6) |
+			(chip->pdiode << 4) |
+			(chip->pgain << 2) |
+			(chip->again_next << 0));
+	return 0;
+}
+
+static int apds990x_detect(struct apds990x_chip *chip)
+{
+	struct i2c_client *client = chip->client;
+	int ret;
+	u8 id;
+
+	ret = apds990x_read_byte(chip, APDS990X_ID, &id);
+	if (ret < 0) {
+		dev_err(&client->dev, "ID read failed\n");
+		return ret;
+	}
+
+	ret = apds990x_read_byte(chip, APDS990X_REV, &chip->revision);
+	if (ret < 0) {
+		dev_err(&client->dev, "REV read failed\n");
+		return ret;
+	}
+
+	switch (id) {
+	case APDS990X_ID_0:
+	case APDS990X_ID_4:
+	case APDS990X_ID_29:
+		snprintf(chip->chipname, sizeof(chip->chipname), "APDS-990x");
+		break;
+	default:
+		ret = -ENODEV;
+		break;
+	}
+	return ret;
+}
+
+static int apds990x_chip_on(struct apds990x_chip *chip)
+{
+	int err	 = regulator_bulk_enable(ARRAY_SIZE(chip->regs),
+					chip->regs);
+	if (err < 0)
+		return err;
+
+	usleep_range(APDS_STARTUP_DELAY, 2 * APDS_STARTUP_DELAY);
+
+	/* Refresh all configs in case of regulators were off */
+	chip->prox_data = 0;
+	apds990x_configure(chip);
+	apds990x_mode_on(chip);
+	return 0;
+}
+
+static int apds990x_chip_off(struct apds990x_chip *chip)
+{
+	apds990x_write_byte(chip, APDS990X_ENABLE, APDS990X_EN_DISABLE_ALL);
+	regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs);
+	return 0;
+}
+
+static ssize_t apds990x_lux_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip = dev_get_drvdata(dev);
+	ssize_t ret;
+	u32 result;
+	long timeout;
+
+	if (pm_runtime_suspended(dev))
+		return -EIO;
+
+	timeout = wait_event_interruptible_timeout(chip->wait,
+						!chip->lux_wait_fresh_res,
+						msecs_to_jiffies(APDS_TIMEOUT));
+	if (!timeout)
+		return -EIO;
+
+	mutex_lock(&chip->mutex);
+	result = (chip->lux * chip->lux_calib) / APDS_CALIB_SCALER;
+	if (result > (APDS_RANGE * APDS990X_LUX_OUTPUT_SCALE))
+		result = APDS_RANGE * APDS990X_LUX_OUTPUT_SCALE;
+
+	ret = sprintf(buf, "%d.%d\n",
+		result / APDS990X_LUX_OUTPUT_SCALE,
+		result % APDS990X_LUX_OUTPUT_SCALE);
+	mutex_unlock(&chip->mutex);
+	return ret;
+}
+
+static DEVICE_ATTR(lux0_input, S_IRUGO, apds990x_lux_show, NULL);
+
+static ssize_t apds990x_lux_range_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", APDS_RANGE);
+}
+
+static DEVICE_ATTR(lux0_sensor_range, S_IRUGO, apds990x_lux_range_show, NULL);
+
+static ssize_t apds990x_lux_calib_format_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", APDS_CALIB_SCALER);
+}
+
+static DEVICE_ATTR(lux0_calibscale_default, S_IRUGO,
+		apds990x_lux_calib_format_show, NULL);
+
+static ssize_t apds990x_lux_calib_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", chip->lux_calib);
+}
+
+static ssize_t apds990x_lux_calib_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip = dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	if (chip->lux_calib > APDS_RANGE)
+		return -EINVAL;
+
+	chip->lux_calib = value;
+
+	return len;
+}
+
+static DEVICE_ATTR(lux0_calibscale, S_IRUGO | S_IWUSR, apds990x_lux_calib_show,
+		apds990x_lux_calib_store);
+
+static ssize_t apds990x_rate_avail(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	int i;
+	int pos = 0;
+	for (i = 0; i < ARRAY_SIZE(arates_hz); i++)
+		pos += sprintf(buf + pos, "%d ", arates_hz[i]);
+	sprintf(buf + pos - 1, "\n");
+	return pos;
+}
+
+static ssize_t apds990x_rate_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->arate);
+}
+
+static int apds990x_set_arate(struct apds990x_chip *chip, int rate)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(arates_hz); i++)
+		if (rate >= arates_hz[i])
+			break;
+
+	if (i == ARRAY_SIZE(arates_hz))
+		return -EINVAL;
+
+	/* Pick up corresponding persistence value */
+	chip->lux_persistence = apersis[i];
+	chip->arate = arates_hz[i];
+
+	/* If the chip is not in use, don't try to access it */
+	if (pm_runtime_suspended(&chip->client->dev))
+		return 0;
+
+	/* Persistence levels */
+	return apds990x_write_byte(chip, APDS990X_PERS,
+			(chip->lux_persistence << APDS990X_APERS_SHIFT) |
+			(chip->prox_persistence << APDS990X_PPERS_SHIFT));
+}
+
+static ssize_t apds990x_rate_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+	int ret;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	ret = apds990x_set_arate(chip, value);
+	mutex_unlock(&chip->mutex);
+
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+static DEVICE_ATTR(lux0_rate_avail, S_IRUGO, apds990x_rate_avail, NULL);
+
+static DEVICE_ATTR(lux0_rate, S_IRUGO | S_IWUSR, apds990x_rate_show,
+						 apds990x_rate_store);
+
+static ssize_t apds990x_prox_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	ssize_t ret;
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	if (pm_runtime_suspended(dev) || !chip->prox_en)
+		return -EIO;
+
+	mutex_lock(&chip->mutex);
+	ret = sprintf(buf, "%d\n", chip->prox_data);
+	mutex_unlock(&chip->mutex);
+	return ret;
+}
+
+static DEVICE_ATTR(prox0_raw, S_IRUGO, apds990x_prox_show, NULL);
+
+static ssize_t apds990x_prox_range_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", APDS_PROX_RANGE);
+}
+
+static DEVICE_ATTR(prox0_sensor_range, S_IRUGO, apds990x_prox_range_show, NULL);
+
+static ssize_t apds990x_prox_enable_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->prox_en);
+}
+
+static ssize_t apds990x_prox_enable_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+
+	if (!chip->prox_en)
+		chip->prox_data = 0;
+
+	if (value)
+		chip->prox_en++;
+	else if (chip->prox_en > 0)
+		chip->prox_en--;
+
+	if (!pm_runtime_suspended(dev))
+		apds990x_mode_on(chip);
+	mutex_unlock(&chip->mutex);
+	return len;
+}
+
+static DEVICE_ATTR(prox0_raw_en, S_IRUGO | S_IWUSR, apds990x_prox_enable_show,
+						   apds990x_prox_enable_store);
+
+static const char reporting_modes[][9] = {"trigger", "periodic"};
+
+static ssize_t apds990x_prox_reporting_mode_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%s\n",
+		reporting_modes[!!chip->prox_continuous_mode]);
+}
+
+static ssize_t apds990x_prox_reporting_mode_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+
+	if (sysfs_streq(buf, reporting_modes[0]))
+		chip->prox_continuous_mode = 0;
+	else if (sysfs_streq(buf, reporting_modes[1]))
+		chip->prox_continuous_mode = 1;
+	else
+		return -EINVAL;
+	return len;
+}
+
+static DEVICE_ATTR(prox0_reporting_mode, S_IRUGO | S_IWUSR,
+		apds990x_prox_reporting_mode_show,
+		apds990x_prox_reporting_mode_store);
+
+static ssize_t apds990x_prox_reporting_avail_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s %s\n", reporting_modes[0], reporting_modes[1]);
+}
+
+static DEVICE_ATTR(prox0_reporting_mode_avail, S_IRUGO | S_IWUSR,
+		apds990x_prox_reporting_avail_show, NULL);
+
+
+static ssize_t apds990x_lux_thresh_above_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->lux_thres_hi);
+}
+
+static ssize_t apds990x_lux_thresh_below_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->lux_thres_lo);
+}
+
+static ssize_t apds990x_set_lux_thresh(struct apds990x_chip *chip, u32 *target,
+				const char *buf)
+{
+	int ret = 0;
+	unsigned long thresh;
+
+	if (strict_strtoul(buf, 0, &thresh))
+		return -EINVAL;
+
+	if (thresh > APDS_RANGE)
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	*target = thresh;
+	/*
+	 * Don't update values in HW if we are still waiting for
+	 * first interrupt to come after device handle open call.
+	 */
+	if (!chip->lux_wait_fresh_res)
+		apds990x_refresh_athres(chip);
+	mutex_unlock(&chip->mutex);
+	return ret;
+
+}
+
+static ssize_t apds990x_lux_thresh_above_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	int ret = apds990x_set_lux_thresh(chip, &chip->lux_thres_hi, buf);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+static ssize_t apds990x_lux_thresh_below_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	int ret = apds990x_set_lux_thresh(chip, &chip->lux_thres_lo, buf);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+static DEVICE_ATTR(lux0_thresh_above_value, S_IRUGO | S_IWUSR,
+		apds990x_lux_thresh_above_show,
+		apds990x_lux_thresh_above_store);
+
+static DEVICE_ATTR(lux0_thresh_below_value, S_IRUGO | S_IWUSR,
+		apds990x_lux_thresh_below_show,
+		apds990x_lux_thresh_below_store);
+
+static ssize_t apds990x_prox_threshold_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", chip->prox_thres);
+}
+
+static ssize_t apds990x_prox_threshold_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+
+	if ((value > APDS_RANGE) || (value == 0) ||
+		(value < APDS_PROX_HYSTERESIS))
+		return -EINVAL;
+
+	mutex_lock(&chip->mutex);
+	chip->prox_thres = value;
+
+	apds990x_force_p_refresh(chip);
+	mutex_unlock(&chip->mutex);
+	return len;
+}
+
+static DEVICE_ATTR(prox0_thresh_above_value, S_IRUGO | S_IWUSR,
+		apds990x_prox_threshold_show,
+		apds990x_prox_threshold_store);
+
+static ssize_t apds990x_power_state_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", !pm_runtime_suspended(dev));
+	return 0;
+}
+
+static ssize_t apds990x_power_state_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (strict_strtoul(buf, 0, &value))
+		return -EINVAL;
+	if (value) {
+		pm_runtime_get_sync(dev);
+		mutex_lock(&chip->mutex);
+		chip->lux_wait_fresh_res = true;
+		apds990x_force_a_refresh(chip);
+		apds990x_force_p_refresh(chip);
+		mutex_unlock(&chip->mutex);
+	} else {
+		if (!pm_runtime_suspended(dev))
+			pm_runtime_put(dev);
+	}
+	return len;
+}
+
+static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR,
+		apds990x_power_state_show,
+		apds990x_power_state_store);
+
+static ssize_t apds990x_chip_id_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct apds990x_chip *chip =  dev_get_drvdata(dev);
+	return sprintf(buf, "%s %d\n", chip->chipname, chip->revision);
+}
+
+static DEVICE_ATTR(chip_id, S_IRUGO, apds990x_chip_id_show, NULL);
+
+static struct attribute *sysfs_attrs_ctrl[] = {
+	&dev_attr_lux0_calibscale.attr,
+	&dev_attr_lux0_calibscale_default.attr,
+	&dev_attr_lux0_input.attr,
+	&dev_attr_lux0_sensor_range.attr,
+	&dev_attr_lux0_rate.attr,
+	&dev_attr_lux0_rate_avail.attr,
+	&dev_attr_lux0_thresh_above_value.attr,
+	&dev_attr_lux0_thresh_below_value.attr,
+	&dev_attr_prox0_raw_en.attr,
+	&dev_attr_prox0_raw.attr,
+	&dev_attr_prox0_sensor_range.attr,
+	&dev_attr_prox0_thresh_above_value.attr,
+	&dev_attr_prox0_reporting_mode.attr,
+	&dev_attr_prox0_reporting_mode_avail.attr,
+	&dev_attr_chip_id.attr,
+	&dev_attr_power_state.attr,
+	NULL
+};
+
+static struct attribute_group apds990x_attribute_group[] = {
+	{.attrs = sysfs_attrs_ctrl },
+};
+
+static int __devinit apds990x_probe(struct i2c_client *client,
+				const struct i2c_device_id *id)
+{
+	struct apds990x_chip *chip;
+	int err;
+
+	chip = kzalloc(sizeof *chip, GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, chip);
+	chip->client  = client;
+
+	init_waitqueue_head(&chip->wait);
+	mutex_init(&chip->mutex);
+	chip->pdata	= client->dev.platform_data;
+
+	if (chip->pdata == NULL) {
+		dev_err(&client->dev, "platform data is mandatory\n");
+		err = -EINVAL;
+		goto fail1;
+	}
+
+	if (chip->pdata->cf.ga == 0) {
+		/* set uncovered sensor default parameters */
+		chip->cf.ga = 1966; /* 0.48 * APDS_PARAM_SCALE */
+		chip->cf.cf1 = 4096; /* 1.00 * APDS_PARAM_SCALE */
+		chip->cf.irf1 = 9134; /* 2.23 * APDS_PARAM_SCALE */
+		chip->cf.cf2 = 2867; /* 0.70 * APDS_PARAM_SCALE */
+		chip->cf.irf2 = 5816; /* 1.42 * APDS_PARAM_SCALE */
+		chip->cf.df = 52;
+	} else {
+		chip->cf = chip->pdata->cf;
+	}
+
+	/* precalculate inverse chip factors for threshold control */
+	chip->rcf.afactor =
+		(chip->cf.irf1 - chip->cf.irf2) * APDS_PARAM_SCALE /
+		(chip->cf.cf1 - chip->cf.cf2);
+	chip->rcf.cf1 = APDS_PARAM_SCALE * APDS_PARAM_SCALE /
+		chip->cf.cf1;
+	chip->rcf.irf1 = chip->cf.irf1 * APDS_PARAM_SCALE /
+		chip->cf.cf1;
+	chip->rcf.cf2 = APDS_PARAM_SCALE * APDS_PARAM_SCALE /
+		chip->cf.cf2;
+	chip->rcf.irf2 = chip->cf.irf2 * APDS_PARAM_SCALE /
+		chip->cf.cf2;
+
+	/* Set something to start with */
+	chip->lux_thres_hi = APDS_LUX_DEF_THRES_HI;
+	chip->lux_thres_lo = APDS_LUX_DEF_THRES_LO;
+	chip->lux_calib = APDS_LUX_NEUTRAL_CALIB_VALUE;
+
+	chip->prox_thres = APDS_PROX_DEF_THRES;
+	chip->pdrive = chip->pdata->pdrive;
+	chip->pdiode = APDS_PDIODE_IR;
+	chip->pgain = APDS_PGAIN_1X;
+	chip->prox_calib = APDS_PROX_NEUTRAL_CALIB_VALUE;
+	chip->prox_persistence = APDS_DEFAULT_PROX_PERS;
+	chip->prox_continuous_mode = false;
+
+	chip->regs[0].supply = reg_vcc;
+	chip->regs[1].supply = reg_vled;
+
+	err = regulator_bulk_get(&client->dev,
+				 ARRAY_SIZE(chip->regs), chip->regs);
+	if (err < 0) {
+		dev_err(&client->dev, "Cannot get regulators\n");
+		goto fail1;
+	}
+
+	err = regulator_bulk_enable(ARRAY_SIZE(chip->regs), chip->regs);
+	if (err < 0) {
+		dev_err(&client->dev, "Cannot enable regulators\n");
+		goto fail2;
+	}
+
+	usleep_range(APDS_STARTUP_DELAY, 2 * APDS_STARTUP_DELAY);
+
+	err = apds990x_detect(chip);
+	if (err < 0) {
+		dev_err(&client->dev, "APDS990X not found\n");
+		goto fail3;
+	}
+
+	pm_runtime_set_active(&client->dev);
+
+	apds990x_configure(chip);
+	apds990x_set_arate(chip, APDS_LUX_DEFAULT_RATE);
+	apds990x_mode_on(chip);
+
+	pm_runtime_enable(&client->dev);
+
+	if (chip->pdata->setup_resources) {
+		err = chip->pdata->setup_resources();
+		if (err) {
+			err = -EINVAL;
+			goto fail3;
+		}
+	}
+
+	err = sysfs_create_group(&chip->client->dev.kobj,
+				apds990x_attribute_group);
+	if (err < 0) {
+		dev_err(&chip->client->dev, "Sysfs registration failed\n");
+		goto fail4;
+	}
+
+	err = request_threaded_irq(client->irq, NULL,
+				apds990x_irq,
+				IRQF_TRIGGER_FALLING | IRQF_TRIGGER_LOW |
+				IRQF_ONESHOT,
+				"apds990x", chip);
+	if (err) {
+		dev_err(&client->dev, "could not get IRQ %d\n",
+			client->irq);
+		goto fail5;
+	}
+	return err;
+fail5:
+	sysfs_remove_group(&chip->client->dev.kobj,
+			&apds990x_attribute_group[0]);
+fail4:
+	if (chip->pdata && chip->pdata->release_resources)
+		chip->pdata->release_resources();
+fail3:
+	regulator_bulk_disable(ARRAY_SIZE(chip->regs), chip->regs);
+fail2:
+	regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs);
+fail1:
+	kfree(chip);
+	return err;
+}
+
+static int __devexit apds990x_remove(struct i2c_client *client)
+{
+	struct apds990x_chip *chip = i2c_get_clientdata(client);
+
+	free_irq(client->irq, chip);
+	sysfs_remove_group(&chip->client->dev.kobj,
+			apds990x_attribute_group);
+
+	if (chip->pdata && chip->pdata->release_resources)
+		chip->pdata->release_resources();
+
+	if (!pm_runtime_suspended(&client->dev))
+		apds990x_chip_off(chip);
+
+	pm_runtime_disable(&client->dev);
+	pm_runtime_set_suspended(&client->dev);
+
+	regulator_bulk_free(ARRAY_SIZE(chip->regs), chip->regs);
+
+	kfree(chip);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int apds990x_suspend(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct apds990x_chip *chip = i2c_get_clientdata(client);
+
+	apds990x_chip_off(chip);
+	return 0;
+}
+
+static int apds990x_resume(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct apds990x_chip *chip = i2c_get_clientdata(client);
+
+	/*
+	 * If we were enabled at suspend time, it is expected
+	 * everything works nice and smoothly. Chip_on is enough
+	 */
+	apds990x_chip_on(chip);
+
+	return 0;
+}
+#else
+#define apds990x_suspend  NULL
+#define apds990x_resume	  NULL
+#define apds990x_shutdown NULL
+#endif
+
+#ifdef CONFIG_PM_RUNTIME
+static int apds990x_runtime_suspend(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct apds990x_chip *chip = i2c_get_clientdata(client);
+
+	apds990x_chip_off(chip);
+	return 0;
+}
+
+static int apds990x_runtime_resume(struct device *dev)
+{
+	struct i2c_client *client = container_of(dev, struct i2c_client, dev);
+	struct apds990x_chip *chip = i2c_get_clientdata(client);
+
+	apds990x_chip_on(chip);
+	return 0;
+}
+
+#endif
+
+static const struct i2c_device_id apds990x_id[] = {
+	{"apds990x", 0 },
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, apds990x_id);
+
+static const struct dev_pm_ops apds990x_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(apds990x_suspend, apds990x_resume)
+	SET_RUNTIME_PM_OPS(apds990x_runtime_suspend,
+			apds990x_runtime_resume,
+			NULL)
+};
+
+static struct i2c_driver apds990x_driver = {
+	.driver	 = {
+		.name	= "apds990x",
+		.owner	= THIS_MODULE,
+		.pm	= &apds990x_pm_ops,
+	},
+	.probe	  = apds990x_probe,
+	.remove	  = __devexit_p(apds990x_remove),
+	.id_table = apds990x_id,
+};
+
+static int __init apds990x_init(void)
+{
+	return i2c_add_driver(&apds990x_driver);
+}
+
+static void __exit apds990x_exit(void)
+{
+	i2c_del_driver(&apds990x_driver);
+}
+
+MODULE_DESCRIPTION("APDS990X combined ALS and proximity sensor");
+MODULE_AUTHOR("Samu Onkalo, Nokia Corporation");
+MODULE_LICENSE("GPL v2");
+
+module_init(apds990x_init);
+module_exit(apds990x_exit);
diff --git a/include/linux/i2c/apds990x.h b/include/linux/i2c/apds990x.h
new file mode 100644
index 000000000000..d186fcc5d257
--- /dev/null
+++ b/include/linux/i2c/apds990x.h
@@ -0,0 +1,79 @@
+/*
+ * This file is part of the APDS990x sensor driver.
+ * Chip is combined proximity and ambient light sensor.
+ *
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
+ *
+ * Contact: Samu Onkalo <samu.p.onkalo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __APDS990X_H__
+#define __APDS990X_H__
+
+
+#define APDS_IRLED_CURR_12mA	0x3
+#define APDS_IRLED_CURR_25mA	0x2
+#define APDS_IRLED_CURR_50mA	0x1
+#define APDS_IRLED_CURR_100mA	0x0
+
+/**
+ * struct apds990x_chip_factors - defines effect of the cover window
+ * @ga: Total glass attenuation
+ * @cf1: clear channel factor 1 for raw to lux conversion
+ * @irf1: IR channel factor 1 for raw to lux conversion
+ * @cf2: clear channel factor 2 for raw to lux conversion
+ * @irf2: IR channel factor 2 for raw to lux conversion
+ * @df: device factor for conversion formulas
+ *
+ * Structure for tuning ALS calculation to match with environment.
+ * Values depend on the material above the sensor and the sensor
+ * itself. If the GA is zero, driver will use uncovered sensor default values
+ * format: decimal value * APDS_PARAM_SCALE except df which is plain integer.
+ */
+#define APDS_PARAM_SCALE 4096
+struct apds990x_chip_factors {
+	int ga;
+	int cf1;
+	int irf1;
+	int cf2;
+	int irf2;
+	int df;
+};
+
+/**
+ * struct apds990x_platform_data - platform data for apsd990x.c driver
+ * @cf: chip factor data
+ * @pddrive: IR-led driving current
+ * @ppcount: number of IR pulses used for proximity estimation
+ * @setup_resources: interrupt line setup call back function
+ * @release_resources: interrupt line release call back function
+ *
+ * Proximity detection result depends heavily on correct ppcount, pdrive
+ * and cover window.
+ *
+ */
+
+struct apds990x_platform_data {
+	struct apds990x_chip_factors cf;
+	u8     pdrive;
+	u8     ppcount;
+	int    (*setup_resources)(void);
+	int    (*release_resources)(void);
+};
+
+#endif

From 3f0f4a3f2008613c601e97f773dbd80ac400e459 Mon Sep 17 00:00:00 2001
From: Samu Onkalo <samu.p.onkalo@nokia.com>
Date: Tue, 26 Oct 2010 14:22:39 -0700
Subject: [PATCH 108/176] Documentation: short descriptions for bh1770glc and
 apds990x drivers

Add short documentation for two ALS / proximity chip drivers.

Signed-off-by: Samu Onkalo <samu.p.onkalo@nokia.com>
Acked-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/misc-devices/apds990x.txt  | 111 ++++++++++++++++++++++
 Documentation/misc-devices/bh1770glc.txt | 116 +++++++++++++++++++++++
 2 files changed, 227 insertions(+)
 create mode 100644 Documentation/misc-devices/apds990x.txt
 create mode 100644 Documentation/misc-devices/bh1770glc.txt

diff --git a/Documentation/misc-devices/apds990x.txt b/Documentation/misc-devices/apds990x.txt
new file mode 100644
index 000000000000..d5408cade32f
--- /dev/null
+++ b/Documentation/misc-devices/apds990x.txt
@@ -0,0 +1,111 @@
+Kernel driver apds990x
+======================
+
+Supported chips:
+Avago APDS990X
+
+Data sheet:
+Not freely available
+
+Author:
+Samu Onkalo <samu.p.onkalo@nokia.com>
+
+Description
+-----------
+
+APDS990x is a combined ambient light and proximity sensor. ALS and proximity
+functionality are highly connected. ALS measurement path must be running
+while the proximity functionality is enabled.
+
+ALS produces raw measurement values for two channels: Clear channel
+(infrared + visible light) and IR only. However, threshold comparisons happen
+using clear channel only. Lux value and the threshold level on the HW
+might vary quite much depending the spectrum of the light source.
+
+Driver makes necessary conversions to both directions so that user handles
+only lux values. Lux value is calculated using information from the both
+channels. HW threshold level is calculated from the given lux value to match
+with current type of the lightning. Sometimes inaccuracy of the estimations
+lead to false interrupt, but that doesn't harm.
+
+ALS contains 4 different gain steps. Driver automatically
+selects suitable gain step. After each measurement, reliability of the results
+is estimated and new measurement is trigged if necessary.
+
+Platform data can provide tuned values to the conversion formulas if
+values are known. Otherwise plain sensor default values are used.
+
+Proximity side is little bit simpler. There is no need for complex conversions.
+It produces directly usable values.
+
+Driver controls chip operational state using pm_runtime framework.
+Voltage regulators are controlled based on chip operational state.
+
+SYSFS
+-----
+
+
+chip_id
+	RO - shows detected chip type and version
+
+power_state
+	RW - enable / disable chip. Uses counting logic
+	     1 enables the chip
+	     0 disables the chip
+lux0_input
+	RO - measured lux value
+	     sysfs_notify called when threshold interrupt occurs
+
+lux0_sensor_range
+	RO - lux0_input max value. Actually never reaches since sensor tends
+	     to saturate much before that. Real max value varies depending
+	     on the light spectrum etc.
+
+lux0_rate
+	RW - measurement rate in Hz
+
+lux0_rate_avail
+	RO - supported measurement rates
+
+lux0_calibscale
+	RW - calibration value. Set to neutral value by default.
+	     Output results are multiplied with calibscale / calibscale_default
+	     value.
+
+lux0_calibscale_default
+	RO - neutral calibration value
+
+lux0_thresh_above_value
+	RW - HI level threshold value. All results above the value
+	     trigs an interrupt. 65535 (i.e. sensor_range) disables the above
+	     interrupt.
+
+lux0_thresh_below_value
+	RW - LO level threshold value. All results below the value
+	     trigs an interrupt. 0 disables the below interrupt.
+
+prox0_raw
+	RO - measured proximity value
+	     sysfs_notify called when threshold interrupt occurs
+
+prox0_sensor_range
+	RO - prox0_raw max value (1023)
+
+prox0_raw_en
+	RW - enable / disable proximity - uses counting logic
+	     1 enables the proximity
+	     0 disables the proximity
+
+prox0_reporting_mode
+	RW - trigger / periodic. In "trigger" mode the driver tells two possible
+	     values: 0 or prox0_sensor_range value. 0 means no proximity,
+	     1023 means proximity. This causes minimal number of interrupts.
+	     In "periodic" mode the driver reports all values above
+	     prox0_thresh_above. This causes more interrupts, but it can give
+	     _rough_ estimate about the distance.
+
+prox0_reporting_mode_avail
+	RO - accepted values to prox0_reporting_mode (trigger, periodic)
+
+prox0_thresh_above_value
+	RW - threshold level which trigs proximity events.
diff --git a/Documentation/misc-devices/bh1770glc.txt b/Documentation/misc-devices/bh1770glc.txt
new file mode 100644
index 000000000000..7d64c014dc70
--- /dev/null
+++ b/Documentation/misc-devices/bh1770glc.txt
@@ -0,0 +1,116 @@
+Kernel driver bh1770glc
+=======================
+
+Supported chips:
+ROHM BH1770GLC
+OSRAM SFH7770
+
+Data sheet:
+Not freely available
+
+Author:
+Samu Onkalo <samu.p.onkalo@nokia.com>
+
+Description
+-----------
+BH1770GLC and SFH7770 are combined ambient light and proximity sensors.
+ALS and proximity parts operates on their own, but they shares common I2C
+interface and interrupt logic. In principle they can run on their own,
+but ALS side results are used to estimate reliability of the proximity sensor.
+
+ALS produces 16 bit lux values. The chip contains interrupt logic to produce
+low and high threshold interrupts.
+
+Proximity part contains IR-led driver up to 3 IR leds. The chip measures
+amount of reflected IR light and produces proximity result. Resolution is
+8 bit. Driver supports only one channel. Driver uses ALS results to estimate
+reliability of the proximity results. Thus ALS is always running while
+proximity detection is needed.
+
+Driver uses threshold interrupts to avoid need for polling the values.
+Proximity low interrupt doesn't exists in the chip. This is simulated
+by using a delayed work. As long as there is proximity threshold above
+interrupts the delayed work is pushed forward. So, when proximity level goes
+below the threshold value, there is no interrupt and the delayed work will
+finally run. This is handled as no proximity indication.
+
+Chip state is controlled via runtime pm framework when enabled in config.
+
+Calibscale factor is used to hide differences between the chips. By default
+value set to neutral state meaning factor of 1.00. To get proper values,
+calibrated source of light is needed as a reference. Calibscale factor is set
+so that measurement produces about the expected lux value.
+
+SYSFS
+-----
+
+chip_id
+	RO - shows detected chip type and version
+
+power_state
+	RW - enable / disable chip. Uses counting logic
+	     1 enables the chip
+	     0 disables the chip
+
+lux0_input
+	RO - measured lux value
+	     sysfs_notify called when threshold interrupt occurs
+
+lux0_sensor_range
+	RO - lux0_input max value
+
+lux0_rate
+	RW - measurement rate in Hz
+
+lux0_rate_avail
+	RO - supported measurement rates
+
+lux0_thresh_above_value
+	RW - HI level threshold value. All results above the value
+	     trigs an interrupt. 65535 (i.e. sensor_range) disables the above
+	     interrupt.
+
+lux0_thresh_below_value
+	RW - LO level threshold value. All results below the value
+	     trigs an interrupt. 0 disables the below interrupt.
+
+lux0_calibscale
+	RW - calibration value. Set to neutral value by default.
+	     Output results are multiplied with calibscale / calibscale_default
+	     value.
+
+lux0_calibscale_default
+	RO - neutral calibration value
+
+prox0_raw
+	RO - measured proximity value
+	     sysfs_notify called when threshold interrupt occurs
+
+prox0_sensor_range
+	RO - prox0_raw max value
+
+prox0_raw_en
+	RW - enable / disable proximity - uses counting logic
+	     1 enables the proximity
+	     0 disables the proximity
+
+prox0_thresh_above_count
+	RW - number of proximity interrupts needed before triggering the event
+
+prox0_rate_above
+	RW - Measurement rate (in Hz) when the level is above threshold
+	     i.e. when proximity on has been reported.
+
+prox0_rate_below
+	RW - Measurement rate (in Hz) when the level is below threshold
+	     i.e. when proximity off has been reported.
+
+prox0_rate_avail
+	RO - Supported proximity measurement rates in Hz
+
+prox0_thresh_above0_value
+	RW - threshold level which trigs proximity events.
+	     Filtered by persistence filter (prox0_thresh_above_count)
+
+prox0_thresh_above1_value
+	RW - threshold level which trigs event immediately

From 93e2f585c149b5056d5c5eaffcaf747bbe9c3015 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:40 -0700
Subject: [PATCH 109/176] lkdtm: prefix enum constants

Prefix cname and ctype constants with CN/CT_.  This is especially for the
conflict on BUG which causes a build break if arch defines it as a inline
function, i.e.  MIPS.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/lkdtm.c | 128 +++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index 343b5d8ea697..81d7fa4ec0db 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -52,32 +52,32 @@
 #define REC_NUM_DEFAULT 10
 
 enum cname {
-	INVALID,
-	INT_HARDWARE_ENTRY,
-	INT_HW_IRQ_EN,
-	INT_TASKLET_ENTRY,
-	FS_DEVRW,
-	MEM_SWAPOUT,
-	TIMERADD,
-	SCSI_DISPATCH_CMD,
-	IDE_CORE_CP,
-	DIRECT,
+	CN_INVALID,
+	CN_INT_HARDWARE_ENTRY,
+	CN_INT_HW_IRQ_EN,
+	CN_INT_TASKLET_ENTRY,
+	CN_FS_DEVRW,
+	CN_MEM_SWAPOUT,
+	CN_TIMERADD,
+	CN_SCSI_DISPATCH_CMD,
+	CN_IDE_CORE_CP,
+	CN_DIRECT,
 };
 
 enum ctype {
-	NONE,
-	PANIC,
-	BUG,
-	EXCEPTION,
-	LOOP,
-	OVERFLOW,
-	CORRUPT_STACK,
-	UNALIGNED_LOAD_STORE_WRITE,
-	OVERWRITE_ALLOCATION,
-	WRITE_AFTER_FREE,
-	SOFTLOCKUP,
-	HARDLOCKUP,
-	HUNG_TASK,
+	CT_NONE,
+	CT_PANIC,
+	CT_BUG,
+	CT_EXCEPTION,
+	CT_LOOP,
+	CT_OVERFLOW,
+	CT_CORRUPT_STACK,
+	CT_UNALIGNED_LOAD_STORE_WRITE,
+	CT_OVERWRITE_ALLOCATION,
+	CT_WRITE_AFTER_FREE,
+	CT_SOFTLOCKUP,
+	CT_HARDLOCKUP,
+	CT_HUNG_TASK,
 };
 
 static char* cp_name[] = {
@@ -117,8 +117,8 @@ static char* cpoint_type;
 static int cpoint_count = DEFAULT_COUNT;
 static int recur_count = REC_NUM_DEFAULT;
 
-static enum cname cpoint = INVALID;
-static enum ctype cptype = NONE;
+static enum cname cpoint = CN_INVALID;
+static enum ctype cptype = CT_NONE;
 static int count = DEFAULT_COUNT;
 
 module_param(recur_count, int, 0644);
@@ -207,12 +207,12 @@ static enum ctype parse_cp_type(const char *what, size_t count)
 			return i + 1;
 	}
 
-	return NONE;
+	return CT_NONE;
 }
 
 static const char *cp_type_to_str(enum ctype type)
 {
-	if (type == NONE || type < 0 || type > ARRAY_SIZE(cp_type))
+	if (type == CT_NONE || type < 0 || type > ARRAY_SIZE(cp_type))
 		return "None";
 
 	return cp_type[type - 1];
@@ -220,7 +220,7 @@ static const char *cp_type_to_str(enum ctype type)
 
 static const char *cp_name_to_str(enum cname name)
 {
-	if (name == INVALID || name < 0 || name > ARRAY_SIZE(cp_name))
+	if (name == CN_INVALID || name < 0 || name > ARRAY_SIZE(cp_name))
 		return "INVALID";
 
 	return cp_name[name - 1];
@@ -245,7 +245,7 @@ static int lkdtm_parse_commandline(void)
 		return -EINVAL;
 
 	cptype = parse_cp_type(cpoint_type, strlen(cpoint_type));
-	if (cptype == NONE)
+	if (cptype == CT_NONE)
 		return -EINVAL;
 
 	for (i = 0; i < ARRAY_SIZE(cp_name); i++) {
@@ -274,30 +274,30 @@ static int recursive_loop(int a)
 static void lkdtm_do_action(enum ctype which)
 {
 	switch (which) {
-	case PANIC:
+	case CT_PANIC:
 		panic("dumptest");
 		break;
-	case BUG:
+	case CT_BUG:
 		BUG();
 		break;
-	case EXCEPTION:
+	case CT_EXCEPTION:
 		*((int *) 0) = 0;
 		break;
-	case LOOP:
+	case CT_LOOP:
 		for (;;)
 			;
 		break;
-	case OVERFLOW:
+	case CT_OVERFLOW:
 		(void) recursive_loop(0);
 		break;
-	case CORRUPT_STACK: {
+	case CT_CORRUPT_STACK: {
 		volatile u32 data[8];
 		volatile u32 *p = data;
 
 		p[12] = 0x12345678;
 		break;
 	}
-	case UNALIGNED_LOAD_STORE_WRITE: {
+	case CT_UNALIGNED_LOAD_STORE_WRITE: {
 		static u8 data[5] __attribute__((aligned(4))) = {1, 2,
 				3, 4, 5};
 		u32 *p;
@@ -309,7 +309,7 @@ static void lkdtm_do_action(enum ctype which)
 		*p = val;
 		 break;
 	}
-	case OVERWRITE_ALLOCATION: {
+	case CT_OVERWRITE_ALLOCATION: {
 		size_t len = 1020;
 		u32 *data = kmalloc(len, GFP_KERNEL);
 
@@ -317,7 +317,7 @@ static void lkdtm_do_action(enum ctype which)
 		kfree(data);
 		break;
 	}
-	case WRITE_AFTER_FREE: {
+	case CT_WRITE_AFTER_FREE: {
 		size_t len = 1024;
 		u32 *data = kmalloc(len, GFP_KERNEL);
 
@@ -326,21 +326,21 @@ static void lkdtm_do_action(enum ctype which)
 		memset(data, 0x78, len);
 		break;
 	}
-	case SOFTLOCKUP:
+	case CT_SOFTLOCKUP:
 		preempt_disable();
 		for (;;)
 			cpu_relax();
 		break;
-	case HARDLOCKUP:
+	case CT_HARDLOCKUP:
 		local_irq_disable();
 		for (;;)
 			cpu_relax();
 		break;
-	case HUNG_TASK:
+	case CT_HUNG_TASK:
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 		break;
-	case NONE:
+	case CT_NONE:
 	default:
 		break;
 	}
@@ -363,43 +363,43 @@ static int lkdtm_register_cpoint(enum cname which)
 {
 	int ret;
 
-	cpoint = INVALID;
+	cpoint = CN_INVALID;
 	if (lkdtm.entry != NULL)
 		unregister_jprobe(&lkdtm);
 
 	switch (which) {
-	case DIRECT:
+	case CN_DIRECT:
 		lkdtm_do_action(cptype);
 		return 0;
-	case INT_HARDWARE_ENTRY:
+	case CN_INT_HARDWARE_ENTRY:
 		lkdtm.kp.symbol_name = "do_IRQ";
 		lkdtm.entry = (kprobe_opcode_t*) jp_do_irq;
 		break;
-	case INT_HW_IRQ_EN:
+	case CN_INT_HW_IRQ_EN:
 		lkdtm.kp.symbol_name = "handle_IRQ_event";
 		lkdtm.entry = (kprobe_opcode_t*) jp_handle_irq_event;
 		break;
-	case INT_TASKLET_ENTRY:
+	case CN_INT_TASKLET_ENTRY:
 		lkdtm.kp.symbol_name = "tasklet_action";
 		lkdtm.entry = (kprobe_opcode_t*) jp_tasklet_action;
 		break;
-	case FS_DEVRW:
+	case CN_FS_DEVRW:
 		lkdtm.kp.symbol_name = "ll_rw_block";
 		lkdtm.entry = (kprobe_opcode_t*) jp_ll_rw_block;
 		break;
-	case MEM_SWAPOUT:
+	case CN_MEM_SWAPOUT:
 		lkdtm.kp.symbol_name = "shrink_inactive_list";
 		lkdtm.entry = (kprobe_opcode_t*) jp_shrink_inactive_list;
 		break;
-	case TIMERADD:
+	case CN_TIMERADD:
 		lkdtm.kp.symbol_name = "hrtimer_start";
 		lkdtm.entry = (kprobe_opcode_t*) jp_hrtimer_start;
 		break;
-	case SCSI_DISPATCH_CMD:
+	case CN_SCSI_DISPATCH_CMD:
 		lkdtm.kp.symbol_name = "scsi_dispatch_cmd";
 		lkdtm.entry = (kprobe_opcode_t*) jp_scsi_dispatch_cmd;
 		break;
-	case IDE_CORE_CP:
+	case CN_IDE_CORE_CP:
 #ifdef CONFIG_IDE
 		lkdtm.kp.symbol_name = "generic_ide_ioctl";
 		lkdtm.entry = (kprobe_opcode_t*) jp_generic_ide_ioctl;
@@ -416,7 +416,7 @@ static int lkdtm_register_cpoint(enum cname which)
 	cpoint = which;
 	if ((ret = register_jprobe(&lkdtm)) < 0) {
 		printk(KERN_INFO "lkdtm: Couldn't register jprobe\n");
-		cpoint = INVALID;
+		cpoint = CN_INVALID;
 	}
 
 	return ret;
@@ -445,7 +445,7 @@ static ssize_t do_register_entry(enum cname which, struct file *f,
 	cptype = parse_cp_type(buf, count);
 	free_page((unsigned long) buf);
 
-	if (cptype == NONE)
+	if (cptype == CT_NONE)
 		return -EINVAL;
 
 	err = lkdtm_register_cpoint(which);
@@ -487,49 +487,49 @@ static int lkdtm_debugfs_open(struct inode *inode, struct file *file)
 static ssize_t int_hardware_entry(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(INT_HARDWARE_ENTRY, f, buf, count, off);
+	return do_register_entry(CN_INT_HARDWARE_ENTRY, f, buf, count, off);
 }
 
 static ssize_t int_hw_irq_en(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(INT_HW_IRQ_EN, f, buf, count, off);
+	return do_register_entry(CN_INT_HW_IRQ_EN, f, buf, count, off);
 }
 
 static ssize_t int_tasklet_entry(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(INT_TASKLET_ENTRY, f, buf, count, off);
+	return do_register_entry(CN_INT_TASKLET_ENTRY, f, buf, count, off);
 }
 
 static ssize_t fs_devrw_entry(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(FS_DEVRW, f, buf, count, off);
+	return do_register_entry(CN_FS_DEVRW, f, buf, count, off);
 }
 
 static ssize_t mem_swapout_entry(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(MEM_SWAPOUT, f, buf, count, off);
+	return do_register_entry(CN_MEM_SWAPOUT, f, buf, count, off);
 }
 
 static ssize_t timeradd_entry(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(TIMERADD, f, buf, count, off);
+	return do_register_entry(CN_TIMERADD, f, buf, count, off);
 }
 
 static ssize_t scsi_dispatch_cmd_entry(struct file *f,
 		const char __user *buf, size_t count, loff_t *off)
 {
-	return do_register_entry(SCSI_DISPATCH_CMD, f, buf, count, off);
+	return do_register_entry(CN_SCSI_DISPATCH_CMD, f, buf, count, off);
 }
 
 static ssize_t ide_core_cp_entry(struct file *f, const char __user *buf,
 		size_t count, loff_t *off)
 {
-	return do_register_entry(IDE_CORE_CP, f, buf, count, off);
+	return do_register_entry(CN_IDE_CORE_CP, f, buf, count, off);
 }
 
 /* Special entry to just crash directly. Available without KPROBEs */
@@ -557,7 +557,7 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf,
 
 	type = parse_cp_type(buf, count);
 	free_page((unsigned long) buf);
-	if (type == NONE)
+	if (type == CT_NONE)
 		return -EINVAL;
 
 	printk(KERN_INFO "lkdtm: Performing direct entry %s\n",
@@ -649,7 +649,7 @@ static int __init lkdtm_module_init(void)
 		goto out_err;
 	}
 
-	if (cpoint != INVALID && cptype != NONE) {
+	if (cpoint != CN_INVALID && cptype != CT_NONE) {
 		ret = lkdtm_register_cpoint(cpoint);
 		if (ret < 0) {
 			printk(KERN_INFO "lkdtm: Invalid crash point %d\n",

From 2e85c4ddd3f32d3e1da51f4129473399e505ffa3 Mon Sep 17 00:00:00 2001
From: Kalhan Trisal <kalhan.trisal@intel.com>
Date: Tue, 26 Oct 2010 14:22:40 -0700
Subject: [PATCH 110/176] drivers/misc/isl29020.c: ambient light sensor

The LS driver will read the latest Lux measurement based upon the light
brightness and will report the LUX output through sysfs interface.

This hardware isn't quite the same as the ISL29003 so has a different
driver.

[akpm@linux-foundation.org: put PM code under #ifdef CONFIG_PM]
Signed-off-by: Kalhan Trisal <kalhan.trisal@intel.com>
[Runtime power management support added]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
[Fixes to runtime PM]
Signed-off-by: Liu Hong <hong.liu@intel.com>
[Cleanups and added checks for I2C errors, reworked the API to match the
 saner one agreed for other sensors]
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/Kconfig    |  10 ++
 drivers/misc/Makefile   |   1 +
 drivers/misc/isl29020.c | 248 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 drivers/misc/isl29020.c

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 7362851a2ca8..477a9434b9c0 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -295,6 +295,16 @@ config ISL29003
 	  This driver can also be built as a module.  If so, the module
 	  will be called isl29003.
 
+config ISL29020
+	tristate "Intersil ISL29020 ambient light sensor"
+	depends on I2C
+	help
+	  If you say yes here you get support for the Intersil ISL29020
+	  ambient light sensor.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called isl29020.
+
 config SENSORS_TSL2550
 	tristate "Taos TSL2550 ambient light sensor"
 	depends on I2C && SYSFS
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 081b9fca8a06..2ecdd32d1357 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_SGI_GRU)		+= sgi-gru/
 obj-$(CONFIG_CS5535_MFGPT)	+= cs5535-mfgpt.o
 obj-$(CONFIG_HP_ILO)		+= hpilo.o
 obj-$(CONFIG_ISL29003)		+= isl29003.o
+obj-$(CONFIG_ISL29020)		+= isl29020.o
 obj-$(CONFIG_SENSORS_TSL2550)	+= tsl2550.o
 obj-$(CONFIG_EP93XX_PWM)	+= ep93xx_pwm.o
 obj-$(CONFIG_DS1682)		+= ds1682.o
diff --git a/drivers/misc/isl29020.c b/drivers/misc/isl29020.c
new file mode 100644
index 000000000000..34fe835921c4
--- /dev/null
+++ b/drivers/misc/isl29020.c
@@ -0,0 +1,248 @@
+/*
+ * isl29020.c - Intersil  ALS Driver
+ *
+ * Copyright (C) 2008 Intel Corp
+ *
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Data sheet at: http://www.intersil.com/data/fn/fn6505.pdf
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/pm_runtime.h>
+
+static DEFINE_MUTEX(mutex);
+
+static ssize_t als_sensing_range_show(struct device *dev,
+			struct device_attribute *attr,  char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	int  val;
+
+	val = i2c_smbus_read_byte_data(client, 0x00);
+
+	if (val < 0)
+		return val;
+	return sprintf(buf, "%d000\n", 1 << (2 * (val & 3)));
+
+}
+
+static ssize_t als_lux_input_data_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	int ret_val, val;
+	unsigned long int lux;
+	int temp;
+
+	pm_runtime_get_sync(dev);
+	msleep(100);
+
+	mutex_lock(&mutex);
+	temp = i2c_smbus_read_byte_data(client, 0x02); /* MSB data */
+	if (temp < 0) {
+		pm_runtime_put_sync(dev);
+		mutex_unlock(&mutex);
+		return temp;
+	}
+
+	ret_val = i2c_smbus_read_byte_data(client, 0x01); /* LSB data */
+	mutex_unlock(&mutex);
+
+	if (ret_val < 0) {
+		pm_runtime_put_sync(dev);
+		return ret_val;
+	}
+
+	ret_val |= temp << 8;
+	val = i2c_smbus_read_byte_data(client, 0x00);
+	pm_runtime_put_sync(dev);
+	if (val < 0)
+		return val;
+	lux = ((((1 << (2 * (val & 3))))*1000) * ret_val) / 65536;
+	return sprintf(buf, "%ld\n", lux);
+}
+
+static ssize_t als_sensing_range_store(struct device *dev,
+		struct device_attribute *attr, const  char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned int ret_val;
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val))
+		return -EINVAL;
+	if (val < 1 || val > 64000)
+		return -EINVAL;
+
+	/* Pick the smallest sensor range that will meet our requirements */
+	if (val <= 1000)
+		val = 1;
+	else if (val <= 4000)
+		val = 2;
+	else if (val <= 16000)
+		val = 3;
+	else
+		val = 4;
+
+	ret_val = i2c_smbus_read_byte_data(client, 0x00);
+
+	ret_val &= 0xFC; /*reset the bit before setting them */
+	ret_val |= val - 1;
+	ret_val = i2c_smbus_write_byte_data(client, 0x00, ret_val);
+
+	if (ret_val < 0)
+		return ret_val;
+	return count;
+}
+
+static void als_set_power_state(struct i2c_client *client, int enable)
+{
+	int ret_val;
+
+	ret_val = i2c_smbus_read_byte_data(client, 0x00);
+	if (ret_val < 0)
+		return;
+
+	if (enable)
+		ret_val |= 0x80;
+	else
+		ret_val &= 0x7F;
+
+	i2c_smbus_write_byte_data(client, 0x00, ret_val);
+}
+
+static DEVICE_ATTR(lux0_sensor_range, S_IRUGO | S_IWUSR,
+	als_sensing_range_show, als_sensing_range_store);
+static DEVICE_ATTR(lux0_input, S_IRUGO, als_lux_input_data_show, NULL);
+
+static struct attribute *mid_att_als[] = {
+	&dev_attr_lux0_sensor_range.attr,
+	&dev_attr_lux0_input.attr,
+	NULL
+};
+
+static struct attribute_group m_als_gr = {
+	.name = "isl29020",
+	.attrs = mid_att_als
+};
+
+static int als_set_default_config(struct i2c_client *client)
+{
+	int retval;
+
+	retval = i2c_smbus_write_byte_data(client, 0x00, 0xc0);
+	if (retval < 0) {
+		dev_err(&client->dev, "default write failed.");
+		return retval;
+	}
+	return 0;;
+}
+
+static int  isl29020_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	int res;
+
+	res = als_set_default_config(client);
+	if (res <  0)
+		return res;
+
+	res = sysfs_create_group(&client->dev.kobj, &m_als_gr);
+	if (res) {
+		dev_err(&client->dev, "isl29020: device create file failed\n");
+		return res;
+	}
+	dev_info(&client->dev, "%s isl29020: ALS chip found\n", client->name);
+	als_set_power_state(client, 0);
+	pm_runtime_enable(&client->dev);
+	return res;
+}
+
+static int isl29020_remove(struct i2c_client *client)
+{
+	struct als_data *data = i2c_get_clientdata(client);
+	sysfs_remove_group(&client->dev.kobj, &m_als_gr);
+	kfree(data);
+	return 0;
+}
+
+static struct i2c_device_id isl29020_id[] = {
+	{ "isl29020", 0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(i2c, isl29020_id);
+
+#ifdef CONFIG_PM
+
+static int isl29020_runtime_suspend(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	als_set_power_state(client, 0);
+	return 0;
+}
+
+static int isl29020_runtime_resume(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	als_set_power_state(client, 1);
+	return 0;
+}
+
+static const struct dev_pm_ops isl29020_pm_ops = {
+	.runtime_suspend = isl29020_runtime_suspend,
+	.runtime_resume = isl29020_runtime_resume,
+};
+
+#define ISL29020_PM_OPS (&isl29020_pm_ops)
+#else	/* CONFIG_PM */
+#define ISL29020_PM_OPS NULL
+#endif	/* CONFIG_PM */
+
+static struct i2c_driver isl29020_driver = {
+	.driver = {
+		.name = "isl29020",
+		.pm = ISL29020_PM_OPS,
+	},
+	.probe = isl29020_probe,
+	.remove = isl29020_remove,
+	.id_table = isl29020_id,
+};
+
+static int __init sensor_isl29020_init(void)
+{
+	return i2c_add_driver(&isl29020_driver);
+}
+
+static void  __exit sensor_isl29020_exit(void)
+{
+	i2c_del_driver(&isl29020_driver);
+}
+
+module_init(sensor_isl29020_init);
+module_exit(sensor_isl29020_exit);
+
+MODULE_AUTHOR("Kalhan Trisal <kalhan.trisal@intel.com");
+MODULE_DESCRIPTION("Intersil isl29020 ALS Driver");
+MODULE_LICENSE("GPL v2");

From 8a233f01b7d7dd587f85fa581274f168f1e88bb8 Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Tue, 26 Oct 2010 14:22:41 -0700
Subject: [PATCH 111/176] pca953x: pca953x driver fixes for x86 mrst

Our Moorestown platform has two max7315 chips which is covered by pca953x
i2c gpio driver.

A while ago this driver got updated with nested irq thread support, and it
broke the compatibity with "request_irq".  For example, the gpio_keys.c
driver can not work with this driver now.  This patch fixes the issue by
switching to generic_handle_irq.

Also fix the irq_base issue: irq_base == 0 is valid, and a "-1" value
should mean invalid. IRQ 0 is not a valid IRQ, irq_base of 0 is valid.

Signed-off-by: Alek Du <alek.du@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/pca953x.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpio/pca953x.c b/drivers/gpio/pca953x.c
index a2b12aa1f2b9..501866662e05 100644
--- a/drivers/gpio/pca953x.c
+++ b/drivers/gpio/pca953x.c
@@ -345,7 +345,7 @@ static irqreturn_t pca953x_irq_handler(int irq, void *devid)
 
 	do {
 		level = __ffs(pending);
-		handle_nested_irq(level + chip->irq_base);
+		generic_handle_irq(level + chip->irq_base);
 
 		pending &= ~(1 << level);
 	} while (pending);
@@ -360,7 +360,8 @@ static int pca953x_irq_setup(struct pca953x_chip *chip,
 	struct pca953x_platform_data *pdata = client->dev.platform_data;
 	int ret;
 
-	if (pdata->irq_base && (id->driver_data & PCA953X_INT)) {
+	if (pdata->irq_base != -1
+			&& (id->driver_data & PCA953X_INT)) {
 		int lvl;
 
 		ret = pca953x_read_reg(chip, PCA953X_INPUT,
@@ -383,7 +384,6 @@ static int pca953x_irq_setup(struct pca953x_chip *chip,
 			set_irq_chip_data(irq, chip);
 			set_irq_chip_and_handler(irq, &pca953x_irq_chip,
 						 handle_edge_irq);
-			set_irq_nested_thread(irq, 1);
 #ifdef CONFIG_ARM
 			set_irq_flags(irq, IRQF_VALID);
 #else
@@ -394,6 +394,7 @@ static int pca953x_irq_setup(struct pca953x_chip *chip,
 		ret = request_threaded_irq(client->irq,
 					   NULL,
 					   pca953x_irq_handler,
+					   IRQF_TRIGGER_RISING |
 					   IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 					   dev_name(&client->dev), chip);
 		if (ret) {
@@ -408,13 +409,13 @@ static int pca953x_irq_setup(struct pca953x_chip *chip,
 	return 0;
 
 out_failed:
-	chip->irq_base = 0;
+	chip->irq_base = -1;
 	return ret;
 }
 
 static void pca953x_irq_teardown(struct pca953x_chip *chip)
 {
-	if (chip->irq_base)
+	if (chip->irq_base != -1)
 		free_irq(chip->client->irq, chip);
 }
 #else /* CONFIG_GPIO_PCA953X_IRQ */
@@ -424,7 +425,7 @@ static int pca953x_irq_setup(struct pca953x_chip *chip,
 	struct i2c_client *client = chip->client;
 	struct pca953x_platform_data *pdata = client->dev.platform_data;
 
-	if (pdata->irq_base && (id->driver_data & PCA953X_INT))
+	if (pdata->irq_base != -1 && (id->driver_data & PCA953X_INT))
 		dev_warn(&client->dev, "interrupt support not compiled in\n");
 
 	return 0;

From 22d96aa59cf120db3584e4c3365554cae77d2441 Mon Sep 17 00:00:00 2001
From: anantha <anantha.narayanan@intel.com>
Date: Tue, 26 Oct 2010 14:22:41 -0700
Subject: [PATCH 112/176] drivers/misc/apds9802als.c: ALS drivers for the
 apds9802als

This adds support for the ADPS9802ALS sensor.

Cleanup by Alan Cox
	- move mutexes to cover more things
	- report I/O errors back to user space
	- report range and values in LUX

Signed-off-by: Anantha Narayanan <anantha.narayanan@intel.com>
[The 4K and 64K in the hw spec actually means 4095 (12bit) and 65535 (16bit).]
Signed-off-by: Hong Liu <hong.liu@intel.com>
[Updated to match the ALS light API interface convention from Samu]
Signed-off-by: Alan Cox <alan@linux.intel.com>
Acked-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/Kconfig       |  10 ++
 drivers/misc/Makefile      |   1 +
 drivers/misc/apds9802als.c | 308 +++++++++++++++++++++++++++++++++++++
 3 files changed, 319 insertions(+)
 create mode 100644 drivers/misc/apds9802als.c

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 477a9434b9c0..6d7665ce6f4a 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -285,6 +285,16 @@ config SGI_GRU_DEBUG
 	This option enables addition debugging code for the SGI GRU driver. If
 	you are unsure, say N.
 
+config APDS9802ALS
+	tristate "Medfield Avago APDS9802 ALS Sensor module"
+	depends on I2C
+	help
+	  If you say yes here you get support for the ALS APDS9802 ambient
+	  light sensor.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called apds9802als.
+
 config ISL29003
 	tristate "Intersil ISL29003 ambient light sensor"
 	depends on I2C && SYSFS
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 2ecdd32d1357..4be5c6fc5ef4 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SGI_XP)		+= sgi-xp/
 obj-$(CONFIG_SGI_GRU)		+= sgi-gru/
 obj-$(CONFIG_CS5535_MFGPT)	+= cs5535-mfgpt.o
 obj-$(CONFIG_HP_ILO)		+= hpilo.o
+obj-$(CONFIG_APDS9802ALS)	+= apds9802als.o
 obj-$(CONFIG_ISL29003)		+= isl29003.o
 obj-$(CONFIG_ISL29020)		+= isl29020.o
 obj-$(CONFIG_SENSORS_TSL2550)	+= tsl2550.o
diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c
new file mode 100644
index 000000000000..fbe49602f396
--- /dev/null
+++ b/drivers/misc/apds9802als.c
@@ -0,0 +1,308 @@
+/*
+ * apds9802als.c - apds9802  ALS Driver
+ *
+ * Copyright (C) 2009 Intel Corp
+ *
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+
+#define ALS_MIN_RANGE_VAL 1
+#define ALS_MAX_RANGE_VAL 2
+#define POWER_STA_ENABLE 1
+#define POWER_STA_DISABLE 0
+#define APDS9802ALS_I2C_ADDR 0x29
+
+#define DRIVER_NAME "apds9802als"
+
+struct als_data {
+	struct device *hwmon_dev;
+	bool needresume;
+	struct mutex mutex;
+};
+
+static ssize_t als_sensing_range_show(struct device *dev,
+			struct device_attribute *attr,  char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	int  val;
+
+	val = i2c_smbus_read_byte_data(client, 0x81);
+	if (val < 0)
+		return val;
+	if (val & 1)
+		return sprintf(buf, "4095\n");
+	else
+		return sprintf(buf, "65535\n");
+}
+
+static ssize_t als_lux0_input_data_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct als_data *data = i2c_get_clientdata(client);
+	unsigned int ret_val;
+	int temp;
+
+	/* Protect against parallel reads */
+	mutex_lock(&data->mutex);
+	temp = i2c_smbus_read_byte_data(client, 0x8C);/*LSB data*/
+	if (temp < 0) {
+		ret_val = temp;
+		goto failed;
+	}
+	ret_val = i2c_smbus_read_byte_data(client, 0x8D);/*MSB data*/
+	if (ret_val < 0)
+		goto failed;
+	mutex_unlock(&data->mutex);
+	ret_val = (ret_val << 8) | temp;
+	return sprintf(buf, "%d\n", ret_val);
+failed:
+	mutex_unlock(&data->mutex);
+	return ret_val;
+}
+
+static ssize_t als_sensing_range_store(struct device *dev,
+		struct device_attribute *attr, const  char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct als_data *data = i2c_get_clientdata(client);
+	unsigned int ret_val;
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val < 4096)
+		val = 1;
+	else if (val < 65536)
+		val = 2;
+	else
+		return -ERANGE;
+
+	/* Make sure nobody else reads/modifies/writes 0x81 while we
+	   are active */
+
+	mutex_lock(&data->mutex);
+
+	ret_val = i2c_smbus_read_byte_data(client, 0x81);
+	if (ret_val < 0)
+		goto fail;
+
+	/* Reset the bits before setting them */
+	ret_val = ret_val & 0xFA;
+
+	if (val == 1) /* Setting the continous measurement up to 4k LUX */
+		ret_val = (ret_val | 0x05);
+	else /* Setting the continous measurement up to 64k LUX*/
+		ret_val = (ret_val | 0x04);
+
+	ret_val = i2c_smbus_write_byte_data(client, 0x81, ret_val);
+	if (ret_val >= 0) {
+		/* All OK */
+		mutex_unlock(&data->mutex);
+		return count;
+	}
+fail:
+	mutex_unlock(&data->mutex);
+	return ret_val;
+}
+
+static ssize_t als_power_status_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	int ret_val;
+	ret_val = i2c_smbus_read_byte_data(client, 0x80);
+	if (ret_val < 0)
+		return ret_val;
+	ret_val = ret_val & 0x01;
+	return sprintf(buf, "%d\n", ret_val);
+}
+
+static int als_set_power_state(struct i2c_client *client, bool on_off)
+{
+	int ret_val;
+	struct als_data *data = i2c_get_clientdata(client);
+
+	mutex_lock(&data->mutex);
+	ret_val = i2c_smbus_read_byte_data(client, 0x80);
+	if (ret_val < 0)
+		goto fail;
+	if (on_off)
+		ret_val = ret_val | 0x01;
+	else
+		ret_val = ret_val & 0xFE;
+	ret_val = i2c_smbus_write_byte_data(client, 0x80, ret_val);
+fail:
+	mutex_unlock(&data->mutex);
+	return ret_val;
+}
+
+static ssize_t als_power_status_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct als_data *data = i2c_get_clientdata(client);
+	unsigned long val;
+	int ret_val;
+
+	if (strict_strtoul(buf, 10, &val))
+		return -EINVAL;
+	if (val == POWER_STA_ENABLE) {
+		ret_val = als_set_power_state(client, true);
+		data->needresume = true;
+	} else if (val == POWER_STA_DISABLE) {
+		ret_val = als_set_power_state(client, false);
+		data->needresume = false;
+	} else
+		return -EINVAL;
+	if (ret_val < 0)
+		return ret_val;
+	return count;
+}
+
+static DEVICE_ATTR(lux0_sensor_range, S_IRUGO | S_IWUSR,
+	als_sensing_range_show, als_sensing_range_store);
+static DEVICE_ATTR(lux0_input, S_IRUGO, als_lux0_input_data_show, NULL);
+static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR,
+	als_power_status_show, als_power_status_store);
+
+static struct attribute *mid_att_als[] = {
+	&dev_attr_lux0_sensor_range.attr,
+	&dev_attr_lux0_input.attr,
+	&dev_attr_power_state.attr,
+	NULL
+};
+
+static struct attribute_group m_als_gr = {
+	.name = "apds9802als",
+	.attrs = mid_att_als
+};
+
+static int als_set_default_config(struct i2c_client *client)
+{
+	int ret_val;
+	/* Write the command and then switch on */
+	ret_val = i2c_smbus_write_byte_data(client, 0x80, 0x01);
+	if (ret_val < 0) {
+		dev_err(&client->dev, "failed default switch on write\n");
+		return ret_val;
+	}
+	/* Continous from 1Lux to 64k Lux */
+	ret_val = i2c_smbus_write_byte_data(client, 0x81, 0x04);
+	if (ret_val < 0)
+		dev_err(&client->dev, "failed default LUX on write\n");
+	return ret_val;
+}
+
+static int  apds9802als_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	int res;
+	struct als_data *data;
+
+	data = kzalloc(sizeof(struct als_data), GFP_KERNEL);
+	if (data == NULL) {
+		dev_err(&client->dev, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+	i2c_set_clientdata(client, data);
+	res = sysfs_create_group(&client->dev.kobj, &m_als_gr);
+	if (res) {
+		dev_err(&client->dev, "device create file failed\n");
+		goto als_error1;
+	}
+	dev_info(&client->dev,
+		"%s apds9802als: ALS chip found\n", client->name);
+	als_set_default_config(client);
+	data->needresume = true;
+	mutex_init(&data->mutex);
+	return res;
+als_error1:
+	i2c_set_clientdata(client, NULL);
+	kfree(data);
+	return res;
+}
+
+static int apds9802als_remove(struct i2c_client *client)
+{
+	struct als_data *data = i2c_get_clientdata(client);
+	sysfs_remove_group(&client->dev.kobj, &m_als_gr);
+	kfree(data);
+	return 0;
+}
+
+static int apds9802als_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	als_set_power_state(client, false);
+	return 0;
+}
+
+static int apds9802als_resume(struct i2c_client *client)
+{
+	struct als_data *data = i2c_get_clientdata(client);
+
+	if (data->needresume == true)
+		als_set_power_state(client, true);
+	return 0;
+}
+
+static struct i2c_device_id apds9802als_id[] = {
+	{ DRIVER_NAME, 0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(i2c, apds9802als_id);
+
+static struct i2c_driver apds9802als_driver = {
+	.driver = {
+	.name = DRIVER_NAME,
+	.owner = THIS_MODULE,
+	},
+	.probe = apds9802als_probe,
+	.remove = apds9802als_remove,
+	.suspend = apds9802als_suspend,
+	.resume = apds9802als_resume,
+	.id_table = apds9802als_id,
+};
+
+static int __init sensor_apds9802als_init(void)
+{
+	return i2c_add_driver(&apds9802als_driver);
+}
+
+static void  __exit sensor_apds9802als_exit(void)
+{
+	i2c_del_driver(&apds9802als_driver);
+}
+module_init(sensor_apds9802als_init);
+module_exit(sensor_apds9802als_exit);
+
+MODULE_AUTHOR("Anantha Narayanan <Anantha.Narayanan@intel.com");
+MODULE_DESCRIPTION("Avago apds9802als ALS Driver");
+MODULE_LICENSE("GPL v2");

From f0cfec11180973e4f4b2b6909623e47eaaf7ecfe Mon Sep 17 00:00:00 2001
From: Hong Liu <hong.liu@intel.com>
Date: Tue, 26 Oct 2010 14:22:42 -0700
Subject: [PATCH 113/176] drivers/misc/apds9802als.c: add runtime PM support

Update the driver for the needed runtime power features.  Remove the old
user controlled power functions.

[akpm@linux-foundation.org: put PM code under CONFIG_PM]
Signed-off-by: Hong Liu <hong.liu@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/apds9802als.c | 167 +++++++++++++++++++++++--------------
 1 file changed, 103 insertions(+), 64 deletions(-)

diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c
index fbe49602f396..f9b91ba8900c 100644
--- a/drivers/misc/apds9802als.c
+++ b/drivers/misc/apds9802als.c
@@ -29,19 +29,16 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/sysfs.h>
-#include <linux/hwmon-sysfs.h>
+#include <linux/pm_runtime.h>
 
 #define ALS_MIN_RANGE_VAL 1
 #define ALS_MAX_RANGE_VAL 2
 #define POWER_STA_ENABLE 1
 #define POWER_STA_DISABLE 0
-#define APDS9802ALS_I2C_ADDR 0x29
 
 #define DRIVER_NAME "apds9802als"
 
 struct als_data {
-	struct device *hwmon_dev;
-	bool needresume;
 	struct mutex mutex;
 };
 
@@ -60,29 +57,64 @@ static ssize_t als_sensing_range_show(struct device *dev,
 		return sprintf(buf, "65535\n");
 }
 
+static int als_wait_for_data_ready(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	int ret;
+	int retry = 10;
+
+	do {
+		msleep(30);
+		ret = i2c_smbus_read_byte_data(client, 0x86);
+	} while (!(ret & 0x80) && retry--);
+
+	if (!retry) {
+		dev_warn(dev, "timeout waiting for data ready\n");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
 static ssize_t als_lux0_input_data_show(struct device *dev,
 			struct device_attribute *attr, char *buf)
 {
 	struct i2c_client *client = to_i2c_client(dev);
 	struct als_data *data = i2c_get_clientdata(client);
-	unsigned int ret_val;
+	int ret_val;
 	int temp;
 
 	/* Protect against parallel reads */
+	pm_runtime_get_sync(dev);
 	mutex_lock(&data->mutex);
-	temp = i2c_smbus_read_byte_data(client, 0x8C);/*LSB data*/
+
+	/* clear EOC interrupt status */
+	i2c_smbus_write_byte(client, 0x40);
+	/* start measurement */
+	temp = i2c_smbus_read_byte_data(client, 0x81);
+	i2c_smbus_write_byte_data(client, 0x81, temp | 0x08);
+
+	ret_val = als_wait_for_data_ready(dev);
+	if (ret_val < 0)
+		goto failed;
+
+	temp = i2c_smbus_read_byte_data(client, 0x8C); /* LSB data */
 	if (temp < 0) {
 		ret_val = temp;
 		goto failed;
 	}
-	ret_val = i2c_smbus_read_byte_data(client, 0x8D);/*MSB data*/
+	ret_val = i2c_smbus_read_byte_data(client, 0x8D); /* MSB data */
 	if (ret_val < 0)
 		goto failed;
+
 	mutex_unlock(&data->mutex);
-	ret_val = (ret_val << 8) | temp;
-	return sprintf(buf, "%d\n", ret_val);
+	pm_runtime_put_sync(dev);
+
+	temp = (ret_val << 8) | temp;
+	return sprintf(buf, "%d\n", temp);
 failed:
 	mutex_unlock(&data->mutex);
+	pm_runtime_put_sync(dev);
 	return ret_val;
 }
 
@@ -104,9 +136,10 @@ static ssize_t als_sensing_range_store(struct device *dev,
 	else
 		return -ERANGE;
 
+	pm_runtime_get_sync(dev);
+
 	/* Make sure nobody else reads/modifies/writes 0x81 while we
 	   are active */
-
 	mutex_lock(&data->mutex);
 
 	ret_val = i2c_smbus_read_byte_data(client, 0x81);
@@ -116,34 +149,25 @@ static ssize_t als_sensing_range_store(struct device *dev,
 	/* Reset the bits before setting them */
 	ret_val = ret_val & 0xFA;
 
-	if (val == 1) /* Setting the continous measurement up to 4k LUX */
-		ret_val = (ret_val | 0x05);
-	else /* Setting the continous measurement up to 64k LUX*/
-		ret_val = (ret_val | 0x04);
+	if (val == 1) /* Setting detection range up to 4k LUX */
+		ret_val = (ret_val | 0x01);
+	else /* Setting detection range up to 64k LUX*/
+		ret_val = (ret_val | 0x00);
 
 	ret_val = i2c_smbus_write_byte_data(client, 0x81, ret_val);
+
 	if (ret_val >= 0) {
 		/* All OK */
 		mutex_unlock(&data->mutex);
+		pm_runtime_put_sync(dev);
 		return count;
 	}
 fail:
 	mutex_unlock(&data->mutex);
+	pm_runtime_put_sync(dev);
 	return ret_val;
 }
 
-static ssize_t als_power_status_show(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	int ret_val;
-	ret_val = i2c_smbus_read_byte_data(client, 0x80);
-	if (ret_val < 0)
-		return ret_val;
-	ret_val = ret_val & 0x01;
-	return sprintf(buf, "%d\n", ret_val);
-}
-
 static int als_set_power_state(struct i2c_client *client, bool on_off)
 {
 	int ret_val;
@@ -163,39 +187,13 @@ static int als_set_power_state(struct i2c_client *client, bool on_off)
 	return ret_val;
 }
 
-static ssize_t als_power_status_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct als_data *data = i2c_get_clientdata(client);
-	unsigned long val;
-	int ret_val;
-
-	if (strict_strtoul(buf, 10, &val))
-		return -EINVAL;
-	if (val == POWER_STA_ENABLE) {
-		ret_val = als_set_power_state(client, true);
-		data->needresume = true;
-	} else if (val == POWER_STA_DISABLE) {
-		ret_val = als_set_power_state(client, false);
-		data->needresume = false;
-	} else
-		return -EINVAL;
-	if (ret_val < 0)
-		return ret_val;
-	return count;
-}
-
 static DEVICE_ATTR(lux0_sensor_range, S_IRUGO | S_IWUSR,
 	als_sensing_range_show, als_sensing_range_store);
 static DEVICE_ATTR(lux0_input, S_IRUGO, als_lux0_input_data_show, NULL);
-static DEVICE_ATTR(power_state, S_IRUGO | S_IWUSR,
-	als_power_status_show, als_power_status_store);
 
 static struct attribute *mid_att_als[] = {
 	&dev_attr_lux0_sensor_range.attr,
 	&dev_attr_lux0_input.attr,
-	&dev_attr_power_state.attr,
 	NULL
 };
 
@@ -213,15 +211,21 @@ static int als_set_default_config(struct i2c_client *client)
 		dev_err(&client->dev, "failed default switch on write\n");
 		return ret_val;
 	}
-	/* Continous from 1Lux to 64k Lux */
-	ret_val = i2c_smbus_write_byte_data(client, 0x81, 0x04);
+	/* detection range: 1~64K Lux, maunal measurement */
+	ret_val = i2c_smbus_write_byte_data(client, 0x81, 0x08);
 	if (ret_val < 0)
 		dev_err(&client->dev, "failed default LUX on write\n");
+
+	/*  We always get 0 for the 1st measurement after system power on,
+	 *  so make sure it is finished before user asks for data.
+	 */
+	als_wait_for_data_ready(&client->dev);
+
 	return ret_val;
 }
 
-static int  apds9802als_probe(struct i2c_client *client,
-					const struct i2c_device_id *id)
+static int apds9802als_probe(struct i2c_client *client,
+			     const struct i2c_device_id *id)
 {
 	int res;
 	struct als_data *data;
@@ -237,11 +241,14 @@ static int  apds9802als_probe(struct i2c_client *client,
 		dev_err(&client->dev, "device create file failed\n");
 		goto als_error1;
 	}
-	dev_info(&client->dev,
-		"%s apds9802als: ALS chip found\n", client->name);
+	dev_info(&client->dev, "ALS chip found\n");
 	als_set_default_config(client);
-	data->needresume = true;
 	mutex_init(&data->mutex);
+
+	pm_runtime_enable(&client->dev);
+	pm_runtime_get(&client->dev);
+	pm_runtime_put(&client->dev);
+
 	return res;
 als_error1:
 	i2c_set_clientdata(client, NULL);
@@ -252,11 +259,14 @@ static int  apds9802als_probe(struct i2c_client *client,
 static int apds9802als_remove(struct i2c_client *client)
 {
 	struct als_data *data = i2c_get_clientdata(client);
+
+	als_set_power_state(client, false);
 	sysfs_remove_group(&client->dev.kobj, &m_als_gr);
 	kfree(data);
 	return 0;
 }
 
+#ifdef CONFIG_PM
 static int apds9802als_suspend(struct i2c_client *client, pm_message_t mesg)
 {
 	als_set_power_state(client, false);
@@ -265,13 +275,42 @@ static int apds9802als_suspend(struct i2c_client *client, pm_message_t mesg)
 
 static int apds9802als_resume(struct i2c_client *client)
 {
-	struct als_data *data = i2c_get_clientdata(client);
+	als_set_default_config(client);
 
-	if (data->needresume == true)
-		als_set_power_state(client, true);
+	pm_runtime_get(&client->dev);
+	pm_runtime_put(&client->dev);
 	return 0;
 }
 
+static int apds9802als_runtime_suspend(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+
+	als_set_power_state(client, false);
+	return 0;
+}
+
+static int apds9802als_runtime_resume(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+
+	als_set_power_state(client, true);
+	return 0;
+}
+
+static const struct dev_pm_ops apds9802als_pm_ops = {
+	.runtime_suspend = apds9802als_runtime_suspend,
+	.runtime_resume = apds9802als_runtime_resume,
+};
+
+#define APDS9802ALS_PM_OPS (&apds9802als_pm_ops)
+
+#else	/* CONFIG_PM */
+#define apds9802als_suspend NULL
+#define apds9802als_resume NULL
+#define APDS9802ALS_PM_OPS NULL
+#endif	/* CONFIG_PM */
+
 static struct i2c_device_id apds9802als_id[] = {
 	{ DRIVER_NAME, 0 },
 	{ }
@@ -281,8 +320,8 @@ MODULE_DEVICE_TABLE(i2c, apds9802als_id);
 
 static struct i2c_driver apds9802als_driver = {
 	.driver = {
-	.name = DRIVER_NAME,
-	.owner = THIS_MODULE,
+		.name = DRIVER_NAME,
+		.pm = APDS9802ALS_PM_OPS,
 	},
 	.probe = apds9802als_probe,
 	.remove = apds9802als_remove,

From 562f5e638de4ef451226552fe8dd7847bacea24e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:42 -0700
Subject: [PATCH 114/176] init: mark __user address space on string literals

When calling syscall service routines in kernel, some of arguments should
be user pointers but were missing __user markup on string literals.  Add
it.  Removes some sparse warnings.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/do_mounts.c    | 4 ++--
 init/do_mounts_md.c | 2 +-
 init/do_mounts_rd.c | 4 ++--
 init/initramfs.c    | 5 +++--
 init/noinitramfs.c  | 6 +++---
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 62a47eafa8e9..830aaec9c7d5 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -291,7 +291,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 	if (err)
 		return err;
 
-	sys_chdir("/root");
+	sys_chdir((const char __user __force *)"/root");
 	ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev;
 	printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
 	       current->fs->pwd.mnt->mnt_sb->s_type->name,
@@ -488,5 +488,5 @@ void __init prepare_namespace(void)
 out:
 	devtmpfs_mount("dev");
 	sys_mount(".", "/", NULL, MS_MOVE, NULL);
-	sys_chroot(".");
+	sys_chroot((const char __user __force *)".");
 }
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 69aebbf8fd2d..32c4799b8c91 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -283,7 +283,7 @@ static void __init autodetect_raid(void)
 
 	wait_for_device_probe();
 
-	fd = sys_open("/dev/md0", 0, 0);
+	fd = sys_open((const char __user __force *) "/dev/md0", 0, 0);
 	if (fd >= 0) {
 		sys_ioctl(fd, RAID_AUTORUN, raid_autopart);
 		sys_close(fd);
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index bf3ef667bf36..6e1ee6987c78 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -168,7 +168,7 @@ int __init rd_load_image(char *from)
 	char rotator[4] = { '|' , '/' , '-' , '\\' };
 #endif
 
-	out_fd = sys_open("/dev/ram", O_RDWR, 0);
+	out_fd = sys_open((const char __user __force *) "/dev/ram", O_RDWR, 0);
 	if (out_fd < 0)
 		goto out;
 
@@ -267,7 +267,7 @@ int __init rd_load_image(char *from)
 	sys_close(out_fd);
 out:
 	kfree(buf);
-	sys_unlink("/dev/ram");
+	sys_unlink((const char __user __force *) "/dev/ram");
 	return res;
 }
 
diff --git a/init/initramfs.c b/init/initramfs.c
index 4b9c20205092..d9c6e782ff53 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -528,7 +528,7 @@ static void __init clean_rootfs(void)
 	struct linux_dirent64 *dirp;
 	int num;
 
-	fd = sys_open("/", O_RDONLY, 0);
+	fd = sys_open((const char __user __force *) "/", O_RDONLY, 0);
 	WARN_ON(fd < 0);
 	if (fd < 0)
 		return;
@@ -590,7 +590,8 @@ static int __init populate_rootfs(void)
 		}
 		printk(KERN_INFO "rootfs image is not initramfs (%s)"
 				"; looks like an initrd\n", err);
-		fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 0700);
+		fd = sys_open((const char __user __force *) "/initrd.image",
+			      O_WRONLY|O_CREAT, 0700);
 		if (fd >= 0) {
 			sys_write(fd, (char *)initrd_start,
 					initrd_end - initrd_start);
diff --git a/init/noinitramfs.c b/init/noinitramfs.c
index f4c1a3a1b8c5..267739d85179 100644
--- a/init/noinitramfs.c
+++ b/init/noinitramfs.c
@@ -29,17 +29,17 @@ static int __init default_rootfs(void)
 {
 	int err;
 
-	err = sys_mkdir("/dev", 0755);
+	err = sys_mkdir((const char __user __force *) "/dev", 0755);
 	if (err < 0)
 		goto out;
 
-	err = sys_mknod((const char __user *) "/dev/console",
+	err = sys_mknod((const char __user __force *) "/dev/console",
 			S_IFCHR | S_IRUSR | S_IWUSR,
 			new_encode_dev(MKDEV(5, 1)));
 	if (err < 0)
 		goto out;
 
-	err = sys_mkdir("/root", 0700);
+	err = sys_mkdir((const char __user __force *) "/root", 0700);
 	if (err < 0)
 		goto out;
 

From 571428be550fbe37160596995e96ad398873fcbd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:43 -0700
Subject: [PATCH 115/176] kernel/user.c: add lock release annotation on
 free_user()

free_user() releases uidhash_lock but was missing annotation.  Add it.
This removes following sparse warnings:

 include/linux/spinlock.h:339:9: warning: context imbalance in 'free_user' - unexpected unlock
 kernel/user.c:120:6: warning: context imbalance in 'free_uid' - wrong count at exit

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dhaval Giani <dhaval.giani@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
  * upon function exit.
  */
 static void free_user(struct user_struct *up, unsigned long flags)
+	__releases(&uidhash_lock)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);

From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 26 Oct 2010 14:22:44 -0700
Subject: [PATCH 116/176] fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot.  The kernel was incapable
of creating either a named pipe or unix domain socket.  This comes down
to a common kernel function called unix_create1() which does:

        atomic_inc(&unix_nr_socks);
        if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000).  That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of atomic_t.

get_max_files() is changed to return an unsigned long.  get_nr_files() is
changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.

Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704     0       2147483648

Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c    | 17 +++++++----------
 include/linux/fs.h |  8 ++++----
 kernel/sysctl.c    |  6 +++---
 net/unix/af_unix.c | 14 +++++++-------
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
  * Return the total number of open files in the system
  */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
 	return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
  * Return the maximum number of open files in the system
  */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
 	return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
 	const struct cred *cred = current_cred();
-	static int old_max;
+	static long old_max;
 	struct file * f;
 
 	/*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
 	/* Ran out of filps - report that */
 	if (get_nr_files() > old_max) {
-		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					get_max_files());
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
 		old_max = get_nr_files();
 	}
 	goto fail;
@@ -487,7 +486,7 @@ void mark_files_ro(struct super_block *sb)
 
 void __init files_init(unsigned long mempages)
 { 
-	int n; 
+	unsigned long n;
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
 	 */ 
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
-	files_stat.max_files = n; 
-	if (files_stat.max_files < NR_FILE)
-		files_stat.max_files = NR_FILE;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
 	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f34ff6e5558..b2cdb6bc8287 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,9 +34,9 @@
 
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
-	int nr_files;		/* read only */
-	int nr_free_files;	/* read only */
-	int max_files;		/* tunable */
+	unsigned long nr_files;		/* read only */
+	unsigned long nr_free_files;	/* read only */
+	unsigned long max_files;		/* tunable */
 };
 
 struct inodes_stat_t {
@@ -400,7 +400,7 @@ extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 
 extern struct files_stat_struct files_stat;
-extern int get_max_files(void);
+extern unsigned long get_max_files(void);
 extern int sysctl_nr_open;
 extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..694b140852c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "file-nr",
 		.data		= &files_stat,
-		.maxlen		= 3*sizeof(int),
+		.maxlen		= sizeof(files_stat),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_files,
 	},
 	{
 		.procname	= "file-max",
 		.data		= &files_stat.max_files,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(files_stat.max_files),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "nr_open",
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0ebc777a6660..3c95304a0817 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
-static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
 	if (u->addr)
 		unix_release_addr(u->addr);
 
-	atomic_dec(&unix_nr_socks);
+	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 #ifdef UNIX_REFCNT_DEBUG
-	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk,
-		atomic_read(&unix_nr_socks));
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
 #endif
 }
 
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	atomic_inc(&unix_nr_socks);
-	if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
-		atomic_dec(&unix_nr_socks);
+		atomic_long_dec(&unix_nr_socks);
 	else {
 		local_bh_disable();
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

From ca51c5a76345b28c6f1b742f9f5f0a6fc9afd9ca Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 26 Oct 2010 14:22:44 -0700
Subject: [PATCH 117/176] kernel/stop_machine.c: fix unused variable warning

kernel/stop_machine.c: In function `cpu_stopper_thread':
kernel/stop_machine.c:265: warning: unused variable `ksym_buf'

ksym_buf[] is unused if WARN_ON() is a no-op.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/stop_machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 090c28812ce1..3cb4a9a8ae1c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ static int cpu_stopper_thread(void *data)
 		cpu_stop_fn_t fn = work->fn;
 		void *arg = work->arg;
 		struct cpu_stop_done *done = work->done;
-		char ksym_buf[KSYM_NAME_LEN];
+		char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
 
 		__set_current_state(TASK_RUNNING);
 

From 4ce6494dbd8909718840bb88d5a699ef6ce5c212 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 26 Oct 2010 14:22:45 -0700
Subject: [PATCH 118/176] stop_machine: convert cpu notifier to return
 encapsulate errno value

In commit e6bde73b07edeb703d4c89c1daabc09c303de11f ("cpu-hotplug: return
better errno on cpu hotplug failure"), the cpu notifier can return an
encapsulated errno value.

This converts the cpu notifier to return an encapsulated errno value for
stop_machine().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/stop_machine.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 3cb4a9a8ae1c..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
 				   cpu);
 		if (IS_ERR(p))
-			return NOTIFY_BAD;
+			return notifier_from_errno(PTR_ERR(p));
 		get_task_struct(p);
 		kthread_bind(p, cpu);
 		sched_set_stop_task(cpu, p);
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void)
 	/* start one for the boot cpu */
 	err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
 				    bcpu);
-	BUG_ON(err == NOTIFY_BAD);
+	BUG_ON(err != NOTIFY_OK);
 	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
 	register_cpu_notifier(&cpu_stop_cpu_notifier);
 

From 766f9164193f6dda1497bbf3861060198421fb92 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:22:45 -0700
Subject: [PATCH 119/176] kernel: remove PF_FLUSHER

PF_FLUSHER is only ever set, not tested, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c     | 2 +-
 include/linux/sched.h | 1 -
 mm/backing-dev.c      | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b5aae4bd0aca..9e46aec10d1a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -794,7 +794,7 @@ int bdi_writeback_thread(void *data)
 	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	current->flags |= PF_SWAPWRITE;
 	set_freezable();
 	wb->last_active = jiffies;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56154bbb8da9..393ce94e54b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1706,7 +1706,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
-#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
 #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
 #define PF_FREEZING	0x00004000	/* freeze in progress. do not account to load */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5ad3c106606b..f2eb27884ffa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
 
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	current->flags |= PF_SWAPWRITE;
 	set_freezable();
 
 	/*

From 5f2365d8c24aec8dbedf49c69b7601c7cfaee2c1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Tue, 26 Oct 2010 14:22:46 -0700
Subject: [PATCH 120/176] misc devices: do not enable by default

Do not enable this Kconfig menu by default since it contains devices not
present on the majority of systems.

This is becoming a pain and a waste of time especially when doing a bunch
of kernel builds on different systems daily and have to answer "make
oldconfig" prompts for strange devices.

Signed-off-by: Borislav Petkov <bp@alien8.de>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Cc: Michael Hennerich <michael.hennerich@analog.com>
Cc: Andres Salomon <dilinger@collabora.co.uk>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 6d7665ce6f4a..5a74db75f66f 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -4,7 +4,6 @@
 
 menuconfig MISC_DEVICES
 	bool "Misc devices"
-	default y
 	---help---
 	  Say Y here to get to see options for device drivers from various
 	  different categories. This option alone does not add any kernel code.

From 6c095efd82e8f6a98515426a733110f91cf0a709 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:47 -0700
Subject: [PATCH 121/176] printk: fixup declaration of kmsg_reasons

Move redundant 'const' after '*' to make pointer itself const

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/printk.c b/kernel/printk.c
index 2531017795f6..d18933a92819 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1511,7 +1511,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
 
-static const char const *kmsg_reasons[] = {
+static const char * const kmsg_reasons[] = {
 	[KMSG_DUMP_OOPS]	= "oops",
 	[KMSG_DUMP_PANIC]	= "panic",
 	[KMSG_DUMP_KEXEC]	= "kexec",

From 8155c02a44a95562e1ae0999360eb31288d7195a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:47 -0700
Subject: [PATCH 122/176] printk: add lock context annotation

acquire_console_semaphore_for_printk() releases logbuf_lock but
was missing proper annotation. Add it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/printk.c b/kernel/printk.c
index d18933a92819..6ff26151f4ff 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -647,6 +647,7 @@ static inline int can_use_console(unsigned int cpu)
  * released but interrupts still disabled.
  */
 static int acquire_console_semaphore_for_printk(unsigned int cpu)
+	__releases(&logbuf_lock)
 {
 	int retval = 0;
 

From 674dff6507d3f9b110219ea125cf5e1213c9acef Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:48 -0700
Subject: [PATCH 123/176] printk: change type of 'boot_delay' to int *

get_option() takes its 2nd arg as int * so passing boot_delay to it
caused following warnings from sparse:

 kernel/printk.c:223:27: warning: incorrect type in argument 2 (different signedness)
 kernel/printk.c:223:27:    expected int *pint
 kernel/printk.c:223:27:    got unsigned int static [toplevel] *<noident>

Since boot_delay can't grow more than 10,000 changing it to 'int *'
will not produce any problem.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/printk.c b/kernel/printk.c
index 6ff26151f4ff..b2ebaee8c377 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
 
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 
-static unsigned int boot_delay; /* msecs delay after each printk during bootup */
+static int boot_delay; /* msecs delay after each printk during bootup */
 static unsigned long long loops_per_msec;	/* based on boot_delay */
 
 static int __init boot_delay_setup(char *str)

From f5d87d851d76a390d0fab2f77bd1d563d69ee586 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:49 -0700
Subject: [PATCH 124/176] printk: declare printk_ratelimit_state in ratelimit.h

Adding declaration of printk_ratelimit_state in ratelimit.h removes
potential build breakage and following sparse warning:

 kernel/printk.c:1426:1: warning: symbol 'printk_ratelimit_state' was not declared. Should it be static?

[akpm@linux-foundation.org: remove unneeded ifdef]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ratelimit.h | 2 ++
 kernel/sysctl.c           | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 8f69d09a41a5..03ff67b0cdf5 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -36,6 +36,8 @@ static inline void ratelimit_state_init(struct ratelimit_state *rs,
 	rs->begin = 0;
 }
 
+extern struct ratelimit_state printk_ratelimit_state;
+
 extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
 #define __ratelimit(state) ___ratelimit(state, __func__)
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 694b140852c2..48d9d689498f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
 
-extern struct ratelimit_state printk_ratelimit_state;
-
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);

From 77006a0a828249dd69341f960043ee41e7487aa0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:49 -0700
Subject: [PATCH 125/176] ratelimit: add comment warning people off
 printk_ratelimit()

printk_ratelimit() was a bad idea - we don't want subsytem A causing
ratelimiting of subsystem B's messages.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9b492b33032..77b04ed037df 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -277,6 +277,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
 
+/*
+ * Please don't use printk_ratelimit(), because it shares ratelimiting state
+ * with all other unrelated printk_ratelimit() callsites.  Instead use
+ * printk_ratelimited() or plain old __ratelimit().
+ */
 extern int __printk_ratelimit(const char *func);
 #define printk_ratelimit() __printk_ratelimit(__func__)
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,

From 5e0579812834ab7fa072db4a15ebdff68d62e2e7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:50 -0700
Subject: [PATCH 126/176] vsprintf.c: use default pointer field size for
 "(null)" strings

It might be nicer to align the output.

For instance, ACPI messages sometimes have "(null)" pointers.

$ dmesg | grep "(null)"  -A 1 -B 1
[    0.198733] ACPI: Dynamic OEM Table Load:
[    0.198745] ACPI: SSDT (null) 00239 (v02  PmRef  Cpu0Ist 00003000 INTL 20051117)
[    0.199294] ACPI: SSDT 7f596e10 001C7 (v02  PmRef  Cpu0Cst 00003001 INTL 20051117)
[    0.200708] ACPI: Dynamic OEM Table Load:
[    0.200721] ACPI: SSDT (null) 001C7 (v02  PmRef  Cpu0Cst 00003001 INTL 20051117)
[    0.201950] ACPI: SSDT 7f597f10 000D0 (v02  PmRef  Cpu1Ist 00003000 INTL 20051117)
[    0.203386] ACPI: Dynamic OEM Table Load:
[    0.203398] ACPI: SSDT (null) 000D0 (v02  PmRef  Cpu1Ist 00003000 INTL 20051117)
[    0.203871] ACPI: SSDT 7f595f10 00083 (v02  PmRef  Cpu1Cst 00003000 INTL 20051117)
[    0.205301] ACPI: Dynamic OEM Table Load:
[    0.205315] ACPI: SSDT (null) 00083 (v02  PmRef  Cpu1Cst 00003000 INTL 20051117)

[akpm@linux-foundation.org: add code comment]
Signed-off-by: Joe Perches <joe@perches.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 7af9d841c43b..8378c136b6e1 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -988,8 +988,15 @@ static noinline_for_stack
 char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 	      struct printf_spec spec)
 {
-	if (!ptr)
+	if (!ptr) {
+		/*
+		 * Print (null) with the same width as a pointer so it makes
+		 * tabular output look nice.
+		 */
+		if (spec.field_width == -1)
+			spec.field_width = 2 * sizeof(void *);
 		return string(buf, end, "(null)", spec);
+	}
 
 	switch (*fmt) {
 	case 'F':
@@ -1031,7 +1038,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 	}
 	spec.flags |= SMALL;
 	if (spec.field_width == -1) {
-		spec.field_width = 2*sizeof(void *);
+		spec.field_width = 2 * sizeof(void *);
 		spec.flags |= ZEROPAD;
 	}
 	spec.base = 16;

From b903c0b8899b46829a9b80ba55b61079b35940ec Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 26 Oct 2010 14:22:50 -0700
Subject: [PATCH 127/176] lib: fix scnprintf() if @size is == 0

scnprintf() should return 0 if @size is == 0. Update the comment for it,
as @size is unsigned.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 8378c136b6e1..c150d3dafff4 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1504,7 +1504,7 @@ EXPORT_SYMBOL(snprintf);
  * @...: Arguments for the format string
  *
  * The return value is the number of characters written into @buf not including
- * the trailing '\0'. If @size is <= 0 the function returns 0.
+ * the trailing '\0'. If @size is == 0 the function returns 0.
  */
 
 int scnprintf(char *buf, size_t size, const char *fmt, ...)
@@ -1516,7 +1516,11 @@ int scnprintf(char *buf, size_t size, const char *fmt, ...)
 	i = vsnprintf(buf, size, fmt, args);
 	va_end(args);
 
-	return (i >= size) ? (size - 1) : i;
+	if (likely(i < size))
+		return i;
+	if (size != 0)
+		return size - 1;
+	return 0;
 }
 EXPORT_SYMBOL(scnprintf);
 

From 63ab52db5ba7f362266cfed03109387ca73e5eb5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:51 -0700
Subject: [PATCH 128/176] scripts/get_maintainer.pl: Add --git-blame
 --rolestats "Authored lines" information

When options --git-blame and --rolestats are specified, add
the maintainers with the qualifying --git-min-percent amount
of lines authored of the complete file.  Does not add more
authors than specified by --git-max-maintainers.

For anyone using hg, this option works but is _very_ slow.
It's orders of magnitude slower than git slow.

The get_maintainer.pl version was incremented to 0.25.

This can be used with or without --git.

For instance:

$ ./scripts/get_maintainer.pl --git-blame --nogit --rolestats -f lib/bitmap.c
Paul Jackson <pj@sgi.com> (authored lines:406/613=66%,commits:7/20=35%)
Akinobu Mita <mita@miraclelinux.com> (authored lines:87/613=14%,commits:3/20=15%)
Reinette Chatre <reinette.chatre@linux.intel.com> (authored lines:42/613=7%)
Andrew Morton <akpm@linux-foundation.org> (commits:16/20=80%)
Paul Mundt <lethal@linux-sh.org> (commits:3/20=15%)
Randy Dunlap <randy.dunlap@oracle.com> (commits:2/20=10%)

$ ./scripts/get_maintainer.pl --git-blame --git --rolestats -f lib/bitmap.c
Andrew Morton <akpm@linux-foundation.org> (commit_signer:4/5=80%,commits:16/20=80%)
Akinobu Mita <akinobu.mita@gmail.com> (commit_signer:2/5=40%,authored lines:87/613=14%,commits:3/20=15%)
Jack Steiner <steiner@sgi.com> (commit_signer:1/5=20%)
Ben Hutchings <ben@decadent.org.uk> (commit_signer:1/5=20%)
Lee Schermerhorn <lee.schermerhorn@hp.com> (commit_signer:1/5=20%)
Paul Jackson <pj@sgi.com> (authored lines:406/613=66%,commits:7/20=35%)
Reinette Chatre <reinette.chatre@linux.intel.com> (authored lines:42/613=7%)
Paul Mundt <lethal@linux-sh.org> (commits:3/20=15%)
Randy Dunlap <randy.dunlap@oracle.com> (commits:2/20=10%)
linux-kernel@vger.kernel.org (open list)

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 60 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index b2281982f52f..a91ae6318403 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
 use strict;
 
 my $P = $0;
-my $V = '0.24';
+my $V = '0.25';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 
@@ -88,6 +88,7 @@ my %VCS_cmds_git = (
     "available" => '(which("git") ne "") && (-d ".git")',
     "find_signers_cmd" => "git log --no-color --since=\$email_git_since -- \$file",
     "find_commit_signers_cmd" => "git log --no-color -1 \$commit",
+    "find_commit_author_cmd" => "git log -1 --format=\"%an <%ae>\" \$commit",
     "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file",
     "blame_file_cmd" => "git blame -l \$file",
     "commit_pattern" => "^commit [0-9a-f]{40,40}",
@@ -101,6 +102,7 @@ my %VCS_cmds_hg = (
 	"hg log --date=\$email_hg_since" .
 		" --template='commit {node}\\n{desc}\\n' -- \$file",
     "find_commit_signers_cmd" => "hg log --template='{desc}\\n' -r \$commit",
+    "find_commit_author_cmd" => "hg log -l 1 --template='{author}\\n' -r \$commit",
     "blame_range_cmd" => "",		# not supported
     "blame_file_cmd" => "hg blame -c \$file",
     "commit_pattern" => "^commit [0-9a-f]{40,40}",
@@ -1014,6 +1016,9 @@ sub vcs_find_signers {
     if (!$email_git_penguin_chiefs) {
 	@lines = grep(!/${penguin_chiefs}/i, @lines);
     }
+
+    return (0, @lines) if !@lines;
+
     # cut -f2- -d":"
     s/.*:\s*(.+)\s*/$1/ for (@lines);
 
@@ -1027,6 +1032,28 @@ sub vcs_find_signers {
     return ($commits, @lines);
 }
 
+sub vcs_find_author {
+    my ($cmd) = @_;
+    my @lines = ();
+
+    @lines = &{$VCS_cmds{"execute_cmd"}}($cmd);
+
+    if (!$email_git_penguin_chiefs) {
+	@lines = grep(!/${penguin_chiefs}/i, @lines);
+    }
+
+    return @lines if !@lines;
+
+## Reformat email addresses (with names) to avoid badly written signatures
+
+    foreach my $line (@lines) {
+	my ($name, $address) = parse_email($line);
+	$line = format_email($name, $address, 1);
+    }
+
+    return @lines;
+}
+
 sub vcs_save_commits {
     my ($cmd) = @_;
     my @lines = ();
@@ -1084,6 +1111,10 @@ sub vcs_blame {
 	@commits = vcs_save_commits($cmd);
     }
 
+    foreach my $commit (@commits) {
+	$commit =~ s/^\^//g;
+    }
+
     return @commits;
 }
 
@@ -1121,6 +1152,8 @@ sub vcs_assign {
 	@lines = mailmap(@lines);
     }
 
+    return if (@lines <= 0);
+
     @lines = sort(@lines);
 
     # uniq -c
@@ -1165,14 +1198,17 @@ sub vcs_file_blame {
     my ($file) = @_;
 
     my @signers = ();
+    my @all_commits = ();
     my @commits = ();
     my $total_commits;
+    my $total_lines;
 
     return if (!vcs_exists());
 
-    @commits = vcs_blame($file);
-    @commits = uniq(@commits);
+    @all_commits = vcs_blame($file);
+    @commits = uniq(@all_commits);
     $total_commits = @commits;
+    $total_lines = @all_commits;
 
     foreach my $commit (@commits) {
 	my $commit_count;
@@ -1182,10 +1218,28 @@ sub vcs_file_blame {
 	$cmd =~ s/(\$\w+)/$1/eeg;	#interpolate $cmd
 
 	($commit_count, @commit_signers) = vcs_find_signers($cmd);
+
 	push(@signers, @commit_signers);
     }
 
     if ($from_filename) {
+	if ($output_rolestats) {
+	    my @blame_signers;
+	    foreach my $commit (@commits) {
+		my $i;
+		my $cmd = $VCS_cmds{"find_commit_author_cmd"};
+		$cmd =~ s/(\$\w+)/$1/eeg;	#interpolate $cmd
+		my @author = vcs_find_author($cmd);
+		next if !@author;
+		my $count = grep(/$commit/, @all_commits);
+		for ($i = 0; $i < $count ; $i++) {
+		    push(@blame_signers, $author[0]);
+		}
+	    }
+	    if (@blame_signers) {
+		vcs_assign("authored lines", $total_lines, @blame_signers);
+	    }
+	}
 	vcs_assign("commits", $total_commits, @signers);
     } else {
 	vcs_assign("modified commits", $total_commits, @signers);

From 6ffd9485f5c9c0b2d2aea9f904dff08e7088010a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:51 -0700
Subject: [PATCH 129/176] scripts/get_maintainer.pl: use correct indentation

Fix an overly indented block.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index a91ae6318403..65c6acf1bb3b 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -420,23 +420,23 @@ foreach my $file (@files) {
 
     foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) {
 	add_categories($line);
-	    if ($sections) {
-		my $i;
-		my $start = find_starting_index($line);
-		my $end = find_ending_index($line);
-		for ($i = $start; $i < $end; $i++) {
-		    my $line = $typevalue[$i];
-		    if ($line =~ /^[FX]:/) {		##Restore file patterns
-			$line =~ s/([^\\])\.([^\*])/$1\?$2/g;
-			$line =~ s/([^\\])\.$/$1\?/g;	##Convert . back to ?
-			$line =~ s/\\\./\./g;       	##Convert \. to .
-			$line =~ s/\.\*/\*/g;       	##Convert .* to *
-		    }
-		    $line =~ s/^([A-Z]):/$1:\t/g;
-		    print("$line\n");
+	if ($sections) {
+	    my $i;
+	    my $start = find_starting_index($line);
+	    my $end = find_ending_index($line);
+	    for ($i = $start; $i < $end; $i++) {
+		my $line = $typevalue[$i];
+		if ($line =~ /^[FX]:/) {		##Restore file patterns
+		    $line =~ s/([^\\])\.([^\*])/$1\?$2/g;
+		    $line =~ s/([^\\])\.$/$1\?/g;	##Convert . back to ?
+		    $line =~ s/\\\./\./g;       	##Convert \. to .
+		    $line =~ s/\.\*/\*/g;       	##Convert .* to *
 		}
-		print("\n");
+		$line =~ s/^([A-Z]):/$1:\t/g;
+		print("$line\n");
 	    }
+	    print("\n");
+	}
     }
 
     if ($email && $email_git) {

From fab9ed12fcd0c182a72509382c3da55c527963e3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:52 -0700
Subject: [PATCH 130/176] scripts/get_maintainer.pl: don't search MAINTAINERS
 for keywords or emails

Keyword matching uses K: patterns from MAINTAINERS, so if looking for the
MAINTAINERS maintainer, don't search MAINTAINERS for pattern matches.
MAINTAINERS also has rather a lot of email addresses and is easily
searched using grep "^M:", so skip it.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 65c6acf1bb3b..77f4f2e4cd81 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -304,7 +304,7 @@ foreach my $file (@ARGV) {
     }
     if ($from_filename) {
 	push(@files, $file);
-	if (-f $file && ($keywords || $file_emails)) {
+	if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) {
 	    open(my $f, '<', $file)
 		or die "$P: Can't open $file: $!\n";
 	    my $text = do { local($/) ; <$f> };

From e3e9d11479737692f797bad1762f71468d577a93 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:53 -0700
Subject: [PATCH 131/176] scripts/get_maintainer.pl: add default
 --git-fallback, remove default --git

Adding commit signers when there is a listed MAINTAINER for a file
can make the output list longer than necessary.

Change the --git default from on to off.

Add a new --git-fallback option (default on) used to add commit signers
only when there is no MAINTAINER for a file.

git history is used when --git-fallback is enabled and the pattern
directory depth is not the same as the file directory depth.

For instance:

X86 ARCHITECTURE (32-BIT AND 64-BIT)
M:	Thomas Gleixner <tglx@linutronix.de>
M:	Ingo Molnar <mingo@redhat.com>
M:	"H. Peter Anvin" <hpa@zytor.com>
M:	x86@kernel.org
T:	git git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
S:	Maintained
F:	Documentation/x86/
F:	arch/x86/

If using "./scripts/get_maintainer -f arch/x86/lib/atomic64_32.c", the pattern
for "arch/x86/" does not match the directory depth of "arch/x86/lib"
so the MAINTAINERS entries and git history is used to produce:

$ ./scripts/get_maintainer.pl -f --rolestats arch/x86/lib/atomic64_32.c
Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE...)
Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE...)
"H. Peter Anvin" <hpa@zytor.com> (maintainer:X86 ARCHITECTURE...,commit_signer:1/1=100%)
x86@kernel.org (maintainer:X86 ARCHITECTURE...)
Luca Barbieri <luca@luca-barbieri.com> (commit_signer:1/1=100%)
linux-kernel@vger.kernel.org (open list)

Luca Barbieri is added because he signed the only commit to
arch/x86/lib/atomic64_32.c during the last year and he meets the
other default qualifications.
	--git-min-percent (default:10)
	--git-min-signatures (default:1)

If current users of ./scripts/get_maintainers.pl have scripts
that use --nogit that expect git history to be excluded, those
scripts should be updated to include --nogit-fallback or a
.get_maintainer.conf file should be created with --nogit-fallback.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 77f4f2e4cd81..f46576949ccb 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -24,9 +24,10 @@ my $email_maintainer = 1;
 my $email_list = 1;
 my $email_subscriber_list = 0;
 my $email_git_penguin_chiefs = 0;
-my $email_git = 1;
+my $email_git = 0;
 my $email_git_all_signature_types = 0;
 my $email_git_blame = 0;
+my $email_git_fallback = 1;
 my $email_git_min_signatures = 1;
 my $email_git_max_maintainers = 5;
 my $email_git_min_percent = 5;
@@ -138,6 +139,7 @@ if (!GetOptions(
 		'git!' => \$email_git,
 		'git-all-signature-types!' => \$email_git_all_signature_types,
 		'git-blame!' => \$email_git_blame,
+		'git-fallback!' => \$email_git_fallback,
 		'git-chief-penguins!' => \$email_git_penguin_chiefs,
 		'git-min-signatures=i' => \$email_git_min_signatures,
 		'git-max-maintainers=i' => \$email_git_max_maintainers,
@@ -371,6 +373,7 @@ my @status = ();
 foreach my $file (@files) {
 
     my %hash;
+    my $exact_pattern_match = 0;
     my $tvi = find_first_section();
     while ($tvi < @typevalue) {
 	my $start = find_starting_index($tvi);
@@ -405,6 +408,8 @@ foreach my $file (@files) {
 			    my $value_pd = ($value =~ tr@/@@);
 			    my $file_pd = ($file  =~ tr@/@@);
 			    $value_pd++ if (substr($value,-1,1) ne "/");
+			    $value_pd = -1 if ($value =~ /^\.\*/);
+			    $exact_pattern_match = 1 if ($value_pd >= $file_pd);
 			    if ($pattern_depth == 0 ||
 				(($file_pd - $value_pd) < $pattern_depth)) {
 				$hash{$tvi} = $value_pd;
@@ -439,7 +444,8 @@ foreach my $file (@files) {
 	}
     }
 
-    if ($email && $email_git) {
+    if ($email &&
+	($email_git || ($email_git_fallback && !$exact_pattern_match))) {
 	vcs_file_signoffs($file);
     }
 
@@ -540,6 +546,7 @@ MAINTAINER field selection options:
     --git => include recent git \*-by: signers
     --git-all-signature-types => include signers regardless of signature type
         or use only ${signaturePattern} signers (default: $email_git_all_signature_types)
+    --git-fallback => use git when no exact MAINTAINERS pattern (default: $email_git_fallback)
     --git-chief-penguins => include ${penguin_chiefs}
     --git-min-signatures => number of signatures required (default: $email_git_min_signatures)
     --git-max-maintainers => maximum maintainers to add (default: $email_git_max_maintainers)

From bcde44ed7d2a58733efdf04b5392c027d1348bac Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:53 -0700
Subject: [PATCH 132/176] scripts/get_maintainer.pl: use .get_maintainer.conf
 from . then $HOME then scripts

On Mon, 2010-09-13 at 00:01 -0400, Valdis.Kletnieks@vt.edu wrote:
> Any chance of getting that to be ~/.get_maintainer.conf rather than
> ./.get_maintainer.conf? I've just gotten bit like the 3rd or 4th time by
> "oh but you didn't create that file in *this* tree"
> (I usually have a linus git tree, a linux-next tree, and 3-4 -mm trees)

Sure.

Add a search path for the .conf file.

3 paths are added:

.             customized per-tree configurations
$HOME         user global configuration when per-tree configs don't exist
./scripts     lk defaults to override script

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index f46576949ccb..e5a400c53bf0 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -110,10 +110,12 @@ my %VCS_cmds_hg = (
     "blame_commit_pattern" => "^([0-9a-f]+):"
 );
 
-if (-f "${lk_path}.get_maintainer.conf") {
+my $conf = which_conf(".get_maintainer.conf");
+if (-f $conf) {
     my @conf_args;
-    open(my $conffile, '<', "${lk_path}.get_maintainer.conf")
-	or warn "$P: Can't open .get_maintainer.conf: $!\n";
+    open(my $conffile, '<', "$conf")
+	or warn "$P: Can't find a readable .get_maintainer.conf file $!\n";
+
     while (<$conffile>) {
 	my $line = $_;
 
@@ -961,6 +963,18 @@ sub which {
     return "";
 }
 
+sub which_conf {
+    my ($conf) = @_;
+
+    foreach my $path (split(/:/, ".:$ENV{HOME}:.scripts")) {
+	if (-e "$path/$conf") {
+	    return "$path/$conf";
+	}
+    }
+
+    return "";
+}
+
 sub mailmap {
     my (@lines) = @_;
     my %hash;

From dace8e300d6820c2842de750d12b498a743bcfe5 Mon Sep 17 00:00:00 2001
From: Florian Mickler <florian@mickler.org>
Date: Tue, 26 Oct 2010 14:22:54 -0700
Subject: [PATCH 133/176] scripts/get_maintainer.pl: add interactive mode

This is a first version of an interactive mode for
scripts/get_maintainer.pl.

It allows the user to interact with the script.  Each cc candidate can be
selected and deselected and a shortlog of authored commits can be
displayed for each candidate.

The menu is displayed via STDERR, the end result is outputted to STDOUT.
This unusual mechanism allows using get_maintainer.pl in interactive mode
via git send-email --cc-cmd.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 146 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 5 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index e5a400c53bf0..1ae8c50f1908 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -33,6 +33,7 @@ my $email_git_max_maintainers = 5;
 my $email_git_min_percent = 5;
 my $email_git_since = "1-year-ago";
 my $email_hg_since = "-365";
+my $interactive = 0;
 my $email_remove_duplicates = 1;
 my $output_multiline = 1;
 my $output_separator = ", ";
@@ -52,6 +53,8 @@ my $help = 0;
 
 my $exit = 0;
 
+my %shortlog_buffer;
+
 my @penguin_chief = ();
 push(@penguin_chief, "Linus Torvalds:torvalds\@linux-foundation.org");
 #Andrew wants in on most everything - 2009/01/14
@@ -93,7 +96,8 @@ my %VCS_cmds_git = (
     "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file",
     "blame_file_cmd" => "git blame -l \$file",
     "commit_pattern" => "^commit [0-9a-f]{40,40}",
-    "blame_commit_pattern" => "^([0-9a-f]+) "
+    "blame_commit_pattern" => "^([0-9a-f]+) ",
+    "shortlog_cmd" => "git log --no-color --oneline --since=\$email_git_since --author=\"\$email\" -- \$file"
 );
 
 my %VCS_cmds_hg = (
@@ -107,7 +111,8 @@ my %VCS_cmds_hg = (
     "blame_range_cmd" => "",		# not supported
     "blame_file_cmd" => "hg blame -c \$file",
     "commit_pattern" => "^commit [0-9a-f]{40,40}",
-    "blame_commit_pattern" => "^([0-9a-f]+):"
+    "blame_commit_pattern" => "^([0-9a-f]+):",
+    "shortlog_cmd" => "ht log --date=\$email_hg_since"
 );
 
 my $conf = which_conf(".get_maintainer.conf");
@@ -148,6 +153,7 @@ if (!GetOptions(
 		'git-min-percent=i' => \$email_git_min_percent,
 		'git-since=s' => \$email_git_since,
 		'hg-since=s' => \$email_hg_since,
+		'i|interactive!' => \$interactive,
 		'remove-duplicates!' => \$email_remove_duplicates,
 		'm!' => \$email_maintainer,
 		'n!' => \$email_usename,
@@ -225,6 +231,8 @@ if ($email_git_all_signature_types) {
     $signaturePattern = "(.+?)[Bb][Yy]:";
 }
 
+
+
 ## Read MAINTAINERS for type/value pairs
 
 my @typevalue = ();
@@ -450,10 +458,13 @@ foreach my $file (@files) {
 	($email_git || ($email_git_fallback && !$exact_pattern_match))) {
 	vcs_file_signoffs($file);
     }
-
     if ($email && $email_git_blame) {
 	vcs_file_blame($file);
     }
+    if ($email && $interactive){
+	vcs_file_shortlogs($file);
+
+    }
 }
 
 if ($keywords) {
@@ -486,9 +497,13 @@ if ($email) {
     }
 }
 
+
 if ($email || $email_list) {
     my @to = ();
     if ($email) {
+	if ($interactive) {
+	    @email_to = @{vcs_interactive_menu(\@email_to)};
+	}
 	@to = (@to, @email_to);
     }
     if ($email_list) {
@@ -501,7 +516,6 @@ if ($scm) {
     @scm = uniq(@scm);
     output(@scm);
 }
-
 if ($status) {
     @status = uniq(@status);
     output(@status);
@@ -556,6 +570,7 @@ MAINTAINER field selection options:
     --git-blame => use git blame to find modified commits for patch or file
     --git-since => git history to use (default: $email_git_since)
     --hg-since => hg history to use (default: $email_hg_since)
+    --interactive => display a menu (mostly useful if used with the --git option)
     --m => include maintainer(s) if any
     --n => include name 'Full Name <addr\@domain.tld>'
     --l => include list(s) if any
@@ -1156,6 +1171,127 @@ sub vcs_exists {
     return 0;
 }
 
+sub vcs_interactive_menu {
+    my $list_ref = shift;
+    my @list = @$list_ref;
+
+    return if (!vcs_exists());
+
+    my %selected;
+    my %shortlog;
+    my $input;
+    my $count = 0;
+
+    #select maintainers by default
+    foreach my $entry (@list){
+	    my $role = $entry->[1];
+	    $selected{$count} = ($role =~ /maintainer:|supporter:/);
+	    $count++;
+    }
+
+    #menu loop
+    do {
+	my $count = 0;
+	foreach my $entry (@list){
+	    my $email = $entry->[0];
+	    my $role = $entry->[1];
+	    if ($selected{$count}){
+		print STDERR "* ";
+	    } else {
+		print STDERR "  ";
+	    }
+	    print STDERR "$count: $email,\t\t $role";
+	    print STDERR "\n";
+	    if ($shortlog{$count}){
+		my $entries_ref = vcs_get_shortlog($email);
+		foreach my $entry_ref (@{$entries_ref}){
+		    my $filename = @{$entry_ref}[0];
+		    my @shortlog = @{@{$entry_ref}[1]};
+		    print STDERR "\tshortlog for $filename (authored commits: " . @shortlog . ").\n";
+		    foreach my $commit (@shortlog){
+			print STDERR "\t  $commit\n";
+		    }
+		    print STDERR "\n";
+		}
+	    }
+	    $count++;
+	}
+	print STDERR "\n";
+	print STDERR "Choose whom to cc by entering a commaseperated list of numbers and hitting enter.\n";
+	print STDERR "To show a short list of commits, precede the number by a '?',\n";
+	print STDERR "A blank line indicates that you are satisfied with your choice.\n";
+	$input = <STDIN>;
+	chomp($input);
+
+	my @wish = split(/[, ]+/,$input);
+	foreach my $nr (@wish){
+		my $logtoggle = 0;
+		if ($nr =~ /\?/){
+			$nr =~ s/\?//;
+			$logtoggle = 1;
+		}
+
+		#skip out of bounds numbers
+		next unless ($nr <= $count && $nr >= 0);
+
+		if ($logtoggle){
+			$shortlog{$nr} = !$shortlog{$nr};
+		} else {
+			$selected{$nr} = !$selected{$nr};
+
+			#switch shortlog on if an entry get's selected
+			if ($selected{$nr}){
+				$shortlog{$nr}=1;
+			}
+		}
+	};
+    } while(length($input) > 0);
+
+    #drop not selected entries
+    $count = 0;
+    my @new_emailto;
+    foreach my $entry (@list){
+	if ($selected{$count}){
+		push(@new_emailto,$list[$count]);
+		print STDERR "$count: ";
+		print STDERR $email_to[$count]->[0];
+		print STDERR ",\t\t ";
+		print STDERR $email_to[$count]->[1];
+		print STDERR "\n";
+	}
+	$count++;
+    }
+    return \@new_emailto;
+}
+
+sub vcs_get_shortlog {
+    my $arg = shift;
+    my ($name, $address) = parse_email($arg);
+    return $shortlog_buffer{$address};
+}
+
+sub vcs_file_shortlogs {
+    my ($file) = @_;
+    print STDERR "shortlog processing $file:";
+    foreach my $entry (@email_to){
+	my ($name, $address) = parse_email($entry->[0]);
+	print STDERR ".";
+	my $commits_ref = vcs_email_shortlog($address, $file);
+	push(@{$shortlog_buffer{$address}}, [ $file, $commits_ref ]);
+    }
+    print STDERR "\n";
+}
+
+sub vcs_email_shortlog {
+    my $email = shift;
+    my ($file) = @_;
+
+    my $cmd = $VCS_cmds{"shortlog_cmd"};
+    $cmd =~ s/(\$\w+)/$1/eeg;		#substitute variables
+    my @lines = &{$VCS_cmds{"execute_cmd"}}($cmd);
+    return \@lines;
+}
+
 sub vcs_assign {
     my ($role, $divisor, @lines) = @_;
 
@@ -1236,7 +1372,7 @@ sub vcs_file_blame {
 	my @commit_signers = ();
 
 	my $cmd = $VCS_cmds{"find_commit_signers_cmd"};
-	$cmd =~ s/(\$\w+)/$1/eeg;	#interpolate $cmd
+	$cmd =~ s/(\$\w+)/$1/eeg;	#substitute variables in $cmd
 
 	($commit_count, @commit_signers) = vcs_find_signers($cmd);
 

From 683c6f8fcbcb6de8d07545ba70aff49e50d8bcf2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:55 -0700
Subject: [PATCH 134/176] scripts/get_maintainer.pl: improve --interactive UI

o Added searching by git-blame as well as git-history
o Added different selection toggles
o Added ability to list commits by author or by sign-off-type
o Use custom git and hg formats to make searching for subject/author
  a bit easier.
o Move inlined section matching and searching git/hg history to
  new get_maintainer subroutine
o Added subroutines save_commits_by_author and save_commits_by_signer
o Removed subroutines vcs_get_shortlog and vcs_email_shortlog
o Rename camelcase signaturePattern to signature_pattern

Update to 0.26 beta3

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Florian Mickler <florian@mickler.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 903 ++++++++++++++++++++++++++------------
 1 file changed, 612 insertions(+), 291 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 1ae8c50f1908..f51176039ff5 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
 use strict;
 
 my $P = $0;
-my $V = '0.25';
+my $V = '0.26-beta3';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 
@@ -27,6 +27,7 @@ my $email_git_penguin_chiefs = 0;
 my $email_git = 0;
 my $email_git_all_signature_types = 0;
 my $email_git_blame = 0;
+my $email_git_blame_signatures = 1;
 my $email_git_fallback = 1;
 my $email_git_min_signatures = 1;
 my $email_git_max_maintainers = 5;
@@ -51,9 +52,12 @@ my $pattern_depth = 0;
 my $version = 0;
 my $help = 0;
 
+my $vcs_used = 0;
+
 my $exit = 0;
 
-my %shortlog_buffer;
+my %commit_author_hash;
+my %commit_signer_hash;
 
 my @penguin_chief = ();
 push(@penguin_chief, "Linus Torvalds:torvalds\@linux-foundation.org");
@@ -77,7 +81,6 @@ my @signature_tags = ();
 push(@signature_tags, "Signed-off-by:");
 push(@signature_tags, "Reviewed-by:");
 push(@signature_tags, "Acked-by:");
-my $signaturePattern = "\(" . join("|", @signature_tags) . "\)";
 
 # rfc822 email address - preloaded methods go here.
 my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])";
@@ -90,29 +93,62 @@ my %VCS_cmds;
 my %VCS_cmds_git = (
     "execute_cmd" => \&git_execute_cmd,
     "available" => '(which("git") ne "") && (-d ".git")',
-    "find_signers_cmd" => "git log --no-color --since=\$email_git_since -- \$file",
-    "find_commit_signers_cmd" => "git log --no-color -1 \$commit",
-    "find_commit_author_cmd" => "git log -1 --format=\"%an <%ae>\" \$commit",
+    "find_signers_cmd" =>
+	"git log --no-color --since=\$email_git_since " .
+	    '--format="GitCommit: %H%n' .
+		      'GitAuthor: %an <%ae>%n' .
+		      'GitDate: %aD%n' .
+		      'GitSubject: %s%n' .
+		      '%b%n"' .
+	    " -- \$file",
+    "find_commit_signers_cmd" =>
+	"git log --no-color " .
+	    '--format="GitCommit: %H%n' .
+		      'GitAuthor: %an <%ae>%n' .
+		      'GitDate: %aD%n' .
+		      'GitSubject: %s%n' .
+		      '%b%n"' .
+	    " -1 \$commit",
+    "find_commit_author_cmd" =>
+	"git log --no-color " .
+	    '--format="GitCommit: %H%n' .
+		      'GitAuthor: %an <%ae>%n' .
+		      'GitDate: %aD%n' .
+		      'GitSubject: %s%n"' .
+	    " -1 \$commit",
     "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file",
     "blame_file_cmd" => "git blame -l \$file",
-    "commit_pattern" => "^commit [0-9a-f]{40,40}",
+    "commit_pattern" => "^GitCommit: ([0-9a-f]{40,40})",
     "blame_commit_pattern" => "^([0-9a-f]+) ",
-    "shortlog_cmd" => "git log --no-color --oneline --since=\$email_git_since --author=\"\$email\" -- \$file"
+    "author_pattern" => "^GitAuthor: (.*)",
+    "subject_pattern" => "^GitSubject: (.*)",
 );
 
 my %VCS_cmds_hg = (
     "execute_cmd" => \&hg_execute_cmd,
     "available" => '(which("hg") ne "") && (-d ".hg")',
     "find_signers_cmd" =>
-	"hg log --date=\$email_hg_since" .
-		" --template='commit {node}\\n{desc}\\n' -- \$file",
-    "find_commit_signers_cmd" => "hg log --template='{desc}\\n' -r \$commit",
-    "find_commit_author_cmd" => "hg log -l 1 --template='{author}\\n' -r \$commit",
+	"hg log --date=\$email_hg_since " .
+	    "--template='HgCommit: {node}\\n" .
+	                "HgAuthor: {author}\\n" .
+			"HgSubject: {desc}\\n'" .
+	    " -- \$file",
+    "find_commit_signers_cmd" =>
+	"hg log " .
+	    "--template='HgSubject: {desc}\\n'" .
+	    " -r \$commit",
+    "find_commit_author_cmd" =>
+	"hg log " .
+	    "--template='HgCommit: {node}\\n" .
+		        "HgAuthor: {author}\\n" .
+			"HgSubject: {desc|firstline}\\n'" .
+	    " -r \$commit",
     "blame_range_cmd" => "",		# not supported
-    "blame_file_cmd" => "hg blame -c \$file",
-    "commit_pattern" => "^commit [0-9a-f]{40,40}",
-    "blame_commit_pattern" => "^([0-9a-f]+):",
-    "shortlog_cmd" => "ht log --date=\$email_hg_since"
+    "blame_file_cmd" => "hg blame -n \$file",
+    "commit_pattern" => "^HgCommit: ([0-9a-f]{40,40})",
+    "blame_commit_pattern" => "^([ 0-9a-f]+):",
+    "author_pattern" => "^HgAuthor: (.*)",
+    "subject_pattern" => "^HgSubject: (.*)",
 );
 
 my $conf = which_conf(".get_maintainer.conf");
@@ -146,6 +182,7 @@ if (!GetOptions(
 		'git!' => \$email_git,
 		'git-all-signature-types!' => \$email_git_all_signature_types,
 		'git-blame!' => \$email_git_blame,
+		'git-blame-signatures!' => \$email_git_blame_signatures,
 		'git-fallback!' => \$email_git_fallback,
 		'git-chief-penguins!' => \$email_git_penguin_chiefs,
 		'git-min-signatures=i' => \$email_git_min_signatures,
@@ -193,13 +230,9 @@ if (-t STDIN && !@ARGV) {
     die "$P: missing patchfile or -f file - use --help if necessary\n";
 }
 
-if ($output_separator ne ", ") {
-    $output_multiline = 0;
-}
-
-if ($output_rolestats) {
-    $output_roles = 1;
-}
+$output_multiline = 0 if ($output_separator ne ", ");
+$output_rolestats = 1 if ($interactive);
+$output_roles = 1 if ($output_rolestats);
 
 if ($sections) {
     $email = 0;
@@ -227,12 +260,6 @@ if (!top_of_kernel_tree($lk_path)) {
 	. "a linux kernel source tree.\n";
 }
 
-if ($email_git_all_signature_types) {
-    $signaturePattern = "(.+?)[Bb][Yy]:";
-}
-
-
-
 ## Read MAINTAINERS for type/value pairs
 
 my @typevalue = ();
@@ -371,151 +398,28 @@ foreach my $file (@ARGV) {
 
 @file_emails = uniq(@file_emails);
 
+my %email_hash_name;
+my %email_hash_address;
 my @email_to = ();
+my %hash_list_to;
 my @list_to = ();
 my @scm = ();
 my @web = ();
 my @subsystem = ();
 my @status = ();
+my $signature_pattern;
 
-# Find responsible parties
+my @to = get_maintainer();
 
-foreach my $file (@files) {
+@to = merge_email(@to);
 
-    my %hash;
-    my $exact_pattern_match = 0;
-    my $tvi = find_first_section();
-    while ($tvi < @typevalue) {
-	my $start = find_starting_index($tvi);
-	my $end = find_ending_index($tvi);
-	my $exclude = 0;
-	my $i;
-
-	#Do not match excluded file patterns
-
-	for ($i = $start; $i < $end; $i++) {
-	    my $line = $typevalue[$i];
-	    if ($line =~ m/^(\C):\s*(.*)/) {
-		my $type = $1;
-		my $value = $2;
-		if ($type eq 'X') {
-		    if (file_match_pattern($file, $value)) {
-			$exclude = 1;
-			last;
-		    }
-		}
-	    }
-	}
-
-	if (!$exclude) {
-	    for ($i = $start; $i < $end; $i++) {
-		my $line = $typevalue[$i];
-		if ($line =~ m/^(\C):\s*(.*)/) {
-		    my $type = $1;
-		    my $value = $2;
-		    if ($type eq 'F') {
-			if (file_match_pattern($file, $value)) {
-			    my $value_pd = ($value =~ tr@/@@);
-			    my $file_pd = ($file  =~ tr@/@@);
-			    $value_pd++ if (substr($value,-1,1) ne "/");
-			    $value_pd = -1 if ($value =~ /^\.\*/);
-			    $exact_pattern_match = 1 if ($value_pd >= $file_pd);
-			    if ($pattern_depth == 0 ||
-				(($file_pd - $value_pd) < $pattern_depth)) {
-				$hash{$tvi} = $value_pd;
-			    }
-			}
-		    }
-		}
-	    }
-	}
-
-	$tvi = $end + 1;
-    }
-
-    foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) {
-	add_categories($line);
-	if ($sections) {
-	    my $i;
-	    my $start = find_starting_index($line);
-	    my $end = find_ending_index($line);
-	    for ($i = $start; $i < $end; $i++) {
-		my $line = $typevalue[$i];
-		if ($line =~ /^[FX]:/) {		##Restore file patterns
-		    $line =~ s/([^\\])\.([^\*])/$1\?$2/g;
-		    $line =~ s/([^\\])\.$/$1\?/g;	##Convert . back to ?
-		    $line =~ s/\\\./\./g;       	##Convert \. to .
-		    $line =~ s/\.\*/\*/g;       	##Convert .* to *
-		}
-		$line =~ s/^([A-Z]):/$1:\t/g;
-		print("$line\n");
-	    }
-	    print("\n");
-	}
-    }
-
-    if ($email &&
-	($email_git || ($email_git_fallback && !$exact_pattern_match))) {
-	vcs_file_signoffs($file);
-    }
-    if ($email && $email_git_blame) {
-	vcs_file_blame($file);
-    }
-    if ($email && $interactive){
-	vcs_file_shortlogs($file);
-
-    }
-}
-
-if ($keywords) {
-    @keyword_tvi = sort_and_uniq(@keyword_tvi);
-    foreach my $line (@keyword_tvi) {
-	add_categories($line);
-    }
-}
-
-if ($email) {
-    foreach my $chief (@penguin_chief) {
-	if ($chief =~ m/^(.*):(.*)/) {
-	    my $email_address;
-
-	    $email_address = format_email($1, $2, $email_usename);
-	    if ($email_git_penguin_chiefs) {
-		push(@email_to, [$email_address, 'chief penguin']);
-	    } else {
-		@email_to = grep($_->[0] !~ /${email_address}/, @email_to);
-	    }
-	}
-    }
-
-    foreach my $email (@file_emails) {
-	my ($name, $address) = parse_email($email);
-
-	my $tmp_email = format_email($name, $address, $email_usename);
-	push_email_address($tmp_email, '');
-	add_role($tmp_email, 'in file');
-    }
-}
-
-
-if ($email || $email_list) {
-    my @to = ();
-    if ($email) {
-	if ($interactive) {
-	    @email_to = @{vcs_interactive_menu(\@email_to)};
-	}
-	@to = (@to, @email_to);
-    }
-    if ($email_list) {
-	@to = (@to, @list_to);
-    }
-    output(merge_email(@to));
-}
+output(@to) if (@to);
 
 if ($scm) {
     @scm = uniq(@scm);
     output(@scm);
 }
+
 if ($status) {
     @status = uniq(@status);
     output(@status);
@@ -533,6 +437,154 @@ if ($web) {
 
 exit($exit);
 
+sub get_maintainer {
+    %email_hash_name = ();
+    %email_hash_address = ();
+    %commit_author_hash = ();
+    %commit_signer_hash = ();
+    @email_to = ();
+    %hash_list_to = ();
+    @list_to = ();
+    @scm = ();
+    @web = ();
+    @subsystem = ();
+    @status = ();
+
+    if ($email_git_all_signature_types) {
+	$signature_pattern = "(.+?)[Bb][Yy]:";
+    } else {
+	$signature_pattern = "\(" . join("|", @signature_tags) . "\)";
+    }
+
+    # Find responsible parties
+
+    foreach my $file (@files) {
+
+	my %hash;
+	my $exact_pattern_match = 0;
+	my $tvi = find_first_section();
+	while ($tvi < @typevalue) {
+	    my $start = find_starting_index($tvi);
+	    my $end = find_ending_index($tvi);
+	    my $exclude = 0;
+	    my $i;
+
+	    #Do not match excluded file patterns
+
+	    for ($i = $start; $i < $end; $i++) {
+		my $line = $typevalue[$i];
+		if ($line =~ m/^(\C):\s*(.*)/) {
+		    my $type = $1;
+		    my $value = $2;
+		    if ($type eq 'X') {
+			if (file_match_pattern($file, $value)) {
+			    $exclude = 1;
+			    last;
+			}
+		    }
+		}
+	    }
+
+	    if (!$exclude) {
+		for ($i = $start; $i < $end; $i++) {
+		    my $line = $typevalue[$i];
+		    if ($line =~ m/^(\C):\s*(.*)/) {
+			my $type = $1;
+			my $value = $2;
+			if ($type eq 'F') {
+			    if (file_match_pattern($file, $value)) {
+				my $value_pd = ($value =~ tr@/@@);
+				my $file_pd = ($file  =~ tr@/@@);
+				$value_pd++ if (substr($value,-1,1) ne "/");
+				$value_pd = -1 if ($value =~ /^\.\*/);
+				$exact_pattern_match = 1 if ($value_pd >= $file_pd);
+				if ($pattern_depth == 0 ||
+				    (($file_pd - $value_pd) < $pattern_depth)) {
+				    $hash{$tvi} = $value_pd;
+				}
+			    }
+			}
+		    }
+		}
+	    }
+	    $tvi = $end + 1;
+	}
+
+	foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) {
+	    add_categories($line);
+	    if ($sections) {
+		my $i;
+		my $start = find_starting_index($line);
+		my $end = find_ending_index($line);
+		for ($i = $start; $i < $end; $i++) {
+		    my $line = $typevalue[$i];
+		    if ($line =~ /^[FX]:/) {		##Restore file patterns
+			$line =~ s/([^\\])\.([^\*])/$1\?$2/g;
+			$line =~ s/([^\\])\.$/$1\?/g;	##Convert . back to ?
+			$line =~ s/\\\./\./g;       	##Convert \. to .
+			$line =~ s/\.\*/\*/g;       	##Convert .* to *
+		    }
+		    $line =~ s/^([A-Z]):/$1:\t/g;
+		    print("$line\n");
+		}
+		print("\n");
+	    }
+	}
+
+	if ($email && ($email_git ||
+		       ($email_git_fallback && !$exact_pattern_match))) {
+	    vcs_file_signoffs($file);
+	}
+	if ($email && $email_git_blame) {
+	    vcs_file_blame($file);
+	}
+    }
+
+    if ($keywords) {
+	@keyword_tvi = sort_and_uniq(@keyword_tvi);
+	foreach my $line (@keyword_tvi) {
+	    add_categories($line);
+	}
+    }
+
+    if ($email) {
+	foreach my $chief (@penguin_chief) {
+	    if ($chief =~ m/^(.*):(.*)/) {
+		my $email_address;
+
+		$email_address = format_email($1, $2, $email_usename);
+		if ($email_git_penguin_chiefs) {
+		    push(@email_to, [$email_address, 'chief penguin']);
+		} else {
+		    @email_to = grep($_->[0] !~ /${email_address}/, @email_to);
+		}
+	    }
+	}
+
+	foreach my $email (@file_emails) {
+	    my ($name, $address) = parse_email($email);
+
+	    my $tmp_email = format_email($name, $address, $email_usename);
+	    push_email_address($tmp_email, '');
+	    add_role($tmp_email, 'in file');
+	}
+    }
+
+    my @to = ();
+    if ($email || $email_list) {
+	if ($email) {
+	    @to = (@to, @email_to);
+	}
+	if ($email_list) {
+	    @to = (@to, @list_to);
+	}
+    }
+
+    @to = interactive_get_maintainer(\@to) if ($interactive);
+
+    return @to;
+}
+
 sub file_match_pattern {
     my ($file, $pattern) = @_;
     if (substr($pattern, -1) eq "/") {
@@ -561,7 +613,7 @@ MAINTAINER field selection options:
   --email => print email address(es) if any
     --git => include recent git \*-by: signers
     --git-all-signature-types => include signers regardless of signature type
-        or use only ${signaturePattern} signers (default: $email_git_all_signature_types)
+        or use only ${signature_pattern} signers (default: $email_git_all_signature_types)
     --git-fallback => use git when no exact MAINTAINERS pattern (default: $email_git_fallback)
     --git-chief-penguins => include ${penguin_chiefs}
     --git-min-signatures => number of signatures required (default: $email_git_min_signatures)
@@ -847,11 +899,19 @@ sub add_categories {
 		}
 		if ($list_additional =~ m/subscribers-only/) {
 		    if ($email_subscriber_list) {
-			push(@list_to, [$list_address, "subscriber list${list_role}"]);
+			if (!$hash_list_to{$list_address}) {
+			    $hash_list_to{$list_address} = 1;
+			    push(@list_to, [$list_address,
+					    "subscriber list${list_role}"]);
+			}
 		    }
 		} else {
 		    if ($email_list) {
-			push(@list_to, [$list_address, "open list${list_role}"]);
+			if (!$hash_list_to{$list_address}) {
+			    $hash_list_to{$list_address} = 1;
+			    push(@list_to, [$list_address,
+					    "open list${list_role}"]);
+			}
 		    }
 		}
 	    } elsif ($ptype eq "M") {
@@ -882,9 +942,6 @@ sub add_categories {
     }
 }
 
-my %email_hash_name;
-my %email_hash_address;
-
 sub email_inuse {
     my ($name, $address) = @_;
 
@@ -1037,10 +1094,31 @@ sub hg_execute_cmd {
     return @lines;
 }
 
+sub extract_formatted_signatures {
+    my (@signature_lines) = @_;
+
+    my @type = @signature_lines;
+
+    s/\s*(.*):.*/$1/ for (@type);
+
+    # cut -f2- -d":"
+    s/\s*.*:\s*(.+)\s*/$1/ for (@signature_lines);
+
+## Reformat email addresses (with names) to avoid badly written signatures
+
+    foreach my $signer (@signature_lines) {
+	my ($name, $address) = parse_email($signer);
+	$signer = format_email($name, $address, 1);
+    }
+
+    return (\@type, \@signature_lines);
+}
+
 sub vcs_find_signers {
     my ($cmd) = @_;
-    my @lines = ();
     my $commits;
+    my @lines = ();
+    my @signatures = ();
 
     @lines = &{$VCS_cmds{"execute_cmd"}}($cmd);
 
@@ -1048,24 +1126,20 @@ sub vcs_find_signers {
 
     $commits = grep(/$pattern/, @lines);	# of commits
 
-    @lines = grep(/^[ \t]*${signaturePattern}.*\@.*$/, @lines);
+    @signatures = grep(/^[ \t]*${signature_pattern}.*\@.*$/, @lines);
+
+    return (0, @signatures) if !@signatures;
+
+    save_commits_by_author(@lines) if ($interactive);
+    save_commits_by_signer(@lines) if ($interactive);
+
     if (!$email_git_penguin_chiefs) {
-	@lines = grep(!/${penguin_chiefs}/i, @lines);
+	@signatures = grep(!/${penguin_chiefs}/i, @signatures);
     }
 
-    return (0, @lines) if !@lines;
+    my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures);
 
-    # cut -f2- -d":"
-    s/.*:\s*(.+)\s*/$1/ for (@lines);
-
-## Reformat email addresses (with names) to avoid badly written signatures
-
-    foreach my $line (@lines) {
-	my ($name, $address) = parse_email($line);
-	$line = format_email($name, $address, 1);
-    }
-
-    return ($commits, @lines);
+    return ($commits, @$signers_ref);
 }
 
 sub vcs_find_author {
@@ -1080,14 +1154,20 @@ sub vcs_find_author {
 
     return @lines if !@lines;
 
-## Reformat email addresses (with names) to avoid badly written signatures
-
+    my @authors = ();
     foreach my $line (@lines) {
-	my ($name, $address) = parse_email($line);
-	$line = format_email($name, $address, 1);
+	if ($line =~ m/$VCS_cmds{"author_pattern"}/) {
+	    my $author = $1;
+	    my ($name, $address) = parse_email($author);
+	    $author = format_email($name, $address, 1);
+	    push(@authors, $author);
+	}
     }
 
-    return @lines;
+    save_commits_by_author(@lines) if ($interactive);
+    save_commits_by_signer(@lines) if ($interactive);
+
+    return @authors;
 }
 
 sub vcs_save_commits {
@@ -1159,7 +1239,7 @@ sub vcs_exists {
     %VCS_cmds = %VCS_cmds_git;
     return 1 if eval $VCS_cmds{"available"};
     %VCS_cmds = %VCS_cmds_hg;
-    return 1 if eval $VCS_cmds{"available"};
+    return 2 if eval $VCS_cmds{"available"};
     %VCS_cmds = ();
     if (!$printed_novcs) {
 	warn("$P: No supported VCS found.  Add --nogit to options?\n");
@@ -1171,125 +1251,309 @@ sub vcs_exists {
     return 0;
 }
 
-sub vcs_interactive_menu {
-    my $list_ref = shift;
+sub vcs_is_git {
+    return $vcs_used == 1;
+}
+
+sub vcs_is_hg {
+    return $vcs_used == 2;
+}
+
+sub interactive_get_maintainer {
+    my ($list_ref) = @_;
     my @list = @$list_ref;
 
-    return if (!vcs_exists());
+    vcs_exists();
 
     my %selected;
-    my %shortlog;
-    my $input;
+    my %authored;
+    my %signed;
     my $count = 0;
 
     #select maintainers by default
     foreach my $entry (@list){
-	    my $role = $entry->[1];
-	    $selected{$count} = ($role =~ /maintainer:|supporter:/);
-	    $count++;
+	my $role = $entry->[1];
+	$selected{$count} = ($role =~ /^(maintainer|supporter|open list)/);
+	$authored{$count} = 0;
+	$signed{$count} = 0;
+	$count++;
     }
 
     #menu loop
-    do {
-	my $count = 0;
-	foreach my $entry (@list){
-	    my $email = $entry->[0];
-	    my $role = $entry->[1];
-	    if ($selected{$count}){
-		print STDERR "* ";
-	    } else {
-		print STDERR "  ";
-	    }
-	    print STDERR "$count: $email,\t\t $role";
-	    print STDERR "\n";
-	    if ($shortlog{$count}){
-		my $entries_ref = vcs_get_shortlog($email);
-		foreach my $entry_ref (@{$entries_ref}){
-		    my $filename = @{$entry_ref}[0];
-		    my @shortlog = @{@{$entry_ref}[1]};
-		    print STDERR "\tshortlog for $filename (authored commits: " . @shortlog . ").\n";
-		    foreach my $commit (@shortlog){
-			print STDERR "\t  $commit\n";
+    my $done = 0;
+    my $print_options = 0;
+    my $redraw = 1;
+    while (!$done) {
+	$count = 0;
+	if ($redraw) {
+	    printf STDERR "\n%1s %2s %-65sauth sign\n",
+		"*", "#", "email/list and role:stats";
+	    foreach my $entry (@list) {
+		my $email = $entry->[0];
+		my $role = $entry->[1];
+		my $sel = "";
+		$sel = "*" if ($selected{$count});
+		my $commit_author = $commit_author_hash{$email};
+		my $commit_signer = $commit_signer_hash{$email};
+		my $authored = 0;
+		my $signed = 0;
+		$authored++ for (@{$commit_author});
+		$signed++ for (@{$commit_signer});
+		printf STDERR "%1s %2d %-65s", $sel, $count + 1, $email;
+		printf STDERR "%4d %4d", $authored, $signed
+		    if ($authored > 0 || $signed > 0);
+		printf STDERR "\n     %s\n", $role;
+		if ($authored{$count}) {
+		    my $commit_author = $commit_author_hash{$email};
+		    foreach my $ref (@{$commit_author}) {
+			print STDERR "     Author: @{$ref}[1]\n";
 		    }
-		    print STDERR "\n";
 		}
+		if ($signed{$count}) {
+		    my $commit_signer = $commit_signer_hash{$email};
+		    foreach my $ref (@{$commit_signer}) {
+			print STDERR "     @{$ref}[2]: @{$ref}[1]\n";
+		    }
+		}
+
+		$count++;
 	    }
-	    $count++;
 	}
-	print STDERR "\n";
-	print STDERR "Choose whom to cc by entering a commaseperated list of numbers and hitting enter.\n";
-	print STDERR "To show a short list of commits, precede the number by a '?',\n";
-	print STDERR "A blank line indicates that you are satisfied with your choice.\n";
-	$input = <STDIN>;
+	my $date_ref = \$email_git_since;
+	$date_ref = \$email_hg_since if (vcs_is_hg());
+	if ($print_options) {
+	    $print_options = 0;
+	    if (vcs_exists()) {
+		print STDERR
+"\nVersion Control options:\n" .
+"g  use git history      [$email_git]\n" .
+"gf use git-fallback     [$email_git_fallback]\n" .
+"b  use git blame        [$email_git_blame]\n" .
+"bs use blame signatures [$email_git_blame_signatures]\n" .
+"c# minimum commits      [$email_git_min_signatures]\n" .
+"%# min percent          [$email_git_min_percent]\n" .
+"d# history to use       [$$date_ref]\n" .
+"x# max maintainers      [$email_git_max_maintainers]\n" .
+"t  all signature types  [$email_git_all_signature_types]\n";
+	    }
+	    print STDERR "\nAdditional options:\n" .
+"0  toggle all\n" .
+"f  emails in file       [$file_emails]\n" .
+"k  keywords in file     [$keywords]\n" .
+"r  remove duplicates    [$email_remove_duplicates]\n" .
+"p# pattern match depth  [$pattern_depth]\n";
+	}
+	print STDERR
+"\n#(toggle), A#(author), S#(signed) *(all), ^(none), O(options), Y(approve): ";
+
+	my $input = <STDIN>;
 	chomp($input);
 
-	my @wish = split(/[, ]+/,$input);
-	foreach my $nr (@wish){
-		my $logtoggle = 0;
-		if ($nr =~ /\?/){
-			$nr =~ s/\?//;
-			$logtoggle = 1;
+	$redraw = 1;
+	my $rerun = 0;
+	my @wish = split(/[, ]+/, $input);
+	foreach my $nr (@wish) {
+	    $nr = lc($nr);
+	    my $sel = substr($nr, 0, 1);
+	    my $str = substr($nr, 1);
+	    my $val = 0;
+	    $val = $1 if $str =~ /^(\d+)$/;
+
+	    if ($sel eq "y") {
+		$interactive = 0;
+		$done = 1;
+		$output_rolestats = 0;
+		$output_roles = 0;
+		last;
+	    } elsif ($nr =~ /^\d+$/ && $nr > 0 && $nr <= $count) {
+		$selected{$nr - 1} = !$selected{$nr - 1};
+	    } elsif ($sel eq "*" || $sel eq '^') {
+		my $toggle = 0;
+		$toggle = 1 if ($sel eq '*');
+		for (my $i = 0; $i < $count; $i++) {
+		    $selected{$i} = $toggle;
 		}
-
-		#skip out of bounds numbers
-		next unless ($nr <= $count && $nr >= 0);
-
-		if ($logtoggle){
-			$shortlog{$nr} = !$shortlog{$nr};
+	    } elsif ($sel eq "0") {
+		for (my $i = 0; $i < $count; $i++) {
+		    $selected{$i} = !$selected{$i};
+		}
+	    } elsif ($sel eq "a") {
+		if ($val > 0 && $val <= $count) {
+		    $authored{$val - 1} = !$authored{$val - 1};
+		} elsif ($str eq '*' || $str eq '^') {
+		    my $toggle = 0;
+		    $toggle = 1 if ($str eq '*');
+		    for (my $i = 0; $i < $count; $i++) {
+			$authored{$i} = $toggle;
+		    }
+		}
+	    } elsif ($sel eq "s") {
+		if ($val > 0 && $val <= $count) {
+		    $signed{$val - 1} = !$signed{$val - 1};
+		} elsif ($str eq '*' || $str eq '^') {
+		    my $toggle = 0;
+		    $toggle = 1 if ($str eq '*');
+		    for (my $i = 0; $i < $count; $i++) {
+			$signed{$i} = $toggle;
+		    }
+		}
+	    } elsif ($sel eq "o") {
+		$print_options = 1;
+		$redraw = 1;
+	    } elsif ($sel eq "g") {
+		if ($str eq "f") {
+		    bool_invert(\$email_git_fallback);
 		} else {
-			$selected{$nr} = !$selected{$nr};
-
-			#switch shortlog on if an entry get's selected
-			if ($selected{$nr}){
-				$shortlog{$nr}=1;
-			}
+		    bool_invert(\$email_git);
 		}
-	};
-    } while(length($input) > 0);
+		$rerun = 1;
+	    } elsif ($sel eq "b") {
+		if ($str eq "s") {
+		    bool_invert(\$email_git_blame_signatures);
+		} else {
+		    bool_invert(\$email_git_blame);
+		}
+		$rerun = 1;
+	    } elsif ($sel eq "c") {
+		if ($val > 0) {
+		    $email_git_min_signatures = $val;
+		    $rerun = 1;
+		}
+	    } elsif ($sel eq "x") {
+		if ($val > 0) {
+		    $email_git_max_maintainers = $val;
+		    $rerun = 1;
+		}
+	    } elsif ($sel eq "%") {
+		if ($str ne "" && $val >= 0) {
+		    $email_git_min_percent = $val;
+		    $rerun = 1;
+		}
+	    } elsif ($sel eq "d") {
+		if (vcs_is_git()) {
+		    $email_git_since = $str;
+		} elsif (vcs_is_hg()) {
+		    $email_hg_since = $str;
+		}
+		$rerun = 1;
+	    } elsif ($sel eq "t") {
+		bool_invert(\$email_git_all_signature_types);
+		$rerun = 1;
+	    } elsif ($sel eq "f") {
+		bool_invert(\$file_emails);
+		$rerun = 1;
+	    } elsif ($sel eq "r") {
+		bool_invert(\$email_remove_duplicates);
+		$rerun = 1;
+	    } elsif ($sel eq "k") {
+		bool_invert(\$keywords);
+		$rerun = 1;
+	    } elsif ($sel eq "p") {
+		if ($str ne "" && $val >= 0) {
+		    $pattern_depth = $val;
+		    $rerun = 1;
+		}
+	    } else {
+		print STDERR "invalid option: '$nr'\n";
+		$redraw = 0;
+	    }
+	}
+	if ($rerun) {
+	    print STDERR "git-blame can be very slow, please have patience..."
+		if ($email_git_blame);
+	    goto &get_maintainer;
+	}
+    }
 
     #drop not selected entries
     $count = 0;
-    my @new_emailto;
-    foreach my $entry (@list){
-	if ($selected{$count}){
-		push(@new_emailto,$list[$count]);
-		print STDERR "$count: ";
-		print STDERR $email_to[$count]->[0];
-		print STDERR ",\t\t ";
-		print STDERR $email_to[$count]->[1];
-		print STDERR "\n";
+    my @new_emailto = ();
+    foreach my $entry (@list) {
+	if ($selected{$count}) {
+	    push(@new_emailto, $list[$count]);
 	}
 	$count++;
     }
-    return \@new_emailto;
+    return @new_emailto;
 }
 
-sub vcs_get_shortlog {
-    my $arg = shift;
-    my ($name, $address) = parse_email($arg);
-    return $shortlog_buffer{$address};
-}
+sub bool_invert {
+    my ($bool_ref) = @_;
 
-sub vcs_file_shortlogs {
-    my ($file) = @_;
-    print STDERR "shortlog processing $file:";
-    foreach my $entry (@email_to){
-	my ($name, $address) = parse_email($entry->[0]);
-	print STDERR ".";
-	my $commits_ref = vcs_email_shortlog($address, $file);
-	push(@{$shortlog_buffer{$address}}, [ $file, $commits_ref ]);
+    if ($$bool_ref) {
+	$$bool_ref = 0;
+    } else {
+	$$bool_ref = 1;
     }
-    print STDERR "\n";
 }
 
-sub vcs_email_shortlog {
-    my $email = shift;
-    my ($file) = @_;
+sub save_commits_by_author {
+    my (@lines) = @_;
 
-    my $cmd = $VCS_cmds{"shortlog_cmd"};
-    $cmd =~ s/(\$\w+)/$1/eeg;		#substitute variables
-    my @lines = &{$VCS_cmds{"execute_cmd"}}($cmd);
-    return \@lines;
+    my @authors = ();
+    my @commits = ();
+    my @subjects = ();
+
+    foreach my $line (@lines) {
+	if ($line =~ m/$VCS_cmds{"author_pattern"}/) {
+	    my $author = $1;
+	    my ($name, $address) = parse_email($author);
+	    $author = format_email($name, $address, 1);
+	    push(@authors, $author);
+	}
+	push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/);
+	push(@subjects, $1) if ($line =~ m/$VCS_cmds{"subject_pattern"}/);
+    }
+
+    for (my $i = 0; $i < @authors; $i++) {
+	my $exists = 0;
+	foreach my $ref(@{$commit_author_hash{$authors[$i]}}) {
+	    if (@{$ref}[0] eq $commits[$i] &&
+		@{$ref}[1] eq $subjects[$i]) {
+		$exists = 1;
+		last;
+	    }
+	}
+	if (!$exists) {
+	    push(@{$commit_author_hash{$authors[$i]}},
+		 [ ($commits[$i], $subjects[$i]) ]);
+	}
+    }
+}
+
+sub save_commits_by_signer {
+    my (@lines) = @_;
+
+    my $commit = "";
+    my $subject = "";
+
+    foreach my $line (@lines) {
+	$commit = $1 if ($line =~ m/$VCS_cmds{"commit_pattern"}/);
+	$subject = $1 if ($line =~ m/$VCS_cmds{"subject_pattern"}/);
+	if ($line =~ /^[ \t]*${signature_pattern}.*\@.*$/) {
+	    my @signatures = ($line);
+	    my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures);
+	    my @types = @$types_ref;
+	    my @signers = @$signers_ref;
+
+	    my $type = $types[0];
+	    my $signer = $signers[0];
+
+	    my $exists = 0;
+	    foreach my $ref(@{$commit_signer_hash{$signer}}) {
+		if (@{$ref}[0] eq $commit &&
+		    @{$ref}[1] eq $subject &&
+		    @{$ref}[2] eq $type) {
+		    $exists = 1;
+		    last;
+		}
+	    }
+	    if (!$exists) {
+		push(@{$commit_signer_hash{$signer}},
+		     [ ($commit, $subject, $type) ]);
+	    }
+	}
+    }
 }
 
 sub vcs_assign {
@@ -1342,7 +1606,8 @@ sub vcs_file_signoffs {
     my @signers = ();
     my $commits;
 
-    return if (!vcs_exists());
+    $vcs_used = vcs_exists();
+    return if (!$vcs_used);
 
     my $cmd = $VCS_cmds{"find_signers_cmd"};
     $cmd =~ s/(\$\w+)/$1/eeg;		# interpolate $cmd
@@ -1360,37 +1625,93 @@ sub vcs_file_blame {
     my $total_commits;
     my $total_lines;
 
-    return if (!vcs_exists());
+    $vcs_used = vcs_exists();
+    return if (!$vcs_used);
 
     @all_commits = vcs_blame($file);
     @commits = uniq(@all_commits);
     $total_commits = @commits;
     $total_lines = @all_commits;
 
-    foreach my $commit (@commits) {
-	my $commit_count;
-	my @commit_signers = ();
+    if ($email_git_blame_signatures) {
+	if (vcs_is_hg()) {
+	    my $commit_count;
+	    my @commit_signers = ();
+	    my $commit = join(" -r ", @commits);
+	    my $cmd;
 
-	my $cmd = $VCS_cmds{"find_commit_signers_cmd"};
-	$cmd =~ s/(\$\w+)/$1/eeg;	#substitute variables in $cmd
+	    $cmd = $VCS_cmds{"find_commit_signers_cmd"};
+	    $cmd =~ s/(\$\w+)/$1/eeg;	#substitute variables in $cmd
 
-	($commit_count, @commit_signers) = vcs_find_signers($cmd);
+	    ($commit_count, @commit_signers) = vcs_find_signers($cmd);
 
-	push(@signers, @commit_signers);
+	    push(@signers, @commit_signers);
+	} else {
+	    foreach my $commit (@commits) {
+		my $commit_count;
+		my @commit_signers = ();
+		my $cmd;
+
+		$cmd = $VCS_cmds{"find_commit_signers_cmd"};
+		$cmd =~ s/(\$\w+)/$1/eeg;	#substitute variables in $cmd
+
+		($commit_count, @commit_signers) = vcs_find_signers($cmd);
+
+		push(@signers, @commit_signers);
+	    }
+	}
     }
 
     if ($from_filename) {
 	if ($output_rolestats) {
 	    my @blame_signers;
-	    foreach my $commit (@commits) {
-		my $i;
-		my $cmd = $VCS_cmds{"find_commit_author_cmd"};
-		$cmd =~ s/(\$\w+)/$1/eeg;	#interpolate $cmd
-		my @author = vcs_find_author($cmd);
-		next if !@author;
-		my $count = grep(/$commit/, @all_commits);
-		for ($i = 0; $i < $count ; $i++) {
-		    push(@blame_signers, $author[0]);
+	    if (vcs_is_hg()) {{		# Double brace for last exit
+		my $commit_count;
+		my @commit_signers = ();
+		@commits = uniq(@commits);
+		@commits = sort(@commits);
+		my $commit = join(" -r ", @commits);
+		my $cmd;
+
+		$cmd = $VCS_cmds{"find_commit_author_cmd"};
+		$cmd =~ s/(\$\w+)/$1/eeg;	#substitute variables in $cmd
+
+		my @lines = ();
+
+		@lines = &{$VCS_cmds{"execute_cmd"}}($cmd);
+
+		if (!$email_git_penguin_chiefs) {
+		    @lines = grep(!/${penguin_chiefs}/i, @lines);
+		}
+
+		last if !@lines;
+
+		my @authors = ();
+		foreach my $line (@lines) {
+		    if ($line =~ m/$VCS_cmds{"author_pattern"}/) {
+			my $author = $1;
+			my ($name, $address) = parse_email($author);
+			$author = format_email($name, $address, 1);
+			push(@authors, $1);
+		    }
+		}
+
+		save_commits_by_author(@lines) if ($interactive);
+		save_commits_by_signer(@lines) if ($interactive);
+
+		push(@signers, @authors);
+	    }}
+	    else {
+		foreach my $commit (@commits) {
+		    my $i;
+		    my $cmd = $VCS_cmds{"find_commit_author_cmd"};
+		    $cmd =~ s/(\$\w+)/$1/eeg;	#interpolate $cmd
+		    my @author = vcs_find_author($cmd);
+		    next if !@author;
+		    my $count = grep(/$commit/, @all_commits);
+		    for ($i = 0; $i < $count ; $i++) {
+			push(@blame_signers, $author[0]);
+		    }
 		}
 	    }
 	    if (@blame_signers) {

From 6ef1c52e122b675acc88a8b016d6477f67988b91 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:56 -0700
Subject: [PATCH 135/176] scripts/get_maintainer.pl: use case insensitive name
 de-duplication

Case insensitive name and email address matching can help reduce
duplication when authors don't always use the exact same signature.

o Add a --interactive per-file exact_match hash so git history
  can be checked on per-file only when there is no direct maintainer
o Make @interactive_to list global so save_commits_by_<foo> can check
  email names & addresses against this list for duplication
o Don't allow --interactive and --sections
o rename subroutine get_maintainer to get_maintainers
o Added help text option to --interactive menu prompt

Update version to 0.26-beta4

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Florian Mickler <florian@mickler.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 135 ++++++++++++++++++++++++++++----------
 1 file changed, 101 insertions(+), 34 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index f51176039ff5..61d3bb51bddf 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
 use strict;
 
 my $P = $0;
-my $V = '0.26-beta3';
+my $V = '0.26-beta4';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 
@@ -242,6 +242,7 @@ if ($sections) {
     $subsystem = 0;
     $web = 0;
     $keywords = 0;
+    $interactive = 0;
 } else {
     my $selections = $email + $scm + $status + $subsystem + $web;
     if ($selections == 0) {
@@ -407,13 +408,15 @@ my @scm = ();
 my @web = ();
 my @subsystem = ();
 my @status = ();
+my @interactive_to = ();
 my $signature_pattern;
 
-my @to = get_maintainer();
+my @maintainers = get_maintainers();
 
-@to = merge_email(@to);
-
-output(@to) if (@to);
+if (@maintainers) {
+    @maintainers = merge_email(@maintainers);
+    output(@maintainers);
+}
 
 if ($scm) {
     @scm = uniq(@scm);
@@ -437,7 +440,7 @@ if ($web) {
 
 exit($exit);
 
-sub get_maintainer {
+sub get_maintainers {
     %email_hash_name = ();
     %email_hash_address = ();
     %commit_author_hash = ();
@@ -449,7 +452,7 @@ sub get_maintainer {
     @web = ();
     @subsystem = ();
     @status = ();
-
+    @interactive_to = ();
     if ($email_git_all_signature_types) {
 	$signature_pattern = "(.+?)[Bb][Yy]:";
     } else {
@@ -458,10 +461,11 @@ sub get_maintainer {
 
     # Find responsible parties
 
+    my %exact_pattern_match_hash;
+
     foreach my $file (@files) {
 
 	my %hash;
-	my $exact_pattern_match = 0;
 	my $tvi = find_first_section();
 	while ($tvi < @typevalue) {
 	    my $start = find_starting_index($tvi);
@@ -497,7 +501,9 @@ sub get_maintainer {
 				my $file_pd = ($file  =~ tr@/@@);
 				$value_pd++ if (substr($value,-1,1) ne "/");
 				$value_pd = -1 if ($value =~ /^\.\*/);
-				$exact_pattern_match = 1 if ($value_pd >= $file_pd);
+				if ($value_pd >= $file_pd) {
+				    $exact_pattern_match_hash{$file} = 1;
+				}
 				if ($pattern_depth == 0 ||
 				    (($file_pd - $value_pd) < $pattern_depth)) {
 				    $hash{$tvi} = $value_pd;
@@ -530,14 +536,6 @@ sub get_maintainer {
 		print("\n");
 	    }
 	}
-
-	if ($email && ($email_git ||
-		       ($email_git_fallback && !$exact_pattern_match))) {
-	    vcs_file_signoffs($file);
-	}
-	if ($email && $email_git_blame) {
-	    vcs_file_blame($file);
-	}
     }
 
     if ($keywords) {
@@ -547,6 +545,19 @@ sub get_maintainer {
 	}
     }
 
+    @interactive_to = (@email_to, @list_to);
+
+    foreach my $file (@files) {
+	if ($email &&
+	    ($email_git || ($email_git_fallback &&
+			    !$exact_pattern_match_hash{$file}))) {
+	    vcs_file_signoffs($file);
+	}
+	if ($email && $email_git_blame) {
+	    vcs_file_blame($file);
+	}
+    }
+
     if ($email) {
 	foreach my $chief (@penguin_chief) {
 	    if ($chief =~ m/^(.*):(.*)/) {
@@ -580,7 +591,10 @@ sub get_maintainer {
 	}
     }
 
-    @to = interactive_get_maintainer(\@to) if ($interactive);
+    if ($interactive) {
+	@interactive_to = @to;
+	@to = interactive_get_maintainers(\@interactive_to);
+    }
 
     return @to;
 }
@@ -899,16 +913,16 @@ sub add_categories {
 		}
 		if ($list_additional =~ m/subscribers-only/) {
 		    if ($email_subscriber_list) {
-			if (!$hash_list_to{$list_address}) {
-			    $hash_list_to{$list_address} = 1;
+			if (!$hash_list_to{lc($list_address)}) {
+			    $hash_list_to{lc($list_address)} = 1;
 			    push(@list_to, [$list_address,
 					    "subscriber list${list_role}"]);
 			}
 		    }
 		} else {
 		    if ($email_list) {
-			if (!$hash_list_to{$list_address}) {
-			    $hash_list_to{$list_address} = 1;
+			if (!$hash_list_to{lc($list_address)}) {
+			    $hash_list_to{lc($list_address)} = 1;
 			    push(@list_to, [$list_address,
 					    "open list${list_role}"]);
 			}
@@ -946,8 +960,8 @@ sub email_inuse {
     my ($name, $address) = @_;
 
     return 1 if (($name eq "") && ($address eq ""));
-    return 1 if (($name ne "") && exists($email_hash_name{$name}));
-    return 1 if (($address ne "") && exists($email_hash_address{$address}));
+    return 1 if (($name ne "") && exists($email_hash_name{lc($name)}));
+    return 1 if (($address ne "") && exists($email_hash_address{lc($address)}));
 
     return 0;
 }
@@ -965,8 +979,8 @@ sub push_email_address {
 	push(@email_to, [format_email($name, $address, $email_usename), $role]);
     } elsif (!email_inuse($name, $address)) {
 	push(@email_to, [format_email($name, $address, $email_usename), $role]);
-	$email_hash_name{$name}++;
-	$email_hash_address{$address}++;
+	$email_hash_name{lc($name)}++;
+	$email_hash_address{lc($address)}++;
     }
 
     return 1;
@@ -1259,7 +1273,7 @@ sub vcs_is_hg {
     return $vcs_used == 2;
 }
 
-sub interactive_get_maintainer {
+sub interactive_get_maintainers {
     my ($list_ref) = @_;
     my @list = @$list_ref;
 
@@ -1269,11 +1283,12 @@ sub interactive_get_maintainer {
     my %authored;
     my %signed;
     my $count = 0;
-
+    my $maintained = 0;
     #select maintainers by default
-    foreach my $entry (@list){
+    foreach my $entry (@list) {
 	my $role = $entry->[1];
-	$selected{$count} = ($role =~ /^(maintainer|supporter|open list)/);
+	$selected{$count} = ($role =~ /^(maintainer|supporter|open list)/i);
+	$maintained = 1 if ($role =~ /^(maintainer|supporter)/i);
 	$authored{$count} = 0;
 	$signed{$count} = 0;
 	$count++;
@@ -1286,8 +1301,14 @@ sub interactive_get_maintainer {
     while (!$done) {
 	$count = 0;
 	if ($redraw) {
-	    printf STDERR "\n%1s %2s %-65sauth sign\n",
-		"*", "#", "email/list and role:stats";
+	    printf STDERR "\n%1s %2s %-65s",
+			  "*", "#", "email/list and role:stats";
+	    if ($email_git ||
+		($email_git_fallback && !$maintained) ||
+		$email_git_blame) {
+		print STDERR "auth sign";
+	    }
+	    print STDERR "\n";
 	    foreach my $entry (@list) {
 		my $email = $entry->[0];
 		my $role = $entry->[1];
@@ -1453,6 +1474,27 @@ sub interactive_get_maintainer {
 		    $pattern_depth = $val;
 		    $rerun = 1;
 		}
+	    } elsif ($sel eq "h" || $sel eq "?") {
+		print STDERR <<EOT
+
+Interactive mode allows you to select the various maintainers, submitters,
+commit signers and mailing lists that could be CC'd on a patch.
+
+Any *'d entry is selected.
+
+If you have git or hg installed, You can choose to summarize the commit
+history of files in the patch.  Also, each line of the current file can
+be matched to its commit author and that commits signers with blame.
+
+Various knobs exist to control the length of time for active commit
+tracking, the maximum number of commit authors and signers to add,
+and such.
+
+Enter selections at the prompt until you are satisfied that the selected
+maintainers are appropriate.  You may enter multiple selections separated
+by either commas or spaces.
+
+EOT
 	    } else {
 		print STDERR "invalid option: '$nr'\n";
 		$redraw = 0;
@@ -1461,7 +1503,7 @@ sub interactive_get_maintainer {
 	if ($rerun) {
 	    print STDERR "git-blame can be very slow, please have patience..."
 		if ($email_git_blame);
-	    goto &get_maintainer;
+	    goto &get_maintainers;
 	}
     }
 
@@ -1496,9 +1538,20 @@ sub save_commits_by_author {
 
     foreach my $line (@lines) {
 	if ($line =~ m/$VCS_cmds{"author_pattern"}/) {
+	    my $matched = 0;
 	    my $author = $1;
 	    my ($name, $address) = parse_email($author);
-	    $author = format_email($name, $address, 1);
+	    foreach my $to (@interactive_to) {
+		my ($to_name, $to_address) = parse_email($to->[0]);
+		if ($email_remove_duplicates &&
+		    ((lc($name) eq lc($to_name)) ||
+		     (lc($address) eq lc($to_address)))) {
+		    $author = $to->[0];
+		    $matched = 1;
+		    last;
+		}
+	    }
+	    $author = format_email($name, $address, 1) if (!$matched);
 	    push(@authors, $author);
 	}
 	push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/);
@@ -1539,6 +1592,20 @@ sub save_commits_by_signer {
 	    my $type = $types[0];
 	    my $signer = $signers[0];
 
+	    my $matched = 0;
+	    my ($name, $address) = parse_email($signer);
+	    foreach my $to (@interactive_to) {
+		my ($to_name, $to_address) = parse_email($to->[0]);
+		if ($email_remove_duplicates &&
+		    ((lc($name) eq lc($to_name)) ||
+		     (lc($address) eq lc($to_address)))) {
+		    $signer = $to->[0];
+		    $matched = 1;
+		    last;
+		}
+		$signer = format_email($name, $address, 1) if (!$matched);
+	    }
+
 	    my $exists = 0;
 	    foreach my $ref(@{$commit_signer_hash{$signer}}) {
 		if (@{$ref}[0] eq $commit &&

From 7fa8ff2e0c0f326cdaaa4ae7d00f5d021e43ffa2 Mon Sep 17 00:00:00 2001
From: Florian Mickler <florian@mickler.org>
Date: Tue, 26 Oct 2010 14:22:56 -0700
Subject: [PATCH 136/176] scripts/get_maintainer.pl: fix mailmap handling

Implement it, like it is described in git-shortlog.

Signed-off-by: Florian Mickler <florian@mickler.org>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 147 ++++++++++++++++++++++++++++----------
 1 file changed, 109 insertions(+), 38 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 61d3bb51bddf..faeace4e1fd0 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -295,31 +295,76 @@ while (<$maint>) {
 }
 close($maint);
 
-my %mailmap;
 
-if ($email_remove_duplicates) {
-    open(my $mailmap, '<', "${lk_path}.mailmap")
+#
+# Read mail address map
+#
+
+my $mailmap = read_mailmap();
+
+sub read_mailmap {
+    my $mailmap = {
+	names => {},
+	addresses => {}
+   };
+
+    if (!$email_remove_duplicates) {
+	return $mailmap;
+    }
+
+    open(my $mailmap_file, '<', "${lk_path}.mailmap")
 	or warn "$P: Can't open .mailmap: $!\n";
-    while (<$mailmap>) {
-	my $line = $_;
 
-	next if ($line =~ m/^\s*#/);
-	next if ($line =~ m/^\s*$/);
+    while (<$mailmap_file>) {
+	s/#.*$//; #strip comments
+	s/^\s+|\s+$//g; #trim
 
-	my ($name, $address) = parse_email($line);
-	$line = format_email($name, $address, $email_usename);
+	next if (/^\s*$/); #skip empty lines
+	#entries have one of the following formats:
+	# name1 <mail1>
+	# <mail1> <mail2>
+	# name1 <mail1> <mail2>
+	# name1 <mail1> name2 <mail2>
+	# (see man git-shortlog)
+	if (/^(.+)<(.+)>$/) {
+		my $real_name = $1;
+		my $address = $2;
 
-	next if ($line =~ m/^\s*$/);
+		$real_name =~ s/\s+$//;
+		$mailmap->{names}->{$address} = $real_name;
 
-	if (exists($mailmap{$name})) {
-	    my $obj = $mailmap{$name};
-	    push(@$obj, $address);
-	} else {
-	    my @arr = ($address);
-	    $mailmap{$name} = \@arr;
+	} elsif (/^<([^\s]+)>\s*<([^\s]+)>$/) {
+		my $real_address = $1;
+		my $wrong_address = $2;
+
+		$mailmap->{addresses}->{$wrong_address} = $real_address;
+
+	} elsif (/^(.+)<([^\s]+)>\s*<([^\s]+)>$/) {
+		my $real_name= $1;
+		my $real_address = $2;
+		my $wrong_address = $3;
+
+		$real_name =~ s/\s+$//;
+
+		$mailmap->{names}->{$wrong_address} = $real_name;
+		$mailmap->{addresses}->{$wrong_address} = $real_address;
+
+	} elsif (/^(.+)<([^\s]+)>\s*([^\s].*)<([^\s]+)>$/) {
+		my $real_name = $1;
+		my $real_address = $2;
+		my $wrong_name = $3;
+		my $wrong_address = $4;
+
+		$real_name =~ s/\s+$//;
+		$wrong_name =~ s/\s+$//;
+
+		$mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name;
+		$mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address;
 	}
     }
-    close($mailmap);
+    close($mailmap_file);
+
+    return $mailmap;
 }
 
 ## use the filenames on the command line or find the filenames in the patchfiles
@@ -1061,30 +1106,58 @@ sub which_conf {
     return "";
 }
 
-sub mailmap {
-    my (@lines) = @_;
-    my %hash;
+sub mailmap_email {
+	my $line = shift;
 
-    foreach my $line (@lines) {
 	my ($name, $address) = parse_email($line);
-	if (!exists($hash{$name})) {
-	    $hash{$name} = $address;
-	} elsif ($address ne $hash{$name}) {
-	    $address = $hash{$name};
-	    $line = format_email($name, $address, $email_usename);
-	}
-	if (exists($mailmap{$name})) {
-	    my $obj = $mailmap{$name};
-	    foreach my $map_address (@$obj) {
-		if (($map_address eq $address) &&
-		    ($map_address ne $hash{$name})) {
-		    $line = format_email($name, $hash{$name}, $email_usename);
+	my $email = format_email($name, $address, 1);
+	my $real_name = $name;
+	my $real_address = $address;
+
+	if (exists $mailmap->{names}->{$email} || exists $mailmap->{addresses}->{$email}) {
+		if (exists $mailmap->{names}->{$email}) {
+			$real_name = $mailmap->{names}->{$email};
+		}
+		if (exists $mailmap->{addresses}->{$email}) {
+			$real_address = $mailmap->{addresses}->{$email};
+		}
+	} else {
+		if (exists $mailmap->{names}->{$address}) {
+			$real_name = $mailmap->{names}->{$address};
+		}
+		if (exists $mailmap->{addresses}->{$address}) {
+			$real_address = $mailmap->{addresses}->{$address};
 		}
-	    }
 	}
+	return format_email($real_name, $real_address, 1);
+}
+
+sub mailmap {
+    my (@addresses) = @_;
+
+    my @ret = ();
+    foreach my $line (@addresses) {
+	push(@ret, mailmap_email($line), 1);
     }
 
-    return @lines;
+    merge_by_realname(@ret) if $email_remove_duplicates;
+
+    return @ret;
+}
+
+sub merge_by_realname {
+	my %address_map;
+	my (@emails) = @_;
+	foreach my $email (@emails) {
+		my ($name, $address) = parse_email($email);
+		if (!exists $address_map{$name}) {
+			$address_map{$name} = $address;
+		} else {
+			$address = $address_map{$name};
+			$email = format_email($name,$address,1);
+		}
+	}
+
 }
 
 sub git_execute_cmd {
@@ -1636,9 +1709,7 @@ sub vcs_assign {
 	$divisor = 1;
     }
 
-    if ($email_remove_duplicates) {
-	@lines = mailmap(@lines);
-    }
+    @lines = mailmap(@lines);
 
     return if (@lines <= 0);
 

From 47abc7225761faf28be52b3ac4dc26ffeac7b750 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:57 -0700
Subject: [PATCH 137/176] scripts/get_maintainer.pl: correct indentation in a
 few places

And a miscellaneous conversion of You to you in a help message

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Florian Mickler <florian@mickler.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 154 +++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 77 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index faeace4e1fd0..0abfdbc5cdff 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -306,7 +306,7 @@ sub read_mailmap {
     my $mailmap = {
 	names => {},
 	addresses => {}
-   };
+    };
 
     if (!$email_remove_duplicates) {
 	return $mailmap;
@@ -327,39 +327,39 @@ sub read_mailmap {
 	# name1 <mail1> name2 <mail2>
 	# (see man git-shortlog)
 	if (/^(.+)<(.+)>$/) {
-		my $real_name = $1;
-		my $address = $2;
+	    my $real_name = $1;
+	    my $address = $2;
 
-		$real_name =~ s/\s+$//;
-		$mailmap->{names}->{$address} = $real_name;
+	    $real_name =~ s/\s+$//;
+	    $mailmap->{names}->{$address} = $real_name;
 
 	} elsif (/^<([^\s]+)>\s*<([^\s]+)>$/) {
-		my $real_address = $1;
-		my $wrong_address = $2;
+	    my $real_address = $1;
+	    my $wrong_address = $2;
 
-		$mailmap->{addresses}->{$wrong_address} = $real_address;
+	    $mailmap->{addresses}->{$wrong_address} = $real_address;
 
 	} elsif (/^(.+)<([^\s]+)>\s*<([^\s]+)>$/) {
-		my $real_name= $1;
-		my $real_address = $2;
-		my $wrong_address = $3;
+	    my $real_name= $1;
+	    my $real_address = $2;
+	    my $wrong_address = $3;
 
-		$real_name =~ s/\s+$//;
+	    $real_name =~ s/\s+$//;
 
-		$mailmap->{names}->{$wrong_address} = $real_name;
-		$mailmap->{addresses}->{$wrong_address} = $real_address;
+	    $mailmap->{names}->{$wrong_address} = $real_name;
+	    $mailmap->{addresses}->{$wrong_address} = $real_address;
 
 	} elsif (/^(.+)<([^\s]+)>\s*([^\s].*)<([^\s]+)>$/) {
-		my $real_name = $1;
-		my $real_address = $2;
-		my $wrong_name = $3;
-		my $wrong_address = $4;
+	    my $real_name = $1;
+	    my $real_address = $2;
+	    my $wrong_name = $3;
+	    my $wrong_address = $4;
 
-		$real_name =~ s/\s+$//;
-		$wrong_name =~ s/\s+$//;
+	    $real_name =~ s/\s+$//;
+	    $wrong_name =~ s/\s+$//;
 
-		$mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name;
-		$mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address;
+	    $mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name;
+	    $mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address;
 	}
     }
     close($mailmap_file);
@@ -743,30 +743,30 @@ EOT
 }
 
 sub top_of_kernel_tree {
-	my ($lk_path) = @_;
+    my ($lk_path) = @_;
 
-	if ($lk_path ne "" && substr($lk_path,length($lk_path)-1,1) ne "/") {
-	    $lk_path .= "/";
-	}
-	if (   (-f "${lk_path}COPYING")
-	    && (-f "${lk_path}CREDITS")
-	    && (-f "${lk_path}Kbuild")
-	    && (-f "${lk_path}MAINTAINERS")
-	    && (-f "${lk_path}Makefile")
-	    && (-f "${lk_path}README")
-	    && (-d "${lk_path}Documentation")
-	    && (-d "${lk_path}arch")
-	    && (-d "${lk_path}include")
-	    && (-d "${lk_path}drivers")
-	    && (-d "${lk_path}fs")
-	    && (-d "${lk_path}init")
-	    && (-d "${lk_path}ipc")
-	    && (-d "${lk_path}kernel")
-	    && (-d "${lk_path}lib")
-	    && (-d "${lk_path}scripts")) {
-		return 1;
-	}
-	return 0;
+    if ($lk_path ne "" && substr($lk_path,length($lk_path)-1,1) ne "/") {
+	$lk_path .= "/";
+    }
+    if (   (-f "${lk_path}COPYING")
+	&& (-f "${lk_path}CREDITS")
+	&& (-f "${lk_path}Kbuild")
+	&& (-f "${lk_path}MAINTAINERS")
+	&& (-f "${lk_path}Makefile")
+	&& (-f "${lk_path}README")
+	&& (-d "${lk_path}Documentation")
+	&& (-d "${lk_path}arch")
+	&& (-d "${lk_path}include")
+	&& (-d "${lk_path}drivers")
+	&& (-d "${lk_path}fs")
+	&& (-d "${lk_path}init")
+	&& (-d "${lk_path}ipc")
+	&& (-d "${lk_path}kernel")
+	&& (-d "${lk_path}lib")
+	&& (-d "${lk_path}scripts")) {
+	return 1;
+    }
+    return 0;
 }
 
 sub parse_email {
@@ -1107,29 +1107,30 @@ sub which_conf {
 }
 
 sub mailmap_email {
-	my $line = shift;
+    my $line = shift;
 
-	my ($name, $address) = parse_email($line);
-	my $email = format_email($name, $address, 1);
-	my $real_name = $name;
-	my $real_address = $address;
+    my ($name, $address) = parse_email($line);
+    my $email = format_email($name, $address, 1);
+    my $real_name = $name;
+    my $real_address = $address;
 
-	if (exists $mailmap->{names}->{$email} || exists $mailmap->{addresses}->{$email}) {
-		if (exists $mailmap->{names}->{$email}) {
-			$real_name = $mailmap->{names}->{$email};
-		}
-		if (exists $mailmap->{addresses}->{$email}) {
-			$real_address = $mailmap->{addresses}->{$email};
-		}
-	} else {
-		if (exists $mailmap->{names}->{$address}) {
-			$real_name = $mailmap->{names}->{$address};
-		}
-		if (exists $mailmap->{addresses}->{$address}) {
-			$real_address = $mailmap->{addresses}->{$address};
-		}
+    if (exists $mailmap->{names}->{$email} ||
+	exists $mailmap->{addresses}->{$email}) {
+	if (exists $mailmap->{names}->{$email}) {
+	    $real_name = $mailmap->{names}->{$email};
 	}
-	return format_email($real_name, $real_address, 1);
+	if (exists $mailmap->{addresses}->{$email}) {
+	    $real_address = $mailmap->{addresses}->{$email};
+	}
+    } else {
+	if (exists $mailmap->{names}->{$address}) {
+	    $real_name = $mailmap->{names}->{$address};
+	}
+	if (exists $mailmap->{addresses}->{$address}) {
+	    $real_address = $mailmap->{addresses}->{$address};
+	}
+    }
+    return format_email($real_name, $real_address, 1);
 }
 
 sub mailmap {
@@ -1146,18 +1147,17 @@ sub mailmap {
 }
 
 sub merge_by_realname {
-	my %address_map;
-	my (@emails) = @_;
-	foreach my $email (@emails) {
-		my ($name, $address) = parse_email($email);
-		if (!exists $address_map{$name}) {
-			$address_map{$name} = $address;
-		} else {
-			$address = $address_map{$name};
-			$email = format_email($name,$address,1);
-		}
+    my %address_map;
+    my (@emails) = @_;
+    foreach my $email (@emails) {
+	my ($name, $address) = parse_email($email);
+	if (!exists $address_map{$name}) {
+	    $address_map{$name} = $address;
+	} else {
+	    $address = $address_map{$name};
+	    $email = format_email($name,$address,1);
 	}
-
+    }
 }
 
 sub git_execute_cmd {
@@ -1555,7 +1555,7 @@ commit signers and mailing lists that could be CC'd on a patch.
 
 Any *'d entry is selected.
 
-If you have git or hg installed, You can choose to summarize the commit
+If you have git or hg installed, you can choose to summarize the commit
 history of files in the patch.  Also, each line of the current file can
 be matched to its commit author and that commits signers with blame.
 

From b9e2331dd1e0e04f7f2a6f8aa0c05bac2a7f0d7b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:58 -0700
Subject: [PATCH 138/176] scripts/get_maintainer.pl: use mailmap in name
 deduplication and other updates

Use Florian Mickler's mailmap routine to reduce name duplication.

o Add subroutine deduplicate_email to centralize code
o Add hashes for deduplicate_(name|address)_hash
o Remove now unused @interactive_to
o Whitespace neatening
o Add command line --help text
o Add --mailmap command line option control
o Interactive changes:
   - Add toggles for maintainer, git and list selections
   - Default selection is all
   - Add mailmap control

Update to 0.26-beta5

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Florian Mickler <florian@mickler.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 234 ++++++++++++++++++++++++--------------
 1 file changed, 149 insertions(+), 85 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 0abfdbc5cdff..e822518bc610 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
 use strict;
 
 my $P = $0;
-my $V = '0.26-beta4';
+my $V = '0.26-beta5';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 
@@ -36,6 +36,7 @@ my $email_git_since = "1-year-ago";
 my $email_hg_since = "-365";
 my $interactive = 0;
 my $email_remove_duplicates = 1;
+my $email_use_mailmap = 1;
 my $output_multiline = 1;
 my $output_separator = ", ";
 my $output_roles = 0;
@@ -192,6 +193,7 @@ if (!GetOptions(
 		'hg-since=s' => \$email_hg_since,
 		'i|interactive!' => \$interactive,
 		'remove-duplicates!' => \$email_remove_duplicates,
+		'mailmap!' => \$email_use_mailmap,
 		'm!' => \$email_maintainer,
 		'n!' => \$email_usename,
 		'l!' => \$email_list,
@@ -300,17 +302,17 @@ close($maint);
 # Read mail address map
 #
 
-my $mailmap = read_mailmap();
+my $mailmap;
+
+read_mailmap();
 
 sub read_mailmap {
-    my $mailmap = {
+    $mailmap = {
 	names => {},
 	addresses => {}
     };
 
-    if (!$email_remove_duplicates) {
-	return $mailmap;
-    }
+    return if (!$email_use_mailmap || !(-f "${lk_path}.mailmap"));
 
     open(my $mailmap_file, '<', "${lk_path}.mailmap")
 	or warn "$P: Can't open .mailmap: $!\n";
@@ -331,6 +333,7 @@ sub read_mailmap {
 	    my $address = $2;
 
 	    $real_name =~ s/\s+$//;
+	    ($real_name, $address) = parse_email("$real_name <$address>");
 	    $mailmap->{names}->{$address} = $real_name;
 
 	} elsif (/^<([^\s]+)>\s*<([^\s]+)>$/) {
@@ -340,12 +343,13 @@ sub read_mailmap {
 	    $mailmap->{addresses}->{$wrong_address} = $real_address;
 
 	} elsif (/^(.+)<([^\s]+)>\s*<([^\s]+)>$/) {
-	    my $real_name= $1;
+	    my $real_name = $1;
 	    my $real_address = $2;
 	    my $wrong_address = $3;
 
 	    $real_name =~ s/\s+$//;
-
+	    ($real_name, $real_address) =
+		parse_email("$real_name <$real_address>");
 	    $mailmap->{names}->{$wrong_address} = $real_name;
 	    $mailmap->{addresses}->{$wrong_address} = $real_address;
 
@@ -356,15 +360,19 @@ sub read_mailmap {
 	    my $wrong_address = $4;
 
 	    $real_name =~ s/\s+$//;
-	    $wrong_name =~ s/\s+$//;
+	    ($real_name, $real_address) =
+		parse_email("$real_name <$real_address>");
 
-	    $mailmap->{names}->{format_email($wrong_name,$wrong_address,1)} = $real_name;
-	    $mailmap->{addresses}->{format_email($wrong_name,$wrong_address,1)} = $real_address;
+	    $wrong_name =~ s/\s+$//;
+	    ($wrong_name, $wrong_address) =
+		parse_email("$wrong_name <$wrong_address>");
+
+	    my $wrong_email = format_email($wrong_name, $wrong_address, 1);
+	    $mailmap->{names}->{$wrong_email} = $real_name;
+	    $mailmap->{addresses}->{$wrong_email} = $real_address;
 	}
     }
     close($mailmap_file);
-
-    return $mailmap;
 }
 
 ## use the filenames on the command line or find the filenames in the patchfiles
@@ -453,7 +461,8 @@ my @scm = ();
 my @web = ();
 my @subsystem = ();
 my @status = ();
-my @interactive_to = ();
+my %deduplicate_name_hash = ();
+my %deduplicate_address_hash = ();
 my $signature_pattern;
 
 my @maintainers = get_maintainers();
@@ -497,7 +506,8 @@ sub get_maintainers {
     @web = ();
     @subsystem = ();
     @status = ();
-    @interactive_to = ();
+    %deduplicate_name_hash = ();
+    %deduplicate_address_hash = ();
     if ($email_git_all_signature_types) {
 	$signature_pattern = "(.+?)[Bb][Yy]:";
     } else {
@@ -506,7 +516,7 @@ sub get_maintainers {
 
     # Find responsible parties
 
-    my %exact_pattern_match_hash;
+    my %exact_pattern_match_hash = ();
 
     foreach my $file (@files) {
 
@@ -590,7 +600,9 @@ sub get_maintainers {
 	}
     }
 
-    @interactive_to = (@email_to, @list_to);
+    foreach my $email (@email_to, @list_to) {
+	$email->[0] = deduplicate_email($email->[0]);
+    }
 
     foreach my $file (@files) {
 	if ($email &&
@@ -637,8 +649,7 @@ sub get_maintainers {
     }
 
     if ($interactive) {
-	@interactive_to = @to;
-	@to = interactive_get_maintainers(\@interactive_to);
+	@to = interactive_get_maintainers(\@to);
     }
 
     return @to;
@@ -702,8 +713,9 @@ Output type options:
 
 Other options:
   --pattern-depth => Number of pattern directory traversals (default: 0 (all))
-  --keywords => scan patch for keywords (default: 1 (on))
-  --sections => print the entire subsystem sections with pattern matches
+  --keywords => scan patch for keywords (default: $keywords)
+  --sections => print all of the subsystem sections with pattern matches
+  --mailmap => use .mailmap file (default: $email_use_mailmap)
   --version => show version
   --help => show this help information
 
@@ -1107,7 +1119,7 @@ sub which_conf {
 }
 
 sub mailmap_email {
-    my $line = shift;
+    my ($line) = @_;
 
     my ($name, $address) = parse_email($line);
     my $email = format_email($name, $address, 1);
@@ -1136,26 +1148,25 @@ sub mailmap_email {
 sub mailmap {
     my (@addresses) = @_;
 
-    my @ret = ();
+    my @mapped_emails = ();
     foreach my $line (@addresses) {
-	push(@ret, mailmap_email($line), 1);
+	push(@mapped_emails, mailmap_email($line));
     }
-
-    merge_by_realname(@ret) if $email_remove_duplicates;
-
-    return @ret;
+    merge_by_realname(@mapped_emails) if ($email_use_mailmap);
+    return @mapped_emails;
 }
 
 sub merge_by_realname {
     my %address_map;
     my (@emails) = @_;
+
     foreach my $email (@emails) {
 	my ($name, $address) = parse_email($email);
-	if (!exists $address_map{$name}) {
-	    $address_map{$name} = $address;
-	} else {
+	if (exists $address_map{$name}) {
 	    $address = $address_map{$name};
-	    $email = format_email($name,$address,1);
+	    $email = format_email($name, $address, 1);
+	} else {
+	    $address_map{$name} = $address;
 	}
     }
 }
@@ -1194,8 +1205,7 @@ sub extract_formatted_signatures {
 ## Reformat email addresses (with names) to avoid badly written signatures
 
     foreach my $signer (@signature_lines) {
-	my ($name, $address) = parse_email($signer);
-	$signer = format_email($name, $address, 1);
+	$signer = deduplicate_email($signer);
     }
 
     return (\@type, \@signature_lines);
@@ -1339,6 +1349,7 @@ sub vcs_exists {
 }
 
 sub vcs_is_git {
+    vcs_exists();
     return $vcs_used == 1;
 }
 
@@ -1357,11 +1368,9 @@ sub interactive_get_maintainers {
     my %signed;
     my $count = 0;
     my $maintained = 0;
-    #select maintainers by default
     foreach my $entry (@list) {
-	my $role = $entry->[1];
-	$selected{$count} = ($role =~ /^(maintainer|supporter|open list)/i);
-	$maintained = 1 if ($role =~ /^(maintainer|supporter)/i);
+	$maintained = 1 if ($entry->[1] =~ /^(maintainer|supporter)/i);
+	$selected{$count} = 1;
 	$authored{$count} = 0;
 	$signed{$count} = 0;
 	$count++;
@@ -1418,24 +1427,34 @@ sub interactive_get_maintainers {
 	if ($print_options) {
 	    $print_options = 0;
 	    if (vcs_exists()) {
-		print STDERR
-"\nVersion Control options:\n" .
-"g  use git history      [$email_git]\n" .
-"gf use git-fallback     [$email_git_fallback]\n" .
-"b  use git blame        [$email_git_blame]\n" .
-"bs use blame signatures [$email_git_blame_signatures]\n" .
-"c# minimum commits      [$email_git_min_signatures]\n" .
-"%# min percent          [$email_git_min_percent]\n" .
-"d# history to use       [$$date_ref]\n" .
-"x# max maintainers      [$email_git_max_maintainers]\n" .
-"t  all signature types  [$email_git_all_signature_types]\n";
+		print STDERR <<EOT
+
+Version Control options:
+g  use git history      [$email_git]
+gf use git-fallback     [$email_git_fallback]
+b  use git blame        [$email_git_blame]
+bs use blame signatures [$email_git_blame_signatures]
+c# minimum commits      [$email_git_min_signatures]
+%# min percent          [$email_git_min_percent]
+d# history to use       [$$date_ref]
+x# max maintainers      [$email_git_max_maintainers]
+t  all signature types  [$email_git_all_signature_types]
+m  use .mailmap         [$email_use_mailmap]
+EOT
 	    }
-	    print STDERR "\nAdditional options:\n" .
-"0  toggle all\n" .
-"f  emails in file       [$file_emails]\n" .
-"k  keywords in file     [$keywords]\n" .
-"r  remove duplicates    [$email_remove_duplicates]\n" .
-"p# pattern match depth  [$pattern_depth]\n";
+	    print STDERR <<EOT
+
+Additional options:
+0  toggle all
+tm toggle maintainers
+tg toggle git entries
+tl toggle open list entries
+ts toggle subscriber list entries
+f  emails in file       [$file_emails]
+k  keywords in file     [$keywords]
+r  remove duplicates    [$email_remove_duplicates]
+p# pattern match depth  [$pattern_depth]
+EOT
 	}
 	print STDERR
 "\n#(toggle), A#(author), S#(signed) *(all), ^(none), O(options), Y(approve): ";
@@ -1471,6 +1490,28 @@ sub interactive_get_maintainers {
 		for (my $i = 0; $i < $count; $i++) {
 		    $selected{$i} = !$selected{$i};
 		}
+	    } elsif ($sel eq "t") {
+		if (lc($str) eq "m") {
+		    for (my $i = 0; $i < $count; $i++) {
+			$selected{$i} = !$selected{$i}
+			    if ($list[$i]->[1] =~ /^(maintainer|supporter)/i);
+		    }
+		} elsif (lc($str) eq "g") {
+		    for (my $i = 0; $i < $count; $i++) {
+			$selected{$i} = !$selected{$i}
+			    if ($list[$i]->[1] =~ /^(author|commit|signer)/i);
+		    }
+		} elsif (lc($str) eq "l") {
+		    for (my $i = 0; $i < $count; $i++) {
+			$selected{$i} = !$selected{$i}
+			    if ($list[$i]->[1] =~ /^(open list)/i);
+		    }
+		} elsif (lc($str) eq "s") {
+		    for (my $i = 0; $i < $count; $i++) {
+			$selected{$i} = !$selected{$i}
+			    if ($list[$i]->[1] =~ /^(subscriber list)/i);
+		    }
+		}
 	    } elsif ($sel eq "a") {
 		if ($val > 0 && $val <= $count) {
 		    $authored{$val - 1} = !$authored{$val - 1};
@@ -1539,6 +1580,10 @@ sub interactive_get_maintainers {
 	    } elsif ($sel eq "r") {
 		bool_invert(\$email_remove_duplicates);
 		$rerun = 1;
+	    } elsif ($sel eq "m") {
+		bool_invert(\$email_use_mailmap);
+		read_mailmap();
+		$rerun = 1;
 	    } elsif ($sel eq "k") {
 		bool_invert(\$keywords);
 		$rerun = 1;
@@ -1602,6 +1647,36 @@ sub bool_invert {
     }
 }
 
+sub deduplicate_email {
+    my ($email) = @_;
+
+    my $matched = 0;
+    my ($name, $address) = parse_email($email);
+    $email = format_email($name, $address, 1);
+    $email = mailmap_email($email);
+
+    return $email if (!$email_remove_duplicates);
+
+    ($name, $address) = parse_email($email);
+
+    if ($deduplicate_name_hash{lc($name)}) {
+	$name = $deduplicate_name_hash{lc($name)}->[0];
+	$address = $deduplicate_name_hash{lc($name)}->[1];
+	$matched = 1;
+    } elsif ($deduplicate_address_hash{lc($address)}) {
+	$name = $deduplicate_address_hash{lc($address)}->[0];
+	$address = $deduplicate_address_hash{lc($address)}->[1];
+	$matched = 1;
+    }
+    if (!$matched) {
+	$deduplicate_name_hash{lc($name)} = [ $name, $address ];
+	$deduplicate_address_hash{lc($address)} = [ $name, $address ];
+    }
+    $email = format_email($name, $address, 1);
+    $email = mailmap_email($email);
+    return $email;
+}
+
 sub save_commits_by_author {
     my (@lines) = @_;
 
@@ -1611,20 +1686,8 @@ sub save_commits_by_author {
 
     foreach my $line (@lines) {
 	if ($line =~ m/$VCS_cmds{"author_pattern"}/) {
-	    my $matched = 0;
 	    my $author = $1;
-	    my ($name, $address) = parse_email($author);
-	    foreach my $to (@interactive_to) {
-		my ($to_name, $to_address) = parse_email($to->[0]);
-		if ($email_remove_duplicates &&
-		    ((lc($name) eq lc($to_name)) ||
-		     (lc($address) eq lc($to_address)))) {
-		    $author = $to->[0];
-		    $matched = 1;
-		    last;
-		}
-	    }
-	    $author = format_email($name, $address, 1) if (!$matched);
+	    $author = deduplicate_email($author);
 	    push(@authors, $author);
 	}
 	push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/);
@@ -1665,19 +1728,7 @@ sub save_commits_by_signer {
 	    my $type = $types[0];
 	    my $signer = $signers[0];
 
-	    my $matched = 0;
-	    my ($name, $address) = parse_email($signer);
-	    foreach my $to (@interactive_to) {
-		my ($to_name, $to_address) = parse_email($to->[0]);
-		if ($email_remove_duplicates &&
-		    ((lc($name) eq lc($to_name)) ||
-		     (lc($address) eq lc($to_address)))) {
-		    $signer = $to->[0];
-		    $matched = 1;
-		    last;
-		}
-		$signer = format_email($name, $address, 1) if (!$matched);
-	    }
+	    $signer = deduplicate_email($signer);
 
 	    my $exists = 0;
 	    foreach my $ref(@{$commit_signer_hash{$signer}}) {
@@ -1751,6 +1802,11 @@ sub vcs_file_signoffs {
     $cmd =~ s/(\$\w+)/$1/eeg;		# interpolate $cmd
 
     ($commits, @signers) = vcs_find_signers($cmd);
+
+    foreach my $signer (@signers) {
+	$signer = deduplicate_email($signer);
+    }
+
     vcs_assign("commit_signer", $commits, @signers);
 }
 
@@ -1828,9 +1884,8 @@ sub vcs_file_blame {
 		foreach my $line (@lines) {
 		    if ($line =~ m/$VCS_cmds{"author_pattern"}/) {
 			my $author = $1;
-			my ($name, $address) = parse_email($author);
-			$author = format_email($name, $address, 1);
-			push(@authors, $1);
+			$author = deduplicate_email($author);
+			push(@authors, $author);
 		    }
 		}
 
@@ -1846,9 +1901,12 @@ sub vcs_file_blame {
 		    $cmd =~ s/(\$\w+)/$1/eeg;	#interpolate $cmd
 		    my @author = vcs_find_author($cmd);
 		    next if !@author;
+
+		    my $formatted_author = deduplicate_email($author[0]);
+
 		    my $count = grep(/$commit/, @all_commits);
 		    for ($i = 0; $i < $count ; $i++) {
-			push(@blame_signers, $author[0]);
+			push(@blame_signers, $formatted_author);
 		    }
 		}
 	    }
@@ -1856,8 +1914,14 @@ sub vcs_file_blame {
 		vcs_assign("authored lines", $total_lines, @blame_signers);
 	    }
 	}
+	foreach my $signer (@signers) {
+	    $signer = deduplicate_email($signer);
+	}
 	vcs_assign("commits", $total_commits, @signers);
     } else {
+	foreach my $signer (@signers) {
+	    $signer = deduplicate_email($signer);
+	}
 	vcs_assign("modified commits", $total_commits, @signers);
     }
 }

From fae99206769b6bbf8a20ab883726b164945771d7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:58 -0700
Subject: [PATCH 139/176] scripts/get_maintainer.pl: don't deduplicate unnamed
 addresses ie: mailing lists

Fix a defect with the first mailing list address being used for each
subsequent mailing list.

Updated to 0.26-beta6.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Florian Mickler <florian@mickler.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index e822518bc610..d21ec3a89603 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
 use strict;
 
 my $P = $0;
-my $V = '0.26-beta5';
+my $V = '0.26-beta6';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 
@@ -1036,7 +1036,7 @@ sub push_email_address {
 	push(@email_to, [format_email($name, $address, $email_usename), $role]);
     } elsif (!email_inuse($name, $address)) {
 	push(@email_to, [format_email($name, $address, $email_usename), $role]);
-	$email_hash_name{lc($name)}++;
+	$email_hash_name{lc($name)}++ if ($name ne "");
 	$email_hash_address{lc($address)}++;
     }
 
@@ -1659,7 +1659,7 @@ sub deduplicate_email {
 
     ($name, $address) = parse_email($email);
 
-    if ($deduplicate_name_hash{lc($name)}) {
+    if ($name ne "" && $deduplicate_name_hash{lc($name)}) {
 	$name = $deduplicate_name_hash{lc($name)}->[0];
 	$address = $deduplicate_name_hash{lc($name)}->[1];
 	$matched = 1;

From ec15408206e6b87a11dc1560b2bd758d2ad7ccb1 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:59 -0700
Subject: [PATCH 140/176] MAINTAINERS: fix Colibri PXA270 file pattern

The original commit 403d29713e0a ("pxa/income: Add Income SBC support")
started with the wrong file pattern.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 146b8a068a4e..a8d1f5686cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -710,8 +710,7 @@ ARM/INCOME PXA270 SUPPORT
 M:	Marek Vasut <marek.vasut@gmail.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
-F:	arch/arm/mach-pxa/income.c
-F:	arch/arm/mach-pxa/include/mach-pxa/income.h
+F:	arch/arm/mach-pxa/colibri-pxa270-income.c
 
 ARM/INTEL IOP32X ARM ARCHITECTURE
 M:	Lennert Buytenhek <kernel@wantstofly.org>

From 838553c5a151f0d76c5558b85d6a4131c835a20c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:22:59 -0700
Subject: [PATCH 141/176] MAINTAINERS: merge imote2 and stargate

Removed by commit dcd925f95194da4 ("pxa: merge stargate2 and imote2 board
files").

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Jonathan Cameron <jic23@cam.ac.uk>
Acked-by: Eric Miao <eric.y.miao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a8d1f5686cee..5bb0f896bc24 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -757,13 +757,7 @@ L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm/mach-ixp4xx/
 
-ARM/INTEL RESEARCH IMOTE 2 MACHINE SUPPORT
-M:	Jonathan Cameron <jic23@cam.ac.uk>
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-S:	Maintained
-F:	arch/arm/mach-pxa/imote2.c
-
-ARM/INTEL RESEARCH STARGATE 2 MACHINE SUPPORT
+ARM/INTEL RESEARCH IMOTE/STARGATE 2 MACHINE SUPPORT
 M:	Jonathan Cameron <jic23@cam.ac.uk>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained

From 85743532a8ffa826edba3d1ca7d8f0dbf3f2a67d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:00 -0700
Subject: [PATCH 142/176] MAINTAINERS: merge s3c-244x sections

Removed by commit 70556b143ae4c ("ARM: S3C24XX: Remove old mach-s3c2442").

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5bb0f896bc24..feda2cff24ff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -922,25 +922,12 @@ W:	http://www.fluff.org/ben/linux/
 S:	Maintained
 F:	arch/arm/mach-s3c2410/
 
-ARM/S3C2440 ARM ARCHITECTURE
+ARM/S3C244x ARM ARCHITECTURE
 M:	Ben Dooks <ben-linux@fluff.org>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.fluff.org/ben/linux/
 S:	Maintained
 F:	arch/arm/mach-s3c2440/
-
-ARM/S3C2442 ARM ARCHITECTURE
-M:	Ben Dooks <ben-linux@fluff.org>
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-W:	http://www.fluff.org/ben/linux/
-S:	Maintained
-F:	arch/arm/mach-s3c2442/
-
-ARM/S3C2443 ARM ARCHITECTURE
-M:	Ben Dooks <ben-linux@fluff.org>
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-W:	http://www.fluff.org/ben/linux/
-S:	Maintained
 F:	arch/arm/mach-s3c2443/
 
 ARM/S3C6400 ARM ARCHITECTURE

From f996231b808f889df77561c86c01cee3c45e254e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:01 -0700
Subject: [PATCH 143/176] MAINTAINERS: merge s3c6400 and 6410 to 64xx

Removed by commit 431107ea5b680a24a ("ARM: S3C64XX: Merge mach-s3c6400 and
mach-s3c6410").

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index feda2cff24ff..abf4436a3741 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -930,19 +930,12 @@ S:	Maintained
 F:	arch/arm/mach-s3c2440/
 F:	arch/arm/mach-s3c2443/
 
-ARM/S3C6400 ARM ARCHITECTURE
+ARM/S3C64xx ARM ARCHITECTURE
 M:	Ben Dooks <ben-linux@fluff.org>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.fluff.org/ben/linux/
 S:	Maintained
-F:	arch/arm/mach-s3c6400/
-
-ARM/S3C6410 ARM ARCHITECTURE
-M:	Ben Dooks <ben-linux@fluff.org>
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-W:	http://www.fluff.org/ben/linux/
-S:	Maintained
-F:	arch/arm/mach-s3c6410/
+F:	arch/arm/mach-s3c64xx/
 
 ARM/S5P ARM ARCHITECTURES
 M:	Kukjin Kim <kgene.kim@samsung.com>

From cefea792137c163529c12090c93b693d29166c60 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:01 -0700
Subject: [PATCH 144/176] MAINTAINERS: remove USB OV511 DRIVER

Removed by commit 7373ab3669aec93 ("V4L/DVB: Remove obsolete ov511
driver").

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Amerigo Wang <amwang@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index abf4436a3741..f471adfab417 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6114,13 +6114,6 @@ L:	linux-usb@vger.kernel.org
 S:	Maintained
 F:	drivers/usb/serial/option.c
 
-USB OV511 DRIVER
-M:	Mark McClelland <mmcclell@bigfoot.com>
-L:	linux-usb@vger.kernel.org
-W:	http://alpha.dyndns.org/ov511/
-S:	Maintained
-F:	drivers/media/video/ov511.*
-
 USB PEGASUS DRIVER
 M:	Petko Manolov <petkan@users.sourceforge.net>
 L:	linux-usb@vger.kernel.org

From be8c268a221cbe25025f30e697968cfcd8e78c94 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:02 -0700
Subject: [PATCH 145/176] MAINTAINERS: remove USB ZC0301 DRIVER

Removed by commit 0d58cef664e01f ("V4L/DVB: Remove obsolete zc0301 v4l
driver").

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Amerigo Wang <amwang@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index f471adfab417..c67d606e8477 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6274,16 +6274,6 @@ S:	Supported
 F:	drivers/usb/host/xhci*
 F:	drivers/usb/host/pci-quirks*
 
-USB ZC0301 DRIVER
-M:	Luca Risolia <luca.risolia@studio.unibo.it>
-L:	linux-usb@vger.kernel.org
-L:	linux-media@vger.kernel.org
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git
-W:	http://www.linux-projects.org
-S:	Maintained
-F:	Documentation/video4linux/zc0301.txt
-F:	drivers/media/video/zc0301/
-
 USB ZD1201 DRIVER
 L:	linux-wireless@vger.kernel.org
 W:	http://linux-lc100020.sourceforge.net

From 1fa7e5473cba543b02a396ced9e407f614bb117c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:02 -0700
Subject: [PATCH 146/176] MAINTAINERS: use "T: git" and whitespace trivia

Add missing git as a prefix for git repositories in the few places it
wasn't already used.

Convert a space delimiter to a tab.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c67d606e8477..014b648f99ed 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -657,7 +657,7 @@ ARM/FARADAY FA526 PORT
 M:	Hans Ulli Kroll <ulli.kroll@googlemail.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
-T:	git://git.berlios.de/gemini-board
+T:	git git://git.berlios.de/gemini-board
 F:	arch/arm/mm/*-fa*
 
 ARM/FOOTBRIDGE ARCHITECTURE
@@ -672,7 +672,7 @@ ARM/FREESCALE IMX / MXC ARM ARCHITECTURE
 M:	Sascha Hauer <kernel@pengutronix.de>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
-T:	git://git.pengutronix.de/git/imx/linux-2.6.git
+T:	git git://git.pengutronix.de/git/imx/linux-2.6.git
 F:	arch/arm/mach-mx*/
 F:	arch/arm/plat-mxc/
 
@@ -3840,7 +3840,7 @@ F:	drivers/net/wireless/mwl8k.c
 MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER
 M:	Nicolas Pitre <nico@fluxnic.net>
 S:	Odd Fixes
-F: drivers/mmc/host/mvsdio.*
+F:	drivers/mmc/host/mvsdio.*
 
 MARVELL YUKON / SYSKONNECT DRIVER
 M:	Mirko Lindner <mlindner@syskonnect.de>
@@ -4931,7 +4931,7 @@ RCUTORTURE MODULE
 M:	Josh Triplett <josh@freedesktop.org>
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 S:	Supported
-T:	git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git
 F:	Documentation/RCU/torture.txt
 F:	kernel/rcutorture.c
 
@@ -4956,7 +4956,7 @@ M:	Dipankar Sarma <dipankar@in.ibm.com>
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 W:	http://www.rdrop.com/users/paulmck/rclock/
 S:	Supported
-T:	git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git
 F:	Documentation/RCU/
 F:	include/linux/rcu*
 F:	include/linux/srcu*

From 66f1991bc2357436498ac990302b6f5bf403d7ef Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andy.shevchenko@gmail.com>
Date: Tue, 26 Oct 2010 14:23:03 -0700
Subject: [PATCH 147/176] lib/bitmap.c: use hex_to_bin()

Signed-off-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bitmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index ffb78c916ccd..741fae905ae3 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -359,7 +359,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area);
 
 #define CHUNKSZ				32
 #define nbits_to_hold_value(val)	fls(val)
-#define unhex(c)			(isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
 #define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
 /**
@@ -466,7 +465,7 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
 			if (chunk & ~((1UL << (CHUNKSZ - 4)) - 1))
 				return -EOVERFLOW;
 
-			chunk = (chunk << 4) | unhex(c);
+			chunk = (chunk << 4) | hex_to_bin(c);
 			ndigits++; totaldigits++;
 		}
 		if (ndigits == 0)

From 066a9be6c0124edc9527088231f03c6236be375d Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naota@elisp.net>
Date: Tue, 26 Oct 2010 14:23:03 -0700
Subject: [PATCH 148/176] idr: fix idr_pre_get() locking description

Despite the idr_pre_get() kernel-doc, there are some cases where you can
call idr_pre_get() from within locked regions.  Add a description for such
cases.

See also: http://lkml.org/lkml/2010/9/16/462

[akpm@linux-foundation.org: cleanups, grammatical fixes]
Signed-off-by: Naohiro Aota <naota@elisp.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/idr.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 5e0966be0f7c..e35850d3004a 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -110,9 +110,10 @@ static void idr_mark_full(struct idr_layer **pa, int id)
  * @idp:	idr handle
  * @gfp_mask:	memory allocation flags
  *
- * This function should be called prior to locking and calling the
- * idr_get_new* functions. It preallocates enough memory to satisfy
- * the worst possible allocation.
+ * This function should be called prior to calling the idr_get_new* functions.
+ * It preallocates enough memory to satisfy the worst possible allocation. The
+ * caller should pass in GFP_KERNEL if possible.  This of course requires that
+ * no spinning locks be held.
  *
  * If the system is REALLY out of memory this function returns 0,
  * otherwise 1.
@@ -290,9 +291,11 @@ static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id)
  * This is the allocate id function.  It should be called with any
  * required locks.
  *
- * If memory is required, it will return -EAGAIN, you should unlock
- * and go back to the idr_pre_get() call.  If the idr is full, it will
- * return -ENOSPC.
+ * If allocation from IDR's private freelist fails, idr_get_new_above() will
+ * return -EAGAIN.  The caller should retry the idr_pre_get() call to refill
+ * IDR's preallocation and then retry the idr_get_new_above() call.
+ *
+ * If the idr is full idr_get_new_above() will return -ENOSPC.
  *
  * @id returns a value in the range @starting_id ... 0x7fffffff
  */
@@ -318,12 +321,11 @@ EXPORT_SYMBOL(idr_get_new_above);
  * @ptr: pointer you want associated with the id
  * @id: pointer to the allocated handle
  *
- * This is the allocate id function.  It should be called with any
- * required locks.
+ * If allocation from IDR's private freelist fails, idr_get_new_above() will
+ * return -EAGAIN.  The caller should retry the idr_pre_get() call to refill
+ * IDR's preallocation and then retry the idr_get_new_above() call.
  *
- * If memory is required, it will return -EAGAIN, you should unlock
- * and go back to the idr_pre_get() call.  If the idr is full, it will
- * return -ENOSPC.
+ * If the idr is full idr_get_new_above() will return -ENOSPC.
  *
  * @id returns a value in the range 0 ... 0x7fffffff
  */

From e2852ae825dba5ebc159788720baec1a28a57125 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 26 Oct 2010 14:23:05 -0700
Subject: [PATCH 149/176] percpu_counter: add debugobj support

All percpu counters are linked to a global list on initialization and
removed from it on destruction.  The list is walked during CPU up/down.
If a percpu counter is freed without being properly destroyed, the system
will oops only on the next CPU up/down making it pretty nasty to track
down.  This patch adds debugobj support for percpu counters so that such
problems can be found easily.

As percpu counters don't make sense on stack and can't be statically
initialized, debugobj support is pretty simple.  It's initialized and
activated on counter initialization, and deactivatd and destroyed on
counter destruction.  With this patch applied, the bug fixed by commit
602586a83b719df0fbd94196a1359ed35aeb2df3 (shmem: put_super must
percpu_counter_destroy) triggers the following warning on tmpfs unmount
and the system won't oops on the next cpu up/down operation.

 ------------[ cut here ]------------
 WARNING: at lib/debugobjects.c:259 debug_print_object+0x5c/0x70()
 Hardware name: Bochs
 ODEBUG: free active (active state 0) object type: percpu_counter
 Modules linked in:
 Pid: 3999, comm: umount Not tainted 2.6.36-rc2-work+ #5
 Call Trace:
  [<ffffffff81083f7f>] warn_slowpath_common+0x7f/0xc0
  [<ffffffff81084076>] warn_slowpath_fmt+0x46/0x50
  [<ffffffff813b45cc>] debug_print_object+0x5c/0x70
  [<ffffffff813b50e5>] debug_check_no_obj_freed+0x125/0x210
  [<ffffffff811577d3>] kfree+0xb3/0x2f0
  [<ffffffff81132edd>] shmem_put_super+0x1d/0x30
  [<ffffffff81162e96>] generic_shutdown_super+0x56/0xe0
  [<ffffffff81162f86>] kill_anon_super+0x16/0x60
  [<ffffffff81162ff7>] kill_litter_super+0x27/0x30
  [<ffffffff81163295>] deactivate_locked_super+0x45/0x60
  [<ffffffff81163cfa>] deactivate_super+0x4a/0x70
  [<ffffffff8117d446>] mntput_no_expire+0x86/0xe0
  [<ffffffff8117df7f>] sys_umount+0x6f/0x360
  [<ffffffff8103f01b>] system_call_fastpath+0x16/0x1b
 ---[ end trace cce2a341ba3611a7 ]---

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Thomas Gleixner <tglxlinutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug    |  8 ++++++++
 lib/percpu_counter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 69a32664c289..0d5c762532a5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -317,6 +317,14 @@ config DEBUG_OBJECTS_RCU_HEAD
 	help
 	  Enable this to turn on debugging of RCU list heads (call_rcu() usage).
 
+config DEBUG_OBJECTS_PERCPU_COUNTER
+	bool "Debug percpu counter objects"
+	depends on DEBUG_OBJECTS
+	help
+	  If you say Y here, additional code will be inserted into the
+	  percpu counter routines to track the life time of percpu counter
+	  objects and validate the percpu counter operations.
+
 config DEBUG_OBJECTS_ENABLE_DEFAULT
 	int "debug_objects bootup default value (0-1)"
         range 0 1
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 209448e1d2b9..1d954ea72331 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -8,10 +8,53 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
+#include <linux/debugobjects.h>
 
 static LIST_HEAD(percpu_counters);
 static DEFINE_MUTEX(percpu_counters_lock);
 
+#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
+
+static struct debug_obj_descr percpu_counter_debug_descr;
+
+static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
+{
+	struct percpu_counter *fbc = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		percpu_counter_destroy(fbc);
+		debug_object_free(fbc, &percpu_counter_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static struct debug_obj_descr percpu_counter_debug_descr = {
+	.name		= "percpu_counter",
+	.fixup_free	= percpu_counter_fixup_free,
+};
+
+static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
+{
+	debug_object_init(fbc, &percpu_counter_debug_descr);
+	debug_object_activate(fbc, &percpu_counter_debug_descr);
+}
+
+static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
+{
+	debug_object_deactivate(fbc, &percpu_counter_debug_descr);
+	debug_object_free(fbc, &percpu_counter_debug_descr);
+}
+
+#else	/* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
+static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
+{ }
+static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
+{ }
+#endif	/* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
+
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
 	int cpu;
@@ -75,6 +118,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
 	fbc->counters = alloc_percpu(s32);
 	if (!fbc->counters)
 		return -ENOMEM;
+
+	debug_percpu_counter_activate(fbc);
+
 #ifdef CONFIG_HOTPLUG_CPU
 	INIT_LIST_HEAD(&fbc->list);
 	mutex_lock(&percpu_counters_lock);
@@ -90,6 +136,8 @@ void percpu_counter_destroy(struct percpu_counter *fbc)
 	if (!fbc->counters)
 		return;
 
+	debug_percpu_counter_deactivate(fbc);
+
 #ifdef CONFIG_HOTPLUG_CPU
 	mutex_lock(&percpu_counters_lock);
 	list_del(&fbc->list);

From 6d411e6c01608cefb7b9ea6712110538a1432f9f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 26 Oct 2010 14:23:05 -0700
Subject: [PATCH 150/176] lib/Kconfig.debug: add list_sort debugging switch

While hunting a non-existing bug in 'list_sort()', I've improved the
'list_sort_test()' function which tests the 'list_sort()' library call.
Although at the end I found a bug in my code, but not in 'list_sort()', I
think my clean-ups and improvements are worth merging because they make
the test function better.

This patch:

Make the self-tests selectable via Kconfig rather than by manual enabling
of DEBUG_LIST_SORT.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 9 +++++++++
 lib/list_sort.c   | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0d5c762532a5..95bda87a3e84 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -748,6 +748,15 @@ config DEBUG_LIST
 
 	  If unsure, say N.
 
+config TEST_LIST_SORT
+	bool "Linked list sorting test"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to turn on 'list_sort()' function test. This test is
+	  executed only once during system boot, so affects only boot time.
+
+	  If unsure, say N.
+
 config DEBUG_SG
 	bool "Debug SG table operations"
 	depends on DEBUG_KERNEL
diff --git a/lib/list_sort.c b/lib/list_sort.c
index a7616fa3162e..827794016bfb 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -141,7 +141,7 @@ void list_sort(void *priv, struct list_head *head,
 }
 EXPORT_SYMBOL(list_sort);
 
-#ifdef DEBUG_LIST_SORT
+#ifdef CONFIG_TEST_LIST_SORT
 struct debug_el {
 	struct list_head l_h;
 	int value;
@@ -214,4 +214,4 @@ static int __init list_sort_test(void)
 	return 0;
 }
 module_init(list_sort_test);
-#endif
+#endif /* CONFIG_TEST_LIST_SORT */

From bb2ab10fa693110cffa7087ffe2749d6e9a27d5f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 26 Oct 2010 14:23:06 -0700
Subject: [PATCH 151/176] lib/list_sort: test: use more reasonable printk
 levels

I do not see any reason to use KERN_WARN for normal messages and
KERN_EMERG for error messages in the lib_sort testing routine.  Let's use
more reasonable KERN_NORM and KERN_ERR levels.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/list_sort.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 827794016bfb..679b3a060e7e 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -166,7 +166,7 @@ static int __init list_sort_test(void)
 	struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
 	struct list_head *cur;
 
-	printk(KERN_WARNING "testing list_sort()\n");
+	printk(KERN_DEBUG "testing list_sort()\n");
 
 	cur = head;
 	for (i = 0; i < LIST_SORT_TEST_LENGTH; i++) {
@@ -189,17 +189,17 @@ static int __init list_sort_test(void)
 		struct debug_el *el = container_of(cur, struct debug_el, l_h);
 		int cmp_result = cmp(NULL, cur, cur->next);
 		if (cur->next->prev != cur) {
-			printk(KERN_EMERG "list_sort() returned "
-						"a corrupted list!\n");
+			printk(KERN_ERR "list_sort() returned "
+					"a corrupted list!\n");
 			return 1;
 		} else if (cmp_result > 0) {
-			printk(KERN_EMERG "list_sort() failed to sort!\n");
+			printk(KERN_ERR "list_sort() failed to sort!\n");
 			return 1;
 		} else if (cmp_result == 0 &&
 				el->serial >= container_of(cur->next,
 					struct debug_el, l_h)->serial) {
-			printk(KERN_EMERG "list_sort() failed to preserve order"
-						 " of equivalent elements!\n");
+			printk(KERN_ERR "list_sort() failed to preserve order "
+					"of equivalent elements!\n");
 			return 1;
 		}
 		kfree(cur->prev);
@@ -207,8 +207,8 @@ static int __init list_sort_test(void)
 	}
 	kfree(cur);
 	if (count != LIST_SORT_TEST_LENGTH) {
-		printk(KERN_EMERG "list_sort() returned list of"
-						"different length!\n");
+		printk(KERN_ERR "list_sort() returned list of "
+				"different length!\n");
 		return 1;
 	}
 	return 0;

From eeee9ebb54b76a33a13d2c926ffb018a4aea410f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 26 Oct 2010 14:23:06 -0700
Subject: [PATCH 152/176] lib/list_sort: test: use generic random32

Instead of using own pseudo-random generator, use generic linux
'random32()' function.  Presumably, this should improve test coverage.

At the same time, do the following changes:
  o Use shorter macro name for test list length
  o Do not use strange 'l_h' name for 'struct list_head' element,
    use 'list', because it is traditional name and thus, makes the
    code more obvious and readable.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/list_sort.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 679b3a060e7e..8f3c24415ae5 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -142,42 +142,45 @@ void list_sort(void *priv, struct list_head *head,
 EXPORT_SYMBOL(list_sort);
 
 #ifdef CONFIG_TEST_LIST_SORT
+
+#include <linux/random.h>
+
 struct debug_el {
-	struct list_head l_h;
+	struct list_head list;
 	int value;
 	unsigned serial;
 };
 
 static int cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-	return container_of(a, struct debug_el, l_h)->value
-	     - container_of(b, struct debug_el, l_h)->value;
+	return container_of(a, struct debug_el, list)->value
+	     - container_of(b, struct debug_el, list)->value;
 }
 
 /*
  * The pattern of set bits in the list length determines which cases
  * are hit in list_sort().
  */
-#define LIST_SORT_TEST_LENGTH (512+128+2) /* not including head */
+#define TEST_LIST_LEN (512+128+2) /* not including head */
 
 static int __init list_sort_test(void)
 {
-	int i, r = 1, count;
+	int i, count;
 	struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
 	struct list_head *cur;
 
 	printk(KERN_DEBUG "testing list_sort()\n");
 
 	cur = head;
-	for (i = 0; i < LIST_SORT_TEST_LENGTH; i++) {
+	for (i = 0; i < TEST_LIST_LEN; i++) {
 		struct debug_el *el = kmalloc(sizeof(*el), GFP_KERNEL);
 		BUG_ON(!el);
 		 /* force some equivalencies */
-		el->value = (r = (r * 725861) % 6599) % (LIST_SORT_TEST_LENGTH/3);
+		el->value = random32() % (TEST_LIST_LEN/3);
 		el->serial = i;
 
-		el->l_h.prev = cur;
-		cur->next = &el->l_h;
+		el->list.prev = cur;
+		cur->next = &el->list;
 		cur = cur->next;
 	}
 	head->prev = cur;
@@ -186,7 +189,7 @@ static int __init list_sort_test(void)
 
 	count = 1;
 	for (cur = head->next; cur->next != head; cur = cur->next) {
-		struct debug_el *el = container_of(cur, struct debug_el, l_h);
+		struct debug_el *el = container_of(cur, struct debug_el, list);
 		int cmp_result = cmp(NULL, cur, cur->next);
 		if (cur->next->prev != cur) {
 			printk(KERN_ERR "list_sort() returned "
@@ -197,7 +200,7 @@ static int __init list_sort_test(void)
 			return 1;
 		} else if (cmp_result == 0 &&
 				el->serial >= container_of(cur->next,
-					struct debug_el, l_h)->serial) {
+					struct debug_el, list)->serial) {
 			printk(KERN_ERR "list_sort() failed to preserve order "
 					"of equivalent elements!\n");
 			return 1;
@@ -206,7 +209,7 @@ static int __init list_sort_test(void)
 		count++;
 	}
 	kfree(cur);
-	if (count != LIST_SORT_TEST_LENGTH) {
+	if (count != TEST_LIST_LEN) {
 		printk(KERN_ERR "list_sort() returned list of "
 				"different length!\n");
 		return 1;

From f3dc0e384248ea6fda0987f909007fa9ab5fb51a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 26 Oct 2010 14:23:07 -0700
Subject: [PATCH 153/176] lib/list_sort: test: improve errors handling

The 'lib_sort()' test does not free memory if it fails, and it makes the
kernel panic if it cannot allocate memory.  This patch fixes the problem.

This patch also changes several small things:
 o use 'list_add()' helper instead of adding manually
 o introduce temporary 'el1' variable to avoid ugly and unreadalbe
   "if" statement
 o make 'head' to be stack variable instead of 'kmalloc()'ed, which
   simplifies code a bit

Overall, this patch is of clean-up type.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/list_sort.c | 65 +++++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 8f3c24415ae5..2b99ff80f4be 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -165,56 +165,67 @@ static int cmp(void *priv, struct list_head *a, struct list_head *b)
 
 static int __init list_sort_test(void)
 {
-	int i, count;
-	struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
-	struct list_head *cur;
+	int i, count = 1, err = -EINVAL;
+	struct debug_el *el;
+	struct list_head *cur, *tmp;
+	LIST_HEAD(head);
 
 	printk(KERN_DEBUG "testing list_sort()\n");
 
-	cur = head;
 	for (i = 0; i < TEST_LIST_LEN; i++) {
-		struct debug_el *el = kmalloc(sizeof(*el), GFP_KERNEL);
-		BUG_ON(!el);
+		el = kmalloc(sizeof(*el), GFP_KERNEL);
+		if (!el) {
+			printk(KERN_ERR "cancel list_sort() testing - cannot "
+					"allocate memory\n");
+			goto exit;
+		}
 		 /* force some equivalencies */
 		el->value = random32() % (TEST_LIST_LEN/3);
 		el->serial = i;
-
-		el->list.prev = cur;
-		cur->next = &el->list;
-		cur = cur->next;
+		list_add_tail(&el->list, &head);
 	}
-	head->prev = cur;
 
-	list_sort(NULL, head, cmp);
+	list_sort(NULL, &head, cmp);
+
+	for (cur = head.next; cur->next != &head; cur = cur->next) {
+		struct debug_el *el1;
+		int cmp_result;
 
-	count = 1;
-	for (cur = head->next; cur->next != head; cur = cur->next) {
-		struct debug_el *el = container_of(cur, struct debug_el, list);
-		int cmp_result = cmp(NULL, cur, cur->next);
 		if (cur->next->prev != cur) {
 			printk(KERN_ERR "list_sort() returned "
 					"a corrupted list!\n");
-			return 1;
-		} else if (cmp_result > 0) {
+			goto exit;
+		}
+
+		cmp_result = cmp(NULL, cur, cur->next);
+		if (cmp_result > 0) {
 			printk(KERN_ERR "list_sort() failed to sort!\n");
-			return 1;
-		} else if (cmp_result == 0 &&
-				el->serial >= container_of(cur->next,
-					struct debug_el, list)->serial) {
+			goto exit;
+		}
+
+		el = container_of(cur, struct debug_el, list);
+		el1 = container_of(cur->next, struct debug_el, list);
+		if (cmp_result == 0 && el->serial >= el1->serial) {
 			printk(KERN_ERR "list_sort() failed to preserve order "
 					"of equivalent elements!\n");
-			return 1;
+			goto exit;
 		}
-		kfree(cur->prev);
 		count++;
 	}
-	kfree(cur);
+
 	if (count != TEST_LIST_LEN) {
 		printk(KERN_ERR "list_sort() returned list of "
 				"different length!\n");
-		return 1;
+		goto exit;
 	}
-	return 0;
+
+	err = 0;
+exit:
+	list_for_each_safe(cur, tmp, &head) {
+		list_del(cur);
+		kfree(container_of(cur, struct debug_el, list));
+	}
+	return err;
 }
 module_init(list_sort_test);
 #endif /* CONFIG_TEST_LIST_SORT */

From 014afa943d44f0df8e65bc4bd071c67772277d93 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 26 Oct 2010 14:23:08 -0700
Subject: [PATCH 154/176] lib/list_sort: test: unify test messages

This patch unifies 'list_sort_test()' messages a bit and makes sure all of
them start with the "list_sort_test:" prefix to make it obvious for users
where the messages come from.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/list_sort.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 2b99ff80f4be..01aff9e80821 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -170,12 +170,12 @@ static int __init list_sort_test(void)
 	struct list_head *cur, *tmp;
 	LIST_HEAD(head);
 
-	printk(KERN_DEBUG "testing list_sort()\n");
+	printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n");
 
 	for (i = 0; i < TEST_LIST_LEN; i++) {
 		el = kmalloc(sizeof(*el), GFP_KERNEL);
 		if (!el) {
-			printk(KERN_ERR "cancel list_sort() testing - cannot "
+			printk(KERN_ERR "list_sort_test: error: cannot "
 					"allocate memory\n");
 			goto exit;
 		}
@@ -192,30 +192,31 @@ static int __init list_sort_test(void)
 		int cmp_result;
 
 		if (cur->next->prev != cur) {
-			printk(KERN_ERR "list_sort() returned "
-					"a corrupted list!\n");
+			printk(KERN_ERR "list_sort_test: error: list is "
+					"corrupted\n");
 			goto exit;
 		}
 
 		cmp_result = cmp(NULL, cur, cur->next);
 		if (cmp_result > 0) {
-			printk(KERN_ERR "list_sort() failed to sort!\n");
+			printk(KERN_ERR "list_sort_test: error: list is not "
+					"sorted\n");
 			goto exit;
 		}
 
 		el = container_of(cur, struct debug_el, list);
 		el1 = container_of(cur->next, struct debug_el, list);
 		if (cmp_result == 0 && el->serial >= el1->serial) {
-			printk(KERN_ERR "list_sort() failed to preserve order "
-					"of equivalent elements!\n");
+			printk(KERN_ERR "list_sort_test: error: order of "
+					"equivalent elements not preserved\n");
 			goto exit;
 		}
 		count++;
 	}
 
 	if (count != TEST_LIST_LEN) {
-		printk(KERN_ERR "list_sort() returned list of "
-				"different length!\n");
+		printk(KERN_ERR "list_sort_test: error: bad list length %d",
+				count);
 		goto exit;
 	}
 

From 041b78f232bb87b2de8ca3fed50384bc7dc9c2de Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 26 Oct 2010 14:23:08 -0700
Subject: [PATCH 155/176] lib/list_sort: test: check element addresses

Improve 'lib_sort()' test and check that:
 o 'cmp()' is called only for elements which were present in the original list,
   i.e., the 'a' and 'b' parameters are valid
 o the resulted (sorted) list consists onlly of the original elements
 o intdoruce "poison" fields to make sure data around 'struc list_head' field
   are not corrupted.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/list_sort.c | 83 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 12 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 01aff9e80821..d7325c6b103f 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -145,24 +145,66 @@ EXPORT_SYMBOL(list_sort);
 
 #include <linux/random.h>
 
-struct debug_el {
-	struct list_head list;
-	int value;
-	unsigned serial;
-};
-
-static int cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-	return container_of(a, struct debug_el, list)->value
-	     - container_of(b, struct debug_el, list)->value;
-}
-
 /*
  * The pattern of set bits in the list length determines which cases
  * are hit in list_sort().
  */
 #define TEST_LIST_LEN (512+128+2) /* not including head */
 
+#define TEST_POISON1 0xDEADBEEF
+#define TEST_POISON2 0xA324354C
+
+struct debug_el {
+	unsigned int poison1;
+	struct list_head list;
+	unsigned int poison2;
+	int value;
+	unsigned serial;
+};
+
+/* Array, containing pointers to all elements in the test list */
+static struct debug_el **elts __initdata;
+
+static int __init check(struct debug_el *ela, struct debug_el *elb)
+{
+	if (ela->serial >= TEST_LIST_LEN) {
+		printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n",
+				ela->serial);
+		return -EINVAL;
+	}
+	if (elb->serial >= TEST_LIST_LEN) {
+		printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n",
+				elb->serial);
+		return -EINVAL;
+	}
+	if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
+		printk(KERN_ERR "list_sort_test: error: phantom element\n");
+		return -EINVAL;
+	}
+	if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
+		printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n",
+				ela->poison1, ela->poison2);
+		return -EINVAL;
+	}
+	if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
+		printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n",
+				elb->poison1, elb->poison2);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct debug_el *ela, *elb;
+
+	ela = container_of(a, struct debug_el, list);
+	elb = container_of(b, struct debug_el, list);
+
+	check(ela, elb);
+	return ela->value - elb->value;
+}
+
 static int __init list_sort_test(void)
 {
 	int i, count = 1, err = -EINVAL;
@@ -172,6 +214,13 @@ static int __init list_sort_test(void)
 
 	printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n");
 
+	elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL);
+	if (!elts) {
+		printk(KERN_ERR "list_sort_test: error: cannot allocate "
+				"memory\n");
+		goto exit;
+	}
+
 	for (i = 0; i < TEST_LIST_LEN; i++) {
 		el = kmalloc(sizeof(*el), GFP_KERNEL);
 		if (!el) {
@@ -182,6 +231,9 @@ static int __init list_sort_test(void)
 		 /* force some equivalencies */
 		el->value = random32() % (TEST_LIST_LEN/3);
 		el->serial = i;
+		el->poison1 = TEST_POISON1;
+		el->poison2 = TEST_POISON2;
+		elts[i] = el;
 		list_add_tail(&el->list, &head);
 	}
 
@@ -211,6 +263,12 @@ static int __init list_sort_test(void)
 					"equivalent elements not preserved\n");
 			goto exit;
 		}
+
+		if (check(el, el1)) {
+			printk(KERN_ERR "list_sort_test: error: element check "
+					"failed\n");
+			goto exit;
+		}
 		count++;
 	}
 
@@ -222,6 +280,7 @@ static int __init list_sort_test(void)
 
 	err = 0;
 exit:
+	kfree(elts);
 	list_for_each_safe(cur, tmp, &head) {
 		list_del(cur);
 		kfree(container_of(cur, struct debug_el, list));

From ea00c30b5b31baa91be29bee966204eccc15e9d3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 26 Oct 2010 14:23:09 -0700
Subject: [PATCH 156/176] percpu_counter: use this_cpu_ptr() instead of
 per_cpu_ptr()

this_cpu_ptr() avoids an array lookup and can use the percpu offset of the
local cpu directly.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/percpu_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 1d954ea72331..604678d7d06d 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -73,9 +73,9 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
 {
 	s64 count;
 	s32 *pcount;
-	int cpu = get_cpu();
 
-	pcount = per_cpu_ptr(fbc->counters, cpu);
+	preempt_disable();
+	pcount = this_cpu_ptr(fbc->counters);
 	count = *pcount + amount;
 	if (count >= batch || count <= -batch) {
 		spin_lock(&fbc->lock);
@@ -85,7 +85,7 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
 	} else {
 		*pcount = count;
 	}
-	put_cpu();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__percpu_counter_add);
 

From 5d051decfc27cdf33fbbd2bfca958d0d2c903569 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:23:09 -0700
Subject: [PATCH 157/176] lib/parser: cleanup match_number()

Use new variable 'len' to make code more readable.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/parser.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/parser.c b/lib/parser.c
index fb34977246bb..6e89eca5cca0 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -128,12 +128,13 @@ static int match_number(substring_t *s, int *result, int base)
 	char *endp;
 	char *buf;
 	int ret;
+	size_t len = s->to - s->from;
 
-	buf = kmalloc(s->to - s->from + 1, GFP_KERNEL);
+	buf = kmalloc(len + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
-	memcpy(buf, s->from, s->to - s->from);
-	buf[s->to - s->from] = '\0';
+	memcpy(buf, s->from, len);
+	buf[len] = '\0';
 	*result = simple_strtol(buf, &endp, base);
 	ret = 0;
 	if (endp == buf)

From 658716d19f8f155c67d4677ba68034b8e492dfbe Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 26 Oct 2010 14:23:10 -0700
Subject: [PATCH 158/176] div64_u64(): improve precision on 32bit platforms

The current implementation of div64_u64 for 32bit systems returns an
approximately correct result when the divisor exceeds 32bits.  Since doing
64bit division using 32bit hardware is a long since solved problem we just
use one of the existing proven methods.

Additionally, add a div64_s64 function to correctly handle doing signed
64bit division.

Addresses https://bugzilla.redhat.com/show_bug.cgi?id=616105

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ben Woodard <bwoodard@llnl.gov>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Mark Grondona <mgrondona@llnl.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  5 ++++
 include/linux/math64.h | 12 ++++++++++
 lib/div64.c            | 52 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 77b04ed037df..450092c1e35f 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -173,6 +173,11 @@ extern int _cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
+#define abs64(x) ({				\
+		s64 __x = (x);			\
+		(__x < 0) ? -__x : __x;		\
+	})
+
 #ifdef CONFIG_PROVE_LOCKING
 void might_fault(void);
 #else
diff --git a/include/linux/math64.h b/include/linux/math64.h
index c87f1528703a..23fcdfcba81b 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -35,6 +35,14 @@ static inline u64 div64_u64(u64 dividend, u64 divisor)
 	return dividend / divisor;
 }
 
+/**
+ * div64_s64 - signed 64bit divide with 64bit divisor
+ */
+static inline s64 div64_s64(s64 dividend, s64 divisor)
+{
+	return dividend / divisor;
+}
+
 #elif BITS_PER_LONG == 32
 
 #ifndef div_u64_rem
@@ -53,6 +61,10 @@ extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
 extern u64 div64_u64(u64 dividend, u64 divisor);
 #endif
 
+#ifndef div64_s64
+extern s64 div64_s64(s64 dividend, s64 divisor);
+#endif
+
 #endif /* BITS_PER_LONG */
 
 /**
diff --git a/lib/div64.c b/lib/div64.c
index a111eb8de9cf..5b4919191778 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -77,26 +77,58 @@ s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 EXPORT_SYMBOL(div_s64_rem);
 #endif
 
-/* 64bit divisor, dividend and result. dynamic precision */
+/**
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ *
+ * This implementation is a modified version of the algorithm proposed
+ * by the book 'Hacker's Delight'.  The original source and full proof
+ * can be found here and is available for use without restriction.
+ *
+ * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c'
+ */
 #ifndef div64_u64
 u64 div64_u64(u64 dividend, u64 divisor)
 {
-	u32 high, d;
+	u32 high = divisor >> 32;
+	u64 quot;
 
-	high = divisor >> 32;
-	if (high) {
-		unsigned int shift = fls(high);
+	if (high == 0) {
+		quot = div_u64(dividend, divisor);
+	} else {
+		int n = 1 + fls(high);
+		quot = div_u64(dividend >> n, divisor >> n);
 
-		d = divisor >> shift;
-		dividend >>= shift;
-	} else
-		d = divisor;
+		if (quot != 0)
+			quot--;
+		if ((dividend - quot * divisor) >= divisor)
+			quot++;
+	}
 
-	return div_u64(dividend, d);
+	return quot;
 }
 EXPORT_SYMBOL(div64_u64);
 #endif
 
+/**
+ * div64_s64 - signed 64bit divide with 64bit divisor
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ */
+#ifndef div64_s64
+s64 div64_s64(s64 dividend, s64 divisor)
+{
+	s64 quot, t;
+
+	quot = div64_u64(abs64(dividend), abs64(divisor));
+	t = (dividend ^ divisor) >> 63;
+
+	return (quot ^ t) - t;
+}
+EXPORT_SYMBOL(div64_s64);
+#endif
+
 #endif /* BITS_PER_LONG == 32 */
 
 /*

From 6b4c5bebcebb0a48d29947e9aa749650751a7696 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:11 -0700
Subject: [PATCH 159/176] checkpatch: fix regressions in "fix handling of
 leading spaces"

The patch "checkpatch: fix handling of leading spaces" added checks for
leading spaces on lines, but this introduces regressions.  Firstly it does
not correctly detect when we are in a comment.  Secondly it does not allow
for preprocessor command spacing.  Finally it does not allow for label
indentation which is required to be less than one tab.  Fix these up:

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2039acdf5122..0a87c7417ada 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1459,10 +1459,13 @@ sub process {
 		}
 
 # check for spaces at the beginning of a line.
-		if ($rawline =~ /^\+ / && $rawline !~ /\+ +\*/)  {
+# Exceptions:
+#  1) within comments
+#  2) indented preprocessor commands
+#  3) hanging labels
+		if ($rawline =~ /^\+ / && $line !~ /\+ *(?:$;|#|$Ident:)/)  {
 			my $herevet = "$here\n" . cat_vet($rawline) . "\n";
-			WARN("please, no space for starting a line, \
-				excluding comments\n" . $herevet);
+			WARN("please, no spaces at the start of a line\n" . $herevet);
 		}
 
 # check we are in a valid C source file if not then ignore this hunk

From e91b6e263ed6735c766cb14bbe63b9c7bd774526 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:11 -0700
Subject: [PATCH 160/176] checkpatch: types may sit on a line on their own

When the following form is used we have a type which fully fills a line.
This means that a type may end at the end of line as well as at the
following identifier.

	int **
	foo;

Reported-by: Daniel Walker <dwalker@fifo99.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0a87c7417ada..41f59b1e209b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -843,7 +843,7 @@ sub annotate_values {
 				$av_preprocessor = 0;
 			}
 
-		} elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\()/) {
+		} elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) {
 			print "DECLARE($1)\n" if ($dbg_values > 1);
 			$type = 'T';
 

From d2c0a23514d8ac4ed10a8e742467cfb72ca3bed8 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:12 -0700
Subject: [PATCH 161/176] checkpatch: suggest cleanpatch and cleanfile when
 appropriate

When we hit types of whitespace which may be fixed by scripts/cleanpatch
and scripts/cleanfile suggest their use in our report.

Suggested-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 41f59b1e209b..32d6a236570d 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -103,6 +103,8 @@ for my $key (keys %debug) {
 	die "$@" if ($@);
 }
 
+my $rpt_cleaners = 0;
+
 if ($terse) {
 	$emacs = 1;
 	$quiet++;
@@ -1389,6 +1391,7 @@ sub process {
 		} elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) {
 			my $herevet = "$here\n" . cat_vet($rawline) . "\n";
 			ERROR("trailing whitespace\n" . $herevet);
+			$rpt_cleaners = 1;
 		}
 
 # check for Kconfig help text having a real description
@@ -1450,6 +1453,7 @@ sub process {
 		    $rawline =~ /^\+\s*        \s*/) {
 			my $herevet = "$here\n" . cat_vet($rawline) . "\n";
 			ERROR("code indent should use tabs where possible\n" . $herevet);
+			$rpt_cleaners = 1;
 		}
 
 # check for space before tabs.
@@ -2842,6 +2846,15 @@ sub process {
 		print "\n" if ($quiet == 0);
 	}
 
+	if ($quiet == 0) {
+		# If there were whitespace errors which cleanpatch can fix
+		# then suggest that.
+		if ($rpt_cleaners) {
+			print "NOTE: whitespace errors detected, you may wish to use scripts/cleanpatch or\n";
+			print "      scripts/cleanfile\n\n";
+		}
+	}
+
 	if ($clean == 1 && $quiet == 0) {
 		print "$vname has no obvious style problems and is ready for submission.\n"
 	}

From fb2d2c1b5825503d30fb6f2dc328dbe4a47d9794 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:12 -0700
Subject: [PATCH 162/176] checkpatch: ensure we do not collapse bracketed
 sections into constants

When determining if a return () sequence is a function style bracketing we
simplify the expression one bracket at a time replacing each with a
constant.  However this can trigger a false merge with expressions as
below:

	return (foo)0;

Prevent this false merging.

Reported-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 32d6a236570d..8d010ac0efe1 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2183,15 +2183,16 @@ sub process {
 			my $value = $2;
 
 			# Flatten any parentheses
-			$value =~ s/\)\(/\) \(/g;
+			$value =~ s/\(/ \(/g;
+			$value =~ s/\)/\) /g;
 			while ($value =~ s/\[[^\{\}]*\]/1/ ||
 			       $value !~ /(?:$Ident|-?$Constant)\s*
 					     $Compare\s*
 					     (?:$Ident|-?$Constant)/x &&
 			       $value =~ s/\([^\(\)]*\)/1/) {
 			}
-
-			if ($value =~ /^(?:$Ident|-?$Constant)$/) {
+#print "value<$value>\n";
+			if ($value =~ /^\s*(?:$Ident|-?$Constant)\s*$/) {
 				ERROR("return is not a function, parentheses are not required\n" . $herecurr);
 
 			} elsif ($spacing !~ /\s+/) {

From 9446ef569c288e683225fec8337a0b2b81e75cc5 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:13 -0700
Subject: [PATCH 163/176] checkpatch: handle casts better fixing false
 categorisation of : as binary

The following incantation is triggering categorisation of its colon (:) as
a binary form, which it is not:

	return foo ? (s8)bar : baz;

Handle casts differently from types in the categoriser, allowing us to
better track (s8)bar as a value and not a declaration.

Reported-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 8d010ac0efe1..3cec2990d51e 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -845,6 +845,11 @@ sub annotate_values {
 				$av_preprocessor = 0;
 			}
 
+		} elsif ($cur =~ /^(\(\s*$Type\s*)\)/) {
+			print "CAST($1)\n" if ($dbg_values > 1);
+			push(@av_paren_type, $type);
+			$type = 'C';
+
 		} elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) {
 			print "DECLARE($1)\n" if ($dbg_values > 1);
 			$type = 'T';

From 53a3c4487a05b8f26ef72fe434a750a3402c998f Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:14 -0700
Subject: [PATCH 164/176] checkpatch: returning errno typically should be
 negative

Add a (strict mode only) test to check for non-negative returns of what
appear to be errno values as the majority case these should indeed be
negative.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 3cec2990d51e..bcdb54bd61a0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2204,6 +2204,13 @@ sub process {
 				ERROR("space required before the open parenthesis '('\n" . $herecurr);
 			}
 		}
+# Return of what appears to be an errno should normally be -'ve
+		if ($line =~ /^.\s*return\s*(E[A-Z]*)\s*;/) {
+			my $name = $1;
+			if ($name ne 'EOF' && $name ne 'ERROR') {
+				WARN("return of an errno should typically be -ve (return -$1)\n" . $herecurr);
+			}
+		}
 
 # Need a space before open parenthesis after if, while etc
 		if ($line=~/\b(if|while|for|switch)\(/) {

From 8cf6de7145943caa38c56c61cd83b17687afd900 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:14 -0700
Subject: [PATCH 165/176] checkpatch: add check for space after struct, union,
 and enum

Add spacing checks for struct, union, and enum definitions.  Check the
spacing after type and before the equals (=) and open brace ({).

Based on a patch by Joe Perches.

Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index bcdb54bd61a0..983ac1816da0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1905,6 +1905,16 @@ sub process {
 			ERROR("open brace '{' following function declarations go on the next line\n" . $herecurr);
 		}
 
+# missing space after union or struct definition
+		if ($rawline =~ /^\+\s*(union|struct)\s+$Ident[=\{]/) {
+		    WARN("Missing space after struct or union definition\n" . $herecurr);
+		}
+
+# missing space after enum definition
+		if ($rawline =~ /^\+\s*enum\{/) {
+		    WARN("Missing space after enum definition\n" . $herecurr);
+		}
+
 # open braces for enum, union and struct go on the same line.
 		if ($line =~ /^.\s*{/ &&
 		    $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) {

From 0c73b4eb7a825a5aff16d8a9701f6c28056de058 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:15 -0700
Subject: [PATCH 166/176] checkpatch: simplify and consolidate "missing space
 after" checks

Commonise the code for missing spaces after struct, union, and enum such
that they share the same code.  Ensure we cover all the common cases in
each case.  Check against the sanitised line to ensure we do not report on
comments and strings.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 983ac1816da0..cb19f54b4b2a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1905,22 +1905,17 @@ sub process {
 			ERROR("open brace '{' following function declarations go on the next line\n" . $herecurr);
 		}
 
-# missing space after union or struct definition
-		if ($rawline =~ /^\+\s*(union|struct)\s+$Ident[=\{]/) {
-		    WARN("Missing space after struct or union definition\n" . $herecurr);
-		}
-
-# missing space after enum definition
-		if ($rawline =~ /^\+\s*enum\{/) {
-		    WARN("Missing space after enum definition\n" . $herecurr);
-		}
-
 # open braces for enum, union and struct go on the same line.
 		if ($line =~ /^.\s*{/ &&
 		    $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) {
 			ERROR("open brace '{' following $1 go on the same line\n" . $hereprev);
 		}
 
+# missing space after union, struct or enum definition
+		if ($line =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?(?:\s+$Ident)?[=\{]/) {
+		    WARN("missing space after $1 definition\n" . $herecurr);
+		}
+
 # check for spacing round square brackets; allowed:
 #  1. with a type on the left -- int [] a;
 #  2. at the beginning of a line for slice initialisers -- [0...10] = 5,

From 9fe287d79b0af983050d24e7916cf3d1f019f553 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:15 -0700
Subject: [PATCH 167/176] checkpatch: ensure kconfig help checks only apply
 when we are adding help

When checking the length of the help we need to be sure we are seeing the
whole story before erroring.  Firstly we only want to check when adding
the help in the first place.  Second we need to be sure that we are seeing
the end of the entry, nominally when there is no context below or that
context shows the start of the next entry.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index cb19f54b4b2a..e44ff91e811d 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1400,18 +1400,34 @@ sub process {
 		}
 
 # check for Kconfig help text having a real description
+# Only applies when adding the entry originally, after that we do not have
+# sufficient context to determine whether it is indeed long enough.
 		if ($realfile =~ /Kconfig/ &&
-		    $line =~ /\+?\s*(---)?help(---)?$/) {
+		    $line =~ /\+\s*(?:---)?help(?:---)?$/) {
 			my $length = 0;
-			for (my $l = $linenr; defined($lines[$l]); $l++) {
-				my $f = $lines[$l];
+			my $cnt = $realcnt;
+			my $ln = $linenr + 1;
+			my $f;
+			my $is_end = 0;
+			while ($cnt > 0 && defined $lines[$ln - 1]) {
+				$f = $lines[$ln - 1];
+				$cnt-- if ($lines[$ln - 1] !~ /^-/);
+				$is_end = $lines[$ln - 1] =~ /^\+/;
+				$ln++;
+
+				next if ($f =~ /^-/);
+				$f =~ s/^.//;
 				$f =~ s/#.*//;
 				$f =~ s/^\s+//;
 				next if ($f =~ /^$/);
-				last if ($f =~ /^\s*config\s/);
+				if ($f =~ /^\s*config\s/) {
+					$is_end = 1;
+					last;
+				}
 				$length++;
 			}
-			WARN("please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($length < 4);
+			WARN("please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_end && $length < 4);
+			#print "is_end<$is_end> length<$length>\n";
 		}
 
 # check we are in a valid source file if not then ignore this hunk

From 3bf9a009fccea422bc355414a3bdf5f35fff9f36 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin.vincent@stericsson.com>
Date: Tue, 26 Oct 2010 14:23:16 -0700
Subject: [PATCH 168/176] checkpatch: check for incorrect permissions

Throw an error when a source file has been given execute permissions using
the mode change line present in git diffs.  Also alow the filename
matching to use the "diff" line in addition to the "+++" line, since the
mode change lines appear before any "+++" lines.

[apw@canonical.com: simplified filename logic slightly, added tests]
Cc: Andy Whitcroft <apw@canonical.com>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Rabin Vincent <rabin.vincent@stericsson.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e44ff91e811d..93fa145671a0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1315,7 +1315,11 @@ sub process {
 		$here = "#$realline: " if ($file);
 
 		# extract the filename as it passes
-		if ($line=~/^\+\+\+\s+(\S+)/) {
+		if ($line =~ /^diff --git.*?(\S+)$/) {
+			$realfile = $1;
+			$realfile =~ s@^([^/]*)/@@;
+
+		} elsif ($line =~ /^\+\+\+\s+(\S+)/) {
 			$realfile = $1;
 			$realfile =~ s@^([^/]*)/@@;
 
@@ -1339,6 +1343,14 @@ sub process {
 
 		$cnt_lines++ if ($realcnt != 0);
 
+# Check for incorrect file permissions
+		if ($line =~ /^new (file )?mode.*[7531]\d{0,2}$/) {
+			my $permhere = $here . "FILE: $realfile\n";
+			if ($realfile =~ /(Makefile|Kconfig|\.c|\.h|\.S|\.tmpl)$/) {
+				ERROR("do not set execute permissions for source files\n" . $permhere);
+			}
+		}
+
 #check the patch for a signoff:
 		if ($line =~ /^\s*signed-off-by:/i) {
 			# This is a signoff, if ugly, so do not double report.

From 03f1df7da5696ddfa6e167b37e0c0ce5aad3de79 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:16 -0700
Subject: [PATCH 169/176] checkpatch: Add additional attribute #defines

On Wed, 2010-08-11 at 12:35 -0400, Dave Jones wrote:
> I just got this from a patch I merged..
>
> ERROR: need consistent spacing around '*' (ctx:WxV)
> #121: FILE: arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c:113:
> +static struct pcc_cpu __percpu *pcc_cpu_info;
>                                 ^
> which doesn't seem right.

Perhaps these need to be added to checkpatch.

[apw@canonical.com: added tests]
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 93fa145671a0..c151c9e142ad 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -152,6 +152,20 @@ our $Sparse	= qr{
 # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check
 our $Attribute	= qr{
 			const|
+			__percpu|
+			__nocast|
+			__safe|
+			__bitwise__|
+			__packed__|
+			__packed2__|
+			__naked|
+			__maybe_unused|
+			__always_unused|
+			__noreturn|
+			__used|
+			__cold|
+			__noclone|
+			__deprecated|
 			__read_mostly|
 			__kprobes|
 			__(?:mem|cpu|dev|)(?:initdata|initconst|init\b)|

From 015830be9779aeae7de7060b07a3157a8e41bcb4 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:17 -0700
Subject: [PATCH 170/176] checkpatch: update copyright dates

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index c151c9e142ad..53b2eaecba0c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2,7 +2,7 @@
 # (c) 2001, Dave Jones. (the file handling bit)
 # (c) 2005, Joel Schopp <jschopp@austin.ibm.com> (the ugly bit)
 # (c) 2007,2008, Andy Whitcroft <apw@uk.ibm.com> (new conditions, test suite)
-# (c) 2008,2009, Andy Whitcroft <apw@canonical.com>
+# (c) 2008-2010 Andy Whitcroft <apw@canonical.com>
 # Licensed under the terms of the GNU GPL License version 2
 
 use strict;

From 5eaa20b984eb316533b4a098d8de3912e434df6a Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:18 -0700
Subject: [PATCH 171/176] checkpatch: clean up structure definition macro
 handline

Handle definitions such as the following correctly, it is not
a complex statement:

    #define PREALLOC(NAME, START, END, FLAGS) {     \
		    .name = (NAME),                 \
		    .start = (START),               \
		    .end = (END),                   \
		    .flags = (FLAGS)                \
	    },

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 53b2eaecba0c..d5361e49abad 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2476,8 +2476,8 @@ sub process {
 				\.$Ident\s*=\s*|
 				^\"|\"$
 			}x;
-			#print "REST<$rest> dstat<$dstat>\n";
-			if ($rest ne '') {
+			#print "REST<$rest> dstat<$dstat> ctx<$ctx>\n";
+			if ($rest ne '' && $rest ne ',') {
 				if ($rest !~ /while\s*\(/ &&
 				    $dstat !~ /$exceptions/)
 				{

From 3cbf62df3a8ce61cb1aa20b7dae964058988bfdd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:18 -0700
Subject: [PATCH 172/176] checkpatch: handle EXPORT_SYMBOL for DEVICE_ATTR and
 similar

Handly definitions similar to below.  The definition macro spits out a
symbol with a prefix.  Add matching of any identifier prefix:

    DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR,
            ata_scsi_lpm_show, ata_scsi_lpm_put);
    EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy);

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d5361e49abad..8ab45b72b396 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1822,8 +1822,17 @@ sub process {
 		    !defined $suppress_export{$realline_next} &&
 		    ($lines[$realline_next - 1] =~ /EXPORT_SYMBOL.*\((.*)\)/ ||
 		     $lines[$realline_next - 1] =~ /EXPORT_UNUSED_SYMBOL.*\((.*)\)/)) {
+			# Handle definitions which produce identifiers with
+			# a prefix:
+			#   XXX(foo);
+			#   EXPORT_SYMBOL(something_foo);
 			my $name = $1;
-			if ($stat !~ /(?:
+			if ($stat =~ /^.([A-Z_]+)\s*\(\s*($Ident)/ &&
+			    $name =~ /^${Ident}_$2/) {
+#print "FOO C name<$name>\n";
+				$suppress_export{$realline_next} = 1;
+
+			} elsif ($stat !~ /(?:
 				\n.}\s*$|
 				^.DEFINE_$Ident\(\Q$name\E\)|
 				^.DECLARE_$Ident\(\Q$name\E\)|

From 01464f30a97c5c30bf9633309b27cce055cef8fd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:19 -0700
Subject: [PATCH 173/176] checkpatch: statement/block context analyser should
 look at sanitised lines

When tracking context to find a block or statement we need to use the
sanitised lines, else perentheses '(' & ')' and braces '{' & '}' can throw
the scanner out.  Also fix up a couple of error outputs which include
those sanitised lines incorrectly.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 8ab45b72b396..d086ffe377d5 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -691,15 +691,15 @@ sub ctx_block_get {
 		$blk .= $rawlines[$line];
 
 		# Handle nested #if/#else.
-		if ($rawlines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) {
+		if ($lines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) {
 			push(@stack, $level);
-		} elsif ($rawlines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) {
+		} elsif ($lines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) {
 			$level = $stack[$#stack - 1];
-		} elsif ($rawlines[$line] =~ /^.\s*#\s*endif\b/) {
+		} elsif ($lines[$line] =~ /^.\s*#\s*endif\b/) {
 			$level = pop(@stack);
 		}
 
-		foreach my $c (split(//, $rawlines[$line])) {
+		foreach my $c (split(//, $lines[$line])) {
 			##print "C<$c>L<$level><$open$close>O<$off>\n";
 			if ($off > 0) {
 				$off--;
@@ -1652,7 +1652,7 @@ sub process {
 
 			if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) {
 				ERROR("that open brace { should be on the previous line\n" .
-					"$here\n$ctx\n$lines[$ctx_ln - 1]\n");
+					"$here\n$ctx\n$rawlines[$ctx_ln - 1]\n");
 			}
 			if ($level == 0 && $pre_ctx !~ /}\s*while\s*\($/ &&
 			    $ctx =~ /\)\s*\;\s*$/ &&
@@ -1661,7 +1661,7 @@ sub process {
 				my ($nlength, $nindent) = line_stats($lines[$ctx_ln - 1]);
 				if ($nindent > $indent) {
 					WARN("trailing semicolon indicates no statements, indent implies otherwise\n" .
-						"$here\n$ctx\n$lines[$ctx_ln - 1]\n");
+						"$here\n$ctx\n$rawlines[$ctx_ln - 1]\n");
 				}
 			}
 		}

From 267ad8f42644c2fa4ff6c2e7596d2b02c7397c85 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 26 Oct 2010 14:23:19 -0700
Subject: [PATCH 174/176] checkpatch: version 0.31

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d086ffe377d5..c1e7fb3eab44 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -10,7 +10,7 @@ use strict;
 my $P = $0;
 $P =~ s@.*/@@g;
 
-my $V = '0.30';
+my $V = '0.31';
 
 use Getopt::Long qw(:config no_auto_abbrev);
 

From cb710eca6820493add0ddd3d7e8e3ee53f2b6e57 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:20 -0700
Subject: [PATCH 175/176] scripts/checkpatch.pl: add warnings for static char
 that could be static const char

Add warnings for possible missing const uses of
	static char foo[] = "bar"
    that could be
	static const char foo[] = "bar"
and
	static const char *foo[] = {"bar", "baz"}
    that could be
	static const char * const foo[] = {"bar", "baz"}

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index c1e7fb3eab44..2ec5fc6a4046 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1869,6 +1869,18 @@ sub process {
 				$herecurr);
 		}
 
+# check for static const char * arrays.
+		if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) {
+			WARN("static const char * array should probably be static const char * const\n" .
+				$herecurr);
+               }
+
+# check for static char foo[] = "bar" declarations.
+		if ($line =~ /\bstatic\s+char\s+(\w+)\s*\[\s*\]\s*=\s*"/) {
+			WARN("static char array declaration should probably be static const char\n" .
+				$herecurr);
+               }
+
 # check for new typedefs, only function parameters and sparse annotations
 # make sense.
 		if ($line =~ /\btypedef\s/ &&

From 93ed0e2d07b25aff4db1d61bfbcd1e82074c0ad5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 Oct 2010 14:23:21 -0700
Subject: [PATCH 176/176] scripts/checkpatch.pl: add check for declaration of
 pci_device_id

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2ec5fc6a4046..90b54d4697fd 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1881,6 +1881,11 @@ sub process {
 				$herecurr);
                }
 
+# check for declarations of struct pci_device_id
+		if ($line =~ /\bstruct\s+pci_device_id\s+\w+\s*\[\s*\]\s*\=\s*\{/) {
+			WARN("Use DEFINE_PCI_DEVICE_TABLE for struct pci_device_id\n" . $herecurr);
+		}
+
 # check for new typedefs, only function parameters and sparse annotations
 # make sense.
 		if ($line =~ /\btypedef\s/ &&