From bd624d75db21ea5402f9ecf4450b311794d80352 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 13 Feb 2015 08:54:56 +0800
Subject: [PATCH 01/98] clockevents: Introduce mode specific callbacks

It is not possible for the clockevents core to know which modes (other than
those with a corresponding feature flag) are supported by a particular
implementation. And drivers are expected to handle transition to all modes
elegantly, as ->set_mode() would be issued for them unconditionally.

Now, adding support for a new mode complicates things a bit if we want to use
the legacy ->set_mode() callback. We need to closely review all clockevents
drivers to see if they would break on addition of a new mode. And after such
reviews, it is found that we have to do non-trivial changes to most of the
drivers [1].

Introduce mode-specific set_mode_*() callbacks, some of which the drivers may or
may not implement. A missing callback would clearly convey the message that the
corresponding mode isn't supported.

A driver may still choose to keep supporting the legacy ->set_mode() callback,
but ->set_mode() wouldn't be supporting any new modes beyond RESUME. If a driver
wants to benefit from using a new mode, it would be required to migrate to
the mode specific callbacks.

The legacy ->set_mode() callback and the newly introduced mode-specific
callbacks are mutually exclusive. Only one of them should be supported by the
driver.

Sanity check is done at the time of registration to distinguish between optional
and required callbacks and to make error recovery and handling simpler. If the
legacy ->set_mode() callback is provided, all mode specific ones would be
ignored by the core but a warning is thrown if they are present.

Call sites calling ->set_mode() directly are also updated to use
__clockevents_set_mode() instead, as ->set_mode() may not be available anymore
for few drivers.

 [1] https://lkml.org/lkml/2014/12/9/605
 [2] https://lkml.org/lkml/2015/1/23/255

Suggested-by: Thomas Gleixner <tglx@linutronix.de> [2]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: linaro-kernel@lists.linaro.org
Cc: linaro-networking@linaro.org
Link: http://lkml.kernel.org/r/792d59a40423f0acffc9bb0bec9de1341a06fa02.1423788565.git.viresh.kumar@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/clockchips.h | 21 ++++++++-
 kernel/time/clockevents.c  | 88 +++++++++++++++++++++++++++++++++++++-
 kernel/time/timer_list.c   | 32 ++++++++++++--
 3 files changed, 134 insertions(+), 7 deletions(-)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 2e4cb67f6e56..59af26b54d15 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -39,6 +39,8 @@ enum clock_event_mode {
 	CLOCK_EVT_MODE_PERIODIC,
 	CLOCK_EVT_MODE_ONESHOT,
 	CLOCK_EVT_MODE_RESUME,
+
+	/* Legacy ->set_mode() callback doesn't support below modes */
 };
 
 /*
@@ -81,7 +83,11 @@ enum clock_event_mode {
  * @mode:		operating mode assigned by the management code
  * @features:		features
  * @retries:		number of forced programming retries
- * @set_mode:		set mode function
+ * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
+ * @set_mode_periodic:	switch mode to periodic, if !set_mode
+ * @set_mode_oneshot:	switch mode to oneshot, if !set_mode
+ * @set_mode_shutdown:	switch mode to shutdown, if !set_mode
+ * @set_mode_resume:	resume clkevt device, if !set_mode
  * @broadcast:		function to broadcast events
  * @min_delta_ticks:	minimum delta value in ticks stored for reconfiguration
  * @max_delta_ticks:	maximum delta value in ticks stored for reconfiguration
@@ -108,9 +114,20 @@ struct clock_event_device {
 	unsigned int		features;
 	unsigned long		retries;
 
-	void			(*broadcast)(const struct cpumask *mask);
+	/*
+	 * Mode transition callback(s): Only one of the two groups should be
+	 * defined:
+	 * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
+	 * - set_mode_{shutdown|periodic|oneshot|resume}().
+	 */
 	void			(*set_mode)(enum clock_event_mode mode,
 					    struct clock_event_device *);
+	int			(*set_mode_periodic)(struct clock_event_device *);
+	int			(*set_mode_oneshot)(struct clock_event_device *);
+	int			(*set_mode_shutdown)(struct clock_event_device *);
+	int			(*set_mode_resume)(struct clock_event_device *);
+
+	void			(*broadcast)(const struct cpumask *mask);
 	void			(*suspend)(struct clock_event_device *);
 	void			(*resume)(struct clock_event_device *);
 	unsigned long		min_delta_ticks;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..489642b08d64 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,6 +94,57 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
+static int __clockevents_set_mode(struct clock_event_device *dev,
+				  enum clock_event_mode mode)
+{
+	/* Transition with legacy set_mode() callback */
+	if (dev->set_mode) {
+		/* Legacy callback doesn't support new modes */
+		if (mode > CLOCK_EVT_MODE_RESUME)
+			return -ENOSYS;
+		dev->set_mode(mode, dev);
+		return 0;
+	}
+
+	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	/* Transition with new mode-specific callbacks */
+	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
+		/*
+		 * This is an internal state, which is guaranteed to go from
+		 * SHUTDOWN to UNUSED. No driver interaction required.
+		 */
+		return 0;
+
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		return dev->set_mode_shutdown(dev);
+
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* Core internal bug */
+		if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+			return -ENOSYS;
+		return dev->set_mode_periodic(dev);
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* Core internal bug */
+		if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+			return -ENOSYS;
+		return dev->set_mode_oneshot(dev);
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Optional callback */
+		if (dev->set_mode_resume)
+			return dev->set_mode_resume(dev);
+		else
+			return 0;
+
+	default:
+		return -ENOSYS;
+	}
+}
+
 /**
  * clockevents_set_mode - set the operating mode of a clock event device
  * @dev:	device to modify
@@ -105,7 +156,9 @@ void clockevents_set_mode(struct clock_event_device *dev,
 				 enum clock_event_mode mode)
 {
 	if (dev->mode != mode) {
-		dev->set_mode(mode, dev);
+		if (__clockevents_set_mode(dev, mode))
+			return;
+
 		dev->mode = mode;
 
 		/*
@@ -373,6 +426,35 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind);
 
+/* Sanity check of mode transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+	/* Legacy set_mode() callback */
+	if (dev->set_mode) {
+		/* We shouldn't be supporting new modes now */
+		WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot ||
+			dev->set_mode_shutdown || dev->set_mode_resume);
+		return 0;
+	}
+
+	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	/* New mode-specific callbacks */
+	if (!dev->set_mode_shutdown)
+		return -EINVAL;
+
+	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+	    !dev->set_mode_periodic)
+		return -EINVAL;
+
+	if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+	    !dev->set_mode_oneshot)
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * clockevents_register_device - register a clock event device
  * @dev:	device to register
@@ -382,6 +464,8 @@ void clockevents_register_device(struct clock_event_device *dev)
 	unsigned long flags;
 
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	BUG_ON(clockevents_sanity_check(dev));
+
 	if (!dev->cpumask) {
 		WARN_ON(num_possible_cpus() > 1);
 		dev->cpumask = cpumask_of(smp_processor_id());
@@ -449,7 +533,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 		return clockevents_program_event(dev, dev->next_event, false);
 
 	if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
-		dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+		return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
 
 	return 0;
 }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..2cfd19485824 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	print_name_offset(m, dev->set_next_event);
 	SEQ_printf(m, "\n");
 
-	SEQ_printf(m, " set_mode:       ");
-	print_name_offset(m, dev->set_mode);
-	SEQ_printf(m, "\n");
+	if (dev->set_mode) {
+		SEQ_printf(m, " set_mode:       ");
+		print_name_offset(m, dev->set_mode);
+		SEQ_printf(m, "\n");
+	} else {
+		if (dev->set_mode_shutdown) {
+			SEQ_printf(m, " shutdown: ");
+			print_name_offset(m, dev->set_mode_shutdown);
+			SEQ_printf(m, "\n");
+		}
+
+		if (dev->set_mode_periodic) {
+			SEQ_printf(m, " periodic: ");
+			print_name_offset(m, dev->set_mode_periodic);
+			SEQ_printf(m, "\n");
+		}
+
+		if (dev->set_mode_oneshot) {
+			SEQ_printf(m, " oneshot:  ");
+			print_name_offset(m, dev->set_mode_oneshot);
+			SEQ_printf(m, "\n");
+		}
+
+		if (dev->set_mode_resume) {
+			SEQ_printf(m, " resume:   ");
+			print_name_offset(m, dev->set_mode_resume);
+			SEQ_printf(m, "\n");
+		}
+	}
 
 	SEQ_printf(m, " event_handler:  ");
 	print_name_offset(m, dev->event_handler);

From 7d70e15480c0450d2bfafaad338a32e884fc215e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 4 Mar 2015 10:37:43 -0500
Subject: [PATCH 02/98] writeback: add missing INITIAL_JIFFIES init in
 global_update_bandwidth()

global_update_bandwidth() uses static variable update_time as the
timestamp for the last update but forgets to initialize it to
INITIALIZE_JIFFIES.

This means that global_dirty_limit will be 5 mins into the future on
32bit and some large amount jiffies into the past on 64bit.  This
isn't critical as the only effect is that global_dirty_limit won't be
updated for the first 5 mins after booting on 32bit machines,
especially given the auxiliary nature of global_dirty_limit's role -
protecting against global dirty threshold's sudden dips; however, it
does lead to unintended suboptimal behavior.  Fix it.

Fixes: c42843f2f0bb ("writeback: introduce smoothed global dirty limit")
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 mm/page-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 45e187b2d971..b4fd980a93eb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -922,7 +922,7 @@ static void global_update_bandwidth(unsigned long thresh,
 				    unsigned long now)
 {
 	static DEFINE_SPINLOCK(dirty_lock);
-	static unsigned long update_time;
+	static unsigned long update_time = INITIAL_JIFFIES;
 
 	/*
 	 * check locklessly first to optimize away locking for the most time

From ff6b8090e26ef7649ef0cc6b42389141ef48b0cf Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Tue, 27 Jan 2015 18:08:22 +0530
Subject: [PATCH 03/98] nbd: fix possible memory leak

we have already allocated memory for nbd_dev, but we were not
releasing that memory and just returning the error value.

Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Acked-by: Paul Clements <Paul.Clements@SteelEye.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
---
 drivers/block/nbd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4bc2a5cb9935..a98c41f72c63 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -803,10 +803,6 @@ static int __init nbd_init(void)
 		return -EINVAL;
 	}
 
-	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
-	if (!nbd_dev)
-		return -ENOMEM;
-
 	part_shift = 0;
 	if (max_part > 0) {
 		part_shift = fls(max_part);
@@ -828,6 +824,10 @@ static int __init nbd_init(void)
 	if (nbds_max > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 
+	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
+	if (!nbd_dev)
+		return -ENOMEM;
+
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
 		if (!disk)

From 12cb89e37a0c25fae7a0f1d2e4985558db9d0b13 Mon Sep 17 00:00:00 2001
From: "Ivan T. Ivanov" <iivanov@mm-sol.com>
Date: Fri, 6 Mar 2015 17:26:17 +0200
Subject: [PATCH 04/98] spi: qup: Fix cs-num DT property parsing

num-cs is 32 bit property, don't read just upper 16 bits.

Fixes: 4a8573abe965 (spi: qup: Remove chip select function)
Signed-off-by: Ivan T. Ivanov <iivanov@mm-sol.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-qup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/spi/spi-qup.c b/drivers/spi/spi-qup.c
index ff9cdbdb6672..2b2c359f5a50 100644
--- a/drivers/spi/spi-qup.c
+++ b/drivers/spi/spi-qup.c
@@ -498,7 +498,7 @@ static int spi_qup_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct device *dev;
 	void __iomem *base;
-	u32 max_freq, iomode;
+	u32 max_freq, iomode, num_cs;
 	int ret, irq, size;
 
 	dev = &pdev->dev;
@@ -550,10 +550,11 @@ static int spi_qup_probe(struct platform_device *pdev)
 	}
 
 	/* use num-cs unless not present or out of range */
-	if (of_property_read_u16(dev->of_node, "num-cs",
-			&master->num_chipselect) ||
-			(master->num_chipselect > SPI_NUM_CHIPSELECTS))
+	if (of_property_read_u32(dev->of_node, "num-cs", &num_cs) ||
+	    num_cs > SPI_NUM_CHIPSELECTS)
 		master->num_chipselect = SPI_NUM_CHIPSELECTS;
+	else
+		master->num_chipselect = num_cs;
 
 	master->bus_num = pdev->id;
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LOOP;

From 854d2f241d71f6ca08ccde30e6c7c2e403363e52 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 6 Mar 2015 14:42:01 +0200
Subject: [PATCH 05/98] spi: dw-mid: clear BUSY flag fist and test other one

The logic of DMA completion is broken now since test_and_clear_bit() never
returns the other bit is set. It means condition are always false and we have
spi_finalize_current_transfer() called per each DMA completion which is wrong.

The patch fixes logic by clearing BUSY bit first and then check for the other
one.

Fixes: 30c8eb52cc4a (spi: dw-mid: split rx and tx callbacks when DMA)
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-dw-mid.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-dw-mid.c b/drivers/spi/spi-dw-mid.c
index 3ce39d10fafb..4f8c798e0633 100644
--- a/drivers/spi/spi-dw-mid.c
+++ b/drivers/spi/spi-dw-mid.c
@@ -108,7 +108,8 @@ static void dw_spi_dma_tx_done(void *arg)
 {
 	struct dw_spi *dws = arg;
 
-	if (test_and_clear_bit(TX_BUSY, &dws->dma_chan_busy) & BIT(RX_BUSY))
+	clear_bit(TX_BUSY, &dws->dma_chan_busy);
+	if (test_bit(RX_BUSY, &dws->dma_chan_busy))
 		return;
 	dw_spi_xfer_done(dws);
 }
@@ -156,7 +157,8 @@ static void dw_spi_dma_rx_done(void *arg)
 {
 	struct dw_spi *dws = arg;
 
-	if (test_and_clear_bit(RX_BUSY, &dws->dma_chan_busy) & BIT(TX_BUSY))
+	clear_bit(RX_BUSY, &dws->dma_chan_busy);
+	if (test_bit(TX_BUSY, &dws->dma_chan_busy))
 		return;
 	dw_spi_xfer_done(dws);
 }

From 6086e346fdea1ae64d974c94c1acacc2605567ae Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:29 -0700
Subject: [PATCH 06/98] clocksource: Simplify the clocks_calc_max_nsecs() logic

The previous clocks_calc_max_nsecs() code had some unecessarily
complex bit logic to find the max interval that could cause
multiplication overflows. Since this is not in the hot
path, just do the divide to make it easier to read.

The previous implementation also had a subtle issue
that it avoided overflows with signed 64-bit values, where
as the intervals are always unsigned. This resulted in
overly conservative intervals, which other safety margins
were then added to, reducing the intended interval length.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/clocksource.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..2148f413256c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -476,19 +476,10 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 
 	/*
 	 * Calculate the maximum number of cycles that we can pass to the
-	 * cyc2ns function without overflowing a 64-bit signed result. The
-	 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
-	 * which is equivalent to the below.
-	 * max_cycles < (2^63)/(mult + maxadj)
-	 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-	 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-	 * max_cycles < 2^(63 - log2(mult + maxadj))
-	 * max_cycles < 1 << (63 - log2(mult + maxadj))
-	 * Please note that we add 1 to the result of the log2 to account for
-	 * any rounding errors, ensure the above inequality is satisfied and
-	 * no overflow will occur.
+	 * cyc2ns() function without overflowing a 64-bit result.
 	 */
-	max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+	max_cycles = ULLONG_MAX;
+	do_div(max_cycles, mult+maxadj);
 
 	/*
 	 * The actual maximum number of cycles we can defer the clocksource is

From 362fde0410377e468ca00ad363fdf3e3ec42eb6a Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:30 -0700
Subject: [PATCH 07/98] clocksource: Simplify the logic around clocksource
 wrapping safety margins

The clocksource logic has a number of places where we try to
include a safety margin. Most of these are 12% safety margins,
but they are inconsistently applied and sometimes are applied
on top of each other.

Additionally, in the previous patch, we corrected an issue
where we unintentionally in effect created a 50% safety margin,
which these 12.5% margins where then added to.

So to simplify the logic here, this patch removes the various
12.5% margins, and consolidates adding the margin in one place:
clocks_calc_max_nsecs().

Additionally, Linus prefers a 50% safety margin, as it allows
bad clock values to be more easily caught. This should really
have no net effect, due to the corrected issue earlier which
caused greater then 50% margins to be used w/o issue.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Stephen Boyd <sboyd@codeaurora.org> (for the sched_clock.c bit)
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/clocksource.c | 26 ++++++++++++--------------
 kernel/time/sched_clock.c |  4 ++--
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2148f413256c..ace95763b3a6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -469,6 +469,9 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
  * @shift:	cycle to nanosecond divisor (power of two)
  * @maxadj:	maximum adjustment value to mult (~11%)
  * @mask:	bitmask for two's complement subtraction of non 64 bit counters
+ *
+ * NOTE: This function includes a safety margin of 50%, so that bad clock values
+ * can be detected.
  */
 u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 {
@@ -490,11 +493,14 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 	max_cycles = min(max_cycles, mask);
 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
 
+	/* Return 50% of the actual maximum, so we can detect bad values */
+	max_nsecs >>= 1;
+
 	return max_nsecs;
 }
 
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * clocksource_max_deferment - Returns max time the clocksource should be deferred
  * @cs:         Pointer to clocksource
  *
  */
@@ -504,13 +510,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
 
 	max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
 					  cs->mask);
-	/*
-	 * To ensure that the clocksource does not wrap whilst we are idle,
-	 * limit the time the clocksource can be deferred by 12.5%. Please
-	 * note a margin of 12.5% is used because this can be computed with
-	 * a shift, versus say 10% which would require division.
-	 */
-	return max_nsecs - (max_nsecs >> 3);
+	return max_nsecs;
 }
 
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -659,10 +659,9 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 	 * conversion precision. 10 minutes is still a reasonable
 	 * amount. That results in a shift value of 24 for a
 	 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-	 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
-	 * margin as we do in clocksource_max_deferment()
+	 * ~ 0.06ppm granularity for NTP.
 	 */
-	sec = (cs->mask - (cs->mask >> 3));
+	sec = cs->mask;
 	do_div(sec, freq);
 	do_div(sec, scale);
 	if (!sec)
@@ -674,9 +673,8 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 			       NSEC_PER_SEC / scale, sec * scale);
 
 	/*
-	 * for clocksources that have large mults, to avoid overflow.
-	 * Since mult may be adjusted by ntp, add an safety extra margin
-	 *
+	 * Ensure clocksources that have large 'mult' values don't overflow
+	 * when adjusted.
 	 */
 	cs->maxadj = clocksource_max_adjustment(cs);
 	while ((cs->mult + cs->maxadj < cs->mult)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..3b8ae45020c1 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -125,9 +125,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 
 	new_mask = CLOCKSOURCE_MASK(bits);
 
-	/* calculate how many ns until we wrap */
+	/* calculate how many nanosecs until we risk wrapping */
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+	new_wrap_kt = ns_to_ktime(wrap);
 
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();

From fb82fe2fe8588745edd73aa3a6229facac5c1e15 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:31 -0700
Subject: [PATCH 08/98] clocksource: Add 'max_cycles' to 'struct clocksource'

In order to facilitate clocksource validation, add a
'max_cycles' field to the clocksource structure which
will hold the maximum cycle value that can safely be
multiplied without potentially causing an overflow.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/clocksource.h |  5 +++--
 kernel/time/clocksource.c   | 28 ++++++++++++++++------------
 kernel/time/sched_clock.c   |  2 +-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 9c78d15d33e4..16d048cadebb 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -56,6 +56,7 @@ struct module;
  * @shift:		cycle to nanosecond divisor (power of two)
  * @max_idle_ns:	max idle time permitted by the clocksource (nsecs)
  * @maxadj:		maximum adjustment value to mult (~11%)
+ * @max_cycles:		maximum safe cycle value which won't overflow on multiplication
  * @flags:		flags describing special properties
  * @archdata:		arch-specific data
  * @suspend:		suspend function for the clocksource, if necessary
@@ -76,7 +77,7 @@ struct clocksource {
 #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
 	struct arch_clocksource_data archdata;
 #endif
-
+	u64 max_cycles;
 	const char *name;
 	struct list_head list;
 	int rating;
@@ -189,7 +190,7 @@ extern struct clocksource * __init clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);
 
 extern u64
-clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask);
+clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cycles);
 extern void
 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ace95763b3a6..fc2a9de43ca1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -469,11 +469,13 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
  * @shift:	cycle to nanosecond divisor (power of two)
  * @maxadj:	maximum adjustment value to mult (~11%)
  * @mask:	bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc:	maximum cycle value before potential overflow (does not include
+ *		any safety margin)
  *
  * NOTE: This function includes a safety margin of 50%, so that bad clock values
  * can be detected.
  */
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
 	u64 max_nsecs, max_cycles;
 
@@ -493,6 +495,10 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 	max_cycles = min(max_cycles, mask);
 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
 
+	/* return the max_cycles value as well if requested */
+	if (max_cyc)
+		*max_cyc = max_cycles;
+
 	/* Return 50% of the actual maximum, so we can detect bad values */
 	max_nsecs >>= 1;
 
@@ -500,17 +506,15 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 }
 
 /**
- * clocksource_max_deferment - Returns max time the clocksource should be deferred
- * @cs:         Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs:         Pointer to clocksource to be updated
  *
  */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
 {
-	u64 max_nsecs;
-
-	max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
-					  cs->mask);
-	return max_nsecs;
+	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+						cs->maxadj, cs->mask,
+						&cs->max_cycles);
 }
 
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -684,7 +688,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 		cs->maxadj = clocksource_max_adjustment(cs);
 	}
 
-	cs->max_idle_ns = clocksource_max_deferment(cs);
+	clocksource_update_max_deferment(cs);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 
@@ -730,8 +734,8 @@ int clocksource_register(struct clocksource *cs)
 		"Clocksource %s might overflow on 11%% adjustment\n",
 		cs->name);
 
-	/* calculate max idle time permitted for this clocksource */
-	cs->max_idle_ns = clocksource_max_deferment(cs);
+	/* Update max idle time permitted for this clocksource */
+	clocksource_update_max_deferment(cs);
 
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 3b8ae45020c1..ca3bc5c7027c 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -126,7 +126,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	new_mask = CLOCKSOURCE_MASK(bits);
 
 	/* calculate how many nanosecs until we risk wrapping */
-	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
+	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
 	new_wrap_kt = ns_to_ktime(wrap);
 
 	/* update epoch for new counter and update epoch_ns from old counter*/

From 3a43477fa36eee2997313380cb52f486e2bff316 Mon Sep 17 00:00:00 2001
From: Roger Tseng <rogerable@realtek.com>
Date: Fri, 6 Mar 2015 11:36:06 +0800
Subject: [PATCH 09/98] mfd: rtsx_usb: Prevent DMA from stack

Functions rtsx_usb_ep0_read_register() and rtsx_usb_get_card_status()
both use arbitrary buffer addresses from arguments directly for DMA and
the buffers could be located in stack. This was caught by DMA-API debug
check.

Fixes this by using double-buffers via kzalloc in both functions to
guarantee the validity of DMA buffer.

WARNING: CPU: 1 PID: 25 at lib/dma-debug.c:1166 check_for_stack+0x96/0xe0()
ehci-pci 0000:00:1a.0: DMA-API: device driver maps memory from stack
[addr=ffff8801199e3cef]
Modules linked in: rtsx_usb_ms arc4 memstick intel_rapl iosf_mbi
rtl8192ce snd_hda_codec_hdmi snd_hda_codec_realtek
snd_hda_codec_generic snd_hda_intel rtl_pci rtl8192c_common
snd_hda_controller x86_pkg_temp_thermal snd_hda_codec rtlwifi mac80211
coretemp kvm_intel kvm iTCO_wdt snd_hwdep snd_seq snd_seq_device
crct10dif_pclmul iTCO_vendor_support sparse_keymap cfg80211
crc32_pclmul snd_pcm crc32c_intel ghash_clmulni_intel rfkill i2c_i801
snd_timer shpchp snd serio_raw mei_me lpc_ich soundcore mei tpm_tis
tpm wmi nfsd auth_rpcgss nfs_acl lockd grace sunrpc i915
rtsx_usb_sdmmc mmc_core 8021q uas garp stp i2c_algo_bit llc mrp
drm_kms_helper usb_storage drm rtsx_usb mfd_core r8169 mii video
CPU: 1 PID: 25 Comm: kworker/1:2 Not tainted 3.20.0-0.rc0.git7.3.fc22.x86_64 #1
Hardware name: WB WB-B06211/WB-B0621, BIOS EB062IWB V1.0 12/12/2013
Workqueue: events rtsx_usb_ms_handle_req [rtsx_usb_ms]
 0000000000000000 000000003d188e66 ffff8801199e3808 ffffffff8187642b
 0000000000000000 ffff8801199e3860 ffff8801199e3848 ffffffff810ab39a
 ffff8801199e3864 ffff8801199e3cef ffff880119b57098 ffff880119b37320
Call Trace:
 [<ffffffff8187642b>] dump_stack+0x4c/0x65
 [<ffffffff810ab39a>] warn_slowpath_common+0x8a/0xc0
 [<ffffffff810ab425>] warn_slowpath_fmt+0x55/0x70
 [<ffffffff8187efe6>] ? _raw_spin_unlock_irqrestore+0x36/0x70
 [<ffffffff81453156>] check_for_stack+0x96/0xe0
 [<ffffffff81453934>] debug_dma_map_page+0x104/0x150
 [<ffffffff81613b86>] usb_hcd_map_urb_for_dma+0x646/0x790
 [<ffffffff81614165>] usb_hcd_submit_urb+0x1d5/0xa90
 [<ffffffff81106f8f>] ? mark_held_locks+0x7f/0xc0
 [<ffffffff81106f8f>] ? mark_held_locks+0x7f/0xc0
 [<ffffffff81103a15>] ? lockdep_init_map+0x65/0x5d0
 [<ffffffff81615d7e>] usb_submit_urb+0x42e/0x5f0
 [<ffffffff81616787>] usb_start_wait_urb+0x77/0x190
 [<ffffffff8124f035>] ? __kmalloc+0x205/0x2d0
 [<ffffffff8161697c>] usb_control_msg+0xdc/0x130
 [<ffffffffa0031669>] rtsx_usb_ep0_read_register+0x59/0x70 [rtsx_usb]
 [<ffffffffa00310c1>] ? rtsx_usb_get_rsp+0x41/0x50 [rtsx_usb]
 [<ffffffffa071da4e>] rtsx_usb_ms_handle_req+0x7ce/0x9c5 [rtsx_usb_ms]

Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Roger Tseng <rogerable@realtek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rtsx_usb.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/mfd/rtsx_usb.c b/drivers/mfd/rtsx_usb.c
index ede50244f265..dbd907d7170e 100644
--- a/drivers/mfd/rtsx_usb.c
+++ b/drivers/mfd/rtsx_usb.c
@@ -196,18 +196,27 @@ EXPORT_SYMBOL_GPL(rtsx_usb_ep0_write_register);
 int rtsx_usb_ep0_read_register(struct rtsx_ucr *ucr, u16 addr, u8 *data)
 {
 	u16 value;
+	u8 *buf;
+	int ret;
 
 	if (!data)
 		return -EINVAL;
-	*data = 0;
+
+	buf = kzalloc(sizeof(u8), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
 	addr |= EP0_READ_REG_CMD << EP0_OP_SHIFT;
 	value = swab16(addr);
 
-	return usb_control_msg(ucr->pusb_dev,
+	ret = usb_control_msg(ucr->pusb_dev,
 			usb_rcvctrlpipe(ucr->pusb_dev, 0), RTSX_USB_REQ_REG_OP,
 			USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-			value, 0, data, 1, 100);
+			value, 0, buf, 1, 100);
+	*data = *buf;
+
+	kfree(buf);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(rtsx_usb_ep0_read_register);
 
@@ -288,18 +297,27 @@ static int rtsx_usb_get_status_with_bulk(struct rtsx_ucr *ucr, u16 *status)
 int rtsx_usb_get_card_status(struct rtsx_ucr *ucr, u16 *status)
 {
 	int ret;
+	u16 *buf;
 
 	if (!status)
 		return -EINVAL;
 
-	if (polling_pipe == 0)
+	if (polling_pipe == 0) {
+		buf = kzalloc(sizeof(u16), GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+
 		ret = usb_control_msg(ucr->pusb_dev,
 				usb_rcvctrlpipe(ucr->pusb_dev, 0),
 				RTSX_USB_REQ_POLL,
 				USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-				0, 0, status, 2, 100);
-	else
+				0, 0, buf, 2, 100);
+		*status = *buf;
+
+		kfree(buf);
+	} else {
 		ret = rtsx_usb_get_status_with_bulk(ucr, status);
+	}
 
 	/* usb_control_msg may return positive when success */
 	if (ret < 0)

From c8648508ebfc597058d2cd00b6c539110264a167 Mon Sep 17 00:00:00 2001
From: Ameya Palande <2ameya@gmail.com>
Date: Thu, 26 Feb 2015 12:05:51 -0800
Subject: [PATCH 10/98] mfd: kempld-core: Fix callback return value check

On success, callback function returns 0. So invert the if condition
check so that we can break out of loop.

Cc: stable@vger.kernel.org
Signed-off-by: Ameya Palande <2ameya@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/kempld-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/kempld-core.c b/drivers/mfd/kempld-core.c
index f38ec424872e..5615522f8d62 100644
--- a/drivers/mfd/kempld-core.c
+++ b/drivers/mfd/kempld-core.c
@@ -739,7 +739,7 @@ static int __init kempld_init(void)
 		for (id = kempld_dmi_table;
 		     id->matches[0].slot != DMI_NONE; id++)
 			if (strstr(id->ident, force_device_id))
-				if (id->callback && id->callback(id))
+				if (id->callback && !id->callback(id))
 					break;
 		if (id->matches[0].slot == DMI_NONE)
 			return -ENODEV;

From 78146572b9cd20452da47951812f35b1ad4906be Mon Sep 17 00:00:00 2001
From: Ian Wilson <iwilson@brocade.com>
Date: Thu, 12 Mar 2015 09:37:58 +0000
Subject: [PATCH 11/98] netfilter: Zero the tuple in
 nfnl_cthelper_parse_tuple()

nfnl_cthelper_parse_tuple() is called from nfnl_cthelper_new(),
nfnl_cthelper_get() and nfnl_cthelper_del().  In each case they pass
a pointer to an nf_conntrack_tuple data structure local variable:

    struct nf_conntrack_tuple tuple;
    ...
    ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);

The problem is that this local variable is not initialized, and
nfnl_cthelper_parse_tuple() only initializes two fields: src.l3num and
dst.protonum.  This leaves all other fields with undefined values
based on whatever is on the stack:

    tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM]));
    tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);

The symptom observed was that when the rpc and tns helpers were added
then traffic to port 1536 was being sent to user-space.

Signed-off-by: Ian Wilson <iwilson@brocade.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cthelper.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index a5599fc51a6f..54330fb5efaf 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -77,6 +77,9 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
 	if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
 		return -EINVAL;
 
+	/* Not all fields are initialized so first zero the tuple */
+	memset(tuple, 0, sizeof(struct nf_conntrack_tuple));
+
 	tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM]));
 	tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);
 

From 3c17ad19f0697ffe5ef7438cdafc2d2b7757d8a5 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:32 -0700
Subject: [PATCH 12/98] timekeeping: Add debugging checks to warn if we see
 delays

Recently there's been requests for better sanity
checking in the time code, so that it's more clear
when something is going wrong, since timekeeping issues
could manifest in a large number of strange ways in
various subsystems.

Thus, this patch adds some extra infrastructure to
add a check to update_wall_time() to print two new
warnings:

 1) if we see the call delayed beyond the 'max_cycles'
    overflow point,

 2) or if we see the call delayed beyond the clocksource's
    'max_idle_ns' value, which is currently 50% of the
    overflow point.

This extra infrastructure is conditional on
a new CONFIG_DEBUG_TIMEKEEPING option, also
added in this patch - default off.

Tested this a bit by halting qemu for specified
lengths of time to trigger the warnings.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-5-git-send-email-john.stultz@linaro.org
[ Improved the changelog and the messages a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/jiffies.c     |  1 +
 kernel/time/timekeeping.c | 28 ++++++++++++++++++++++++++++
 lib/Kconfig.debug         | 13 +++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..7e413902aa6a 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
 	.mask		= 0xffffffff, /*32bits*/
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
 	.shift		= JIFFIES_SHIFT,
+	.max_cycles	= 10,
 };
 
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91db94136c10..acf049144cf6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,6 +118,31 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 	tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }
 
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+	cycle_t max_cycles = tk->tkr.clock->max_cycles;
+	const char *name = tk->tkr.clock->name;
+
+	if (offset > max_cycles) {
+		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow\n",
+				offset, name, max_cycles);
+		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope\n");
+	} else {
+		if (offset > (max_cycles >> 1)) {
+			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+					offset, name, max_cycles >> 1);
+			printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+		}
+	}
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+#endif
+
 /**
  * tk_setup_internals - Set up internals to use clocksource clock.
  *
@@ -1630,6 +1655,9 @@ void update_wall_time(void)
 	if (offset < real_tk->cycle_interval)
 		goto out;
 
+	/* Do some additional sanity checking */
+	timekeeping_check_update(real_tk, offset);
+
 	/*
 	 * With NO_HZ we may have to accumulate many cycle_intervals
 	 * (think "ticks") worth of time at once. To do this efficiently,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c5cefb3c009c..36b6fa88ce5b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -865,6 +865,19 @@ config SCHED_STACK_END_CHECK
 	  data corruption or a sporadic crash at a later stage once the region
 	  is examined. The runtime overhead introduced is minimal.
 
+config DEBUG_TIMEKEEPING
+	bool "Enable extra timekeeping sanity checking"
+	help
+	  This option will enable additional timekeeping sanity checks
+	  which may be helpful when diagnosing issues where timekeeping
+	  problems are suspected.
+
+	  This may include checks in the timekeeping hotpaths, so this
+	  option may have a (very small) performance impact to some
+	  workloads.
+
+	  If unsure, say N.
+
 config TIMER_STATS
 	bool "Collect kernel timers statistics"
 	depends on DEBUG_KERNEL && PROC_FS

From a558cd021d83b65c47ee5b9bec1fcfe5298a769f Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:33 -0700
Subject: [PATCH 13/98] timekeeping: Add checks to cap clocksource reads to the
 'max_cycles' value

When calculating the current delta since the last tick, we
currently have no hard protections to prevent a multiplication
overflow from occuring.

This patch introduces infrastructure to allow a cap that
limits the clocksource read delta value to the 'max_cycles' value,
which is where an overflow would occur.

Since this is in the hotpath, it adds the extra checking under
CONFIG_DEBUG_TIMEKEEPING=y.

There was some concern that capping time like this could cause
problems as we may stop expiring timers, which could go circular
if the timer that triggers time accumulation were mis-scheduled
too far in the future, which would cause time to stop.

However, since the mult overflow would result in a smaller time
value, we would effectively have the same problem there.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-6-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timekeeping.c | 49 ++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index acf049144cf6..657414cf2e46 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -126,9 +126,9 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 	const char *name = tk->tkr.clock->name;
 
 	if (offset > max_cycles) {
-		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow\n",
+		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
 				offset, name, max_cycles);
-		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope\n");
+		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
 	} else {
 		if (offset > (max_cycles >> 1)) {
 			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
@@ -137,10 +137,39 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 		}
 	}
 }
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+	cycle_t cycle_now, delta;
+
+	/* read clocksource */
+	cycle_now = tkr->read(tkr->clock);
+
+	/* calculate the delta since the last update_wall_time */
+	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+	/* Cap delta value to the max_cycles values to avoid mult overflows */
+	if (unlikely(delta > tkr->clock->max_cycles))
+		delta = tkr->clock->max_cycles;
+
+	return delta;
+}
 #else
 static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
 }
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+	cycle_t cycle_now, delta;
+
+	/* read clocksource */
+	cycle_now = tkr->read(tkr->clock);
+
+	/* calculate the delta since the last update_wall_time */
+	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+	return delta;
+}
 #endif
 
 /**
@@ -218,14 +247,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
 
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-	cycle_t cycle_now, delta;
+	cycle_t delta;
 	s64 nsec;
 
-	/* read clocksource: */
-	cycle_now = tkr->read(tkr->clock);
-
-	/* calculate the delta since the last update_wall_time: */
-	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+	delta = timekeeping_get_delta(tkr);
 
 	nsec = delta * tkr->mult + tkr->xtime_nsec;
 	nsec >>= tkr->shift;
@@ -237,14 +262,10 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 {
 	struct clocksource *clock = tk->tkr.clock;
-	cycle_t cycle_now, delta;
+	cycle_t delta;
 	s64 nsec;
 
-	/* read clocksource: */
-	cycle_now = tk->tkr.read(clock);
-
-	/* calculate the delta since the last update_wall_time: */
-	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
+	delta = timekeeping_get_delta(&tk->tkr);
 
 	/* convert delta to nanoseconds. */
 	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);

From 057b87e3161d1194a095718f9918c01b2c389e74 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:34 -0700
Subject: [PATCH 14/98] timekeeping: Try to catch clocksource delta underflows

In the case where there is a broken clocksource
where there are multiple actual clocks that
aren't perfectly aligned, we may see small "negative"
deltas when we subtract 'now' from 'cycle_last'.

The values are actually negative with respect to the
clocksource mask value, not necessarily negative
if cast to a s64, but we can check by checking the
delta to see if it is a small (relative to the mask)
negative value (again negative relative to the mask).

If so, we assume we jumped backwards somehow and
instead use zero for our delta.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-7-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timekeeping.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 657414cf2e46..187149be83ea 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -148,6 +148,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 	/* calculate the delta since the last update_wall_time */
 	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
 
+	/*
+	 * Try to catch underflows by checking if we are seeing small
+	 * mask-relative negative values.
+	 */
+	if (unlikely((~delta & tkr->mask) < (tkr->mask >> 3)))
+		delta = 0;
+
 	/* Cap delta value to the max_cycles values to avoid mult overflows */
 	if (unlikely(delta > tkr->clock->max_cycles))
 		delta = tkr->clock->max_cycles;

From 4ca22c2648f9c1cec0b242f58d7302136f5a4cbb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:35 -0700
Subject: [PATCH 15/98] timekeeping: Add warnings when overflows or underflows
 are observed

It was suggested that the underflow/overflow protection
should probably throw some sort of warning out, rather
than just silently fixing the issue.

So this patch adds some warnings here. The flag variables
used are not protected by locks, but since we can't print
from the reading functions, just being able to say we
saw an issue in the update interval is useful enough,
and can be slightly racy without real consequence.

The big complication is that we're only under a read
seqlock, so the data could shift under us during
our calculation to see if there was a problem. This
patch avoids this issue by nesting another seqlock
which allows us to snapshot the just required values
atomically. So we shouldn't see false positives.

I also added some basic rate-limiting here, since
on one build machine w/ skewed TSCs it was fairly
noisy at bootup.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-8-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timekeeping.c | 64 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 7 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 187149be83ea..892f6cbf1e67 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -119,6 +119,20 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 }
 
 #ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
 
@@ -136,28 +150,64 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 			printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
 		}
 	}
+
+	if (timekeeping_underflow_seen) {
+		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+			printk_deferred("         Your kernel is probably still fine.\n");
+			timekeeping_last_warning = jiffies;
+		}
+		timekeeping_underflow_seen = 0;
+	}
+
+	if (timekeeping_overflow_seen) {
+		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+			printk_deferred("         Your kernel is probably still fine.\n");
+			timekeeping_last_warning = jiffies;
+		}
+		timekeeping_overflow_seen = 0;
+	}
 }
 
 static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 {
-	cycle_t cycle_now, delta;
+	cycle_t now, last, mask, max, delta;
+	unsigned int seq;
 
-	/* read clocksource */
-	cycle_now = tkr->read(tkr->clock);
+	/*
+	 * Since we're called holding a seqlock, the data may shift
+	 * under us while we're doing the calculation. This can cause
+	 * false positives, since we'd note a problem but throw the
+	 * results away. So nest another seqlock here to atomically
+	 * grab the points we are checking with.
+	 */
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		now = tkr->read(tkr->clock);
+		last = tkr->cycle_last;
+		mask = tkr->mask;
+		max = tkr->clock->max_cycles;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	/* calculate the delta since the last update_wall_time */
-	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+	delta = clocksource_delta(now, last, mask);
 
 	/*
 	 * Try to catch underflows by checking if we are seeing small
 	 * mask-relative negative values.
 	 */
-	if (unlikely((~delta & tkr->mask) < (tkr->mask >> 3)))
+	if (unlikely((~delta & mask) < (mask >> 3))) {
+		timekeeping_underflow_seen = 1;
 		delta = 0;
+	}
 
 	/* Cap delta value to the max_cycles values to avoid mult overflows */
-	if (unlikely(delta > tkr->clock->max_cycles))
+	if (unlikely(delta > max)) {
+		timekeeping_overflow_seen = 1;
 		delta = tkr->clock->max_cycles;
+	}
 
 	return delta;
 }

From 0b046b217ad4c64fbbeaaac24d0648cb1fa49ad8 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:36 -0700
Subject: [PATCH 16/98] clocksource: Improve clocksource watchdog reporting

The clocksource watchdog reporting has been less helpful
then desired, as it just printed the delta between
the two clocksources. This prevents any useful analysis
of why the skew occurred.

Thus this patch tries to improve the output when we
mark a clocksource as unstable, printing out the cycle
last and now values for both the current clocksource
and the watchdog clocksource. This will allow us to see
if the result was due to a false positive caused by
a problematic watchdog.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-9-git-send-email-john.stultz@linaro.org
[ Minor cleanups of kernel messages. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/clocksource.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fc2a9de43ca1..c4cc04bec698 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
 		schedule_work(&watchdog_work);
 }
 
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
-	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-	       cs->name, delta);
-	__clocksource_unstable(cs);
-}
-
 /**
  * clocksource_mark_unstable - mark clocksource unstable via watchdog
  * @cs:		clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
 	struct clocksource *cs;
-	cycle_t csnow, wdnow, delta;
+	cycle_t csnow, wdnow, cslast, wdlast, delta;
 	int64_t wd_nsec, cs_nsec;
 	int next_cpu, reset_pending;
 
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
 
 		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+		wdlast = cs->wd_last; /* save these in case we print them */
+		cslast = cs->cs_last;
 		cs->cs_last = csnow;
 		cs->wd_last = wdnow;
 
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
 
 		/* Check the deviation from the watchdog clocksource. */
 		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-			clocksource_unstable(cs, cs_nsec - wd_nsec);
+			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+			pr_warn("	'%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+				watchdog->name, wdnow, wdlast, watchdog->mask);
+			pr_warn("	'%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+				cs->name, csnow, cslast, cs->mask);
+			__clocksource_unstable(cs);
 			continue;
 		}
 

From f8935983f110505daa38e8d36ee406807f83a069 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:37 -0700
Subject: [PATCH 17/98] clocksource: Mostly kill clocksource_register()

A long running project has been to clean up remaining uses
of clocksource_register(), replacing it with the simpler
clocksource_register_khz/hz() functions.

However, there are a few cases where we need to self-define
our mult/shift values, so switch the function to a more
obviously internal __clocksource_register() name, and
consolidate much of the internal logic so we don't have
duplication.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: David S. Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-10-git-send-email-john.stultz@linaro.org
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/s390/kernel/time.c     |  2 +-
 arch/sparc/kernel/time_32.c |  2 +-
 include/linux/clocksource.h | 10 ++++-
 kernel/time/clocksource.c   | 79 ++++++++++++++++---------------------
 kernel/time/jiffies.c       |  4 +-
 5 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 20660dddb2d6..6c273cd815bb 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -283,7 +283,7 @@ void __init time_init(void)
 	if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt))
 		panic("Couldn't request external interrupt 0x1406");
 
-	if (clocksource_register(&clocksource_tod) != 0)
+	if (__clocksource_register(&clocksource_tod) != 0)
 		panic("Could not register TOD clock source");
 
 	/* Enable TOD clock interrupts on the boot cpu. */
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 2f80d23a0a44..a31c0c822beb 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -191,7 +191,7 @@ static __init int setup_timer_cs(void)
 	timer_cs.mult = clocksource_hz2mult(sparc_config.clock_rate,
 	                                    timer_cs.shift);
 
-	return clocksource_register(&timer_cs);
+	return __clocksource_register(&timer_cs);
 }
 
 #ifdef CONFIG_SMP
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 16d048cadebb..bd98eaa4d005 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -179,7 +179,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 }
 
 
-extern int clocksource_register(struct clocksource*);
 extern int clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
 extern struct clocksource* clocksource_get_next(void);
@@ -203,6 +202,15 @@ __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
 extern void
 __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
 
+/*
+ * Don't call this unless you are a default clocksource
+ * (AKA: jiffies) and absolutely have to.
+ */
+static inline int __clocksource_register(struct clocksource *cs)
+{
+	return __clocksource_register_scale(cs, 1, 0);
+}
+
 static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
 {
 	return __clocksource_register_scale(cs, 1, hz);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c4cc04bec698..5cdf17eb4fa6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -656,38 +656,52 @@ static void clocksource_enqueue(struct clocksource *cs)
 void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 	u64 sec;
+
 	/*
-	 * Calc the maximum number of seconds which we can run before
-	 * wrapping around. For clocksources which have a mask > 32bit
-	 * we need to limit the max sleep time to have a good
-	 * conversion precision. 10 minutes is still a reasonable
-	 * amount. That results in a shift value of 24 for a
-	 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-	 * ~ 0.06ppm granularity for NTP.
+	 * Default clocksources are *special* and self-define their mult/shift.
+	 * But, you're not special, so you should specify a freq value.
 	 */
-	sec = cs->mask;
-	do_div(sec, freq);
-	do_div(sec, scale);
-	if (!sec)
-		sec = 1;
-	else if (sec > 600 && cs->mask > UINT_MAX)
-		sec = 600;
-
-	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
-			       NSEC_PER_SEC / scale, sec * scale);
+	if (freq) {
+		/*
+		 * Calc the maximum number of seconds which we can run before
+		 * wrapping around. For clocksources which have a mask > 32-bit
+		 * we need to limit the max sleep time to have a good
+		 * conversion precision. 10 minutes is still a reasonable
+		 * amount. That results in a shift value of 24 for a
+		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+		 * ~ 0.06ppm granularity for NTP.
+		 */
+		sec = cs->mask;
+		do_div(sec, freq);
+		do_div(sec, scale);
+		if (!sec)
+			sec = 1;
+		else if (sec > 600 && cs->mask > UINT_MAX)
+			sec = 600;
 
+		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+				       NSEC_PER_SEC / scale, sec * scale);
+	}
 	/*
 	 * Ensure clocksources that have large 'mult' values don't overflow
 	 * when adjusted.
 	 */
 	cs->maxadj = clocksource_max_adjustment(cs);
-	while ((cs->mult + cs->maxadj < cs->mult)
-		|| (cs->mult - cs->maxadj > cs->mult)) {
+	while (freq && ((cs->mult + cs->maxadj < cs->mult)
+		|| (cs->mult - cs->maxadj > cs->mult))) {
 		cs->mult >>= 1;
 		cs->shift--;
 		cs->maxadj = clocksource_max_adjustment(cs);
 	}
 
+	/*
+	 * Only warn for *special* clocksources that self-define
+	 * their mult/shift values and don't specify a freq.
+	 */
+	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+		cs->name);
+
 	clocksource_update_max_deferment(cs);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -719,33 +733,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs:		clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
-	/* calculate max adjustment for given mult/shift */
-	cs->maxadj = clocksource_max_adjustment(cs);
-	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
-		"Clocksource %s might overflow on 11%% adjustment\n",
-		cs->name);
-
-	/* Update max idle time permitted for this clocksource */
-	clocksource_update_max_deferment(cs);
-
-	mutex_lock(&clocksource_mutex);
-	clocksource_enqueue(cs);
-	clocksource_enqueue_watchdog(cs);
-	clocksource_select();
-	mutex_unlock(&clocksource_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
 static void __clocksource_change_rating(struct clocksource *cs, int rating)
 {
 	list_del(&cs->list);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7e413902aa6a..c4bb518725b5 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
 
 static int __init init_jiffies_clocksource(void)
 {
-	return clocksource_register(&clocksource_jiffies);
+	return __clocksource_register(&clocksource_jiffies);
 }
 
 core_initcall(init_jiffies_clocksource);
@@ -131,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
 
 	refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
 
-	clocksource_register(&refined_jiffies);
+	__clocksource_register(&refined_jiffies);
 	return 0;
 }

From 3142f76022fe46f6e0a0d3940b23fb6ccb794692 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:38 -0700
Subject: [PATCH 18/98] clocksource, sparc32: Convert to using
 clocksource_register_hz()

While cleaning up some clocksource code, I noticed the
time_32 implementation uses the clocksource_hz2mult()
helper, but doesn't use the clocksource_register_hz()
method.

I don't believe the Sparc clocksource is a default
clocksource, so we shouldn't need to self-define
the mult/shift pair.

So convert the time_32.c implementation to use
clocksource_register_hz().

Untested.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-11-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/sparc/kernel/time_32.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index a31c0c822beb..18147a5523d9 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -181,17 +181,13 @@ static struct clocksource timer_cs = {
 	.rating	= 100,
 	.read	= timer_cs_read,
 	.mask	= CLOCKSOURCE_MASK(64),
-	.shift	= 2,
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 static __init int setup_timer_cs(void)
 {
 	timer_cs_enabled = 1;
-	timer_cs.mult = clocksource_hz2mult(sparc_config.clock_rate,
-	                                    timer_cs.shift);
-
-	return __clocksource_register(&timer_cs);
+	return clocksource_register_hz(&timer_cs, sparc_config.clock_rate);
 }
 
 #ifdef CONFIG_SMP

From 8cc8c525ad4e7b581cacf84119e1a28dcb4044db Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:39 -0700
Subject: [PATCH 19/98] clocksource: Add some debug info about clocksources
 being registered

Print the mask, max_cycles, and max_idle_ns values for
clocksources being registered.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-12-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/clocksource.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5cdf17eb4fa6..1977ebabd922 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -703,6 +703,9 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 		cs->name);
 
 	clocksource_update_max_deferment(cs);
+
+	pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+			cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 

From fba9e07208c0f9d92d9f73761c99c8612039da44 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Mar 2015 21:16:40 -0700
Subject: [PATCH 20/98] clocksource: Rename __clocksource_updatefreq_*() to
 __clocksource_update_freq_*()

Ingo requested this function be renamed to improve readability,
so I've renamed __clocksource_updatefreq_scale() as well as the
__clocksource_updatefreq_hz/khz() functions to avoid
squishedtogethernames.

This touches some of the sh clocksources, which I've not tested.

The arch/arm/plat-omap change is just a comment change for
consistency.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426133800-29329-13-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/plat-omap/counter_32k.c |  2 +-
 drivers/clocksource/em_sti.c     |  2 +-
 drivers/clocksource/sh_cmt.c     |  2 +-
 drivers/clocksource/sh_tmu.c     |  2 +-
 include/linux/clocksource.h      | 10 +++++-----
 kernel/time/clocksource.c        | 11 ++++++-----
 6 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c
index 61b4d705c267..43cf74561cfd 100644
--- a/arch/arm/plat-omap/counter_32k.c
+++ b/arch/arm/plat-omap/counter_32k.c
@@ -103,7 +103,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase)
 
 	/*
 	 * 120000 rough estimate from the calculations in
-	 * __clocksource_updatefreq_scale.
+	 * __clocksource_update_freq_scale.
 	 */
 	clocks_calc_mult_shift(&persistent_mult, &persistent_shift,
 			32768, NSEC_PER_SEC, 120000);
diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c
index d0a7bd66b8b9..dc3c6ee04aaa 100644
--- a/drivers/clocksource/em_sti.c
+++ b/drivers/clocksource/em_sti.c
@@ -210,7 +210,7 @@ static int em_sti_clocksource_enable(struct clocksource *cs)
 
 	ret = em_sti_start(p, USER_CLOCKSOURCE);
 	if (!ret)
-		__clocksource_updatefreq_hz(cs, p->rate);
+		__clocksource_update_freq_hz(cs, p->rate);
 	return ret;
 }
 
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 2bd13b53b727..b8ff3c64cc45 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -641,7 +641,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs)
 
 	ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE);
 	if (!ret) {
-		__clocksource_updatefreq_hz(cs, ch->rate);
+		__clocksource_update_freq_hz(cs, ch->rate);
 		ch->cs_enabled = true;
 	}
 	return ret;
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index f150ca82bfaf..b6b8fa3cd211 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -272,7 +272,7 @@ static int sh_tmu_clocksource_enable(struct clocksource *cs)
 
 	ret = sh_tmu_enable(ch);
 	if (!ret) {
-		__clocksource_updatefreq_hz(cs, ch->rate);
+		__clocksource_update_freq_hz(cs, ch->rate);
 		ch->cs_enabled = true;
 	}
 
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index bd98eaa4d005..135509821c39 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -200,7 +200,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
 extern int
 __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
 extern void
-__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
+__clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq);
 
 /*
  * Don't call this unless you are a default clocksource
@@ -221,14 +221,14 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz)
 	return __clocksource_register_scale(cs, 1000, khz);
 }
 
-static inline void __clocksource_updatefreq_hz(struct clocksource *cs, u32 hz)
+static inline void __clocksource_update_freq_hz(struct clocksource *cs, u32 hz)
 {
-	__clocksource_updatefreq_scale(cs, 1, hz);
+	__clocksource_update_freq_scale(cs, 1, hz);
 }
 
-static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz)
+static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz)
 {
-	__clocksource_updatefreq_scale(cs, 1000, khz);
+	__clocksource_update_freq_scale(cs, 1000, khz);
 }
 
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1977ebabd922..c3be3c71bbad 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -643,7 +643,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 }
 
 /**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
  * @cs:		clocksource to be registered
  * @scale:	Scale factor multiplied against freq to get clocksource hz
  * @freq:	clocksource frequency (cycles per second) divided by scale
@@ -651,9 +651,10 @@ static void clocksource_enqueue(struct clocksource *cs)
  * This should only be called from the clocksource->enable() method.
  *
  * This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
  */
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 	u64 sec;
 
@@ -707,7 +708,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 	pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
 			cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
 /**
  * __clocksource_register_scale - Used to install new clocksources
@@ -724,7 +725,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 
 	/* Initialize mult/shift and max_idle_ns */
-	__clocksource_updatefreq_scale(cs, scale, freq);
+	__clocksource_update_freq_scale(cs, scale, freq);
 
 	/* Add clocksource to the clocksource list */
 	mutex_lock(&clocksource_mutex);

From d8bdff59cea141d2e5f7e98c1b11d3e0271640bd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 13 Mar 2015 10:52:14 +1100
Subject: [PATCH 21/98] netfilter: Fix potential crash in nft_hash walker

When we get back an EAGAIN from rhashtable_walk_next we were
treating it as a valid object which obviously doesn't work too
well.

Luckily this is hard to trigger so it seems nobody has run into
it yet.

This patch fixes it by redoing the next call when we get an EAGAIN.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_hash.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index c82df0a48fcd..37c15e674884 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -153,6 +153,8 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 				iter->err = err;
 				goto out;
 			}
+
+			continue;
 		}
 
 		if (iter->count < iter->skip)

From 0a64815091bd0ad6c6cdfaac2fae55b0f3ecf974 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 9 Mar 2015 17:34:18 +0100
Subject: [PATCH 22/98] s390/cpum_sf: add diagnostic sampling event only if it
 is authorized

The SF_CYCLES_BASIC_DIAG is always registered even if it is turned of in the
current hardware configuration.  Because diagnostic-sampling is typically not
turned on in the hardware configuration, do not register this perf event by
default.  Enable it only if the diagnostic-sampling function is authorized.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_sf.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index c3f8d157cb0d..e6a1578fc000 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1415,7 +1415,7 @@ CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);
 
 static struct attribute *cpumsf_pmu_events_attr[] = {
 	CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC),
-	CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG),
+	NULL,
 	NULL,
 };
 
@@ -1606,8 +1606,11 @@ static int __init init_cpum_sampling_pmu(void)
 		return -EINVAL;
 	}
 
-	if (si.ad)
+	if (si.ad) {
 		sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB);
+		cpumsf_pmu_events_attr[1] =
+			CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG);
+	}
 
 	sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80);
 	if (!sfdbg)

From 20e76ee184a7cea155268377c4e414eea1dab6fd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 13 Mar 2015 10:31:42 +0100
Subject: [PATCH 23/98] s390/ftrace: fix compile error if CONFIG_KPROBES is
 disabled

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/ftrace.c | 61 +++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 82c19899574f..6c79f1b44fe7 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -57,6 +57,44 @@
 
 unsigned long ftrace_plt;
 
+static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn)
+{
+#ifdef CC_USING_HOTPATCH
+	/* brcl 0,0 */
+	insn->opc = 0xc004;
+	insn->disp = 0;
+#else
+	/* stg r14,8(r15) */
+	insn->opc = 0xe3e0;
+	insn->disp = 0xf0080024;
+#endif
+}
+
+static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn)
+{
+#ifdef CONFIG_KPROBES
+	if (insn->opc == BREAKPOINT_INSTRUCTION)
+		return 1;
+#endif
+	return 0;
+}
+
+static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn)
+{
+#ifdef CONFIG_KPROBES
+	insn->opc = BREAKPOINT_INSTRUCTION;
+	insn->disp = KPROBE_ON_FTRACE_NOP;
+#endif
+}
+
+static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn)
+{
+#ifdef CONFIG_KPROBES
+	insn->opc = BREAKPOINT_INSTRUCTION;
+	insn->disp = KPROBE_ON_FTRACE_CALL;
+#endif
+}
+
 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 		       unsigned long addr)
 {
@@ -72,16 +110,9 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		return -EFAULT;
 	if (addr == MCOUNT_ADDR) {
 		/* Initial code replacement */
-#ifdef CC_USING_HOTPATCH
-		/* We expect to see brcl 0,0 */
-		ftrace_generate_nop_insn(&orig);
-#else
-		/* We expect to see stg r14,8(r15) */
-		orig.opc = 0xe3e0;
-		orig.disp = 0xf0080024;
-#endif
+		ftrace_generate_orig_insn(&orig);
 		ftrace_generate_nop_insn(&new);
-	} else if (old.opc == BREAKPOINT_INSTRUCTION) {
+	} else if (is_kprobe_on_ftrace(&old)) {
 		/*
 		 * If we find a breakpoint instruction, a kprobe has been
 		 * placed at the beginning of the function. We write the
@@ -89,9 +120,8 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		 * bytes of the original instruction so that the kprobes
 		 * handler can execute a nop, if it reaches this breakpoint.
 		 */
-		new.opc = orig.opc = BREAKPOINT_INSTRUCTION;
-		orig.disp = KPROBE_ON_FTRACE_CALL;
-		new.disp = KPROBE_ON_FTRACE_NOP;
+		ftrace_generate_kprobe_call_insn(&orig);
+		ftrace_generate_kprobe_nop_insn(&new);
 	} else {
 		/* Replace ftrace call with a nop. */
 		ftrace_generate_call_insn(&orig, rec->ip);
@@ -111,7 +141,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 
 	if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
 		return -EFAULT;
-	if (old.opc == BREAKPOINT_INSTRUCTION) {
+	if (is_kprobe_on_ftrace(&old)) {
 		/*
 		 * If we find a breakpoint instruction, a kprobe has been
 		 * placed at the beginning of the function. We write the
@@ -119,9 +149,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 		 * bytes of the original instruction so that the kprobes
 		 * handler can execute a brasl if it reaches this breakpoint.
 		 */
-		new.opc = orig.opc = BREAKPOINT_INSTRUCTION;
-		orig.disp = KPROBE_ON_FTRACE_NOP;
-		new.disp = KPROBE_ON_FTRACE_CALL;
+		ftrace_generate_kprobe_nop_insn(&orig);
+		ftrace_generate_kprobe_call_insn(&new);
 	} else {
 		/* Replace nop with an ftrace call. */
 		ftrace_generate_nop_insn(&orig);

From e143fa93c28669a7f0851e6850b1da5b1945fd53 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 13 Mar 2015 11:19:09 +0100
Subject: [PATCH 24/98] s390/mm: limit STACK_RND_MASK for compat tasks

For compat tasks the mmap randomization does not use the maximum
randomization value from mmap_rnd_mask but the fixed value of 0x7ff.
This needs to be respected in the definition of STACK_RND_MASK as
well.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index c9df40b5c0ac..c9c875d9ed31 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -211,7 +211,7 @@ do {								\
 
 extern unsigned long mmap_rnd_mask;
 
-#define STACK_RND_MASK	(mmap_rnd_mask)
+#define STACK_RND_MASK	(test_thread_flag(TIF_31BIT) ? 0x7ff : mmap_rnd_mask)
 
 #define ARCH_DLINFO							    \
 do {									    \

From 9a30b096b543932de218dd3501b5562e00a8792d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Mar 2015 23:53:26 -0400
Subject: [PATCH 25/98] blk-mq: fix use of incorrect goto label in
 blk_mq_init_queue error path

If percpu_ref_init() fails the allocated q and hctxs must get cleaned
up; using 'err_map' doesn't allow that to happen.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Ming Lei <ming.lei@canonical.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4f4bea21052e..b7b8933ec241 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1938,7 +1938,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	 */
 	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
 			    PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
-		goto err_map;
+		goto err_mq_usage;
 
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
 	blk_queue_rq_timeout(q, 30000);
@@ -1981,7 +1981,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
 	if (blk_mq_init_hw_queues(q, set))
-		goto err_hw;
+		goto err_mq_usage;
 
 	mutex_lock(&all_q_mutex);
 	list_add_tail(&q->all_q_node, &all_q_list);
@@ -1993,7 +1993,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
 	return q;
 
-err_hw:
+err_mq_usage:
 	blk_cleanup_queue(q);
 err_hctxs:
 	kfree(map);

From e03826d5045e81a66a4fad7be9a8ecdaeb7911cf Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Tue, 17 Mar 2015 15:56:04 +0530
Subject: [PATCH 26/98] regulator: palmas: Correct TPS659038 register
 definition for REGEN2

The register offset for REGEN2_CTRL in different for TPS659038 chip as when
compared with other Palmas family PMICs. In the case of TPS659038 the wrong
offset pointed to PLLEN_CTRL and was causing a hang. Correcting the same.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/regulator/palmas-regulator.c | 4 ++++
 include/linux/mfd/palmas.h           | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c
index 9205f433573c..18198316b6cf 100644
--- a/drivers/regulator/palmas-regulator.c
+++ b/drivers/regulator/palmas-regulator.c
@@ -1572,6 +1572,10 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 	if (!pmic)
 		return -ENOMEM;
 
+	if (of_device_is_compatible(node, "ti,tps659038-pmic"))
+		palmas_generic_regs_info[PALMAS_REG_REGEN2].ctrl_addr =
+							TPS659038_REGEN2_CTRL;
+
 	pmic->dev = &pdev->dev;
 	pmic->palmas = palmas;
 	palmas->pmic = pmic;
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index fb0390a1a498..ee7b1ce7a6f8 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -2999,6 +2999,9 @@ enum usb_irq_events {
 #define PALMAS_GPADC_TRIM15					0x0E
 #define PALMAS_GPADC_TRIM16					0x0F
 
+/* TPS659038 regen2_ctrl offset iss different from palmas */
+#define TPS659038_REGEN2_CTRL					0x12
+
 /* TPS65917 Interrupt registers */
 
 /* Registers for function INTERRUPT */

From d6b6cb1d3e6f78d55c2d4043d77d0d8def3f3b99 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 17 Mar 2015 13:21:42 +0100
Subject: [PATCH 27/98] netfilter: nf_tables: allow to change chain policy
 without hook if it exists

If there's an existing base chain, we have to allow to change the
default policy without indicating the hook information.

However, if the chain doesn't exists, we have to enforce the presence of
the hook attribute.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6ab777912237..ac1a9528dbf2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1225,7 +1225,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	if (nla[NFTA_CHAIN_POLICY]) {
 		if ((chain != NULL &&
-		    !(chain->flags & NFT_BASE_CHAIN)) ||
+		    !(chain->flags & NFT_BASE_CHAIN)))
+			return -EOPNOTSUPP;
+
+		if (chain == NULL &&
 		    nla[NFTA_CHAIN_HOOK] == NULL)
 			return -EOPNOTSUPP;
 

From 0790ec172de1bd2e23f1dbd4925426b6cc3c1b72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 17 Mar 2015 14:02:32 +0100
Subject: [PATCH 28/98] KVM: nVMX: mask unrestricted_guest if disabled on L0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If EPT was enabled, unrestricted_guest was allowed in L1 regardless of
L0.  L1 triple faulted when running L2 guest that required emulation.

Another side effect was 'WARN_ON_ONCE(vmx->nested.nested_run_pending)'
in L0's dmesg:
  WARNING: CPU: 0 PID: 0 at arch/x86/kvm/vmx.c:9190 nested_vmx_vmexit+0x96e/0xb00 [kvm_intel] ()

Prevent this scenario by masking SECONDARY_EXEC_UNRESTRICTED_GUEST when
the host doesn't have it enabled.

Fixes: 78051e3b7e35 ("KVM: nVMX: Disable unrestricted mode if ept=0")
Cc: stable@vger.kernel.org
Tested-By: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10a481b7674d..ae4f6d35d19c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2479,8 +2479,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
 		vmx->nested.nested_vmx_secondary_ctls_high |=
-			SECONDARY_EXEC_ENABLE_EPT |
-			SECONDARY_EXEC_UNRESTRICTED_GUEST;
+			SECONDARY_EXEC_ENABLE_EPT;
 		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
 			 VMX_EPT_INVEPT_BIT;
@@ -2494,6 +2493,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	if (enable_unrestricted_guest)
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC,
 		vmx->nested.nested_vmx_misc_low,

From 391949b6f02121371e3d7d9082c6d17fd9853034 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 18 Mar 2015 11:27:28 +0100
Subject: [PATCH 29/98] spi: trigger trace event for message-done before
 mesg->complete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With spidev the mesg->complete callback points to spidev_complete.
Calling this unblocks spidev_sync and so spidev_sync_write finishes. As
the struct spi_message just read is a local variable in
spidev_sync_write and recording the trace event accesses this message
the recording is better done first. The same can happen for
spidev_sync_read.

This fixes an oops observed on a 3.14-rt system with spidev activity
after

	echo 1 > /sys/kernel/debug/tracing/events/spi/enable

.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c64a3e59fce3..57a195041dc7 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1105,13 +1105,14 @@ void spi_finalize_current_message(struct spi_master *master)
 				"failed to unprepare message: %d\n", ret);
 		}
 	}
+
+	trace_spi_message_done(mesg);
+
 	master->cur_msg_prepared = false;
 
 	mesg->state = NULL;
 	if (mesg->complete)
 		mesg->complete(mesg->context);
-
-	trace_spi_message_done(mesg);
 }
 EXPORT_SYMBOL_GPL(spi_finalize_current_message);
 

From cf39284d41f67964cf42b21bb386c012cf5b7f65 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Wed, 18 Mar 2015 08:57:41 +0800
Subject: [PATCH 30/98] regulator: Fix documentation for regmap in the config

dev_get_regulator() does not exist, fix the typo.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index d4ad5b5a02bb..045f709cb89b 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -316,7 +316,7 @@ struct regulator_desc {
  * @driver_data: private regulator data
  * @of_node: OpenFirmware node to parse for device tree bindings (may be
  *           NULL).
- * @regmap: regmap to use for core regmap helpers if dev_get_regulator() is
+ * @regmap: regmap to use for core regmap helpers if dev_get_regmap() is
  *          insufficient.
  * @ena_gpio_initialized: GPIO controlling regulator enable was properly
  *                        initialized, meaning that >= 0 is a valid gpio

From bc188d818edf325ae38cfa43254a0b10a4defd65 Mon Sep 17 00:00:00 2001
From: Sam Bradshaw <sbradshaw@micron.com>
Date: Wed, 18 Mar 2015 17:06:18 -0600
Subject: [PATCH 31/98] blkmq: Fix NULL pointer deref when all reserved tags in

When allocating from the reserved tags pool, bt_get() is called with
a NULL hctx.  If all tags are in use, the hw queue is kicked to push
out any pending IO, potentially freeing tags, and tag allocation is
retried.  The problem is that blk_mq_run_hw_queue() doesn't check for
a NULL hctx.  So we avoid it with a simple NULL hctx test.

Tested by hammering mtip32xx with concurrent smartctl/hdparm.

Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Selvan Mani <smani@micron.com>
Fixes: b32232073e80 ("blk-mq: fix hang in bt_get()")
Cc: stable@kernel.org

Added appropriate comment.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d53a764b05ea..be3290cc0644 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -278,9 +278,11 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
-		 * some to complete.
+		 * some to complete. Note that hctx can be NULL here for
+		 * reserved tag allocation.
 		 */
-		blk_mq_run_hw_queue(hctx, false);
+		if (hctx)
+			blk_mq_run_hw_queue(hctx, false);
 
 		/*
 		 * Retry tag allocation after running the hardware queue,

From 4017a7ee693d1cae6735c0dac21594a7c6416c4c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 2 Mar 2015 01:10:28 +0100
Subject: [PATCH 32/98] netfilter: restore rule tracing via nfnetlink_log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since fab4085 ("netfilter: log: nf_log_packet() as real unified
interface"), the loginfo structure that is passed to nf_log_packet() is
used to explicitly indicate the logger type you want to use.

This is a problem for people tracing rules through nfnetlink_log since
packets are always routed to the NF_LOG_TYPE logger after the
aforementioned patch.

We can fix this by removing the trace loginfo structures, but that still
changes the log level from 4 to 5 for tracing messages and there may be
someone relying on this outthere. So let's just introduce a new
nf_log_trace() function that restores the former behaviour.

Reported-by: Markus Kötter <koetter@rrzn.uni-hannover.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h  | 10 ++++++++++
 net/ipv4/netfilter/ip_tables.c  |  6 +++---
 net/ipv6/netfilter/ip6_tables.c |  6 +++---
 net/netfilter/nf_log.c          | 24 ++++++++++++++++++++++++
 net/netfilter/nf_tables_core.c  |  8 ++++----
 5 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 534e1f2ac4fc..57639fca223a 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -79,6 +79,16 @@ void nf_log_packet(struct net *net,
 		   const struct nf_loginfo *li,
 		   const char *fmt, ...);
 
+__printf(8, 9)
+void nf_log_trace(struct net *net,
+		  u_int8_t pf,
+		  unsigned int hooknum,
+		  const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct nf_loginfo *li,
+		  const char *fmt, ...);
+
 struct nf_log_buf;
 
 struct nf_log_buf *nf_log_buf_open(void);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 99e810f84671..cf5e82f39d3b 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -272,9 +272,9 @@ static void trace_packet(const struct sk_buff *skb,
 		    &chainname, &comment, &rulenum) != 0)
 			break;
 
-	nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo,
-		      "TRACE: %s:%s:%s:%u ",
-		      tablename, chainname, comment, rulenum);
+	nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
+		     "TRACE: %s:%s:%s:%u ",
+		     tablename, chainname, comment, rulenum);
 }
 #endif
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e080fbbbc0e5..bb00c6f2a885 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -298,9 +298,9 @@ static void trace_packet(const struct sk_buff *skb,
 		    &chainname, &comment, &rulenum) != 0)
 			break;
 
-	nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
-		      "TRACE: %s:%s:%s:%u ",
-		      tablename, chainname, comment, rulenum);
+	nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
+		     "TRACE: %s:%s:%s:%u ",
+		     tablename, chainname, comment, rulenum);
 }
 #endif
 
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 0d8448f19dfe..675d12c69e32 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -212,6 +212,30 @@ void nf_log_packet(struct net *net,
 }
 EXPORT_SYMBOL(nf_log_packet);
 
+void nf_log_trace(struct net *net,
+		  u_int8_t pf,
+		  unsigned int hooknum,
+		  const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct nf_loginfo *loginfo, const char *fmt, ...)
+{
+	va_list args;
+	char prefix[NF_LOG_PREFIXLEN];
+	const struct nf_logger *logger;
+
+	rcu_read_lock();
+	logger = rcu_dereference(net->nf.nf_loggers[pf]);
+	if (logger) {
+		va_start(args, fmt);
+		vsnprintf(prefix, sizeof(prefix), fmt, args);
+		va_end(args);
+		logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_trace);
+
 #define S_SIZE (1024 - (sizeof(unsigned int) + 1))
 
 struct nf_log_buf {
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 3b90eb2b2c55..2d298dccb6dd 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -94,10 +94,10 @@ static void nft_trace_packet(const struct nft_pktinfo *pkt,
 {
 	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
 
-	nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
-		      pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
-		      chain->table->name, chain->name, comments[type],
-		      rulenum);
+	nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+		     pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
+		     chain->table->name, chain->name, comments[type],
+		     rulenum);
 }
 
 unsigned int

From 67d8712dcc70aa16d8e14a52eb73870e3cbddfc2 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Wed, 18 Mar 2015 11:57:39 -0600
Subject: [PATCH 33/98] selftests: Fix build failures when invoked from
 kselftest target

Several tests that rely on implicit build rules fail to build,
when invoked from the main Makefile kselftest target. These
failures are due to --no-builtin-rules and --no-builtin-variables
options set in the inherited MAKEFLAGS.

--no-builtin-rules eliminates the use of built-in implicit rules
and --no-builtin-variables is for not defining built-in variables.
These two options override the use of implicit rules resulting in
build failures. In addition, inherited LDFLAGS result in build
failures and there is no need to define LDFLAGS.  Clear LDFLAGS
and MAKEFLAG when make is invoked from the main Makefile kselftest
target. Fixing this at selftests Makefile avoids changing the main
Makefile and keeps this change self contained at selftests level.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 4e511221a0c1..0db571340edb 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -22,6 +22,14 @@ TARGETS += vm
 TARGETS_HOTPLUG = cpu-hotplug
 TARGETS_HOTPLUG += memory-hotplug
 
+# Clear LDFLAGS and MAKEFLAGS if called from main
+# Makefile to avoid test build failures when test
+# Makefile doesn't have explicit build rules.
+ifeq (1,$(MAKELEVEL))
+undefine LDFLAGS
+override MAKEFLAGS =
+endif
+
 all:
 	for TARGET in $(TARGETS); do \
 		make -C $$TARGET; \

From 5067c0469c643512f24786990e315f9c15cc7d24 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 12 Mar 2015 10:32:18 -0700
Subject: [PATCH 34/98] ata: Add a new flag to destinguish sas controller

SAS controller has its own tag allocation, which doesn't directly match to ATA
tag, so SAS and SATA have different code path for ata tags. Originally we use
port->scsi_host (98bd4be1) to destinguish SAS controller, but libsas set
->scsi_host too, so we can't use it for the destinguish, we add a new flag for
this purpose.

Without this patch, the following oops can happen because scsi-mq uses
a host-wide tag map shared among all devices with some integer tag
values >= ATA_MAX_QUEUE.  These unexpectedly high tag values cause
__ata_qc_from_tag() to return NULL, which is then dereferenced in
ata_qc_new_init().

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
  IP: [<ffffffff804fd46e>] ata_qc_new_init+0x3e/0x120
  PGD 32adf0067 PUD 32adf1067 PMD 0
  Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
  Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi igb
  i2c_algo_bit ptp pps_core pm80xx libsas scsi_transport_sas sg coretemp
  eeprom w83795 i2c_i801
  CPU: 4 PID: 1450 Comm: cydiskbench Not tainted 4.0.0-rc3 #1
  Hardware name: Supermicro X8DTH-i/6/iF/6F/X8DTH, BIOS 2.1b       05/04/12
  task: ffff8800ba86d500 ti: ffff88032a064000 task.ti: ffff88032a064000
  RIP: 0010:[<ffffffff804fd46e>]  [<ffffffff804fd46e>] ata_qc_new_init+0x3e/0x120
  RSP: 0018:ffff88032a067858  EFLAGS: 00010046
  RAX: 0000000000000000 RBX: ffff8800ba0d2230 RCX: 000000000000002a
  RDX: ffffffff80505ae0 RSI: 0000000000000020 RDI: ffff8800ba0d2230
  RBP: ffff88032a067868 R08: 0000000000000201 R09: 0000000000000001
  R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800ba0d0000
  R13: ffff8800ba0d2230 R14: ffffffff80505ae0 R15: ffff8800ba0d0000
  FS:  0000000041223950(0063) GS:ffff88033e480000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 0000000000000058 CR3: 000000032a0a3000 CR4: 00000000000006e0
  Stack:
   ffff880329eee758 ffff880329eee758 ffff88032a0678a8 ffffffff80502dad
   ffff8800ba167978 ffff880329eee758 ffff88032bf9c520 ffff8800ba167978
   ffff88032bf9c520 ffff88032bf9a290 ffff88032a0678b8 ffffffff80506909
  Call Trace:
   [<ffffffff80502dad>] ata_scsi_translate+0x3d/0x1b0
   [<ffffffff80506909>] ata_sas_queuecmd+0x149/0x2a0
   [<ffffffffa0046650>] sas_queuecommand+0xa0/0x1f0 [libsas]
   [<ffffffff804ea544>] scsi_dispatch_cmd+0xd4/0x1a0
   [<ffffffff804eb50f>] scsi_queue_rq+0x66f/0x7f0
   [<ffffffff803e5098>] __blk_mq_run_hw_queue+0x208/0x3f0
   [<ffffffff803e54b8>] blk_mq_run_hw_queue+0x88/0xc0
   [<ffffffff803e5c74>] blk_mq_insert_request+0xc4/0x130
   [<ffffffff803e0b63>] blk_execute_rq_nowait+0x73/0x160
   [<ffffffffa0023fca>] sg_common_write+0x3da/0x720 [sg]
   [<ffffffffa0025100>] sg_new_write+0x250/0x360 [sg]
   [<ffffffffa0025feb>] sg_write+0x13b/0x450 [sg]
   [<ffffffff8032ec91>] vfs_write+0xd1/0x1b0
   [<ffffffff8032ee54>] SyS_write+0x54/0xc0
   [<ffffffff80689932>] system_call_fastpath+0x12/0x17

tj: updated description.

Fixes: 12cb5ce101ab ("libata: use blk taging")
Reported-and-tested-by: Tony Battersby <tonyb@cybernetics.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c     | 4 ++--
 drivers/scsi/ipr.c            | 3 ++-
 drivers/scsi/libsas/sas_ata.c | 3 ++-
 include/linux/libata.h        | 1 +
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 4c35f0822d06..ef150ebb4c30 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4737,7 +4737,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
 		return NULL;
 
 	/* libsas case */
-	if (!ap->scsi_host) {
+	if (ap->flags & ATA_FLAG_SAS_HOST) {
 		tag = ata_sas_allocate_tag(ap);
 		if (tag < 0)
 			return NULL;
@@ -4776,7 +4776,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		if (!ap->scsi_host)
+		if (ap->flags & ATA_FLAG_SAS_HOST)
 			ata_sas_free_tag(tag, ap);
 	}
 }
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 9219953ee949..d9afc51af7d3 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -6815,7 +6815,8 @@ static struct ata_port_operations ipr_sata_ops = {
 };
 
 static struct ata_port_info sata_port_info = {
-	.flags		= ATA_FLAG_SATA | ATA_FLAG_PIO_DMA,
+	.flags		= ATA_FLAG_SATA | ATA_FLAG_PIO_DMA |
+			  ATA_FLAG_SAS_HOST,
 	.pio_mask	= ATA_PIO4_ONLY,
 	.mwdma_mask	= ATA_MWDMA2,
 	.udma_mask	= ATA_UDMA6,
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 932d9cc98d2f..9c706d8c1441 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -547,7 +547,8 @@ static struct ata_port_operations sas_sata_ops = {
 };
 
 static struct ata_port_info sata_port_info = {
-	.flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ,
+	.flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ |
+		 ATA_FLAG_SAS_HOST,
 	.pio_mask = ATA_PIO4,
 	.mwdma_mask = ATA_MWDMA2,
 	.udma_mask = ATA_UDMA6,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index fc03efa64ffe..6b08cc106c21 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -232,6 +232,7 @@ enum {
 					      * led */
 	ATA_FLAG_NO_DIPM	= (1 << 23), /* host not happy with DIPM */
 	ATA_FLAG_LOWTAG		= (1 << 24), /* host wants lowest available tag */
+	ATA_FLAG_SAS_HOST	= (1 << 25), /* SAS host */
 
 	/* bits 24:31 of ap->flags are reserved for LLD specific flags */
 

From c6b570d97c0e77f570bb6b2ed30d372b2b1e9aae Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 9 Mar 2015 12:20:13 +0100
Subject: [PATCH 35/98] regmap: introduce regmap_name to fix syscon regmap
 trace events

This patch fixes a NULL pointer dereference when enabling regmap event
tracing in the presence of a syscon regmap, introduced by commit bdb0066df96e
("mfd: syscon: Decouple syscon interface from platform devices").
That patch introduced syscon regmaps that have their dev field set to NULL.
The regmap trace events expect it to point to a valid struct device and feed
it to dev_name():

  $ echo 1 > /sys/kernel/debug/tracing/events/regmap/enable

  Unable to handle kernel NULL pointer dereference at virtual address 0000002c
  pgd = 80004000
  [0000002c] *pgd=00000000
  Internal error: Oops: 17 [#1] SMP ARM
  Modules linked in: coda videobuf2_vmalloc
  CPU: 0 PID: 304 Comm: kworker/0:2 Not tainted 4.0.0-rc2+ #9197
  Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
  Workqueue: events_freezable thermal_zone_device_check
  task: 9f25a200 ti: 9f1ee000 task.ti: 9f1ee000
  PC is at ftrace_raw_event_regmap_block+0x3c/0xe4
  LR is at _regmap_raw_read+0x1bc/0x1cc
  pc : [<803636e8>]    lr : [<80365f2c>]    psr: 600f0093
  sp : 9f1efd78  ip : 9f1efdb8  fp : 9f1efdb4
  r10: 00000004  r9 : 00000001  r8 : 00000001
  r7 : 00000180  r6 : 00000000  r5 : 9f00e3c0  r4 : 00000003
  r3 : 00000001  r2 : 00000180  r1 : 00000000  r0 : 9f00e3c0
  Flags: nZCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment kernel
  Control: 10c5387d  Table: 2d91004a  DAC: 00000015
  Process kworker/0:2 (pid: 304, stack limit = 0x9f1ee210)
  Stack: (0x9f1efd78 to 0x9f1f0000)
  fd60:                                                       9f1efda4 9f1efd88
  fd80: 800708c0 805f9510 80927140 800f0013 9f1fc800 9eb2f490 00000000 00000180
  fda0: 808e3840 00000001 9f1efdfc 9f1efdb8 80365f2c 803636b8 805f8958 800708e0
  fdc0: a00f0013 803636ac 9f16de00 00000180 80927140 9f1fc800 9f1fc800 9f1efe6c
  fde0: 9f1efe6c 9f732400 00000000 00000000 9f1efe1c 9f1efe00 80365f70 80365d7c
  fe00: 80365f3c 9f1fc800 9f1fc800 00000180 9f1efe44 9f1efe20 803656a4 80365f48
  fe20: 9f1fc800 00000180 9f1efe6c 9f1efe6c 9f732400 00000000 9f1efe64 9f1efe48
  fe40: 803657bc 80365634 00000001 9e95f910 9f1fc800 9f1efeb4 9f1efe8c 9f1efe68
  fe60: 80452ac0 80365778 9f1efe8c 9f1efe78 9e93d400 9e93d5e8 9f1efeb4 9f72ef40
  fe80: 9f1efeac 9f1efe90 8044e11c 80452998 8045298c 9e93d608 9e93d400 808e1978
  fea0: 9f1efecc 9f1efeb0 8044fd14 8044e0d0 ffffffff 9f25a200 9e93d608 9e481380
  fec0: 9f1efedc 9f1efed0 8044fde8 8044fcec 9f1eff1c 9f1efee0 80038d50 8044fdd8
  fee0: 9f1ee020 9f72ef40 9e481398 00000000 00000008 9f72ef54 9f1ee020 9f72ef40
  ff00: 9e481398 9e481380 00000008 9f72ef40 9f1eff5c 9f1eff20 80039754 80038bfc
  ff20: 00000000 9e481380 80894100 808e1662 00000000 9e4f2ec0 00000000 9e481380
  ff40: 800396f8 00000000 00000000 00000000 9f1effac 9f1eff60 8003e020 80039704
  ff60: ffffffff 00000000 ffffffff 9e481380 00000000 00000000 9f1eff78 9f1eff78
  ff80: 00000000 00000000 9f1eff88 9f1eff88 9e4f2ec0 8003df30 00000000 00000000
  ffa0: 00000000 9f1effb0 8000eb60 8003df3c 00000000 00000000 00000000 00000000
  ffc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
  ffe0: 00000000 00000000 00000000 00000000 00000013 00000000 ffffffff ffffffff
  Backtrace:
  [<803636ac>] (ftrace_raw_event_regmap_block) from [<80365f2c>] (_regmap_raw_read+0x1bc/0x1cc)
   r9:00000001 r8:808e3840 r7:00000180 r6:00000000 r5:9eb2f490 r4:9f1fc800
  [<80365d70>] (_regmap_raw_read) from [<80365f70>] (_regmap_bus_read+0x34/0x6c)
   r10:00000000 r9:00000000 r8:9f732400 r7:9f1efe6c r6:9f1efe6c r5:9f1fc800
   r4:9f1fc800
  [<80365f3c>] (_regmap_bus_read) from [<803656a4>] (_regmap_read+0x7c/0x144)
   r6:00000180 r5:9f1fc800 r4:9f1fc800 r3:80365f3c
  [<80365628>] (_regmap_read) from [<803657bc>] (regmap_read+0x50/0x70)
   r9:00000000 r8:9f732400 r7:9f1efe6c r6:9f1efe6c r5:00000180 r4:9f1fc800
  [<8036576c>] (regmap_read) from [<80452ac0>] (imx_get_temp+0x134/0x1a4)
   r6:9f1efeb4 r5:9f1fc800 r4:9e95f910 r3:00000001
  [<8045298c>] (imx_get_temp) from [<8044e11c>] (thermal_zone_get_temp+0x58/0x74)
   r7:9f72ef40 r6:9f1efeb4 r5:9e93d5e8 r4:9e93d400
  [<8044e0c4>] (thermal_zone_get_temp) from [<8044fd14>] (thermal_zone_device_update+0x34/0xec)
   r6:808e1978 r5:9e93d400 r4:9e93d608 r3:8045298c
  [<8044fce0>] (thermal_zone_device_update) from [<8044fde8>] (thermal_zone_device_check+0x1c/0x20)
   r5:9e481380 r4:9e93d608
  [<8044fdcc>] (thermal_zone_device_check) from [<80038d50>] (process_one_work+0x160/0x3d4)
  [<80038bf0>] (process_one_work) from [<80039754>] (worker_thread+0x5c/0x4f4)
   r10:9f72ef40 r9:00000008 r8:9e481380 r7:9e481398 r6:9f72ef40 r5:9f1ee020
   r4:9f72ef54
  [<800396f8>] (worker_thread) from [<8003e020>] (kthread+0xf0/0x108)
   r10:00000000 r9:00000000 r8:00000000 r7:800396f8 r6:9e481380 r5:00000000
   r4:9e4f2ec0
  [<8003df30>] (kthread) from [<8000eb60>] (ret_from_fork+0x14/0x34)
   r7:00000000 r6:00000000 r5:8003df30 r4:9e4f2ec0
  Code: e3140040 1a00001a e3140020 1a000016 (e596002c)
  ---[ end trace 193c15c2494ec960 ]---

Fixes: bdb0066df96e (mfd: syscon: Decouple syscon interface from platform devices)
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/base/regmap/internal.h |   8 +++
 drivers/base/regmap/regcache.c |  16 ++---
 drivers/base/regmap/regmap.c   |  32 ++++-----
 include/trace/events/regmap.h  | 123 ++++++++++++++++-----------------
 4 files changed, 91 insertions(+), 88 deletions(-)

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index beb8b27d4621..a13587b5c2be 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -243,4 +243,12 @@ extern struct regcache_ops regcache_rbtree_ops;
 extern struct regcache_ops regcache_lzo_ops;
 extern struct regcache_ops regcache_flat_ops;
 
+static inline const char *regmap_name(const struct regmap *map)
+{
+	if (map->dev)
+		return dev_name(map->dev);
+
+	return map->name;
+}
+
 #endif
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index f373c35f9e1d..f5db662e951e 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -218,7 +218,7 @@ int regcache_read(struct regmap *map,
 		ret = map->cache_ops->read(map, reg, value);
 
 		if (ret == 0)
-			trace_regmap_reg_read_cache(map->dev, reg, *value);
+			trace_regmap_reg_read_cache(map, reg, *value);
 
 		return ret;
 	}
@@ -311,7 +311,7 @@ int regcache_sync(struct regmap *map)
 	dev_dbg(map->dev, "Syncing %s cache\n",
 		map->cache_ops->name);
 	name = map->cache_ops->name;
-	trace_regcache_sync(map->dev, name, "start");
+	trace_regcache_sync(map, name, "start");
 
 	if (!map->cache_dirty)
 		goto out;
@@ -346,7 +346,7 @@ out:
 
 	regmap_async_complete(map);
 
-	trace_regcache_sync(map->dev, name, "stop");
+	trace_regcache_sync(map, name, "stop");
 
 	return ret;
 }
@@ -381,7 +381,7 @@ int regcache_sync_region(struct regmap *map, unsigned int min,
 	name = map->cache_ops->name;
 	dev_dbg(map->dev, "Syncing %s cache from %d-%d\n", name, min, max);
 
-	trace_regcache_sync(map->dev, name, "start region");
+	trace_regcache_sync(map, name, "start region");
 
 	if (!map->cache_dirty)
 		goto out;
@@ -401,7 +401,7 @@ out:
 
 	regmap_async_complete(map);
 
-	trace_regcache_sync(map->dev, name, "stop region");
+	trace_regcache_sync(map, name, "stop region");
 
 	return ret;
 }
@@ -428,7 +428,7 @@ int regcache_drop_region(struct regmap *map, unsigned int min,
 
 	map->lock(map->lock_arg);
 
-	trace_regcache_drop_region(map->dev, min, max);
+	trace_regcache_drop_region(map, min, max);
 
 	ret = map->cache_ops->drop(map, min, max);
 
@@ -455,7 +455,7 @@ void regcache_cache_only(struct regmap *map, bool enable)
 	map->lock(map->lock_arg);
 	WARN_ON(map->cache_bypass && enable);
 	map->cache_only = enable;
-	trace_regmap_cache_only(map->dev, enable);
+	trace_regmap_cache_only(map, enable);
 	map->unlock(map->lock_arg);
 }
 EXPORT_SYMBOL_GPL(regcache_cache_only);
@@ -493,7 +493,7 @@ void regcache_cache_bypass(struct regmap *map, bool enable)
 	map->lock(map->lock_arg);
 	WARN_ON(map->cache_only && enable);
 	map->cache_bypass = enable;
-	trace_regmap_cache_bypass(map->dev, enable);
+	trace_regmap_cache_bypass(map, enable);
 	map->unlock(map->lock_arg);
 }
 EXPORT_SYMBOL_GPL(regcache_cache_bypass);
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index f99b098ddabf..dbfe6a69c3da 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1281,7 +1281,7 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 	if (map->async && map->bus->async_write) {
 		struct regmap_async *async;
 
-		trace_regmap_async_write_start(map->dev, reg, val_len);
+		trace_regmap_async_write_start(map, reg, val_len);
 
 		spin_lock_irqsave(&map->async_lock, flags);
 		async = list_first_entry_or_null(&map->async_free,
@@ -1339,8 +1339,7 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 		return ret;
 	}
 
-	trace_regmap_hw_write_start(map->dev, reg,
-				    val_len / map->format.val_bytes);
+	trace_regmap_hw_write_start(map, reg, val_len / map->format.val_bytes);
 
 	/* If we're doing a single register write we can probably just
 	 * send the work_buf directly, otherwise try to do a gather
@@ -1372,8 +1371,7 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 		kfree(buf);
 	}
 
-	trace_regmap_hw_write_done(map->dev, reg,
-				   val_len / map->format.val_bytes);
+	trace_regmap_hw_write_done(map, reg, val_len / map->format.val_bytes);
 
 	return ret;
 }
@@ -1407,12 +1405,12 @@ static int _regmap_bus_formatted_write(void *context, unsigned int reg,
 
 	map->format.format_write(map, reg, val);
 
-	trace_regmap_hw_write_start(map->dev, reg, 1);
+	trace_regmap_hw_write_start(map, reg, 1);
 
 	ret = map->bus->write(map->bus_context, map->work_buf,
 			      map->format.buf_size);
 
-	trace_regmap_hw_write_done(map->dev, reg, 1);
+	trace_regmap_hw_write_done(map, reg, 1);
 
 	return ret;
 }
@@ -1470,7 +1468,7 @@ int _regmap_write(struct regmap *map, unsigned int reg,
 		dev_info(map->dev, "%x <= %x\n", reg, val);
 #endif
 
-	trace_regmap_reg_write(map->dev, reg, val);
+	trace_regmap_reg_write(map, reg, val);
 
 	return map->reg_write(context, reg, val);
 }
@@ -1773,7 +1771,7 @@ static int _regmap_raw_multi_reg_write(struct regmap *map,
 	for (i = 0; i < num_regs; i++) {
 		int reg = regs[i].reg;
 		int val = regs[i].def;
-		trace_regmap_hw_write_start(map->dev, reg, 1);
+		trace_regmap_hw_write_start(map, reg, 1);
 		map->format.format_reg(u8, reg, map->reg_shift);
 		u8 += reg_bytes + pad_bytes;
 		map->format.format_val(u8, val, 0);
@@ -1788,7 +1786,7 @@ static int _regmap_raw_multi_reg_write(struct regmap *map,
 
 	for (i = 0; i < num_regs; i++) {
 		int reg = regs[i].reg;
-		trace_regmap_hw_write_done(map->dev, reg, 1);
+		trace_regmap_hw_write_done(map, reg, 1);
 	}
 	return ret;
 }
@@ -2059,15 +2057,13 @@ static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 	 */
 	u8[0] |= map->read_flag_mask;
 
-	trace_regmap_hw_read_start(map->dev, reg,
-				   val_len / map->format.val_bytes);
+	trace_regmap_hw_read_start(map, reg, val_len / map->format.val_bytes);
 
 	ret = map->bus->read(map->bus_context, map->work_buf,
 			     map->format.reg_bytes + map->format.pad_bytes,
 			     val, val_len);
 
-	trace_regmap_hw_read_done(map->dev, reg,
-				  val_len / map->format.val_bytes);
+	trace_regmap_hw_read_done(map, reg, val_len / map->format.val_bytes);
 
 	return ret;
 }
@@ -2123,7 +2119,7 @@ static int _regmap_read(struct regmap *map, unsigned int reg,
 			dev_info(map->dev, "%x => %x\n", reg, *val);
 #endif
 
-		trace_regmap_reg_read(map->dev, reg, *val);
+		trace_regmap_reg_read(map, reg, *val);
 
 		if (!map->cache_bypass)
 			regcache_write(map, reg, *val);
@@ -2480,7 +2476,7 @@ void regmap_async_complete_cb(struct regmap_async *async, int ret)
 	struct regmap *map = async->map;
 	bool wake;
 
-	trace_regmap_async_io_complete(map->dev);
+	trace_regmap_async_io_complete(map);
 
 	spin_lock(&map->async_lock);
 	list_move(&async->list, &map->async_free);
@@ -2525,7 +2521,7 @@ int regmap_async_complete(struct regmap *map)
 	if (!map->bus || !map->bus->async_write)
 		return 0;
 
-	trace_regmap_async_complete_start(map->dev);
+	trace_regmap_async_complete_start(map);
 
 	wait_event(map->async_waitq, regmap_async_is_done(map));
 
@@ -2534,7 +2530,7 @@ int regmap_async_complete(struct regmap *map)
 	map->async_ret = 0;
 	spin_unlock_irqrestore(&map->async_lock, flags);
 
-	trace_regmap_async_complete_done(map->dev);
+	trace_regmap_async_complete_done(map);
 
 	return ret;
 }
diff --git a/include/trace/events/regmap.h b/include/trace/events/regmap.h
index 23d561512f64..22317d2b52ab 100644
--- a/include/trace/events/regmap.h
+++ b/include/trace/events/regmap.h
@@ -7,27 +7,26 @@
 #include <linux/ktime.h>
 #include <linux/tracepoint.h>
 
-struct device;
-struct regmap;
+#include "../../../drivers/base/regmap/internal.h"
 
 /*
  * Log register events
  */
 DECLARE_EVENT_CLASS(regmap_reg,
 
-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),
 
-	TP_ARGS(dev, reg, val),
+	TP_ARGS(map, reg, val),
 
 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
-		__field(	unsigned int,	reg		)
-		__field(	unsigned int,	val		)
+		__string(	name,		regmap_name(map)	)
+		__field(	unsigned int,	reg			)
+		__field(	unsigned int,	val			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->reg = reg;
 		__entry->val = val;
 	),
@@ -39,45 +38,45 @@ DECLARE_EVENT_CLASS(regmap_reg,
 
 DEFINE_EVENT(regmap_reg, regmap_reg_write,
 
-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),
 
-	TP_ARGS(dev, reg, val)
+	TP_ARGS(map, reg, val)
 
 );
 
 DEFINE_EVENT(regmap_reg, regmap_reg_read,
 
-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),
 
-	TP_ARGS(dev, reg, val)
+	TP_ARGS(map, reg, val)
 
 );
 
 DEFINE_EVENT(regmap_reg, regmap_reg_read_cache,
 
-	TP_PROTO(struct device *dev, unsigned int reg,
+	TP_PROTO(struct regmap *map, unsigned int reg,
 		 unsigned int val),
 
-	TP_ARGS(dev, reg, val)
+	TP_ARGS(map, reg, val)
 
 );
 
 DECLARE_EVENT_CLASS(regmap_block,
 
-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),
 
-	TP_ARGS(dev, reg, count),
+	TP_ARGS(map, reg, count),
 
 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
-		__field(	unsigned int,	reg		)
-		__field(	int,		count		)
+		__string(	name,		regmap_name(map)	)
+		__field(	unsigned int,	reg			)
+		__field(	int,		count			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->reg = reg;
 		__entry->count = count;
 	),
@@ -89,48 +88,48 @@ DECLARE_EVENT_CLASS(regmap_block,
 
 DEFINE_EVENT(regmap_block, regmap_hw_read_start,
 
-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),
 
-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );
 
 DEFINE_EVENT(regmap_block, regmap_hw_read_done,
 
-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),
 
-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );
 
 DEFINE_EVENT(regmap_block, regmap_hw_write_start,
 
-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),
 
-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );
 
 DEFINE_EVENT(regmap_block, regmap_hw_write_done,
 
-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),
 
-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );
 
 TRACE_EVENT(regcache_sync,
 
-	TP_PROTO(struct device *dev, const char *type,
+	TP_PROTO(struct regmap *map, const char *type,
 		 const char *status),
 
-	TP_ARGS(dev, type, status),
+	TP_ARGS(map, type, status),
 
 	TP_STRUCT__entry(
-		__string(       name,           dev_name(dev)   )
-		__string(	status,		status		)
-		__string(	type,		type		)
-		__field(	int,		type		)
+		__string(       name,           regmap_name(map)	)
+		__string(	status,		status			)
+		__string(	type,		type			)
+		__field(	int,		type			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__assign_str(status, status);
 		__assign_str(type, type);
 	),
@@ -141,17 +140,17 @@ TRACE_EVENT(regcache_sync,
 
 DECLARE_EVENT_CLASS(regmap_bool,
 
-	TP_PROTO(struct device *dev, bool flag),
+	TP_PROTO(struct regmap *map, bool flag),
 
-	TP_ARGS(dev, flag),
+	TP_ARGS(map, flag),
 
 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
-		__field(	int,		flag		)
+		__string(	name,		regmap_name(map)	)
+		__field(	int,		flag			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->flag = flag;
 	),
 
@@ -161,32 +160,32 @@ DECLARE_EVENT_CLASS(regmap_bool,
 
 DEFINE_EVENT(regmap_bool, regmap_cache_only,
 
-	TP_PROTO(struct device *dev, bool flag),
+	TP_PROTO(struct regmap *map, bool flag),
 
-	TP_ARGS(dev, flag)
+	TP_ARGS(map, flag)
 
 );
 
 DEFINE_EVENT(regmap_bool, regmap_cache_bypass,
 
-	TP_PROTO(struct device *dev, bool flag),
+	TP_PROTO(struct regmap *map, bool flag),
 
-	TP_ARGS(dev, flag)
+	TP_ARGS(map, flag)
 
 );
 
 DECLARE_EVENT_CLASS(regmap_async,
 
-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),
 
-	TP_ARGS(dev),
+	TP_ARGS(map),
 
 	TP_STRUCT__entry(
-		__string(	name,		dev_name(dev)	)
+		__string(	name,		regmap_name(map)	)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 	),
 
 	TP_printk("%s", __get_str(name))
@@ -194,50 +193,50 @@ DECLARE_EVENT_CLASS(regmap_async,
 
 DEFINE_EVENT(regmap_block, regmap_async_write_start,
 
-	TP_PROTO(struct device *dev, unsigned int reg, int count),
+	TP_PROTO(struct regmap *map, unsigned int reg, int count),
 
-	TP_ARGS(dev, reg, count)
+	TP_ARGS(map, reg, count)
 );
 
 DEFINE_EVENT(regmap_async, regmap_async_io_complete,
 
-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),
 
-	TP_ARGS(dev)
+	TP_ARGS(map)
 
 );
 
 DEFINE_EVENT(regmap_async, regmap_async_complete_start,
 
-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),
 
-	TP_ARGS(dev)
+	TP_ARGS(map)
 
 );
 
 DEFINE_EVENT(regmap_async, regmap_async_complete_done,
 
-	TP_PROTO(struct device *dev),
+	TP_PROTO(struct regmap *map),
 
-	TP_ARGS(dev)
+	TP_ARGS(map)
 
 );
 
 TRACE_EVENT(regcache_drop_region,
 
-	TP_PROTO(struct device *dev, unsigned int from,
+	TP_PROTO(struct regmap *map, unsigned int from,
 		 unsigned int to),
 
-	TP_ARGS(dev, from, to),
+	TP_ARGS(map, from, to),
 
 	TP_STRUCT__entry(
-		__string(       name,           dev_name(dev)   )
-		__field(	unsigned int,	from		)
-		__field(	unsigned int,	to		)
+		__string(       name,           regmap_name(map)	)
+		__field(	unsigned int,	from			)
+		__field(	unsigned int,	to			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name, regmap_name(map));
 		__entry->from = from;
 		__entry->to = to;
 	),

From 5b0d4b5514bbcce69b516d0742f2cfc84ebd6db3 Mon Sep 17 00:00:00 2001
From: David Ahern <david.ahern@oracle.com>
Date: Thu, 19 Mar 2015 16:05:57 -0400
Subject: [PATCH 36/98] sparc: perf: Remove redundant perf_pmu_{en|dis}able
 calls

perf_pmu_disable is called by core perf code before pmu->del and the
enable function is called by core perf code afterwards. No need to
call again within sparc_pmu_del.

Ditto for pmu->add and sparc_pmu_add.

Signed-off-by: David Ahern <david.ahern@oracle.com>
Acked-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/perf_event.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 46a5e4508752..6dc4e793df4c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1101,7 +1101,6 @@ static void sparc_pmu_del(struct perf_event *event, int _flags)
 	int i;
 
 	local_irq_save(flags);
-	perf_pmu_disable(event->pmu);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event[i]) {
@@ -1127,7 +1126,6 @@ static void sparc_pmu_del(struct perf_event *event, int _flags)
 		}
 	}
 
-	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
@@ -1361,7 +1359,6 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	perf_pmu_disable(event->pmu);
 
 	n0 = cpuc->n_events;
 	if (n0 >= sparc_pmu->max_hw_events)
@@ -1394,7 +1391,6 @@ nocheck:
 
 	ret = 0;
 out:
-	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 	return ret;
 }

From d51291cb8f32bfae6b331e1838651f3ddefa73a5 Mon Sep 17 00:00:00 2001
From: David Ahern <david.ahern@oracle.com>
Date: Thu, 19 Mar 2015 16:06:17 -0400
Subject: [PATCH 37/98] sparc: perf: Make counting mode actually work

Currently perf-stat (aka, counting mode) does not work:

$ perf stat ls
...
 Performance counter stats for 'ls':

          1.585665      task-clock (msec)         #    0.580 CPUs utilized
                24      context-switches          #    0.015 M/sec
                 0      cpu-migrations            #    0.000 K/sec
                86      page-faults               #    0.054 M/sec
   <not supported>      cycles
   <not supported>      stalled-cycles-frontend
   <not supported>      stalled-cycles-backend
   <not supported>      instructions
   <not supported>      branches
   <not supported>      branch-misses

       0.002735100 seconds time elapsed

The reason is that state is never reset (stays with PERF_HES_UPTODATE set).
Add a call to sparc_pmu_enable_event during the added_event handling.
Clean up the encoding since pmu_start calls sparc_pmu_enable_event which
does the same. Passing PERF_EF_RELOAD to sparc_pmu_start means the call
to sparc_perf_event_set_period can be removed as well.

With this patch:

$ perf stat ls
...
 Performance counter stats for 'ls':

          1.552890      task-clock (msec)         #    0.552 CPUs utilized
                24      context-switches          #    0.015 M/sec
                 0      cpu-migrations            #    0.000 K/sec
                86      page-faults               #    0.055 M/sec
         5,748,997      cycles                    #    3.702 GHz
   <not supported>      stalled-cycles-frontend:HG
   <not supported>      stalled-cycles-backend:HG
         1,684,362      instructions:HG           #    0.29  insns per cycle
           295,133      branches:HG               #  190.054 M/sec
            28,007      branch-misses:HG          #    9.49% of all branches

       0.002815665 seconds time elapsed

Signed-off-by: David Ahern <david.ahern@oracle.com>
Acked-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/perf_event.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6dc4e793df4c..af53c25da2e7 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -960,6 +960,8 @@ out:
 	cpuc->pcr[0] |= cpuc->event[0]->hw.config_base;
 }
 
+static void sparc_pmu_start(struct perf_event *event, int flags);
+
 /* On this PMU each PIC has it's own PCR control register.  */
 static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc)
 {
@@ -972,20 +974,13 @@ static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc)
 		struct perf_event *cp = cpuc->event[i];
 		struct hw_perf_event *hwc = &cp->hw;
 		int idx = hwc->idx;
-		u64 enc;
 
 		if (cpuc->current_idx[i] != PIC_NO_INDEX)
 			continue;
 
-		sparc_perf_event_set_period(cp, hwc, idx);
 		cpuc->current_idx[i] = idx;
 
-		enc = perf_event_get_enc(cpuc->events[i]);
-		cpuc->pcr[idx] &= ~mask_for_index(idx);
-		if (hwc->state & PERF_HES_STOPPED)
-			cpuc->pcr[idx] |= nop_for_index(idx);
-		else
-			cpuc->pcr[idx] |= event_encoding(enc, idx);
+		sparc_pmu_start(cp, PERF_EF_RELOAD);
 	}
 out:
 	for (i = 0; i < cpuc->n_events; i++) {

From b5aff55d89c27aedcae9521155b81b6aebb6c5d8 Mon Sep 17 00:00:00 2001
From: David Ahern <david.ahern@oracle.com>
Date: Thu, 19 Mar 2015 16:06:37 -0400
Subject: [PATCH 38/98] sparc: perf: Add support M7 processor

The M7 processor has a different hypervisor group id and different PCR fast
trap values. PIC read/write functions and PCR bit fields are the same as
the T4 so those are reused.

Signed-off-by: David Ahern <david.ahern@oracle.com>
Acked-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/hypervisor.h | 12 +++++++++
 arch/sparc/kernel/hvapi.c           |  1 +
 arch/sparc/kernel/hvcalls.S         | 16 ++++++++++++
 arch/sparc/kernel/pcr.c             | 33 ++++++++++++++++++++++++
 arch/sparc/kernel/perf_event.c      | 40 +++++++++++++++++++++++++++++
 5 files changed, 102 insertions(+)

diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h
index 4f6725ff4c33..f5b6537306f0 100644
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -2957,6 +2957,17 @@ unsigned long sun4v_t5_set_perfreg(unsigned long reg_num,
 				   unsigned long reg_val);
 #endif
 
+
+#define HV_FAST_M7_GET_PERFREG	0x43
+#define HV_FAST_M7_SET_PERFREG	0x44
+
+#ifndef	__ASSEMBLY__
+unsigned long sun4v_m7_get_perfreg(unsigned long reg_num,
+				      unsigned long *reg_val);
+unsigned long sun4v_m7_set_perfreg(unsigned long reg_num,
+				      unsigned long reg_val);
+#endif
+
 /* Function numbers for HV_CORE_TRAP.  */
 #define HV_CORE_SET_VER			0x00
 #define HV_CORE_PUTCHAR			0x01
@@ -2981,6 +2992,7 @@ unsigned long sun4v_t5_set_perfreg(unsigned long reg_num,
 #define HV_GRP_SDIO			0x0108
 #define HV_GRP_SDIO_ERR			0x0109
 #define HV_GRP_REBOOT_DATA		0x0110
+#define HV_GRP_M7_PERF			0x0114
 #define HV_GRP_NIAG_PERF		0x0200
 #define HV_GRP_FIRE_PERF		0x0201
 #define HV_GRP_N2_CPU			0x0202
diff --git a/arch/sparc/kernel/hvapi.c b/arch/sparc/kernel/hvapi.c
index 5c55145bfbf0..662500fa555f 100644
--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@@ -48,6 +48,7 @@ static struct api_info api_table[] = {
 	{ .group = HV_GRP_VT_CPU,				},
 	{ .group = HV_GRP_T5_CPU,				},
 	{ .group = HV_GRP_DIAG,		.flags = FLAG_PRE_API	},
+	{ .group = HV_GRP_M7_PERF,				},
 };
 
 static DEFINE_SPINLOCK(hvapi_lock);
diff --git a/arch/sparc/kernel/hvcalls.S b/arch/sparc/kernel/hvcalls.S
index caedf8320416..afbaba52d2f1 100644
--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@@ -837,3 +837,19 @@ ENTRY(sun4v_t5_set_perfreg)
 	retl
 	 nop
 ENDPROC(sun4v_t5_set_perfreg)
+
+ENTRY(sun4v_m7_get_perfreg)
+	mov	%o1, %o4
+	mov	HV_FAST_M7_GET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	retl
+	nop
+ENDPROC(sun4v_m7_get_perfreg)
+
+ENTRY(sun4v_m7_set_perfreg)
+	mov	HV_FAST_M7_SET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	retl
+	nop
+ENDPROC(sun4v_m7_set_perfreg)
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 7e967c8018c8..eb978c77c76a 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -217,6 +217,31 @@ static const struct pcr_ops n5_pcr_ops = {
 	.pcr_nmi_disable	= PCR_N4_PICNPT,
 };
 
+static u64 m7_pcr_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	(void) sun4v_m7_get_perfreg(reg_num, &val);
+
+	return val;
+}
+
+static void m7_pcr_write(unsigned long reg_num, u64 val)
+{
+	(void) sun4v_m7_set_perfreg(reg_num, val);
+}
+
+static const struct pcr_ops m7_pcr_ops = {
+	.read_pcr		= m7_pcr_read,
+	.write_pcr		= m7_pcr_write,
+	.read_pic		= n4_pic_read,
+	.write_pic		= n4_pic_write,
+	.nmi_picl_value		= n4_picl_value,
+	.pcr_nmi_enable		= (PCR_N4_PICNPT | PCR_N4_STRACE |
+				   PCR_N4_UTRACE | PCR_N4_TOE |
+				   (26 << PCR_N4_SL_SHIFT)),
+	.pcr_nmi_disable	= PCR_N4_PICNPT,
+};
 
 static unsigned long perf_hsvc_group;
 static unsigned long perf_hsvc_major;
@@ -248,6 +273,10 @@ static int __init register_perf_hsvc(void)
 			perf_hsvc_group = HV_GRP_T5_CPU;
 			break;
 
+		case SUN4V_CHIP_SPARC_M7:
+			perf_hsvc_group = HV_GRP_M7_PERF;
+			break;
+
 		default:
 			return -ENODEV;
 		}
@@ -293,6 +322,10 @@ static int __init setup_sun4v_pcr_ops(void)
 		pcr_ops = &n5_pcr_ops;
 		break;
 
+	case SUN4V_CHIP_SPARC_M7:
+		pcr_ops = &m7_pcr_ops;
+		break;
+
 	default:
 		ret = -ENODEV;
 		break;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index af53c25da2e7..86eebfa3b158 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -792,6 +792,42 @@ static const struct sparc_pmu niagara4_pmu = {
 	.num_pic_regs	= 4,
 };
 
+static void sparc_m7_write_pmc(int idx, u64 val)
+{
+	u64 pcr;
+
+	pcr = pcr_ops->read_pcr(idx);
+	/* ensure ov and ntc are reset */
+	pcr &= ~(PCR_N4_OV | PCR_N4_NTC);
+
+	pcr_ops->write_pic(idx, val & 0xffffffff);
+
+	pcr_ops->write_pcr(idx, pcr);
+}
+
+static const struct sparc_pmu sparc_m7_pmu = {
+	.event_map	= niagara4_event_map,
+	.cache_map	= &niagara4_cache_map,
+	.max_events	= ARRAY_SIZE(niagara4_perfmon_event_map),
+	.read_pmc	= sparc_vt_read_pmc,
+	.write_pmc	= sparc_m7_write_pmc,
+	.upper_shift	= 5,
+	.lower_shift	= 5,
+	.event_mask	= 0x7ff,
+	.user_bit	= PCR_N4_UTRACE,
+	.priv_bit	= PCR_N4_STRACE,
+
+	/* We explicitly don't support hypervisor tracing. */
+	.hv_bit		= 0,
+
+	.irq_bit	= PCR_N4_TOE,
+	.upper_nop	= 0,
+	.lower_nop	= 0,
+	.flags		= 0,
+	.max_hw_events	= 4,
+	.num_pcrs	= 4,
+	.num_pic_regs	= 4,
+};
 static const struct sparc_pmu *sparc_pmu __read_mostly;
 
 static u64 event_encoding(u64 event_id, int idx)
@@ -1658,6 +1694,10 @@ static bool __init supported_pmu(void)
 		sparc_pmu = &niagara4_pmu;
 		return true;
 	}
+	if (!strcmp(sparc_pmu_type, "sparc-m7")) {
+		sparc_pmu = &sparc_m7_pmu;
+		return true;
+	}
 	return false;
 }
 

From 31aaa98c248da766ece922bbbe8cc78cfd0bc920 Mon Sep 17 00:00:00 2001
From: David Ahern <david.ahern@oracle.com>
Date: Thu, 19 Mar 2015 16:06:53 -0400
Subject: [PATCH 39/98] sparc: Touch NMI watchdog when walking cpus and calling
 printk

With the increase in number of CPUs calls to functions that dump
output to console (e.g., arch_trigger_all_cpu_backtrace) can take
a long time to complete. If IRQs are disabled eventually the NMI
watchdog kicks in and creates more havoc. Avoid by telling the NMI
watchdog everything is ok.

Signed-off-by: David Ahern <david.ahern@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/process_64.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 0be7bf978cb1..46a59643bb1c 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -287,6 +287,8 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
 			printk("             TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n",
 			       gp->tpc, gp->o7, gp->i7, gp->rpc);
 		}
+
+		touch_nmi_watchdog();
 	}
 
 	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
@@ -362,6 +364,8 @@ static void pmu_snapshot_all_cpus(void)
 		       (cpu == this_cpu ? '*' : ' '), cpu,
 		       pp->pcr[0], pp->pcr[1], pp->pcr[2], pp->pcr[3],
 		       pp->pic[0], pp->pic[1], pp->pic[2], pp->pic[3]);
+
+		touch_nmi_watchdog();
 	}
 
 	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));

From 755563bc79c764c90b9f44db5e4fe6c556d3440c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 19 Mar 2015 19:29:01 +1100
Subject: [PATCH 40/98] powerpc/powernv: Fixes for hypervisor doorbell handling

Since we can now use hypervisor doorbells for host IPIs, this makes
sure we clear the host IPI flag when taking a doorbell interrupt, and
clears any pending doorbell IPI in pnv_smp_cpu_kill_self() (as we
already do for IPIs sent via the XICS interrupt controller).  Otherwise
if there did happen to be a leftover pending doorbell interrupt for
an offline CPU thread for any reason, it would prevent that thread from
going into a power-saving mode; it would instead keep waking up because
of the interrupt.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/ppc-opcode.h |  3 +++
 arch/powerpc/include/asm/reg.h        |  3 +++
 arch/powerpc/kernel/dbell.c           |  2 ++
 arch/powerpc/platforms/powernv/smp.c  | 14 ++++++++++++--
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 03cd858a401c..4cbe23af400a 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -153,6 +153,7 @@
 #define PPC_INST_MFSPR_PVR_MASK		0xfc1fffff
 #define PPC_INST_MFTMR			0x7c0002dc
 #define PPC_INST_MSGSND			0x7c00019c
+#define PPC_INST_MSGCLR			0x7c0001dc
 #define PPC_INST_MSGSNDP		0x7c00011c
 #define PPC_INST_MTTMR			0x7c0003dc
 #define PPC_INST_NOP			0x60000000
@@ -309,6 +310,8 @@
 					___PPC_RB(b) | __PPC_EH(eh))
 #define PPC_MSGSND(b)		stringify_in_c(.long PPC_INST_MSGSND | \
 					___PPC_RB(b))
+#define PPC_MSGCLR(b)		stringify_in_c(.long PPC_INST_MSGCLR | \
+					___PPC_RB(b))
 #define PPC_MSGSNDP(b)		stringify_in_c(.long PPC_INST_MSGSNDP | \
 					___PPC_RB(b))
 #define PPC_POPCNTB(a, s)	stringify_in_c(.long PPC_INST_POPCNTB | \
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 1c874fb533bb..af56b5c6c81a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -608,13 +608,16 @@
 #define   SRR1_ISI_N_OR_G	0x10000000 /* ISI: Access is no-exec or G */
 #define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
+#define   SRR1_WAKEMASK_P8	0x003c0000 /* reason for wakeup on POWER8 */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
 #define   SRR1_WAKEMT		0x00280000 /* mtctrl */
 #define	  SRR1_WAKEHMI		0x00280000 /* Hypervisor maintenance */
 #define   SRR1_WAKEDEC		0x00180000 /* Decrementer interrupt */
+#define   SRR1_WAKEDBELL	0x00140000 /* Privileged doorbell on P8 */
 #define   SRR1_WAKETHERM	0x00100000 /* Thermal management interrupt */
 #define	  SRR1_WAKERESET	0x00100000 /* System reset */
+#define   SRR1_WAKEHDBELL	0x000c0000 /* Hypervisor doorbell on P8 */
 #define	  SRR1_WAKESTATE	0x00030000 /* Powersave exit mask [46:47] */
 #define	  SRR1_WS_DEEPEST	0x00030000 /* Some resources not maintained,
 					  * may not be recoverable */
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index f4217819cc31..2128f3a96c32 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -17,6 +17,7 @@
 
 #include <asm/dbell.h>
 #include <asm/irq_regs.h>
+#include <asm/kvm_ppc.h>
 
 #ifdef CONFIG_SMP
 void doorbell_setup_this_cpu(void)
@@ -41,6 +42,7 @@ void doorbell_exception(struct pt_regs *regs)
 
 	may_hard_irq_enable();
 
+	kvmppc_set_host_ipi(smp_processor_id(), 0);
 	__this_cpu_inc(irq_stat.doorbell_irqs);
 
 	smp_ipi_demux();
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index fc34025ef822..38a45088f633 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -33,6 +33,8 @@
 #include <asm/runlatch.h>
 #include <asm/code-patching.h>
 #include <asm/dbell.h>
+#include <asm/kvm_ppc.h>
+#include <asm/ppc-opcode.h>
 
 #include "powernv.h"
 
@@ -149,7 +151,7 @@ static int pnv_smp_cpu_disable(void)
 static void pnv_smp_cpu_kill_self(void)
 {
 	unsigned int cpu;
-	unsigned long srr1;
+	unsigned long srr1, wmask;
 	u32 idle_states;
 
 	/* Standard hot unplug procedure */
@@ -161,6 +163,10 @@ static void pnv_smp_cpu_kill_self(void)
 	generic_set_cpu_dead(cpu);
 	smp_wmb();
 
+	wmask = SRR1_WAKEMASK;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		wmask = SRR1_WAKEMASK_P8;
+
 	idle_states = pnv_get_supported_cpuidle_states();
 	/* We don't want to take decrementer interrupts while we are offline,
 	 * so clear LPCR:PECE1. We keep PECE2 enabled.
@@ -191,10 +197,14 @@ static void pnv_smp_cpu_kill_self(void)
 		 * having finished executing in a KVM guest, then srr1
 		 * contains 0.
 		 */
-		if ((srr1 & SRR1_WAKEMASK) == SRR1_WAKEEE) {
+		if ((srr1 & wmask) == SRR1_WAKEEE) {
 			icp_native_flush_interrupt();
 			local_paca->irq_happened &= PACA_IRQ_HARD_DIS;
 			smp_mb();
+		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
+			kvmppc_set_host_ipi(cpu, 0);
 		}
 
 		if (cpu_core_split_required())

From ddee09c099c35074e50aaf9157efd22429d3acdf Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 19 Mar 2015 14:12:06 +1100
Subject: [PATCH 41/98] powerpc: Add PVR for POWER8NVL processor

There's a new variant of POWER8 coming called "POWER8 with NVLink". The
core is identical to POWER8 but unfortunately they strapped it with a
different PVR, so we need to add an explicit entry for it.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/cputable.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index f337666768a7..f83046878336 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -437,6 +437,26 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
+	{	/* Power8NVL */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004c0000,
+		.cpu_name		= "POWER8NVL (raw)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
 	{	/* Power8 DD1: Does not support doorbell IPIs */
 		.pvr_mask		= 0xffffff00,
 		.pvr_value		= 0x004d0100,

From f6ff04149637723261aa4738958b0098b929ee9e Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Wed, 4 Mar 2015 11:59:33 -0800
Subject: [PATCH 42/98] powerpc/pseries: Little endian fixes for post mobility
 device tree update

We currently use the device tree update code in the kernel after resuming
from a suspend operation to re-sync the kernels view of the device tree with
that of the hypervisor. The code as it stands is not endian safe as it relies
on parsing buffers returned by RTAS calls that thusly contains data in big
endian format.

This patch annotates variables and structure members with __be types as well
as performing necessary byte swaps to cpu endian for data that needs to be
parsed.

Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Cc: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Cc: Cyril Bur <cyrilbur@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/mobility.c | 44 ++++++++++++-----------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 90cf3dcbd9f2..8f35d525cede 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -25,10 +25,10 @@
 static struct kobject *mobility_kobj;
 
 struct update_props_workarea {
-	u32 phandle;
-	u32 state;
-	u64 reserved;
-	u32 nprops;
+	__be32 phandle;
+	__be32 state;
+	__be64 reserved;
+	__be32 nprops;
 } __packed;
 
 #define NODE_ACTION_MASK	0xff000000
@@ -54,11 +54,11 @@ static int mobility_rtas_call(int token, char *buf, s32 scope)
 	return rc;
 }
 
-static int delete_dt_node(u32 phandle)
+static int delete_dt_node(__be32 phandle)
 {
 	struct device_node *dn;
 
-	dn = of_find_node_by_phandle(phandle);
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
 	if (!dn)
 		return -ENOENT;
 
@@ -127,7 +127,7 @@ static int update_dt_property(struct device_node *dn, struct property **prop,
 	return 0;
 }
 
-static int update_dt_node(u32 phandle, s32 scope)
+static int update_dt_node(__be32 phandle, s32 scope)
 {
 	struct update_props_workarea *upwa;
 	struct device_node *dn;
@@ -136,6 +136,7 @@ static int update_dt_node(u32 phandle, s32 scope)
 	char *prop_data;
 	char *rtas_buf;
 	int update_properties_token;
+	u32 nprops;
 	u32 vd;
 
 	update_properties_token = rtas_token("ibm,update-properties");
@@ -146,7 +147,7 @@ static int update_dt_node(u32 phandle, s32 scope)
 	if (!rtas_buf)
 		return -ENOMEM;
 
-	dn = of_find_node_by_phandle(phandle);
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
 	if (!dn) {
 		kfree(rtas_buf);
 		return -ENOENT;
@@ -162,6 +163,7 @@ static int update_dt_node(u32 phandle, s32 scope)
 			break;
 
 		prop_data = rtas_buf + sizeof(*upwa);
+		nprops = be32_to_cpu(upwa->nprops);
 
 		/* On the first call to ibm,update-properties for a node the
 		 * the first property value descriptor contains an empty
@@ -170,17 +172,17 @@ static int update_dt_node(u32 phandle, s32 scope)
 		 */
 		if (*prop_data == 0) {
 			prop_data++;
-			vd = *(u32 *)prop_data;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
 			prop_data += vd + sizeof(vd);
-			upwa->nprops--;
+			nprops--;
 		}
 
-		for (i = 0; i < upwa->nprops; i++) {
+		for (i = 0; i < nprops; i++) {
 			char *prop_name;
 
 			prop_name = prop_data;
 			prop_data += strlen(prop_name) + 1;
-			vd = *(u32 *)prop_data;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
 			prop_data += sizeof(vd);
 
 			switch (vd) {
@@ -212,13 +214,13 @@ static int update_dt_node(u32 phandle, s32 scope)
 	return 0;
 }
 
-static int add_dt_node(u32 parent_phandle, u32 drc_index)
+static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
 {
 	struct device_node *dn;
 	struct device_node *parent_dn;
 	int rc;
 
-	parent_dn = of_find_node_by_phandle(parent_phandle);
+	parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle));
 	if (!parent_dn)
 		return -ENOENT;
 
@@ -237,7 +239,7 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index)
 int pseries_devicetree_update(s32 scope)
 {
 	char *rtas_buf;
-	u32 *data;
+	__be32 *data;
 	int update_nodes_token;
 	int rc;
 
@@ -254,17 +256,17 @@ int pseries_devicetree_update(s32 scope)
 		if (rc && rc != 1)
 			break;
 
-		data = (u32 *)rtas_buf + 4;
-		while (*data & NODE_ACTION_MASK) {
+		data = (__be32 *)rtas_buf + 4;
+		while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
 			int i;
-			u32 action = *data & NODE_ACTION_MASK;
-			int node_count = *data & NODE_COUNT_MASK;
+			u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
+			u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
 
 			data++;
 
 			for (i = 0; i < node_count; i++) {
-				u32 phandle = *data++;
-				u32 drc_index;
+				__be32 phandle = *data++;
+				__be32 drc_index;
 
 				switch (action) {
 				case DELETE_DT_NODE:

From 3d8c6dce53a349df8878d078e56bf429bad572f9 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 20 Mar 2015 13:56:06 +0100
Subject: [PATCH 43/98] netfilter: xt_TPROXY: fix invflags check in
 tproxy_tg6_check()

We have to check for IP6T_INV_PROTO in invflags, instead of flags.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Balazs Scheidler <bazsi@balabit.hu>
---
 net/netfilter/xt_TPROXY.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index ef8a926752a9..50e1e5aaf4ce 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -513,8 +513,8 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
 {
 	const struct ip6t_ip6 *i = par->entryinfo;
 
-	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
-	    && !(i->flags & IP6T_INV_PROTO))
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
+	    !(i->invflags & IP6T_INV_PROTO))
 		return 0;
 
 	pr_info("Can be used only in combination with "

From 7ee8e4f3983c4ff700958a6099c8fd212ea67b94 Mon Sep 17 00:00:00 2001
From: Wenbo Wang <wenbo.wang@memblaze.com>
Date: Fri, 20 Mar 2015 01:04:54 -0400
Subject: [PATCH 44/98] Fix bug in blk_rq_merge_ok

Use the right array index to reference the last
element of rq->biotail->bi_io_vec[]

Signed-off-by: Wenbo Wang <wenbo.wang@memblaze.com>
Reviewed-by: Chong Yuan <chong.yuan@memblaze.com>
Fixes: 66cb45aa41315 ("block: add support for limiting gaps in SG lists")
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index fc1ff3b1ea1f..fd3fee81c23c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -592,7 +592,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
 		struct bio_vec *bprev;
 
-		bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1];
+		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
 		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
 			return false;
 	}

From 8e199dfd82ee097b522b00344af6448715d8ee0c Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 19 Mar 2015 11:22:32 +0100
Subject: [PATCH 45/98] ipv6: call ipv6_proxy_select_ident instead of
 ipv6_select_ident in udp6_ufo_fragment

Matt Grant reported frequent crashes in ipv6_select_ident when
udp6_ufo_fragment is called from openvswitch on a skb that doesn't
have a dst_entry set.

ipv6_proxy_select_ident generates the frag_id without using the dst
associated with the skb.  This approach was suggested by Vladislav
Yasevich.

Fixes: 0508c07f5e0c ("ipv6: Select fragment id during UFO segmentation if not set.")
Cc: Vladislav Yasevich <vyasevic@redhat.com>
Reported-by: Matt Grant <matt@mattgrant.net.nz>
Tested-by: Matt Grant <matt@mattgrant.net.nz>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp_offload.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index ab889bb16b3c..be2c0ba82c85 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -112,11 +112,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
 		fptr->nexthdr = nexthdr;
 		fptr->reserved = 0;
-		if (skb_shinfo(skb)->ip6_frag_id)
-			fptr->identification = skb_shinfo(skb)->ip6_frag_id;
-		else
-			ipv6_select_ident(fptr,
-					  (struct rt6_info *)skb_dst(skb));
+		if (!skb_shinfo(skb)->ip6_frag_id)
+			ipv6_proxy_select_ident(skb);
+		fptr->identification = skb_shinfo(skb)->ip6_frag_id;
 
 		/* Fragment the skb. ipv6 header and the remaining fields of the
 		 * fragment header are updated in ipv6_gso_segment()

From 87f966d97b89774162df04d2106c6350c8fe4cb3 Mon Sep 17 00:00:00 2001
From: Markos Chandras <markos.chandras@imgtec.com>
Date: Thu, 19 Mar 2015 10:28:14 +0000
Subject: [PATCH 46/98] net: ethernet: pcnet32: Setup the SRAM and NOUFLO on
 Am79C97{3, 5}

On a MIPS Malta board, tons of fifo underflow errors have been observed
when using u-boot as bootloader instead of YAMON. The reason for that
is that YAMON used to set the pcnet device to SRAM mode but u-boot does
not. As a result, the default Tx threshold (64 bytes) is now too small to
keep the fifo relatively used and it can result to Tx fifo underflow errors.
As a result of which, it's best to setup the SRAM on supported controllers
so we can always use the NOUFLO bit.

Cc: <netdev@vger.kernel.org>
Cc: <stable@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Cc: Don Fry <pcnet32@frontier.com>
Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pcnet32.c | 31 ++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 11d6e6561df1..15a8190a6f75 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -1543,7 +1543,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 {
 	struct pcnet32_private *lp;
 	int i, media;
-	int fdx, mii, fset, dxsuflo;
+	int fdx, mii, fset, dxsuflo, sram;
 	int chip_version;
 	char *chipname;
 	struct net_device *dev;
@@ -1580,7 +1580,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 	}
 
 	/* initialize variables */
-	fdx = mii = fset = dxsuflo = 0;
+	fdx = mii = fset = dxsuflo = sram = 0;
 	chip_version = (chip_version >> 12) & 0xffff;
 
 	switch (chip_version) {
@@ -1613,6 +1613,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 		chipname = "PCnet/FAST III 79C973";	/* PCI */
 		fdx = 1;
 		mii = 1;
+		sram = 1;
 		break;
 	case 0x2626:
 		chipname = "PCnet/Home 79C978";	/* PCI */
@@ -1636,6 +1637,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 		chipname = "PCnet/FAST III 79C975";	/* PCI */
 		fdx = 1;
 		mii = 1;
+		sram = 1;
 		break;
 	case 0x2628:
 		chipname = "PCnet/PRO 79C976";
@@ -1664,6 +1666,31 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 		dxsuflo = 1;
 	}
 
+	/*
+	 * The Am79C973/Am79C975 controllers come with 12K of SRAM
+	 * which we can use for the Tx/Rx buffers but most importantly,
+	 * the use of SRAM allow us to use the BCR18:NOUFLO bit to avoid
+	 * Tx fifo underflows.
+	 */
+	if (sram) {
+		/*
+		 * The SRAM is being configured in two steps. First we
+		 * set the SRAM size in the BCR25:SRAM_SIZE bits. According
+		 * to the datasheet, each bit corresponds to a 512-byte
+		 * page so we can have at most 24 pages. The SRAM_SIZE
+		 * holds the value of the upper 8 bits of the 16-bit SRAM size.
+		 * The low 8-bits start at 0x00 and end at 0xff. So the
+		 * address range is from 0x0000 up to 0x17ff. Therefore,
+		 * the SRAM_SIZE is set to 0x17. The next step is to set
+		 * the BCR26:SRAM_BND midway through so the Tx and Rx
+		 * buffers can share the SRAM equally.
+		 */
+		a->write_bcr(ioaddr, 25, 0x17);
+		a->write_bcr(ioaddr, 26, 0xc);
+		/* And finally enable the NOUFLO bit */
+		a->write_bcr(ioaddr, 18, a->read_bcr(ioaddr, 18) | (1 << 11));
+	}
+
 	dev = alloc_etherdev(sizeof(*lp));
 	if (!dev) {
 		ret = -ENOMEM;

From 73ba57bfae4a1914f6a6dac71e3168dd900e00af Mon Sep 17 00:00:00 2001
From: Steven Barth <cyrus@openwrt.org>
Date: Thu, 19 Mar 2015 16:16:04 +0100
Subject: [PATCH 47/98] ipv6: fix backtracking for throw routes

for throw routes to trigger evaluation of other policy rules
EAGAIN needs to be propagated up to fib_rules_lookup
similar to how its done for IPv4

A simple testcase for verification is:

ip -6 rule add lookup 33333 priority 33333
ip -6 route add throw 2001:db8::1
ip -6 route add 2001:db8::1 via fe80::1 dev wlan0 table 33333
ip route get 2001:db8::1

Signed-off-by: Steven Barth <cyrus@openwrt.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index b4d5e1d97c1b..27ca79682efb 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -104,6 +104,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 				goto again;
 			flp6->saddr = saddr;
 		}
+		err = rt->dst.error;
 		goto out;
 	}
 again:

From d22e1537181188e5dc8cbc51451832625035bdc2 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Thu, 19 Mar 2015 19:19:30 -0400
Subject: [PATCH 48/98] tcp: fix tcp fin memory accounting

tcp_send_fin() does not account for the memory it allocates properly, so
sk_forward_alloc can be negative in cases where we've sent a FIN:

ss example output (ss -amn | grep -B1 f4294):
tcp    FIN-WAIT-1 0      1            192.168.0.1:45520         192.0.2.1:8080
	skmem:(r0,rb87380,t0,tb87380,f4294966016,w1280,o0,bl0)
Acked-by: Eric Dumazet <edumazet@google.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a2a796c5536b..1db253e36045 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2773,15 +2773,11 @@ void tcp_send_fin(struct sock *sk)
 	} else {
 		/* Socket is locked, keep trying until memory is available. */
 		for (;;) {
-			skb = alloc_skb_fclone(MAX_TCP_HEADER,
-					       sk->sk_allocation);
+			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
 			if (skb)
 				break;
 			yield();
 		}
-
-		/* Reserve space for headers and prepare control bits. */
-		skb_reserve(skb, MAX_TCP_HEADER);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		tcp_init_nondata_skb(skb, tp->write_seq,
 				     TCPHDR_ACK | TCPHDR_FIN);

From 435452aa88474fae5a31fd14fca88f0802e66f53 Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara.volam@emulex.com>
Date: Fri, 20 Mar 2015 06:28:23 -0400
Subject: [PATCH 49/98] be2net: Prevent VFs from enabling VLAN promiscuous mode

Currently, a PF does not restrict its VF interface from enabling vlan
promiscuous mode. This breaks vlan isolation when a vlan
(transparent tagging) is configured on a VF.

This patch fixes this problem by disabling the vlan promisc capability
for VFs.

Reported-by: Yoann Juet <veilletechno-irts@univ-nantes.fr>
Signed-off-by: Vasundhara Volam <vasundhara.volam@emulex.com>
Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be.h      |  1 +
 drivers/net/ethernet/emulex/benet/be_cmds.c |  3 +-
 drivers/net/ethernet/emulex/benet/be_cmds.h |  2 +-
 drivers/net/ethernet/emulex/benet/be_main.c | 98 ++++++++++++++++-----
 4 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 27de37aa90af..92eb0c82bb04 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -354,6 +354,7 @@ struct be_vf_cfg {
 	u16 vlan_tag;
 	u32 tx_rate;
 	u32 plink_tracking;
+	u32 privileges;
 };
 
 enum vf_state {
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 36916cfa70f9..3e894f449cc1 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -1918,7 +1918,7 @@ int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd,
 
 /* Uses sycnhronous mcc */
 int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
-		       u32 num)
+		       u32 num, u32 domain)
 {
 	struct be_mcc_wrb *wrb;
 	struct be_cmd_req_vlan_config *req;
@@ -1936,6 +1936,7 @@ int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
 	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
 			       OPCODE_COMMON_NTWK_VLAN_CONFIG, sizeof(*req),
 			       wrb, NULL);
+	req->hdr.domain = domain;
 
 	req->interface_id = if_id;
 	req->untagged = BE_IF_FLAGS_UNTAGGED & be_if_cap_flags(adapter) ? 1 : 0;
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index db761e8e42a3..a7634a3f052a 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -2256,7 +2256,7 @@ int lancer_cmd_get_pport_stats(struct be_adapter *adapter,
 int be_cmd_get_fw_ver(struct be_adapter *adapter);
 int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *, int num);
 int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
-		       u32 num);
+		       u32 num, u32 domain);
 int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 status);
 int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc);
 int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 0a816859aca5..eb2bed92bf60 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1171,7 +1171,7 @@ static int be_vid_config(struct be_adapter *adapter)
 	for_each_set_bit(i, adapter->vids, VLAN_N_VID)
 		vids[num++] = cpu_to_le16(i);
 
-	status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num);
+	status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num, 0);
 	if (status) {
 		dev_err(dev, "Setting HW VLAN filtering failed\n");
 		/* Set to VLAN promisc mode as setting VLAN filter failed */
@@ -1380,11 +1380,67 @@ static int be_get_vf_config(struct net_device *netdev, int vf,
 	return 0;
 }
 
+static int be_set_vf_tvt(struct be_adapter *adapter, int vf, u16 vlan)
+{
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	u16 vids[BE_NUM_VLANS_SUPPORTED];
+	int vf_if_id = vf_cfg->if_handle;
+	int status;
+
+	/* Enable Transparent VLAN Tagging */
+	status = be_cmd_set_hsw_config(adapter, vlan, vf + 1, vf_if_id, 0);
+	if (status)
+		return status;
+
+	/* Clear pre-programmed VLAN filters on VF if any, if TVT is enabled */
+	vids[0] = 0;
+	status = be_cmd_vlan_config(adapter, vf_if_id, vids, 1, vf + 1);
+	if (!status)
+		dev_info(&adapter->pdev->dev,
+			 "Cleared guest VLANs on VF%d", vf);
+
+	/* After TVT is enabled, disallow VFs to program VLAN filters */
+	if (vf_cfg->privileges & BE_PRIV_FILTMGMT) {
+		status = be_cmd_set_fn_privileges(adapter, vf_cfg->privileges &
+						  ~BE_PRIV_FILTMGMT, vf + 1);
+		if (!status)
+			vf_cfg->privileges &= ~BE_PRIV_FILTMGMT;
+	}
+	return 0;
+}
+
+static int be_clear_vf_tvt(struct be_adapter *adapter, int vf)
+{
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	/* Reset Transparent VLAN Tagging. */
+	status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID, vf + 1,
+				       vf_cfg->if_handle, 0);
+	if (status)
+		return status;
+
+	/* Allow VFs to program VLAN filtering */
+	if (!(vf_cfg->privileges & BE_PRIV_FILTMGMT)) {
+		status = be_cmd_set_fn_privileges(adapter, vf_cfg->privileges |
+						  BE_PRIV_FILTMGMT, vf + 1);
+		if (!status) {
+			vf_cfg->privileges |= BE_PRIV_FILTMGMT;
+			dev_info(dev, "VF%d: FILTMGMT priv enabled", vf);
+		}
+	}
+
+	dev_info(dev,
+		 "Disable/re-enable i/f in VM to clear Transparent VLAN tag");
+	return 0;
+}
+
 static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
-	int status = 0;
+	int status;
 
 	if (!sriov_enabled(adapter))
 		return -EPERM;
@@ -1394,24 +1450,19 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 
 	if (vlan || qos) {
 		vlan |= qos << VLAN_PRIO_SHIFT;
-		if (vf_cfg->vlan_tag != vlan)
-			status = be_cmd_set_hsw_config(adapter, vlan, vf + 1,
-						       vf_cfg->if_handle, 0);
+		status = be_set_vf_tvt(adapter, vf, vlan);
 	} else {
-		/* Reset Transparent Vlan Tagging. */
-		status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID,
-					       vf + 1, vf_cfg->if_handle, 0);
+		status = be_clear_vf_tvt(adapter, vf);
 	}
 
 	if (status) {
 		dev_err(&adapter->pdev->dev,
-			"VLAN %d config on VF %d failed : %#x\n", vlan,
-			vf, status);
+			"VLAN %d config on VF %d failed : %#x\n", vlan, vf,
+			status);
 		return be_cmd_status(status);
 	}
 
 	vf_cfg->vlan_tag = vlan;
-
 	return 0;
 }
 
@@ -3339,7 +3390,6 @@ static int be_if_create(struct be_adapter *adapter, u32 *if_handle,
 			u32 cap_flags, u32 vf)
 {
 	u32 en_flags;
-	int status;
 
 	en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
 		   BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |
@@ -3347,10 +3397,7 @@ static int be_if_create(struct be_adapter *adapter, u32 *if_handle,
 
 	en_flags &= cap_flags;
 
-	status = be_cmd_if_create(adapter, cap_flags, en_flags,
-				  if_handle, vf);
-
-	return status;
+	return be_cmd_if_create(adapter, cap_flags, en_flags, if_handle, vf);
 }
 
 static int be_vfs_if_create(struct be_adapter *adapter)
@@ -3368,8 +3415,13 @@ static int be_vfs_if_create(struct be_adapter *adapter)
 		if (!BE3_chip(adapter)) {
 			status = be_cmd_get_profile_config(adapter, &res,
 							   vf + 1);
-			if (!status)
+			if (!status) {
 				cap_flags = res.if_cap_flags;
+				/* Prevent VFs from enabling VLAN promiscuous
+				 * mode
+				 */
+				cap_flags &= ~BE_IF_FLAGS_VLAN_PROMISCUOUS;
+			}
 		}
 
 		status = be_if_create(adapter, &vf_cfg->if_handle,
@@ -3403,7 +3455,6 @@ static int be_vf_setup(struct be_adapter *adapter)
 	struct device *dev = &adapter->pdev->dev;
 	struct be_vf_cfg *vf_cfg;
 	int status, old_vfs, vf;
-	u32 privileges;
 
 	old_vfs = pci_num_vf(adapter->pdev);
 
@@ -3433,15 +3484,18 @@ static int be_vf_setup(struct be_adapter *adapter)
 
 	for_all_vfs(adapter, vf_cfg, vf) {
 		/* Allow VFs to programs MAC/VLAN filters */
-		status = be_cmd_get_fn_privileges(adapter, &privileges, vf + 1);
-		if (!status && !(privileges & BE_PRIV_FILTMGMT)) {
+		status = be_cmd_get_fn_privileges(adapter, &vf_cfg->privileges,
+						  vf + 1);
+		if (!status && !(vf_cfg->privileges & BE_PRIV_FILTMGMT)) {
 			status = be_cmd_set_fn_privileges(adapter,
-							  privileges |
+							  vf_cfg->privileges |
 							  BE_PRIV_FILTMGMT,
 							  vf + 1);
-			if (!status)
+			if (!status) {
+				vf_cfg->privileges |= BE_PRIV_FILTMGMT;
 				dev_info(dev, "VF%d has FILTMGMT privilege\n",
 					 vf);
+			}
 		}
 
 		/* Allow full available bandwidth */

From c8ba4ad0b59c511578f8f706aae711f01c990794 Mon Sep 17 00:00:00 2001
From: Suresh Reddy <Suresh.Reddy@emulex.com>
Date: Fri, 20 Mar 2015 06:28:24 -0400
Subject: [PATCH 50/98] be2net: restrict MODIFY_EQ_DELAY cmd to a max of 8 EQs

Issuing this cmd for more than 8 EQs does not have the intended effect
even on BEx and Skyhawk-R.

This patch fixes this by issuing this cmd for upto 8 EQs at a time.
Signed-off-by: Suresh Reddy <Suresh.Reddy@emulex.com>

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_cmds.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 3e894f449cc1..7f05f309e935 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -1902,15 +1902,11 @@ int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd,
 {
 	int num_eqs, i = 0;
 
-	if (lancer_chip(adapter) && num > 8) {
-		while (num) {
-			num_eqs = min(num, 8);
-			__be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs);
-			i += num_eqs;
-			num -= num_eqs;
-		}
-	} else {
-		__be_cmd_modify_eqd(adapter, set_eqd, num);
+	while (num) {
+		num_eqs = min(num, 8);
+		__be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs);
+		i += num_eqs;
+		num -= num_eqs;
 	}
 
 	return 0;

From 25848c9015964d9d97dffd48bbb88b75a25d0a1b Mon Sep 17 00:00:00 2001
From: Suresh Reddy <Suresh.Reddy@emulex.com>
Date: Fri, 20 Mar 2015 06:28:25 -0400
Subject: [PATCH 51/98] be2net: use PCI MMIO read instead of config read for
 errors

When an EEH error occurs, the device/slot is disconnected. This condition
is more reliably detected (i.e., returns all ones) with an MMIO read rather
than a config read -- especially on power platforms.

Hence, this patch fixes EEH error detection by replacing config reads with
MMIO reads for reading the error registers. The error registers in
Skyhawk-R/BE2/BE3 are accessible both via the config space and the
PCICFG (BAR0) memory space.

Reported-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Suresh Reddy <Suresh.Reddy@emulex.com>
Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be.h      |  1 +
 drivers/net/ethernet/emulex/benet/be_main.c | 33 ++++++++++++++-------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 92eb0c82bb04..27b9fe99a9bd 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -424,6 +424,7 @@ struct be_adapter {
 
 	u8 __iomem *csr;	/* CSR BAR used only for BE2/3 */
 	u8 __iomem *db;		/* Door Bell */
+	u8 __iomem *pcicfg;	/* On SH,BEx only. Shadow of PCI config space */
 
 	struct mutex mbox_lock; /* For serializing mbox cmds to BE card */
 	struct be_dma_mem mbox_mem;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index eb2bed92bf60..e6b790f0d9dc 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2823,14 +2823,12 @@ void be_detect_error(struct be_adapter *adapter)
 			}
 		}
 	} else {
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_LOW, &ue_lo);
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_HIGH, &ue_hi);
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_LOW_MASK, &ue_lo_mask);
-		pci_read_config_dword(adapter->pdev,
-				      PCICFG_UE_STATUS_HI_MASK, &ue_hi_mask);
+		ue_lo = ioread32(adapter->pcicfg + PCICFG_UE_STATUS_LOW);
+		ue_hi = ioread32(adapter->pcicfg + PCICFG_UE_STATUS_HIGH);
+		ue_lo_mask = ioread32(adapter->pcicfg +
+				      PCICFG_UE_STATUS_LOW_MASK);
+		ue_hi_mask = ioread32(adapter->pcicfg +
+				      PCICFG_UE_STATUS_HI_MASK);
 
 		ue_lo = (ue_lo & ~ue_lo_mask);
 		ue_hi = (ue_hi & ~ue_hi_mask);
@@ -4874,24 +4872,37 @@ static int be_roce_map_pci_bars(struct be_adapter *adapter)
 
 static int be_map_pci_bars(struct be_adapter *adapter)
 {
+	struct pci_dev *pdev = adapter->pdev;
 	u8 __iomem *addr;
 
 	if (BEx_chip(adapter) && be_physfn(adapter)) {
-		adapter->csr = pci_iomap(adapter->pdev, 2, 0);
+		adapter->csr = pci_iomap(pdev, 2, 0);
 		if (!adapter->csr)
 			return -ENOMEM;
 	}
 
-	addr = pci_iomap(adapter->pdev, db_bar(adapter), 0);
+	addr = pci_iomap(pdev, db_bar(adapter), 0);
 	if (!addr)
 		goto pci_map_err;
 	adapter->db = addr;
 
+	if (skyhawk_chip(adapter) || BEx_chip(adapter)) {
+		if (be_physfn(adapter)) {
+			/* PCICFG is the 2nd BAR in BE2 */
+			addr = pci_iomap(pdev, BE2_chip(adapter) ? 1 : 0, 0);
+			if (!addr)
+				goto pci_map_err;
+			adapter->pcicfg = addr;
+		} else {
+			adapter->pcicfg = adapter->db + SRIOV_VF_PCICFG_OFFSET;
+		}
+	}
+
 	be_roce_map_pci_bars(adapter);
 	return 0;
 
 pci_map_err:
-	dev_err(&adapter->pdev->dev, "Error in mapping PCI BARs\n");
+	dev_err(&pdev->dev, "Error in mapping PCI BARs\n");
 	be_unmap_pci_bars(adapter);
 	return -ENOMEM;
 }

From 91edd096e224941131f896b86838b1e59553696a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 20 Mar 2015 16:48:13 +0000
Subject: [PATCH 52/98] net: compat: Update get_compat_msghdr() to match
 copy_msghdr_from_user() behaviour

Commit db31c55a6fb2 (net: clamp ->msg_namelen instead of returning an
error) introduced the clamping of msg_namelen when the unsigned value
was larger than sizeof(struct sockaddr_storage). This caused a
msg_namelen of -1 to be valid. The native code was subsequently fixed by
commit dbb490b96584 (net: socket: error on a negative msg_namelen).

In addition, the native code sets msg_namelen to 0 when msg_name is
NULL. This was done in commit (6a2a2b3ae075 net:socket: set msg_namelen
to 0 if msg_name is passed as NULL in msghdr struct from userland) and
subsequently updated by 08adb7dabd48 (fold verify_iovec() into
copy_msghdr_from_user()).

This patch brings the get_compat_msghdr() in line with
copy_msghdr_from_user().

Fixes: db31c55a6fb2 (net: clamp ->msg_namelen instead of returning an error)
Cc: David S. Miller <davem@davemloft.net>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/compat.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/compat.c b/net/compat.c
index 94d3d5e97883..f7bd286a8280 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -49,6 +49,13 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg,
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
 	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
 		return -EFAULT;
+
+	if (!uaddr)
+		kmsg->msg_namelen = 0;
+
+	if (kmsg->msg_namelen < 0)
+		return -EINVAL;
+
 	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 	kmsg->msg_control = compat_ptr(tmp3);

From 4de930efc23b92ddf88ce91c405ee645fe6e27ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 20 Mar 2015 17:41:43 +0000
Subject: [PATCH 53/98] net: validate the range we feed to iov_iter_init() in
 sys_sendto/sys_recvfrom

Cc: stable@vger.kernel.org # v3.19
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/socket.c b/net/socket.c
index bbedbfcb42c2..245330ca0015 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1702,6 +1702,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
 
 	if (len > INT_MAX)
 		len = INT_MAX;
+	if (unlikely(!access_ok(VERIFY_READ, buff, len)))
+		return -EFAULT;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		goto out;
@@ -1760,6 +1762,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
 
 	if (size > INT_MAX)
 		size = INT_MAX;
+	if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size)))
+		return -EFAULT;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		goto out;

From f40bff4239d45ac061044a8a79cf6868c62df345 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Sat, 21 Mar 2015 11:29:37 +0100
Subject: [PATCH 54/98] cx82310_eth: wait for firmware to become ready

When the device is powered up, some (older) firmware versions fail to work
properly if we send commands before the boot is complete (everything is OK
when the device is hot-plugged). The firmware indicates its ready status by
putting the link up.
Newer firmwares delay the first command so they don't suffer from this problem.
They also report the link being always up.

Wait for firmware to become ready (link up) before sending any commands and/or
data.

This also allows lowering CMD_TIMEOUT value to a reasonable time.

Tested with 4.1.0.9 (old) and 4.1.0.30 (new) firmware versions.

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cx82310_eth.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/cx82310_eth.c b/drivers/net/usb/cx82310_eth.c
index fe48f4c51373..1762ad3910b2 100644
--- a/drivers/net/usb/cx82310_eth.c
+++ b/drivers/net/usb/cx82310_eth.c
@@ -46,8 +46,7 @@ enum cx82310_status {
 };
 
 #define CMD_PACKET_SIZE	64
-/* first command after power on can take around 8 seconds */
-#define CMD_TIMEOUT	15000
+#define CMD_TIMEOUT	100
 #define CMD_REPLY_RETRY 5
 
 #define CX82310_MTU	1514
@@ -78,8 +77,9 @@ static int cx82310_cmd(struct usbnet *dev, enum cx82310_cmd cmd, bool reply,
 	ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, CMD_EP), buf,
 			   CMD_PACKET_SIZE, &actual_len, CMD_TIMEOUT);
 	if (ret < 0) {
-		dev_err(&dev->udev->dev, "send command %#x: error %d\n",
-			cmd, ret);
+		if (cmd != CMD_GET_LINK_STATUS)
+			dev_err(&dev->udev->dev, "send command %#x: error %d\n",
+				cmd, ret);
 		goto end;
 	}
 
@@ -90,8 +90,10 @@ static int cx82310_cmd(struct usbnet *dev, enum cx82310_cmd cmd, bool reply,
 					   buf, CMD_PACKET_SIZE, &actual_len,
 					   CMD_TIMEOUT);
 			if (ret < 0) {
-				dev_err(&dev->udev->dev,
-					"reply receive error %d\n", ret);
+				if (cmd != CMD_GET_LINK_STATUS)
+					dev_err(&dev->udev->dev,
+						"reply receive error %d\n",
+						ret);
 				goto end;
 			}
 			if (actual_len > 0)
@@ -134,6 +136,8 @@ static int cx82310_bind(struct usbnet *dev, struct usb_interface *intf)
 	int ret;
 	char buf[15];
 	struct usb_device *udev = dev->udev;
+	u8 link[3];
+	int timeout = 50;
 
 	/* avoid ADSL modems - continue only if iProduct is "USB NET CARD" */
 	if (usb_string(udev, udev->descriptor.iProduct, buf, sizeof(buf)) > 0
@@ -160,6 +164,20 @@ static int cx82310_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (!dev->partial_data)
 		return -ENOMEM;
 
+	/* wait for firmware to become ready (indicated by the link being up) */
+	while (--timeout) {
+		ret = cx82310_cmd(dev, CMD_GET_LINK_STATUS, true, NULL, 0,
+				  link, sizeof(link));
+		/* the command can time out during boot - it's not an error */
+		if (!ret && link[0] == 1 && link[2] == 1)
+			break;
+		msleep(500);
+	};
+	if (!timeout) {
+		dev_err(&udev->dev, "firmware not ready in time\n");
+		return -ETIMEDOUT;
+	}
+
 	/* enable ethernet mode (?) */
 	ret = cx82310_cmd(dev, CMD_ETHERNET_MODE, true, "\x01", 1, NULL, 0);
 	if (ret) {

From 749177ccc74f9c6d0f51bd78a15c652a2134aa11 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 21 Mar 2015 19:25:05 +0100
Subject: [PATCH 55/98] netfilter: nft_compat: set IP6T_F_PROTO flag if
 protocol is set

ip6tables extensions check for this flag to restrict match/target to a
given protocol. Without this flag set, SYNPROXY6 returns an error.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nft_compat.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 213584cf04b3..65f3e2b6be44 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -133,6 +133,9 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
 		break;
 	case AF_INET6:
+		if (proto)
+			entry->e6.ipv6.flags |= IP6T_F_PROTO;
+
 		entry->e6.ipv6.proto = proto;
 		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
 		break;
@@ -344,6 +347,9 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
 		break;
 	case AF_INET6:
+		if (proto)
+			entry->e6.ipv6.flags |= IP6T_F_PROTO;
+
 		entry->e6.ipv6.proto = proto;
 		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
 		break;

From 44d5f6f5901e996744858c175baee320ccf1eda3 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Tue, 17 Mar 2015 16:14:41 +0530
Subject: [PATCH 56/98] powerpc/book3s: Fix the MCE code to use
 CONFIG_KVM_BOOK3S_64_HANDLER

commit id 2ba9f0d has changed CONFIG_KVM_BOOK3S_64_HV to tristate to allow
HV/PR bits to be built as modules. But the MCE code still depends on
CONFIG_KVM_BOOK3S_64_HV which is wrong. When user selects
CONFIG_KVM_BOOK3S_64_HV=m to build HV/PR bits as a separate module the
relevant MCE code gets excluded.

This patch fixes the MCE code to use CONFIG_KVM_BOOK3S_64_HANDLER. This
makes sure that the relevant MCE code is included when HV/PR bits
are built as a separate modules.

Fixes: 2ba9f0d88750 ("kvm: powerpc: book3s: Support building HV and PR KVM as module")
Cc: stable@vger.kernel.org  # v3.14+
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index c2df8150bd7a..9519e6bdc6d7 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1408,7 +1408,7 @@ machine_check_handle_early:
 	bne	9f			/* continue in V mode if we are. */
 
 5:
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	/*
 	 * We are coming from kernel context. Check if we are coming from
 	 * guest. if yes, then we can continue. We will fall through

From 0164a711c97b0beeb7994b7d32ccddf586b6d81a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 23 Mar 2015 11:17:56 +0000
Subject: [PATCH 57/98] metag: Fix ioremap_wc/ioremap_cached build errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When ioremap_wc() or ioremap_cached() are used without first including
asm/pgtable.h, the _PAGE_CACHEABLE or _PAGE_WR_COMBINE definitions
aren't found, resulting in build errors like the following (in
next-20150323 due to "lib: devres: add a helper function for
ioremap_wc"):

lib/devres.c: In function ‘devm_ioremap_wc’:
lib/devres.c:91: error: ‘_PAGE_WR_COMBINE’ undeclared

We can't easily include asm/pgtable.h in asm/io.h due to dependency
problems, so split out the _PAGE_* definitions from asm/pgtable.h into a
separate asm/pgtable-bits.h header (as a couple of other architectures
already do), and include that in io.h instead.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: linux-metag@vger.kernel.org
Cc: Abhilash Kesavan <a.kesavan@samsung.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/metag/include/asm/io.h           |   1 +
 arch/metag/include/asm/pgtable-bits.h | 104 ++++++++++++++++++++++++++
 arch/metag/include/asm/pgtable.h      |  95 +----------------------
 3 files changed, 106 insertions(+), 94 deletions(-)
 create mode 100644 arch/metag/include/asm/pgtable-bits.h

diff --git a/arch/metag/include/asm/io.h b/arch/metag/include/asm/io.h
index 9359e5048442..d5779b0ec573 100644
--- a/arch/metag/include/asm/io.h
+++ b/arch/metag/include/asm/io.h
@@ -2,6 +2,7 @@
 #define _ASM_METAG_IO_H
 
 #include <linux/types.h>
+#include <asm/pgtable-bits.h>
 
 #define IO_SPACE_LIMIT  0
 
diff --git a/arch/metag/include/asm/pgtable-bits.h b/arch/metag/include/asm/pgtable-bits.h
new file mode 100644
index 000000000000..25ba6729f496
--- /dev/null
+++ b/arch/metag/include/asm/pgtable-bits.h
@@ -0,0 +1,104 @@
+/*
+ * Meta page table definitions.
+ */
+
+#ifndef _METAG_PGTABLE_BITS_H
+#define _METAG_PGTABLE_BITS_H
+
+#include <asm/metag_mem.h>
+
+/*
+ * Definitions for MMU descriptors
+ *
+ * These are the hardware bits in the MMCU pte entries.
+ * Derived from the Meta toolkit headers.
+ */
+#define _PAGE_PRESENT		MMCU_ENTRY_VAL_BIT
+#define _PAGE_WRITE		MMCU_ENTRY_WR_BIT
+#define _PAGE_PRIV		MMCU_ENTRY_PRIV_BIT
+/* Write combine bit - this can cause writes to occur out of order */
+#define _PAGE_WR_COMBINE	MMCU_ENTRY_WRC_BIT
+/* Sys coherent bit - this bit is never used by Linux */
+#define _PAGE_SYS_COHERENT	MMCU_ENTRY_SYS_BIT
+#define _PAGE_ALWAYS_ZERO_1	0x020
+#define _PAGE_CACHE_CTRL0	0x040
+#define _PAGE_CACHE_CTRL1	0x080
+#define _PAGE_ALWAYS_ZERO_2	0x100
+#define _PAGE_ALWAYS_ZERO_3	0x200
+#define _PAGE_ALWAYS_ZERO_4	0x400
+#define _PAGE_ALWAYS_ZERO_5	0x800
+
+/* These are software bits that we stuff into the gaps in the hardware
+ * pte entries that are not used.  Note, these DO get stored in the actual
+ * hardware, but the hardware just does not use them.
+ */
+#define _PAGE_ACCESSED		_PAGE_ALWAYS_ZERO_1
+#define _PAGE_DIRTY		_PAGE_ALWAYS_ZERO_2
+
+/* Pages owned, and protected by, the kernel. */
+#define _PAGE_KERNEL		_PAGE_PRIV
+
+/* No cacheing of this page */
+#define _PAGE_CACHE_WIN0	(MMCU_CWIN_UNCACHED << MMCU_ENTRY_CWIN_S)
+/* burst cacheing - good for data streaming */
+#define _PAGE_CACHE_WIN1	(MMCU_CWIN_BURST << MMCU_ENTRY_CWIN_S)
+/* One cache way per thread */
+#define _PAGE_CACHE_WIN2	(MMCU_CWIN_C1SET << MMCU_ENTRY_CWIN_S)
+/* Full on cacheing */
+#define _PAGE_CACHE_WIN3	(MMCU_CWIN_CACHED << MMCU_ENTRY_CWIN_S)
+
+#define _PAGE_CACHEABLE		(_PAGE_CACHE_WIN3 | _PAGE_WR_COMBINE)
+
+/* which bits are used for cache control ... */
+#define _PAGE_CACHE_MASK	(_PAGE_CACHE_CTRL0 | _PAGE_CACHE_CTRL1 | \
+				 _PAGE_WR_COMBINE)
+
+/* This is a mask of the bits that pte_modify is allowed to change. */
+#define _PAGE_CHG_MASK		(PAGE_MASK)
+
+#define _PAGE_SZ_SHIFT		1
+#define _PAGE_SZ_4K		(0x0)
+#define _PAGE_SZ_8K		(0x1 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_16K		(0x2 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_32K		(0x3 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_64K		(0x4 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_128K		(0x5 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_256K		(0x6 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_512K		(0x7 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_1M		(0x8 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_2M		(0x9 << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_4M		(0xa << _PAGE_SZ_SHIFT)
+#define _PAGE_SZ_MASK		(0xf << _PAGE_SZ_SHIFT)
+
+#if defined(CONFIG_PAGE_SIZE_4K)
+#define _PAGE_SZ		(_PAGE_SZ_4K)
+#elif defined(CONFIG_PAGE_SIZE_8K)
+#define _PAGE_SZ		(_PAGE_SZ_8K)
+#elif defined(CONFIG_PAGE_SIZE_16K)
+#define _PAGE_SZ		(_PAGE_SZ_16K)
+#endif
+#define _PAGE_TABLE		(_PAGE_SZ | _PAGE_PRESENT)
+
+#if defined(CONFIG_HUGETLB_PAGE_SIZE_8K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_8K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_16K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_16K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_32K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_32K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_64K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_128K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_128K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_256K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_256K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
+# define _PAGE_SZHUGE		(_PAGE_SZ_512K)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1M)
+# define _PAGE_SZHUGE		(_PAGE_SZ_1M)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_2M)
+# define _PAGE_SZHUGE		(_PAGE_SZ_2M)
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4M)
+# define _PAGE_SZHUGE		(_PAGE_SZ_4M)
+#endif
+
+#endif /* _METAG_PGTABLE_BITS_H */
diff --git a/arch/metag/include/asm/pgtable.h b/arch/metag/include/asm/pgtable.h
index d0604c0a8702..ffa3a3a2ecad 100644
--- a/arch/metag/include/asm/pgtable.h
+++ b/arch/metag/include/asm/pgtable.h
@@ -5,6 +5,7 @@
 #ifndef _METAG_PGTABLE_H
 #define _METAG_PGTABLE_H
 
+#include <asm/pgtable-bits.h>
 #include <asm-generic/pgtable-nopmd.h>
 
 /* Invalid regions on Meta: 0x00000000-0x001FFFFF and 0xFFFF0000-0xFFFFFFFF */
@@ -20,100 +21,6 @@
 #define VMALLOC_END		0x7FFFFFFF
 #endif
 
-/*
- * Definitions for MMU descriptors
- *
- * These are the hardware bits in the MMCU pte entries.
- * Derived from the Meta toolkit headers.
- */
-#define _PAGE_PRESENT		MMCU_ENTRY_VAL_BIT
-#define _PAGE_WRITE		MMCU_ENTRY_WR_BIT
-#define _PAGE_PRIV		MMCU_ENTRY_PRIV_BIT
-/* Write combine bit - this can cause writes to occur out of order */
-#define _PAGE_WR_COMBINE	MMCU_ENTRY_WRC_BIT
-/* Sys coherent bit - this bit is never used by Linux */
-#define _PAGE_SYS_COHERENT	MMCU_ENTRY_SYS_BIT
-#define _PAGE_ALWAYS_ZERO_1	0x020
-#define _PAGE_CACHE_CTRL0	0x040
-#define _PAGE_CACHE_CTRL1	0x080
-#define _PAGE_ALWAYS_ZERO_2	0x100
-#define _PAGE_ALWAYS_ZERO_3	0x200
-#define _PAGE_ALWAYS_ZERO_4	0x400
-#define _PAGE_ALWAYS_ZERO_5	0x800
-
-/* These are software bits that we stuff into the gaps in the hardware
- * pte entries that are not used.  Note, these DO get stored in the actual
- * hardware, but the hardware just does not use them.
- */
-#define _PAGE_ACCESSED		_PAGE_ALWAYS_ZERO_1
-#define _PAGE_DIRTY		_PAGE_ALWAYS_ZERO_2
-
-/* Pages owned, and protected by, the kernel. */
-#define _PAGE_KERNEL		_PAGE_PRIV
-
-/* No cacheing of this page */
-#define _PAGE_CACHE_WIN0	(MMCU_CWIN_UNCACHED << MMCU_ENTRY_CWIN_S)
-/* burst cacheing - good for data streaming */
-#define _PAGE_CACHE_WIN1	(MMCU_CWIN_BURST << MMCU_ENTRY_CWIN_S)
-/* One cache way per thread */
-#define _PAGE_CACHE_WIN2	(MMCU_CWIN_C1SET << MMCU_ENTRY_CWIN_S)
-/* Full on cacheing */
-#define _PAGE_CACHE_WIN3	(MMCU_CWIN_CACHED << MMCU_ENTRY_CWIN_S)
-
-#define _PAGE_CACHEABLE		(_PAGE_CACHE_WIN3 | _PAGE_WR_COMBINE)
-
-/* which bits are used for cache control ... */
-#define _PAGE_CACHE_MASK	(_PAGE_CACHE_CTRL0 | _PAGE_CACHE_CTRL1 | \
-				 _PAGE_WR_COMBINE)
-
-/* This is a mask of the bits that pte_modify is allowed to change. */
-#define _PAGE_CHG_MASK		(PAGE_MASK)
-
-#define _PAGE_SZ_SHIFT		1
-#define _PAGE_SZ_4K		(0x0)
-#define _PAGE_SZ_8K		(0x1 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_16K		(0x2 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_32K		(0x3 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_64K		(0x4 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_128K		(0x5 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_256K		(0x6 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_512K		(0x7 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_1M		(0x8 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_2M		(0x9 << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_4M		(0xa << _PAGE_SZ_SHIFT)
-#define _PAGE_SZ_MASK		(0xf << _PAGE_SZ_SHIFT)
-
-#if defined(CONFIG_PAGE_SIZE_4K)
-#define _PAGE_SZ		(_PAGE_SZ_4K)
-#elif defined(CONFIG_PAGE_SIZE_8K)
-#define _PAGE_SZ		(_PAGE_SZ_8K)
-#elif defined(CONFIG_PAGE_SIZE_16K)
-#define _PAGE_SZ		(_PAGE_SZ_16K)
-#endif
-#define _PAGE_TABLE		(_PAGE_SZ | _PAGE_PRESENT)
-
-#if defined(CONFIG_HUGETLB_PAGE_SIZE_8K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_8K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_16K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_16K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_32K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_32K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_64K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_128K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_128K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_256K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_256K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
-# define _PAGE_SZHUGE		(_PAGE_SZ_512K)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1M)
-# define _PAGE_SZHUGE		(_PAGE_SZ_1M)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_2M)
-# define _PAGE_SZHUGE		(_PAGE_SZ_2M)
-#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4M)
-# define _PAGE_SZHUGE		(_PAGE_SZ_4M)
-#endif
-
 /*
  * The Linux memory management assumes a three-level page table setup. On
  * Meta, we use that, but "fold" the mid level into the top-level page

From e6e96d73a2aaaa54ed2c0f98693f4bf572712f1c Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 23 Mar 2015 09:32:37 -0600
Subject: [PATCH 58/98] NVMe: Initialize device list head before starting

Driver recovery requires the device's list node to have been initialized.

Fixes: https://lkml.org/lkml/2015/3/22/262

Reported-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index ceb32dd52a6c..e23be20a3417 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -3003,6 +3003,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 	get_device(dev->device);
 
+	INIT_LIST_HEAD(&dev->node);
 	INIT_WORK(&dev->probe_work, nvme_async_probe);
 	schedule_work(&dev->probe_work);
 	return 0;

From c72efb658f7c8b27ca3d0efb5cfd5ded9fcac89e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 23 Mar 2015 00:18:48 -0400
Subject: [PATCH 59/98] writeback: fix possible underflow in write bandwidth
 calculation

From 1ebf33901ecc75d9496862dceb1ef0377980587c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 23 Mar 2015 00:08:19 -0400

2f800fbd777b ("writeback: fix dirtied pages accounting on redirty")
introduced account_page_redirty() which reverts stat updates for a
redirtied page, making BDI_DIRTIED no longer monotonically increasing.

bdi_update_write_bandwidth() uses the delta in BDI_DIRTIED as the
basis for bandwidth calculation.  While unlikely, since the above
patch, the newer value may be lower than the recorded past value and
underflow the bandwidth calculation leading to a wild result.

Fix it by subtracing min of the old and new values when calculating
delta.  AFAIK, there hasn't been any report of it happening but the
resulting erratic behavior would be non-critical and temporary, so
it's possible that the issue is happening without being reported.  The
risk of the fix is very low, so tagged for -stable.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Fixes: 2f800fbd777b ("writeback: fix dirtied pages accounting on redirty")
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 mm/page-writeback.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b4fd980a93eb..644bcb665773 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
 	 * write_bandwidth = ---------------------------------------------------
 	 *                                          period
+	 *
+	 * @written may have decreased due to account_page_redirty().
+	 * Avoid underflowing @bw calculation.
 	 */
-	bw = written - bdi->written_stamp;
+	bw = written - min(written, bdi->written_stamp);
 	bw *= HZ;
 	if (unlikely(elapsed > period)) {
 		do_div(bw, elapsed);

From 2077cef4d5c29cf886192ec32066f783d6a80db8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 Mar 2015 09:22:10 -0700
Subject: [PATCH 60/98] sparc64: Fix several bugs in memmove().

Firstly, handle zero length calls properly.  Believe it or not there
are a few of these happening during early boot.

Next, we can't just drop to a memcpy() call in the forward copy case
where dst <= src.  The reason is that the cache initializing stores
used in the Niagara memcpy() implementations can end up clearing out
cache lines before we've sourced their original contents completely.

For example, considering NG4memcpy, the main unrolled loop begins like
this:

     load   src + 0x00
     load   src + 0x08
     load   src + 0x10
     load   src + 0x18
     load   src + 0x20
     store  dst + 0x00

Assume dst is 64 byte aligned and let's say that dst is src - 8 for
this memcpy() call.  That store at the end there is the one to the
first line in the cache line, thus clearing the whole line, which thus
clobbers "src + 0x28" before it even gets loaded.

To avoid this, just fall through to a simple copy only mildly
optimized for the case where src and dst are 8 byte aligned and the
length is a multiple of 8 as well.  We could get fancy and call
GENmemcpy() but this is good enough for how this thing is actually
used.

Reported-by: David Ahern <david.ahern@oracle.com>
Reported-by: Bob Picco <bpicco@meloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/memmove.S | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/lib/memmove.S b/arch/sparc/lib/memmove.S
index b7f6334e159f..857ad4f8905f 100644
--- a/arch/sparc/lib/memmove.S
+++ b/arch/sparc/lib/memmove.S
@@ -8,9 +8,11 @@
 
 	.text
 ENTRY(memmove) /* o0=dst o1=src o2=len */
-	mov		%o0, %g1
+	brz,pn		%o2, 99f
+	 mov		%o0, %g1
+
 	cmp		%o0, %o1
-	bleu,pt		%xcc, memcpy
+	bleu,pt		%xcc, 2f
 	 add		%o1, %o2, %g7
 	cmp		%g7, %o0
 	bleu,pt		%xcc, memcpy
@@ -24,7 +26,34 @@ ENTRY(memmove) /* o0=dst o1=src o2=len */
 	stb		%g7, [%o0]
 	bne,pt		%icc, 1b
 	 sub		%o0, 1, %o0
-
+99:
 	retl
 	 mov		%g1, %o0
+
+	/* We can't just call memcpy for these memmove cases.  On some
+	 * chips the memcpy uses cache initializing stores and when dst
+	 * and src are close enough, those can clobber the source data
+	 * before we've loaded it in.
+	 */
+2:	or		%o0, %o1, %g7
+	or		%o2, %g7, %g7
+	andcc		%g7, 0x7, %g0
+	bne,pn		%xcc, 4f
+	 nop
+
+3:	ldx		[%o1], %g7
+	add		%o1, 8, %o1
+	subcc		%o2, 8, %o2
+	add		%o0, 8, %o0
+	bne,pt		%icc, 3b
+	 stx		%g7, [%o0 - 0x8]
+	ba,a,pt		%xcc, 99b
+
+4:	ldub		[%o1], %g7
+	add		%o1, 1, %o1
+	subcc		%o2, 1, %o2
+	add		%o0, 1, %o0
+	bne,pt		%icc, 4b
+	 stb		%g7, [%o0 - 0x1]
+	ba,a,pt		%xcc, 99b
 ENDPROC(memmove)

From e53f21bce4d35a93b23d8fa1a840860f6c74f59e Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 23 Mar 2015 15:06:50 +0000
Subject: [PATCH 61/98] arm64: Use the reserved TTBR0 if context switching to
 the init_mm

The idle_task_exit() function may call switch_mm() with next ==
&init_mm. On arm64, init_mm.pgd cannot be used for user mappings, so
this patch simply sets the reserved TTBR0.

Cc: <stable@vger.kernel.org>
Reported-by: Jon Medhurst (Tixy) <tixy@linaro.org>
Tested-by: Jon Medhurst (Tixy) <tixy@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu_context.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a9eee33dfa62..101a42bde728 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -151,6 +151,15 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 {
 	unsigned int cpu = smp_processor_id();
 
+	/*
+	 * init_mm.pgd does not contain any user mappings and it is always
+	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
+	 */
+	if (next == &init_mm) {
+		cpu_set_reserved_ttbr0();
+		return;
+	}
+
 	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next)
 		check_and_switch_context(next, tsk);
 }

From 63a4f065ece613b6d575b538234375b0e9c23bbc Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Mar 2015 17:01:43 -0400
Subject: [PATCH 62/98] dm: fix add_disk() NULL pointer due to race with
 free_dev()

Commit c4db59d31e39 ("fs: don't reassign dirty inodes to
default_backing_dev_info") exposed DM to a latent race in free_dev() vs
add_disk() in relation to management of the device's minor number.

Fix this by refactoring free_dev() to match cleanup order of the
alloc_dev() error path.  Move cleanup of the gendisk, queue, and bdev
to _before_ the cleanup of the idr managed minor number.

Also, purely due to cleanup that fell out during the free_dev() audit:
- adjust dm_blk_close() to access the gendisk's private_data under
  the _minor_lock spinlock.
- move __dm_destroy()'s dm_get_live_table() call out from under the
  _minor_lock spinlock.

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1202449

Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9b641b38b857..8001fe9e3434 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -433,7 +433,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 
 	dm_get(md);
 	atomic_inc(&md->open_count);
-
 out:
 	spin_unlock(&_minor_lock);
 
@@ -442,16 +441,20 @@ out:
 
 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
-	struct mapped_device *md = disk->private_data;
+	struct mapped_device *md;
 
 	spin_lock(&_minor_lock);
 
+	md = disk->private_data;
+	if (WARN_ON(!md))
+		goto out;
+
 	if (atomic_dec_and_test(&md->open_count) &&
 	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 		queue_work(deferred_remove_workqueue, &deferred_remove_work);
 
 	dm_put(md);
-
+out:
 	spin_unlock(&_minor_lock);
 }
 
@@ -2241,7 +2244,6 @@ static void free_dev(struct mapped_device *md)
 	int minor = MINOR(disk_devt(md->disk));
 
 	unlock_fs(md);
-	bdput(md->bdev);
 	destroy_workqueue(md->wq);
 
 	if (md->kworker_task)
@@ -2252,19 +2254,22 @@ static void free_dev(struct mapped_device *md)
 		mempool_destroy(md->rq_pool);
 	if (md->bs)
 		bioset_free(md->bs);
-	blk_integrity_unregister(md->disk);
-	del_gendisk(md->disk);
+
 	cleanup_srcu_struct(&md->io_barrier);
 	free_table_devices(&md->table_devices);
-	free_minor(minor);
+	dm_stats_cleanup(&md->stats);
 
 	spin_lock(&_minor_lock);
 	md->disk->private_data = NULL;
 	spin_unlock(&_minor_lock);
-
+	if (blk_get_integrity(md->disk))
+		blk_integrity_unregister(md->disk);
+	del_gendisk(md->disk);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
-	dm_stats_cleanup(&md->stats);
+	bdput(md->bdev);
+	free_minor(minor);
+
 	module_put(THIS_MODULE);
 	kfree(md);
 }
@@ -2642,8 +2647,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 
 	might_sleep();
 
-	spin_lock(&_minor_lock);
 	map = dm_get_live_table(md, &srcu_idx);
+
+	spin_lock(&_minor_lock);
 	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);

From c806a6ad35bfa6c92249cd0ca4772d5ac3f8cb68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 18 Mar 2015 19:38:22 +0100
Subject: [PATCH 63/98] KVM: x86: call irq notifiers with directed EOI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_ioapic_update_eoi() wasn't called if directed EOI was enabled.
We need to do that for irq notifiers.  (Like with edge interrupts.)

Fix it by skipping EOI broadcast only.

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=82211
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 +++-
 arch/x86/kvm/lapic.c  | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index b1947e0f3e10..46d4449772bc 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -422,6 +422,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
 	int i;
+	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -443,7 +444,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
 		spin_lock(&ioapic->lock);
 
-		if (trigger_mode != IOAPIC_LEVEL_TRIG)
+		if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+		    kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
 			continue;
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bd4e34de24c7..4ee827d7bf36 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -833,8 +833,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
-	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
 		int trigger_mode;
 		if (apic_test_vector(vector, apic->regs + APIC_TMR))
 			trigger_mode = IOAPIC_LEVEL_TRIG;

From 744961341d472db6272ed9b42319a90f5a2aa7c4 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 20 Mar 2015 12:21:37 +0000
Subject: [PATCH 64/98] kvm: avoid page allocation failure in
 kvm_set_memory_region()

KVM guest can fail to startup with following trace on host:

qemu-system-x86: page allocation failure: order:4, mode:0x40d0
Call Trace:
  dump_stack+0x47/0x67
  warn_alloc_failed+0xee/0x150
  __alloc_pages_direct_compact+0x14a/0x150
  __alloc_pages_nodemask+0x776/0xb80
  alloc_kmem_pages+0x3a/0x110
  kmalloc_order+0x13/0x50
  kmemdup+0x1b/0x40
  __kvm_set_memory_region+0x24a/0x9f0 [kvm]
  kvm_set_ioapic+0x130/0x130 [kvm]
  kvm_set_memory_region+0x21/0x40 [kvm]
  kvm_vm_ioctl+0x43f/0x750 [kvm]

Failure happens when attempting to allocate pages for
'struct kvm_memslots', however it doesn't have to be
present in physically contiguous (kmalloc-ed) address
space, change allocation to kvm_kvzalloc() so that
it will be vmalloc-ed when its size is more then a page.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a2214d9609bd..cc6a25d95fbf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -471,7 +471,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
 	r = -ENOMEM;
-	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+	kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots));
 	if (!kvm->memslots)
 		goto out_err_no_srcu;
 
@@ -522,7 +522,7 @@ out_err_no_srcu:
 out_err_no_disable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm->buses[i]);
-	kfree(kvm->memslots);
+	kvfree(kvm->memslots);
 	kvm_arch_free_vm(kvm);
 	return ERR_PTR(r);
 }
@@ -578,7 +578,7 @@ static void kvm_free_physmem(struct kvm *kvm)
 	kvm_for_each_memslot(memslot, slots)
 		kvm_free_physmem_slot(kvm, memslot, NULL);
 
-	kfree(kvm->memslots);
+	kvfree(kvm->memslots);
 }
 
 static void kvm_destroy_devices(struct kvm *kvm)
@@ -871,10 +871,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			goto out_free;
 	}
 
-	slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
-			GFP_KERNEL);
+	slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
 	if (!slots)
 		goto out_free;
+	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 
 	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
 		slot = id_to_memslot(slots, mem->slot);
@@ -917,7 +917,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	kvm_arch_commit_memory_region(kvm, mem, &old, change);
 
 	kvm_free_physmem_slot(kvm, &old, &new);
-	kfree(old_memslots);
+	kvfree(old_memslots);
 
 	/*
 	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
@@ -936,7 +936,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	return 0;
 
 out_slots:
-	kfree(slots);
+	kvfree(slots);
 out_free:
 	kvm_free_physmem_slot(kvm, &new, &old);
 out:

From 8218c3f4df3bb1c637c17552405039a6dd3c1ee1 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Feb 2015 12:58:13 +0100
Subject: [PATCH 65/98] drm: Fixup racy refcounting in plane_force_disable

Originally it was impossible to be dropping the last refcount in this
function since there was always one around still from the idr. But in

commit 83f45fc360c8e16a330474860ebda872d1384c8c
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Wed Aug 6 09:10:18 2014 +0200

    drm: Don't grab an fb reference for the idr

we've switched to weak references, broke that assumption but forgot to
fix it up.

Since we still force-disable planes it's only possible to hit this
when racing multiple rmfb with fbdev restoring or similar evil things.
As long as userspace is nice it's impossible to hit the BUG_ON.

But the BUG_ON would most likely be hit from fbdev code, which usually
invovles the console_lock besides all modeset locks. So very likely
we'd never get the bug reports if this was hit in the wild, hence
better be safe than sorry and backport.

Spotted by Matt Roper while reviewing other patches.

[airlied: pull this back into 4.0 - the oops happens there]

Cc: stable@vger.kernel.org
Cc: Matt Roper <matthew.d.roper@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_crtc.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index f6d04c7b5115..679b10e34fb5 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -525,17 +525,6 @@ void drm_framebuffer_reference(struct drm_framebuffer *fb)
 }
 EXPORT_SYMBOL(drm_framebuffer_reference);
 
-static void drm_framebuffer_free_bug(struct kref *kref)
-{
-	BUG();
-}
-
-static void __drm_framebuffer_unreference(struct drm_framebuffer *fb)
-{
-	DRM_DEBUG("%p: FB ID: %d (%d)\n", fb, fb->base.id, atomic_read(&fb->refcount.refcount));
-	kref_put(&fb->refcount, drm_framebuffer_free_bug);
-}
-
 /**
  * drm_framebuffer_unregister_private - unregister a private fb from the lookup idr
  * @fb: fb to unregister
@@ -1320,7 +1309,7 @@ void drm_plane_force_disable(struct drm_plane *plane)
 		return;
 	}
 	/* disconnect the plane from the fb and crtc: */
-	__drm_framebuffer_unreference(plane->old_fb);
+	drm_framebuffer_unreference(plane->old_fb);
 	plane->old_fb = NULL;
 	plane->fb = NULL;
 	plane->crtc = NULL;

From f3eab7184ddcd4867cf42e3274ba24a66e1e093d Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@linaro.org>
Date: Sun, 22 Mar 2015 14:51:51 +0000
Subject: [PATCH 66/98] arm64: percpu: Make this_cpu accessors pre-empt safe

this_cpu operations were implemented for arm64 in:
 5284e1b arm64: xchg: Implement cmpxchg_double
 f97fc81 arm64: percpu: Implement this_cpu operations

Unfortunately, it is possible for pre-emption to take place between
address generation and data access. This can lead to cases where data
is being manipulated by this_cpu for a different CPU than it was
called on. Which effectively breaks the spec.

This patch disables pre-emption for the this_cpu operations
guaranteeing that address generation and data manipulation take place
without a pre-emption in-between.

Fixes: 5284e1b4bc8a ("arm64: xchg: Implement cmpxchg_double")
Fixes: f97fc810798c ("arm64: percpu: Implement this_cpu operations")
Reported-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Steve Capper <steve.capper@linaro.org>
[catalin.marinas@arm.com: remove space after type cast]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cmpxchg.h | 30 ++++++++++++++++-----
 arch/arm64/include/asm/percpu.h  | 46 +++++++++++++++++++++++---------
 2 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index cb9593079f29..d8c25b7b18fb 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -246,14 +246,30 @@ static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old,
 	__ret; \
 })
 
-#define this_cpu_cmpxchg_1(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
-#define this_cpu_cmpxchg_2(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
-#define this_cpu_cmpxchg_4(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
-#define this_cpu_cmpxchg_8(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
+#define _protect_cmpxchg_local(pcp, o, n)			\
+({								\
+	typeof(*raw_cpu_ptr(&(pcp))) __ret;			\
+	preempt_disable();					\
+	__ret = cmpxchg_local(raw_cpu_ptr(&(pcp)), o, n);	\
+	preempt_enable();					\
+	__ret;							\
+})
 
-#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2) \
-	cmpxchg_double_local(raw_cpu_ptr(&(ptr1)), raw_cpu_ptr(&(ptr2)), \
-				o1, o2, n1, n2)
+#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+
+#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2)		\
+({									\
+	int __ret;							\
+	preempt_disable();						\
+	__ret = cmpxchg_double_local(	raw_cpu_ptr(&(ptr1)),		\
+					raw_cpu_ptr(&(ptr2)),		\
+					o1, o2, n1, n2);		\
+	preempt_enable();						\
+	__ret;								\
+})
 
 #define cmpxchg64(ptr,o,n)		cmpxchg((ptr),(o),(n))
 #define cmpxchg64_local(ptr,o,n)	cmpxchg_local((ptr),(o),(n))
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 09da25bc596f..4fde8c1df97f 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -204,25 +204,47 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 	return ret;
 }
 
-#define _percpu_add(pcp, val) \
-	__percpu_add(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
+#define _percpu_read(pcp)						\
+({									\
+	typeof(pcp) __retval;						\
+	preempt_disable();						\
+	__retval = (typeof(pcp))__percpu_read(raw_cpu_ptr(&(pcp)), 	\
+					      sizeof(pcp));		\
+	preempt_enable();						\
+	__retval;							\
+})
 
-#define _percpu_add_return(pcp, val) (typeof(pcp)) (_percpu_add(pcp, val))
+#define _percpu_write(pcp, val)						\
+do {									\
+	preempt_disable();						\
+	__percpu_write(raw_cpu_ptr(&(pcp)), (unsigned long)(val), 	\
+				sizeof(pcp));				\
+	preempt_enable();						\
+} while(0)								\
+
+#define _pcp_protect(operation, pcp, val)			\
+({								\
+	typeof(pcp) __retval;					\
+	preempt_disable();					\
+	__retval = (typeof(pcp))operation(raw_cpu_ptr(&(pcp)),	\
+					  (val), sizeof(pcp));	\
+	preempt_enable();					\
+	__retval;						\
+})
+
+#define _percpu_add(pcp, val) \
+	_pcp_protect(__percpu_add, pcp, val)
+
+#define _percpu_add_return(pcp, val) _percpu_add(pcp, val)
 
 #define _percpu_and(pcp, val) \
-	__percpu_and(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
+	_pcp_protect(__percpu_and, pcp, val)
 
 #define _percpu_or(pcp, val) \
-	__percpu_or(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
-
-#define _percpu_read(pcp) (typeof(pcp))	\
-	(__percpu_read(raw_cpu_ptr(&(pcp)), sizeof(pcp)))
-
-#define _percpu_write(pcp, val) \
-	__percpu_write(raw_cpu_ptr(&(pcp)), (unsigned long)(val), sizeof(pcp))
+	_pcp_protect(__percpu_or, pcp, val)
 
 #define _percpu_xchg(pcp, val) (typeof(pcp)) \
-	(__percpu_xchg(raw_cpu_ptr(&(pcp)), (unsigned long)(val), sizeof(pcp)))
+	_pcp_protect(__percpu_xchg, pcp, (unsigned long)(val))
 
 #define this_cpu_add_1(pcp, val) _percpu_add(pcp, val)
 #define this_cpu_add_2(pcp, val) _percpu_add(pcp, val)

From 59a58cb34d3fe73e6c899cc5d9a87428ca662925 Mon Sep 17 00:00:00 2001
From: Damien Lespiau <damien.lespiau@intel.com>
Date: Thu, 5 Feb 2015 18:30:20 +0000
Subject: [PATCH 67/98] drm/i915: Don't try to reference the fb in
 get_initial_plane_config()

Tvrtko noticed a new warning on boot:

  WARNING: CPU: 1 PID: 353 at include/linux/kref.h:47 drm_framebuffer_reference+0x6c/0x80 [drm]()
  Call Trace:
  [<ffffffff8161f10c>] dump_stack+0x4f/0x7b
  [<ffffffff81052caa>] warn_slowpath_common+0xaa/0xd0
  [<ffffffff81052d8a>] warn_slowpath_null+0x1a/0x20
  [<ffffffffa00d035c>] drm_framebuffer_reference+0x6c/0x80 [drm]
  [<ffffffffa01c0df7>] update_state_fb.isra.54+0x47/0x50 [i915]
  [<ffffffffa01ccd5c>] skylake_get_initial_plane_config+0x93c/0x950 [i915]
  [<ffffffffa01e8721>] intel_modeset_init+0x1551/0x17c0 [i915]
  [<ffffffffa02476e0>] i915_driver_load+0xed0/0x11e0 [i915]
  [<ffffffff81627aa1>] ? _raw_spin_unlock_irqrestore+0x51/0x70
  [<ffffffffa00ca8b7>] drm_dev_register+0x77/0x110 [drm]
  [<ffffffffa00cda3b>] drm_get_pci_dev+0x11b/0x1f0 [drm]
  [<ffffffff81098e3d>] ? trace_hardirqs_on+0xd/0x10
  [<ffffffff81627aa1>] ? _raw_spin_unlock_irqrestore+0x51/0x70
  [<ffffffffa0145276>] i915_pci_probe+0x56/0x60 [i915]
  [<ffffffff813ad59c>] pci_device_probe+0x7c/0x100
  [<ffffffff81466aad>] driver_probe_device+0x16d/0x380

We cannot take a reference at this point, not before
intel_framebuffer_init() and the underlying drm_framebuffer_init().

Introduced in:

  commit 706dc7b549175e47f23e913b7f1e52874a7d0f56
  Author: Matt Roper <matthew.d.roper@intel.com>
  Date:   Tue Feb 3 13:10:04 2015 -0800

      drm/i915: Ensure plane->state->fb stays in sync with plane->fb

v2: Don't move update_state_fb(). It was moved around because I
    originally put update_state_fb() in intel_alloc_plane_obj() before
    finding a better place. (Matt)

Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Reported-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
From drm-next:
(cherry picked from commit f55548b5af87ebfc586ca75748947f1c1b1a4a52)
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/i915/intel_display.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 6d22128d97b1..1c12262029fb 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2438,8 +2438,10 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 	if (!intel_crtc->base.primary->fb)
 		return;
 
-	if (intel_alloc_plane_obj(intel_crtc, plane_config))
+	if (intel_alloc_plane_obj(intel_crtc, plane_config)) {
+		update_state_fb(intel_crtc->base.primary);
 		return;
+	}
 
 	kfree(intel_crtc->base.primary->fb);
 	intel_crtc->base.primary->fb = NULL;
@@ -6663,7 +6665,6 @@ i9xx_get_initial_plane_config(struct intel_crtc *crtc,
 		      plane_config->size);
 
 	crtc->base.primary->fb = fb;
-	update_state_fb(crtc->base.primary);
 }
 
 static void chv_crtc_clock_get(struct intel_crtc *crtc,
@@ -7704,7 +7705,6 @@ skylake_get_initial_plane_config(struct intel_crtc *crtc,
 		      plane_config->size);
 
 	crtc->base.primary->fb = fb;
-	update_state_fb(crtc->base.primary);
 	return;
 
 error:
@@ -7798,7 +7798,6 @@ ironlake_get_initial_plane_config(struct intel_crtc *crtc,
 		      plane_config->size);
 
 	crtc->base.primary->fb = fb;
-	update_state_fb(crtc->base.primary);
 }
 
 static bool ironlake_get_pipe_config(struct intel_crtc *crtc,

From 1833c9f647e9bda1cd24653ff8f9c207b5f5b911 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 21 Mar 2015 12:43:08 +0100
Subject: [PATCH 68/98] s390/smp: reenable smt after resume

After a suspend/resume cycle we missed to enable smt again, which leads
to all sorts of bugs, since the kernel assumes smt is enabled, while the
hardware thinks it is not.

Reported-and-tested-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Reported-by: Stefan Haberland <stefan.haberland@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/swsusp_asm64.S | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/s390/kernel/swsusp_asm64.S b/arch/s390/kernel/swsusp_asm64.S
index 6b09fdffbd2f..ca6294645dd3 100644
--- a/arch/s390/kernel/swsusp_asm64.S
+++ b/arch/s390/kernel/swsusp_asm64.S
@@ -177,6 +177,17 @@ restart_entry:
 	lhi	%r1,1
 	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE
 	sam64
+#ifdef CONFIG_SMP
+	larl	%r1,smp_cpu_mt_shift
+	icm	%r1,15,0(%r1)
+	jz	smt_done
+	llgfr	%r1,%r1
+smt_loop:
+	sigp	%r1,%r0,SIGP_SET_MULTI_THREADING
+	brc	8,smt_done			/* accepted */
+	brc	2,smt_loop			/* busy, try again */
+smt_done:
+#endif
 	larl	%r1,.Lnew_pgm_check_psw
 	lpswe	0(%r1)
 pgm_check_entry:

From fb903811c4e3bcd93ccc247235834e2632a3508d Mon Sep 17 00:00:00 2001
From: Ed Cashin <ed.cashin@acm.org>
Date: Wed, 25 Mar 2015 15:55:06 -0700
Subject: [PATCH 69/98] aoe: update aoe maintainer information

The coraid.com email address is defunct.  The old aoe support area hosted
at coraid.com is no longer up.  These changes update the email and website
to current ones.

Signed-off-by: Ed Cashin <ed.cashin@acm.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 358eb0105e00..c6cd0f60635c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1675,8 +1675,8 @@ F:	drivers/misc/eeprom/at24.c
 F:	include/linux/platform_data/at24.h
 
 ATA OVER ETHERNET (AOE) DRIVER
-M:	"Ed L. Cashin" <ecashin@coraid.com>
-W:	http://support.coraid.com/support/linux
+M:	"Ed L. Cashin" <ed.cashin@acm.org>
+W:	http://www.openaoe.org/
 S:	Supported
 F:	Documentation/aoe/
 F:	drivers/block/aoe/

From ddd2a30d41a5bc578ab094f6dbf080697ea1a7dd Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 25 Mar 2015 15:55:09 -0700
Subject: [PATCH 70/98] drivers/rtc/rtc-mrst: fix suspend/resume

The Moorestown RTC driver implements suspend and resume callbacks and
assigns them to the suspend and resume fields of the device_driver
struct.  These callbacks are never actually called by anything though.

Modify the driver to properly use dev_pm_ops so that the suspend and
resume functions are actually executed upon suspend/resume.

[akpm@linux-foundation.org: device_driver.name is const char *]
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Feng Tang <feng.tang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-mrst.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index e2436d140175..3a6fd3a8a2ec 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -413,8 +413,8 @@ static void rtc_mrst_do_remove(struct device *dev)
 	mrst->dev = NULL;
 }
 
-#ifdef	CONFIG_PM
-static int mrst_suspend(struct device *dev, pm_message_t mesg)
+#ifdef CONFIG_PM_SLEEP
+static int mrst_suspend(struct device *dev)
 {
 	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
 	unsigned char	tmp;
@@ -453,7 +453,7 @@ static int mrst_suspend(struct device *dev, pm_message_t mesg)
  */
 static inline int mrst_poweroff(struct device *dev)
 {
-	return mrst_suspend(dev, PMSG_HIBERNATE);
+	return mrst_suspend(dev);
 }
 
 static int mrst_resume(struct device *dev)
@@ -490,9 +490,11 @@ static int mrst_resume(struct device *dev)
 	return 0;
 }
 
+static SIMPLE_DEV_PM_OPS(mrst_pm_ops, mrst_suspend, mrst_resume);
+#define MRST_PM_OPS (&mrst_pm_ops)
+
 #else
-#define	mrst_suspend	NULL
-#define	mrst_resume	NULL
+#define MRST_PM_OPS NULL
 
 static inline int mrst_poweroff(struct device *dev)
 {
@@ -529,9 +531,8 @@ static struct platform_driver vrtc_mrst_platform_driver = {
 	.remove		= vrtc_mrst_platform_remove,
 	.shutdown	= vrtc_mrst_platform_shutdown,
 	.driver = {
-		.name		= (char *) driver_name,
-		.suspend	= mrst_suspend,
-		.resume		= mrst_resume,
+		.name	= driver_name,
+		.pm	= MRST_PM_OPS,
 	}
 };
 

From 3fe89b3e2a7bbf3e97657104b9b33a9d81b950b3 Mon Sep 17 00:00:00 2001
From: Leon Yu <chianglungyu@gmail.com>
Date: Wed, 25 Mar 2015 15:55:11 -0700
Subject: [PATCH 71/98] mm: fix anon_vma->degree underflow in anon_vma endless
 growing prevention

I have constantly stumbled upon "kernel BUG at mm/rmap.c:399!" after
upgrading to 3.19 and had no luck with 4.0-rc1 neither.

So, after looking into new logic introduced by commit 7a3ef208e662 ("mm:
prevent endless growth of anon_vma hierarchy"), I found chances are that
unlink_anon_vmas() is called without incrementing dst->anon_vma->degree
in anon_vma_clone() due to allocation failure.  If dst->anon_vma is not
NULL in error path, its degree will be incorrectly decremented in
unlink_anon_vmas() and eventually underflow when exiting as a result of
another call to unlink_anon_vmas().  That's how "kernel BUG at
mm/rmap.c:399!" is triggered for me.

This patch fixes the underflow by dropping dst->anon_vma when allocation
fails.  It's safe to do so regardless of original value of dst->anon_vma
because dst->anon_vma doesn't have valid meaning if anon_vma_clone()
fails.  Besides, callers don't care dst->anon_vma in such case neither.

Also suggested by Michal Hocko, we can clean up vma_adjust() a bit as
anon_vma_clone() now does the work.

[akpm@linux-foundation.org: tweak comment]
Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
Signed-off-by: Leon Yu <chianglungyu@gmail.com>
Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 4 +---
 mm/rmap.c | 7 +++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index da9990acc08b..9ec50a368634 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -774,10 +774,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 			importer->anon_vma = exporter->anon_vma;
 			error = anon_vma_clone(importer, exporter);
-			if (error) {
-				importer->anon_vma = NULL;
+			if (error)
 				return error;
-			}
 		}
 	}
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 5e3e09081164..c161a14b6a8f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 	return 0;
 
  enomem_failure:
+	/*
+	 * dst->anon_vma is dropped here otherwise its degree can be incorrectly
+	 * decremented in unlink_anon_vmas().
+	 * We can safely do this because callers of anon_vma_clone() don't care
+	 * about dst->anon_vma if anon_vma_clone() failed.
+	 */
+	dst->anon_vma = NULL;
 	unlink_anon_vmas(dst);
 	return -ENOMEM;
 }

From f683739539e819e9b821a197d80e52258510837b Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 25 Mar 2015 15:55:14 -0700
Subject: [PATCH 72/98] mm/pagewalk.c: prevent positive return value of
 walk_page_test() from being passed to callers

walk_page_test() is purely pagewalk's internal stuff, and its positive
return values are not intended to be passed to the callers of pagewalk.

However, in the current code if the last vma in the do-while loop in
walk_page_range() happens to return a positive value, it leaks outside
walk_page_range().  So the user visible effect is invalid/unexpected
return value (according to the reporter, mbind() causes it.)

This patch fixes it simply by reinitializing the return value after
checked.

Another exposed interface, walk_page_vma(), already returns 0 for such
cases so no problem.

Fixes: fafaa4264eba ("pagewalk: improve vma handling")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kazutomo Yoshii <kazutomo.yoshii@gmail.com>
Reported-by: Kazutomo Yoshii <kazutomo.yoshii@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/pagewalk.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 75c1f2878519..29f2f8b853ae 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end,
 			vma = vma->vm_next;
 
 			err = walk_page_test(start, next, walk);
-			if (err > 0)
+			if (err > 0) {
+				/*
+				 * positive return values are purely for
+				 * controlling the pagewalk, so should never
+				 * be passed to the callers.
+				 */
+				err = 0;
 				continue;
+			}
 			if (err < 0)
 				break;
 		}

From 59ec96719f7521002804c0862888971410e288af Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 25 Mar 2015 15:55:17 -0700
Subject: [PATCH 73/98] MAINTAINERS: correct rtc armada38x pattern entry

Commit c6a95dbee793 ("MAINTAINERS: add the RTC driver for the
Armada38x") typoed the pattern, fix it.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c6cd0f60635c..12ebef95224d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1186,7 +1186,7 @@ M:	Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm/mach-mvebu/
-F:	drivers/rtc/armada38x-rtc
+F:	drivers/rtc/rtc-armada38x.c
 
 ARM/Marvell Berlin SoC support
 M:	Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>

From b0dc3a342af36f95a68fe229b8f0f73552c5ca08 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 25 Mar 2015 15:55:20 -0700
Subject: [PATCH 74/98] mm/memory hotplug: postpone the reset of obsolete pgdat

Qiu Xishi reported the following BUG when testing hot-add/hot-remove node under
stress condition:

  BUG: unable to handle kernel paging request at 0000000000025f60
  IP: next_online_pgdat+0x1/0x50
  PGD 0
  Oops: 0000 [#1] SMP
  ACPI: Device does not support D3cold
  Modules linked in: fuse nls_iso8859_1 nls_cp437 vfat fat loop dm_mod coretemp mperf crc32c_intel ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul glue_helper aes_x86_64 pcspkr microcode igb dca i2c_algo_bit ipv6 megaraid_sas iTCO_wdt i2c_i801 i2c_core iTCO_vendor_support tg3 sg hwmon ptp lpc_ich pps_core mfd_core acpi_pad rtc_cmos button ext3 jbd mbcache sd_mod crc_t10dif scsi_dh_alua scsi_dh_rdac scsi_dh_hp_sw scsi_dh_emc scsi_dh ahci libahci libata scsi_mod [last unloaded: rasf]
  CPU: 23 PID: 238 Comm: kworker/23:1 Tainted: G           O 3.10.15-5885-euler0302 #1
  Hardware name: HUAWEI TECHNOLOGIES CO.,LTD. Huawei N1/Huawei N1, BIOS V100R001 03/02/2015
  Workqueue: events vmstat_update
  task: ffffa800d32c0000 ti: ffffa800d32ae000 task.ti: ffffa800d32ae000
  RIP: 0010: next_online_pgdat+0x1/0x50
  RSP: 0018:ffffa800d32afce8  EFLAGS: 00010286
  RAX: 0000000000001440 RBX: ffffffff81da53b8 RCX: 0000000000000082
  RDX: 0000000000000000 RSI: 0000000000000082 RDI: 0000000000000000
  RBP: ffffa800d32afd28 R08: ffffffff81c93bfc R09: ffffffff81cbdc96
  R10: 00000000000040ec R11: 00000000000000a0 R12: ffffa800fffb3440
  R13: ffffa800d32afd38 R14: 0000000000000017 R15: ffffa800e6616800
  FS:  0000000000000000(0000) GS:ffffa800e6600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000025f60 CR3: 0000000001a0b000 CR4: 00000000001407e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
    refresh_cpu_vm_stats+0xd0/0x140
    vmstat_update+0x11/0x50
    process_one_work+0x194/0x3d0
    worker_thread+0x12b/0x410
    kthread+0xc6/0xd0
    ret_from_fork+0x7c/0xb0

The cause is the "memset(pgdat, 0, sizeof(*pgdat))" at the end of
try_offline_node, which will reset all the content of pgdat to 0, as the
pgdat is accessed lock-free, so that the users still using the pgdat
will panic, such as the vmstat_update routine.

process A:				offline node XX:

vmstat_updat()
   refresh_cpu_vm_stats()
     for_each_populated_zone()
       find online node XX
     cond_resched()
					offline cpu and memory, then try_offline_node()
					node_set_offline(nid), and memset(pgdat, 0, sizeof(*pgdat))
       zone = next_zone(zone)
         pg_data_t *pgdat = zone->zone_pgdat;  // here pgdat is NULL now
           next_online_pgdat(pgdat)
             next_online_node(pgdat->node_id);  // NULL pointer access

So the solution here is postponing the reset of obsolete pgdat from
try_offline_node() to hotadd_new_pgdat(), and just resetting
pgdat->nr_zones and pgdat->classzone_idx to be 0 rather than the memset
0 to avoid breaking pointer information in pgdat.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Reported-by: Xishi Qiu <qiuxishi@huawei.com>
Suggested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab10795bea..65842d688b7c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1092,6 +1092,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 			return NULL;
 
 		arch_refresh_nodedata(nid, pgdat);
+	} else {
+		/* Reset the nr_zones and classzone_idx to 0 before reuse */
+		pgdat->nr_zones = 0;
+		pgdat->classzone_idx = 0;
 	}
 
 	/* we can use NODE_DATA(nid) from here */
@@ -1977,15 +1981,6 @@ void try_offline_node(int nid)
 		if (is_vmalloc_addr(zone->wait_table))
 			vfree(zone->wait_table);
 	}
-
-	/*
-	 * Since there is no way to guarentee the address of pgdat/zone is not
-	 * on stack of any kernel threads or used by other kernel objects
-	 * without reference counting or other symchronizing method, do not
-	 * reset node_data and free pgdat here. Just reset it to 0 and reuse
-	 * the memory when the node is online again.
-	 */
-	memset(pgdat, 0, sizeof(*pgdat));
 }
 EXPORT_SYMBOL(try_offline_node);
 

From 859b7a0e89120505c304d7afbbe90325abaa0a6b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 25 Mar 2015 15:55:23 -0700
Subject: [PATCH 75/98] mm/slub: fix lockups on PREEMPT && !SMP kernels

Commit 9aabf810a67c ("mm/slub: optimize alloc/free fastpath by removing
preemption on/off") introduced an occasional hang for kernels built with
CONFIG_PREEMPT && !CONFIG_SMP.

The problem is the following loop the patch introduced to
slab_alloc_node and slab_free:

    do {
        tid = this_cpu_read(s->cpu_slab->tid);
        c = raw_cpu_ptr(s->cpu_slab);
    } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));

GCC 4.9 has been observed to hoist the load of c and c->tid above the
loop for !SMP kernels (as in this case raw_cpu_ptr(x) is compile-time
constant and does not force a reload).  On arm64 the generated assembly
looks like:

         ldr     x4, [x0,#8]
  loop:
         ldr     x1, [x0,#8]
         cmp     x1, x4
         b.ne    loop

If the thread is preempted between the load of c->tid (into x1) and tid
(into x4), and an allocation or free occurs in another thread (bumping
the cpu_slab's tid), the thread will be stuck in the loop until
s->cpu_slab->tid wraps, which may be forever in the absence of
allocations/frees on the same CPU.

This patch changes the loop condition to access c->tid with READ_ONCE.
This ensures that the value is reloaded even when the compiler would
otherwise assume it could cache the value, and also ensures that the
load will not be torn.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steve Capper <steve.capper@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 6832c4eab104..82c473780c91 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2449,7 +2449,8 @@ redo:
 	do {
 		tid = this_cpu_read(s->cpu_slab->tid);
 		c = raw_cpu_ptr(s->cpu_slab);
-	} while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+	} while (IS_ENABLED(CONFIG_PREEMPT) &&
+		 unlikely(tid != READ_ONCE(c->tid)));
 
 	/*
 	 * Irqless object alloc/free algorithm used here depends on sequence
@@ -2718,7 +2719,8 @@ redo:
 	do {
 		tid = this_cpu_read(s->cpu_slab->tid);
 		c = raw_cpu_ptr(s->cpu_slab);
-	} while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+	} while (IS_ENABLED(CONFIG_PREEMPT) &&
+		 unlikely(tid != READ_ONCE(c->tid)));
 
 	/* Same with comment on barrier() in slab_alloc_node() */
 	barrier();

From cfa869438282be84ad4110bba5027ef1fbbe71e4 Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Wed, 25 Mar 2015 15:55:26 -0700
Subject: [PATCH 76/98] mm/page_alloc.c: call kernel_map_pages in
 unset_migrateype_isolate

Commit 3c605096d315 ("mm/page_alloc: restrict max order of merging on
isolated pageblock") changed the logic of unset_migratetype_isolate to
check the buddy allocator and explicitly call __free_pages to merge.

The page that is being freed in this path never had prep_new_page called
so set_page_refcounted is called explicitly but there is no call to
kernel_map_pages.  With the default kernel_map_pages this is mostly
harmless but if kernel_map_pages does any manipulation of the page
tables (unmapping or setting pages to read only) this may trigger a
fault:

    alloc_contig_range test_pages_isolated(ceb00, ced00) failed
    Unable to handle kernel paging request at virtual address ffffffc0cec00000
    pgd = ffffffc045fc4000
    [ffffffc0cec00000] *pgd=0000000000000000
    Internal error: Oops: 9600004f [#1] PREEMPT SMP
    Modules linked in: exfatfs
    CPU: 1 PID: 23237 Comm: TimedEventQueue Not tainted 3.10.49-gc72ad36-dirty #1
    task: ffffffc03de52100 ti: ffffffc015388000 task.ti: ffffffc015388000
    PC is at memset+0xc8/0x1c0
    LR is at kernel_map_pages+0x1ec/0x244

Fix this by calling kernel_map_pages to ensure the page is set in the
page table properly

Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock")
Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Gioh Kim <gioh.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 72f5ac381ab3..755a42c76eb4 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 
 			if (!is_migrate_isolate_page(buddy)) {
 				__isolate_free_page(page, order);
+				kernel_map_pages(page, (1 << order), 1);
 				set_page_refcounted(page);
 				isolated_page = page;
 			}

From 3d5d472cf55d1be091e8a145f80602bf435ece85 Mon Sep 17 00:00:00 2001
From: Taesoo Kim <tsgatesv@gmail.com>
Date: Wed, 25 Mar 2015 15:55:29 -0700
Subject: [PATCH 77/98] fs/affs/file.c: unlock/release page on error

When affs_bread_ino() fails, correctly unlock the page and release the
page cache with proper error value.  All write_end() should
unlock/release the page that was locked by write_beg().

Signed-off-by: Taesoo Kim <tsgatesv@gmail.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/affs/file.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/affs/file.c b/fs/affs/file.c
index d2468bf95669..a91795e01a7f 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 	boff = tmp % bsize;
 	if (boff) {
 		bh = affs_bread_ino(inode, bidx, 0);
-		if (IS_ERR(bh))
-			return PTR_ERR(bh);
+		if (IS_ERR(bh)) {
+			written = PTR_ERR(bh);
+			goto err_first_bh;
+		}
 		tmp = min(bsize - boff, to - from);
 		BUG_ON(boff + tmp > bsize || tmp > bsize);
 		memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
@@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		bidx++;
 	} else if (bidx) {
 		bh = affs_bread_ino(inode, bidx - 1, 0);
-		if (IS_ERR(bh))
-			return PTR_ERR(bh);
+		if (IS_ERR(bh)) {
+			written = PTR_ERR(bh);
+			goto err_first_bh;
+		}
 	}
 	while (from + bsize <= to) {
 		prev_bh = bh;
 		bh = affs_getemptyblk_ino(inode, bidx);
 		if (IS_ERR(bh))
-			goto out;
+			goto err_bh;
 		memcpy(AFFS_DATA(bh), data + from, bsize);
 		if (buffer_new(bh)) {
 			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
@@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		prev_bh = bh;
 		bh = affs_bread_ino(inode, bidx, 1);
 		if (IS_ERR(bh))
-			goto out;
+			goto err_bh;
 		tmp = min(bsize, to - from);
 		BUG_ON(tmp > bsize);
 		memcpy(AFFS_DATA(bh), data + from, tmp);
@@ -790,12 +794,13 @@ done:
 	if (tmp > inode->i_size)
 		inode->i_size = AFFS_I(inode)->mmu_private = tmp;
 
+err_first_bh:
 	unlock_page(page);
 	page_cache_release(page);
 
 	return written;
 
-out:
+err_bh:
 	bh = prev_bh;
 	if (!written)
 		written = PTR_ERR(bh);

From 1f31e1b1963c240ced453489730bdfc9b0110ceb Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Wed, 25 Mar 2015 15:55:31 -0700
Subject: [PATCH 78/98] MAINTAINERS: add Jan as DMI/SMBIOS support maintainer

I am familiar with these drivers and I care about them so let me add
myself as their maintainer.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 12ebef95224d..88c09ca2584f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3252,6 +3252,13 @@ S:	Maintained
 F:	Documentation/hwmon/dme1737
 F:	drivers/hwmon/dme1737.c
 
+DMI/SMBIOS SUPPORT
+M:	Jean Delvare <jdelvare@suse.de>
+S:	Maintained
+F:	drivers/firmware/dmi-id.c
+F:	drivers/firmware/dmi_scan.c
+F:	include/linux/dmi.h
+
 DOCKING STATION DRIVER
 M:	Shaohua Li <shaohua.li@intel.com>
 L:	linux-acpi@vger.kernel.org

From 98cf21c61a7f5419d82f847c4d77bf6e96a76f5f Mon Sep 17 00:00:00 2001
From: Sergei Antonov <saproj@gmail.com>
Date: Wed, 25 Mar 2015 15:55:34 -0700
Subject: [PATCH 79/98] hfsplus: fix B-tree corruption after insertion at
 position 0

Fix B-tree corruption when a new record is inserted at position 0 in the
node in hfs_brec_insert().  In this case a hfs_brec_update_parent() is
called to update the parent index node (if exists) and it is passed
hfs_find_data with a search_key containing a newly inserted key instead
of the key to be updated.  This results in an inconsistent index node.
The bug reproduces on my machine after an extents overflow record for
the catalog file (CNID=4) is inserted into the extents overflow B-tree.
Because of a low (reserved) value of CNID=4, it has to become the first
record in the first leaf node.

The resulting first leaf node is correct:

  ----------------------------------------------------
  | key0.CNID=4 | key1.CNID=123 | key2.CNID=456, ... |
  ----------------------------------------------------

But the parent index key0 still contains the previous key CNID=123:

  -----------------------
  | key0.CNID=123 | ... |
  -----------------------

A change in hfs_brec_insert() makes hfs_brec_update_parent() work
correctly by preventing it from getting fd->record=-1 value from
__hfs_brec_find().

Along the way, I removed duplicate code with unification of the if
condition.  The resulting code is equivalent to the original code
because node is never 0.

Also hfs_brec_update_parent() will now return an error after getting a
negative fd->record value.  However, the return value of
hfs_brec_update_parent() is not checked anywhere in the file and I'm
leaving it unchanged by this patch.  brec.c lacks error checking after
some other calls too, but this issue is of less importance than the one
being fixed by this patch.

Signed-off-by: Sergei Antonov <saproj@gmail.com>
Cc: Joe Perches <joe@perches.com>
Reviewed-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Acked-by: Hin-Tak Leung <htl10@users.sourceforge.net>
Cc: Anton Altaparmakov <aia21@cam.ac.uk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/brec.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 6e560d56094b..754fdf8c6356 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -131,13 +131,16 @@ skip:
 	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
 	hfs_bnode_dump(node);
 
-	if (new_node) {
-		/* update parent key if we inserted a key
-		 * at the start of the first node
-		 */
-		if (!rec && new_node != node)
-			hfs_brec_update_parent(fd);
+	/*
+	 * update parent key if we inserted a key
+	 * at the start of the node and it is not the new node
+	 */
+	if (!rec && new_node != node) {
+		hfs_bnode_read_key(node, fd->search_key, data_off + size);
+		hfs_brec_update_parent(fd);
+	}
 
+	if (new_node) {
 		hfs_bnode_put(fd->bnode);
 		if (!new_node->parent) {
 			hfs_btree_inc_height(tree);
@@ -168,9 +171,6 @@ skip:
 		goto again;
 	}
 
-	if (!rec)
-		hfs_brec_update_parent(fd);
-
 	return 0;
 }
 
@@ -370,6 +370,8 @@ again:
 	if (IS_ERR(parent))
 		return PTR_ERR(parent);
 	__hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+	if (fd->record < 0)
+		return -ENOENT;
 	hfs_bnode_dump(parent);
 	rec = fd->record;
 

From bea66fbd11af1ca98ae26855eea41eda8582923e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 25 Mar 2015 15:55:37 -0700
Subject: [PATCH 80/98] mm: numa: group related processes based on VMA flags
 instead of page table flags

These are three follow-on patches based on the xfsrepair workload Dave
Chinner reported was problematic in 4.0-rc1 due to changes in page table
management -- https://lkml.org/lkml/2015/3/1/226.

Much of the problem was reduced by commit 53da3bc2ba9e ("mm: fix up numa
read-only thread grouping logic") and commit ba68bc0115eb ("mm: thp:
Return the correct value for change_huge_pmd").  It was known that the
performance in 3.19 was still better even if is far less safe.  This
series aims to restore the performance without compromising on safety.

For the test of this mail, I'm comparing 3.19 against 4.0-rc4 and the
three patches applied on top

  autonumabench
                                                3.19.0             4.0.0-rc4             4.0.0-rc4             4.0.0-rc4             4.0.0-rc4
                                               vanilla               vanilla          vmwrite-v5r8         preserve-v5r8         slowscan-v5r8
  Time System-NUMA01                  124.00 (  0.00%)      161.86 (-30.53%)      107.13 ( 13.60%)      103.13 ( 16.83%)      145.01 (-16.94%)
  Time System-NUMA01_THEADLOCAL       115.54 (  0.00%)      107.64 (  6.84%)      131.87 (-14.13%)       83.30 ( 27.90%)       92.35 ( 20.07%)
  Time System-NUMA02                    9.35 (  0.00%)       10.44 (-11.66%)        8.95 (  4.28%)       10.72 (-14.65%)        8.16 ( 12.73%)
  Time System-NUMA02_SMT                3.87 (  0.00%)        4.63 (-19.64%)        4.57 (-18.09%)        3.99 ( -3.10%)        3.36 ( 13.18%)
  Time Elapsed-NUMA01                 570.06 (  0.00%)      567.82 (  0.39%)      515.78 (  9.52%)      517.26 (  9.26%)      543.80 (  4.61%)
  Time Elapsed-NUMA01_THEADLOCAL      393.69 (  0.00%)      384.83 (  2.25%)      384.10 (  2.44%)      384.31 (  2.38%)      380.73 (  3.29%)
  Time Elapsed-NUMA02                  49.09 (  0.00%)       49.33 ( -0.49%)       48.86 (  0.47%)       48.78 (  0.63%)       50.94 ( -3.77%)
  Time Elapsed-NUMA02_SMT              47.51 (  0.00%)       47.15 (  0.76%)       47.98 ( -0.99%)       48.12 ( -1.28%)       49.56 ( -4.31%)

                3.19.0   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4
               vanilla     vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8
  User        46334.60    46391.94    44383.95    43971.89    44372.12
  System        252.84      284.66      252.61      201.24      249.00
  Elapsed      1062.14     1050.96      998.68     1000.94     1026.78

Overall the system CPU usage is comparable and the test is naturally a
bit variable.  The slowing of the scanner hurts numa01 but on this
machine it is an adverse workload and patches that dramatically help it
often hurt absolutely everything else.

Due to patch 2, the fault activity is interesting

                                  3.19.0   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4
                                 vanilla     vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8
  Minor Faults                   2097811     2656646     2597249     1981230     1636841
  Major Faults                       362         450         365         364         365

Note the impact preserving the write bit across protection updates and
fault reduces faults.

  NUMA alloc hit                 1229008     1217015     1191660     1178322     1199681
  NUMA alloc miss                      0           0           0           0           0
  NUMA interleave hit                  0           0           0           0           0
  NUMA alloc local               1228514     1216317     1190871     1177448     1199021
  NUMA base PTE updates        245706197   240041607   238195516   244704842   115012800
  NUMA huge PMD updates           479530      468448      464868      477573      224487
  NUMA page range updates      491225557   479886983   476207932   489222218   229950144
  NUMA hint faults                659753      656503      641678      656926      294842
  NUMA hint local faults          381604      373963      360478      337585      186249
  NUMA hint local percent             57          56          56          51          63
  NUMA pages migrated            5412140     6374899     6266530     5277468     5755096
  AutoNUMA cost                    5121%       5083%       4994%       5097%       2388%

Here the impact of slowing the PTE scanner on migratrion failures is
obvious as "NUMA base PTE updates" and "NUMA huge PMD updates" are
massively reduced even though the headline performance is very similar.

As xfsrepair was the reported workload here is the impact of the series
on it.

  xfsrepair
                                         3.19.0             4.0.0-rc4             4.0.0-rc4             4.0.0-rc4             4.0.0-rc4
                                        vanilla               vanilla          vmwrite-v5r8         preserve-v5r8         slowscan-v5r8
  Min      real-fsmark        1183.29 (  0.00%)     1165.73 (  1.48%)     1152.78 (  2.58%)     1153.64 (  2.51%)     1177.62 (  0.48%)
  Min      syst-fsmark        4107.85 (  0.00%)     4027.75 (  1.95%)     3986.74 (  2.95%)     3979.16 (  3.13%)     4048.76 (  1.44%)
  Min      real-xfsrepair      441.51 (  0.00%)      463.96 ( -5.08%)      449.50 ( -1.81%)      440.08 (  0.32%)      439.87 (  0.37%)
  Min      syst-xfsrepair      195.76 (  0.00%)      278.47 (-42.25%)      262.34 (-34.01%)      203.70 ( -4.06%)      143.64 ( 26.62%)
  Amean    real-fsmark        1188.30 (  0.00%)     1177.34 (  0.92%)     1157.97 (  2.55%)     1158.21 (  2.53%)     1182.22 (  0.51%)
  Amean    syst-fsmark        4111.37 (  0.00%)     4055.70 (  1.35%)     3987.19 (  3.02%)     3998.72 (  2.74%)     4061.69 (  1.21%)
  Amean    real-xfsrepair      450.88 (  0.00%)      468.32 ( -3.87%)      454.14 ( -0.72%)      442.36 (  1.89%)      440.59 (  2.28%)
  Amean    syst-xfsrepair      199.66 (  0.00%)      290.60 (-45.55%)      277.20 (-38.84%)      204.68 ( -2.51%)      150.55 ( 24.60%)
  Stddev   real-fsmark           4.12 (  0.00%)       10.82 (-162.29%)       4.14 ( -0.28%)        5.98 (-45.05%)        4.60 (-11.53%)
  Stddev   syst-fsmark           2.63 (  0.00%)       20.32 (-671.82%)       0.37 ( 85.89%)       16.47 (-525.59%)      15.05 (-471.79%)
  Stddev   real-xfsrepair        6.87 (  0.00%)        4.55 ( 33.75%)        3.46 ( 49.58%)        1.78 ( 74.12%)        0.52 ( 92.50%)
  Stddev   syst-xfsrepair        3.02 (  0.00%)       10.30 (-241.37%)      13.17 (-336.37%)       0.71 ( 76.63%)        5.00 (-65.61%)
  CoeffVar real-fsmark           0.35 (  0.00%)        0.92 (-164.73%)       0.36 ( -2.91%)        0.52 (-48.82%)        0.39 (-12.10%)
  CoeffVar syst-fsmark           0.06 (  0.00%)        0.50 (-682.41%)       0.01 ( 85.45%)        0.41 (-543.22%)       0.37 (-478.78%)
  CoeffVar real-xfsrepair        1.52 (  0.00%)        0.97 ( 36.21%)        0.76 ( 49.94%)        0.40 ( 73.62%)        0.12 ( 92.33%)
  CoeffVar syst-xfsrepair        1.51 (  0.00%)        3.54 (-134.54%)       4.75 (-214.31%)       0.34 ( 77.20%)        3.32 (-119.63%)
  Max      real-fsmark        1193.39 (  0.00%)     1191.77 (  0.14%)     1162.90 (  2.55%)     1166.66 (  2.24%)     1188.50 (  0.41%)
  Max      syst-fsmark        4114.18 (  0.00%)     4075.45 (  0.94%)     3987.65 (  3.08%)     4019.45 (  2.30%)     4082.80 (  0.76%)
  Max      real-xfsrepair      457.80 (  0.00%)      474.60 ( -3.67%)      457.82 ( -0.00%)      444.42 (  2.92%)      441.03 (  3.66%)
  Max      syst-xfsrepair      203.11 (  0.00%)      303.65 (-49.50%)      294.35 (-44.92%)      205.33 ( -1.09%)      155.28 ( 23.55%)

The really relevant lines as syst-xfsrepair which is the system CPU
usage when running xfsrepair.  Note that on my machine the overhead was
45% higher on 4.0-rc4 which may be part of what Dave is seeing.  Once we
preserve the write bit across faults, it's only 2.51% higher on average.
With the full series applied, system CPU usage is 24.6% lower on
average.

Again, the impact of preserving the write bit on minor faults is obvious
and the impact of slowing scanning after migration failures is obvious
on the PTE updates.  Note also that the number of pages migrated is much
reduced even though the headline performance is comparable.

                                  3.19.0   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4
                                 vanilla     vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8
  Minor Faults                 153466827   254507978   249163829   153501373   105737890
  Major Faults                       610         702         690         649         724
  NUMA base PTE updates        217735049   210756527   217729596   216937111   144344993
  NUMA huge PMD updates           129294       85044      106921      127246       79887
  NUMA pages migrated           21938995    29705270    28594162    22687324    16258075

                        3.19.0   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4   4.0.0-rc4
                       vanilla     vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8
  Mean sdb-avgqusz       13.47        2.54        2.55        2.47        2.49
  Mean sdb-avgrqsz      202.32      140.22      139.50      139.02      138.12
  Mean sdb-await         25.92        5.09        5.33        5.02        5.22
  Mean sdb-r_await        4.71        0.19        0.83        0.51        0.11
  Mean sdb-w_await      104.13        5.21        5.38        5.05        5.32
  Mean sdb-svctm          0.59        0.13        0.14        0.13        0.14
  Mean sdb-rrqm           0.16        0.00        0.00        0.00        0.00
  Mean sdb-wrqm           3.59     1799.43     1826.84     1812.21     1785.67
  Max  sdb-avgqusz      111.06       12.13       14.05       11.66       15.60
  Max  sdb-avgrqsz      255.60      190.34      190.01      187.33      191.78
  Max  sdb-await        168.24       39.28       49.22       44.64       65.62
  Max  sdb-r_await      660.00       52.00      280.00       76.00       12.00
  Max  sdb-w_await     7804.00       39.28       49.22       44.64       65.62
  Max  sdb-svctm          4.00        2.82        2.86        1.98        2.84
  Max  sdb-rrqm           8.30        0.00        0.00        0.00        0.00
  Max  sdb-wrqm          34.20     5372.80     5278.60     5386.60     5546.15

FWIW, I also checked SPECjbb in different configurations but it's
similar observations -- minor faults lower, PTE update activity lower
and performance is roughly comparable against 3.19.

This patch (of 3):

Threads that share writable data within pages are grouped together as
related tasks.  This decision is based on whether the PTE is marked
dirty which is subject to timing races between the PTE scanner update
and when the application writes the page.  If the page is file-backed,
then background flushes and sync also affect placement.  This is
unpredictable behaviour which is impossible to reason about so this
patch makes grouping decisions based on the VMA flags.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Dave Chinner <david@fromorbit.com>
Tested-by: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 13 ++-----------
 mm/memory.c      | 19 +++++++++++--------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 626e93db28ba..2f12e9fcf1a2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1291,17 +1291,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		flags |= TNF_FAULT_LOCAL;
 	}
 
-	/*
-	 * Avoid grouping on DSO/COW pages in specific and RO pages
-	 * in general, RO pages shouldn't hurt as much anyway since
-	 * they can be in shared cache state.
-	 *
-	 * FIXME! This checks "pmd_dirty()" as an approximation of
-	 * "is this a read-only page", since checking "pmd_write()"
-	 * is even more broken. We haven't actually turned this into
-	 * a writable page, so pmd_write() will always be false.
-	 */
-	if (!pmd_dirty(pmd))
+	/* See similar comment in do_numa_page for explanation */
+	if (!(vma->vm_flags & VM_WRITE))
 		flags |= TNF_NO_GROUP;
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index 411144f977b1..20beb6647dba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3069,16 +3069,19 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	/*
-	 * Avoid grouping on DSO/COW pages in specific and RO pages
-	 * in general, RO pages shouldn't hurt as much anyway since
-	 * they can be in shared cache state.
+	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+	 * much anyway since they can be in shared cache state. This misses
+	 * the case where a mapping is writable but the process never writes
+	 * to it but pte_write gets cleared during protection updates and
+	 * pte_dirty has unpredictable behaviour between PTE scan updates,
+	 * background writeback, dirty balancing and application behaviour.
 	 *
-	 * FIXME! This checks "pmd_dirty()" as an approximation of
-	 * "is this a read-only page", since checking "pmd_write()"
-	 * is even more broken. We haven't actually turned this into
-	 * a writable page, so pmd_write() will always be false.
+	 * TODO: Note that the ideal here would be to avoid a situation where a
+	 * NUMA fault is taken immediately followed by a write fault in
+	 * some cases which would have lower overhead overall but would be
+	 * invasive as the fault paths would need to be unified.
 	 */
-	if (!pte_dirty(pte))
+	if (!(vma->vm_flags & VM_WRITE))
 		flags |= TNF_NO_GROUP;
 
 	/*

From b191f9b106ea1a24a711dbebb2925d3313da5852 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 25 Mar 2015 15:55:40 -0700
Subject: [PATCH 81/98] mm: numa: preserve PTE write permissions across a NUMA
 hinting fault

Protecting a PTE to trap a NUMA hinting fault clears the writable bit
and further faults are needed after trapping a NUMA hinting fault to set
the writable bit again.  This patch preserves the writable bit when
trapping NUMA hinting faults.  The impact is obvious from the number of
minor faults trapped during the basis balancing benchmark and the system
CPU usage;

  autonumabench
                                             4.0.0-rc4             4.0.0-rc4
                                              baseline              preserve
  Time System-NUMA01                  107.13 (  0.00%)      103.13 (  3.73%)
  Time System-NUMA01_THEADLOCAL       131.87 (  0.00%)       83.30 ( 36.83%)
  Time System-NUMA02                    8.95 (  0.00%)       10.72 (-19.78%)
  Time System-NUMA02_SMT                4.57 (  0.00%)        3.99 ( 12.69%)
  Time Elapsed-NUMA01                 515.78 (  0.00%)      517.26 ( -0.29%)
  Time Elapsed-NUMA01_THEADLOCAL      384.10 (  0.00%)      384.31 ( -0.05%)
  Time Elapsed-NUMA02                  48.86 (  0.00%)       48.78 (  0.16%)
  Time Elapsed-NUMA02_SMT              47.98 (  0.00%)       48.12 ( -0.29%)

               4.0.0-rc4   4.0.0-rc4
                baseline    preserve
  User          44383.95    43971.89
  System          252.61      201.24
  Elapsed         998.68     1000.94

  Minor Faults   2597249     1981230
  Major Faults       365         364

There is a similar drop in system CPU usage using Dave Chinner's xfsrepair
workload

                                      4.0.0-rc4             4.0.0-rc4
                                       baseline              preserve
  Amean    real-xfsrepair      454.14 (  0.00%)      442.36 (  2.60%)
  Amean    syst-xfsrepair      277.20 (  0.00%)      204.68 ( 26.16%)

The patch looks hacky but the alternatives looked worse.  The tidest was
to rewalk the page tables after a hinting fault but it was more complex
than this approach and the performance was worse.  It's not generally
safe to just mark the page writable during the fault if it's a write
fault as it may have been read-only for COW so that approach was
discarded.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Dave Chinner <david@fromorbit.com>
Tested-by: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 9 ++++++++-
 mm/memory.c      | 8 +++-----
 mm/mprotect.c    | 3 +++
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2f12e9fcf1a2..0a42d1521aa4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1260,6 +1260,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int target_nid, last_cpupid = -1;
 	bool page_locked;
 	bool migrated = false;
+	bool was_writable;
 	int flags = 0;
 
 	/* A PROT_NONE fault should not end up here */
@@ -1354,7 +1355,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	goto out;
 clear_pmdnuma:
 	BUG_ON(!PageLocked(page));
+	was_writable = pmd_write(pmd);
 	pmd = pmd_modify(pmd, vma->vm_page_prot);
+	if (was_writable)
+		pmd = pmd_mkwrite(pmd);
 	set_pmd_at(mm, haddr, pmdp, pmd);
 	update_mmu_cache_pmd(vma, addr, pmdp);
 	unlock_page(page);
@@ -1478,6 +1482,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
+		bool preserve_write = prot_numa && pmd_write(*pmd);
 		ret = 1;
 
 		/*
@@ -1493,9 +1498,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!prot_numa || !pmd_protnone(*pmd)) {
 			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
 			entry = pmd_modify(entry, newprot);
+			if (preserve_write)
+				entry = pmd_mkwrite(entry);
 			ret = HPAGE_PMD_NR;
 			set_pmd_at(mm, addr, pmd, entry);
-			BUG_ON(pmd_write(entry));
+			BUG_ON(!preserve_write && pmd_write(entry));
 		}
 		spin_unlock(ptl);
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 20beb6647dba..d20e12da3a3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3035,6 +3035,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int last_cpupid;
 	int target_nid;
 	bool migrated = false;
+	bool was_writable = pte_write(pte);
 	int flags = 0;
 
 	/* A PROT_NONE fault should not end up here */
@@ -3059,6 +3060,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* Make it present again */
 	pte = pte_modify(pte, vma->vm_page_prot);
 	pte = pte_mkyoung(pte);
+	if (was_writable)
+		pte = pte_mkwrite(pte);
 	set_pte_at(mm, addr, ptep, pte);
 	update_mmu_cache(vma, addr, ptep);
 
@@ -3075,11 +3078,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * to it but pte_write gets cleared during protection updates and
 	 * pte_dirty has unpredictable behaviour between PTE scan updates,
 	 * background writeback, dirty balancing and application behaviour.
-	 *
-	 * TODO: Note that the ideal here would be to avoid a situation where a
-	 * NUMA fault is taken immediately followed by a write fault in
-	 * some cases which would have lower overhead overall but would be
-	 * invasive as the fault paths would need to be unified.
 	 */
 	if (!(vma->vm_flags & VM_WRITE))
 		flags |= TNF_NO_GROUP;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 44727811bf4c..88584838e704 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
 			pte_t ptent;
+			bool preserve_write = prot_numa && pte_write(oldpte);
 
 			/*
 			 * Avoid trapping faults against the zero or KSM
@@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
+			if (preserve_write)
+				ptent = pte_mkwrite(ptent);
 
 			/* Avoid taking write faults for known dirty pages */
 			if (dirty_accountable && pte_dirty(ptent) &&

From 074c238177a75f5e79af3b2cb6a84e54823ef950 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 25 Mar 2015 15:55:42 -0700
Subject: [PATCH 82/98] mm: numa: slow PTE scan rate if migration failures
 occur

Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226

  Across the board the 4.0-rc1 numbers are much slower, and the degradation
  is far worse when using the large memory footprint configs. Perf points
  straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config:

   -   56.07%    56.07%  [kernel]            [k] default_send_IPI_mask_sequence_phys
      - default_send_IPI_mask_sequence_phys
         - 99.99% physflat_send_IPI_mask
            - 99.37% native_send_call_func_ipi
                 smp_call_function_many
               - native_flush_tlb_others
                  - 99.85% flush_tlb_page
                       ptep_clear_flush
                       try_to_unmap_one
                       rmap_walk
                       try_to_unmap
                       migrate_pages
                       migrate_misplaced_page
                     - handle_mm_fault
                        - 99.73% __do_page_fault
                             trace_do_page_fault
                             do_async_page_fault
                           + async_page_fault
              0.63% native_send_call_func_single_ipi
                 generic_exec_single
                 smp_call_function_single

This is showing excessive migration activity even though excessive
migrations are meant to get throttled.  Normally, the scan rate is tuned
on a per-task basis depending on the locality of faults.  However, if
migrations fail for any reason then the PTE scanner may scan faster if
the faults continue to be remote.  This means there is higher system CPU
overhead and fault trapping at exactly the time we know that migrations
cannot happen.  This patch tracks when migration failures occur and
slows the PTE scanner.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Dave Chinner <david@fromorbit.com>
Tested-by: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 9 +++++----
 kernel/sched/fair.c   | 8 ++++++--
 mm/huge_memory.c      | 3 ++-
 mm/memory.c           | 3 ++-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d77432e14ff..a419b65770d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1625,11 +1625,11 @@ struct task_struct {
 
 	/*
 	 * numa_faults_locality tracks if faults recorded during the last
-	 * scan window were remote/local. The task scan period is adapted
-	 * based on the locality of the faults with different weights
-	 * depending on whether they were shared or private faults
+	 * scan window were remote/local or failed to migrate. The task scan
+	 * period is adapted based on the locality of the faults with different
+	 * weights depending on whether they were shared or private faults
 	 */
-	unsigned long numa_faults_locality[2];
+	unsigned long numa_faults_locality[3];
 
 	unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@@ -1719,6 +1719,7 @@ struct task_struct {
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
 #define TNF_FAULT_LOCAL	0x08
+#define TNF_MIGRATE_FAIL 0x10
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..bcfe32088b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
 	/*
 	 * If there were no record hinting faults then either the task is
 	 * completely idle or all activity is areas that are not of interest
-	 * to automatic numa balancing. Scan slower
+	 * to automatic numa balancing. Related to that, if there were failed
+	 * migration then it implies we are migrating too quickly or the local
+	 * node is overloaded. In either case, scan slower
 	 */
-	if (local + shared == 0) {
+	if (local + shared == 0 || p->numa_faults_locality[2]) {
 		p->numa_scan_period = min(p->numa_scan_period_max,
 			p->numa_scan_period << 1);
 
@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 	if (migrated)
 		p->numa_pages_migrated += pages;
+	if (flags & TNF_MIGRATE_FAIL)
+		p->numa_faults_locality[2] += pages;
 
 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0a42d1521aa4..51b3e7c64622 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (migrated) {
 		flags |= TNF_MIGRATED;
 		page_nid = target_nid;
-	}
+	} else
+		flags |= TNF_MIGRATE_FAIL;
 
 	goto out;
 clear_pmdnuma:
diff --git a/mm/memory.c b/mm/memory.c
index d20e12da3a3c..97839f5c8c30 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (migrated) {
 		page_nid = target_nid;
 		flags |= TNF_MIGRATED;
-	}
+	} else
+		flags |= TNF_MIGRATE_FAIL;
 
 out:
 	if (page_nid != -1)

From b7b04004ecd9e58cdc6c6ff92f251d5ac5c0adb2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 25 Mar 2015 15:55:45 -0700
Subject: [PATCH 83/98] mm: numa: mark huge PTEs young when clearing NUMA
 hinting faults

Base PTEs are marked young when the NUMA hinting information is cleared
but the same does not happen for huge pages which this patch addresses.

Note that migrated pages are not marked young as the base page migration
code does not assume that migrated pages have been referenced.  This
could be addressed but beyond the scope of this series which is aimed at
Dave Chinners shrink workload that is unlikely to be affected by this
issue.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 51b3e7c64622..6817b0350c71 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1358,6 +1358,7 @@ clear_pmdnuma:
 	BUG_ON(!PageLocked(page));
 	was_writable = pmd_write(pmd);
 	pmd = pmd_modify(pmd, vma->vm_page_prot);
+	pmd = pmd_mkyoung(pmd);
 	if (was_writable)
 		pmd = pmd_mkwrite(pmd);
 	set_pmd_at(mm, haddr, pmdp, pmd);

From 832a3aad1e2927b1684e7369c9f36a370e0b95da Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 18 Mar 2015 18:19:22 +0000
Subject: [PATCH 84/98] drm/i915: Keep ring->active_list and
 ring->requests_list consistent

If we retire requests last, we may use a later seqno and so clear
the requests lists without clearing the active list, leading to
confusion. Hence we should retire requests first for consistency with
the early return. The order used to be important as the lifecycle for
the object on the active list was determined by request->seqno. However,
the requests themselves are now reference counted removing the
constraint from the order of retirement.

Fixes regression from

commit 1b5a433a4dd967b125131da42b89b5cc0d5b1f57
Author: John Harrison <John.C.Harrison@Intel.com>
Date:   Mon Nov 24 18:49:42 2014 +0000

    drm/i915: Convert 'i915_seqno_passed' calls into 'i915_gem_request_completed
'

and a

	WARNING: CPU: 0 PID: 1383 at drivers/gpu/drm/i915/i915_gem_evict.c:279 i915_gem_evict_vm+0x10c/0x140()
	WARN_ON(!list_empty(&vm->active_list))

Identified by updating WATCH_LISTS:

	[drm:i915_verify_lists] *ERROR* blitter ring: active list not empty, but no requests
	WARNING: CPU: 0 PID: 681 at drivers/gpu/drm/i915/i915_gem.c:2751 i915_gem_retire_requests_ring+0x149/0x230()
	WARN_ON(i915_verify_lists(ring->dev))

Note that this is only a problem in evict_vm where the following happens
after a retire_request has cleaned out all requests, but not all active
bo:
- intel_ring_idle called from i915_gpu_idle notices that no requests are
  outstanding and immediately returns.
- i915_gem_retire_requests_ring called from i915_gem_retire_requests also
  immediately returns when there's no request, still leaving the bo on the
  active list.
- evict_vm hits the WARN_ON(!list_empty(&vm->active_list)) after evicting
  all active objects that there's still stuff left that shouldn't be
  there.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 38 ++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5b205863b659..27ea6bdebce7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2737,24 +2737,11 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 
 	WARN_ON(i915_verify_lists(ring->dev));
 
-	/* Move any buffers on the active list that are no longer referenced
-	 * by the ringbuffer to the flushing/inactive lists as appropriate,
-	 * before we free the context associated with the requests.
+	/* Retire requests first as we use it above for the early return.
+	 * If we retire requests last, we may use a later seqno and so clear
+	 * the requests lists without clearing the active list, leading to
+	 * confusion.
 	 */
-	while (!list_empty(&ring->active_list)) {
-		struct drm_i915_gem_object *obj;
-
-		obj = list_first_entry(&ring->active_list,
-				      struct drm_i915_gem_object,
-				      ring_list);
-
-		if (!i915_gem_request_completed(obj->last_read_req, true))
-			break;
-
-		i915_gem_object_move_to_inactive(obj);
-	}
-
-
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 		struct intel_ringbuffer *ringbuf;
@@ -2789,6 +2776,23 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 		i915_gem_free_request(request);
 	}
 
+	/* Move any buffers on the active list that are no longer referenced
+	 * by the ringbuffer to the flushing/inactive lists as appropriate,
+	 * before we free the context associated with the requests.
+	 */
+	while (!list_empty(&ring->active_list)) {
+		struct drm_i915_gem_object *obj;
+
+		obj = list_first_entry(&ring->active_list,
+				      struct drm_i915_gem_object,
+				      ring_list);
+
+		if (!i915_gem_request_completed(obj->last_read_req, true))
+			break;
+
+		i915_gem_object_move_to_inactive(obj);
+	}
+
 	if (unlikely(ring->trace_irq_req &&
 		     i915_gem_request_completed(ring->trace_irq_req, true))) {
 		ring->irq_put(ring);

From 3164a803416832d61268b758112e8dfd7e35cdf7 Mon Sep 17 00:00:00 2001
From: Damien Lespiau <damien.lespiau@intel.com>
Date: Thu, 5 Feb 2015 19:24:25 +0000
Subject: [PATCH 85/98] drm/i915: Fix atomic state when reusing the firmware fb

Right now, we get a warning when taking over the firmware fb:

  [drm:drm_atomic_plane_check] FB set but no CRTC

with the following backtrace:

  [<ffffffffa010339d>] drm_atomic_check_only+0x35d/0x510 [drm]
  [<ffffffffa0103567>] drm_atomic_commit+0x17/0x60 [drm]
  [<ffffffffa00a6ccd>] drm_atomic_helper_plane_set_property+0x8d/0xd0 [drm_kms_helper]
  [<ffffffffa00f1fed>] drm_mode_plane_set_obj_prop+0x2d/0x90 [drm]
  [<ffffffffa00a8a1b>] restore_fbdev_mode+0x6b/0xf0 [drm_kms_helper]
  [<ffffffffa00aa969>] drm_fb_helper_restore_fbdev_mode_unlocked+0x29/0x80 [drm_kms_helper]
  [<ffffffffa00aa9e2>] drm_fb_helper_set_par+0x22/0x50 [drm_kms_helper]
  [<ffffffffa050a71a>] intel_fbdev_set_par+0x1a/0x60 [i915]
  [<ffffffff813ad444>] fbcon_init+0x4f4/0x580

That's because we update the plane state with the fb from the firmware, but we
never associate the plane to that CRTC.

We don't quite have the full DRM take over from HW state just yet, so
fake enough of the plane atomic state to pass the checks.

v2: Fix the state on which we set the CRTC in the case we're sharing the
    initial fb with another pipe. (Matt)

Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
[Jani: backported to drm-intel-fixes for v4.0-rc]
Reference: http://mid.gmane.org/CA+5PVA7yXH=U757w8V=Zj2U1URG4nYNav20NpjtQ4svVueyPNw@mail.gmail.com
Reference: http://lkml.kernel.org/r/CA+55aFweWR=nDzc2Y=rCtL_H8JfdprQiCimN5dwc+TgyD4Bjsg@mail.gmail.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_display.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 1c12262029fb..30faf6c262e5 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2439,7 +2439,11 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 		return;
 
 	if (intel_alloc_plane_obj(intel_crtc, plane_config)) {
-		update_state_fb(intel_crtc->base.primary);
+		struct drm_plane *primary = intel_crtc->base.primary;
+
+		primary->state->crtc = &intel_crtc->base;
+		update_state_fb(primary);
+
 		return;
 	}
 
@@ -2464,11 +2468,14 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 			continue;
 
 		if (i915_gem_obj_ggtt_offset(obj) == plane_config->base) {
+			struct drm_plane *primary = intel_crtc->base.primary;
+
 			if (obj->tiling_mode != I915_TILING_NONE)
 				dev_priv->preserve_bios_swizzle = true;
 
 			drm_framebuffer_reference(c->primary->fb);
-			intel_crtc->base.primary->fb = c->primary->fb;
+			primary->fb = c->primary->fb;
+			primary->state->crtc = &intel_crtc->base;
 			obj->frontbuffer_bits |= INTEL_FRONTBUFFER_PRIMARY(intel_crtc->pipe);
 			break;
 		}

From 5f407751b0ca9bd876fe8f15ff28153661c6ba0a Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 25 Mar 2015 18:30:38 +0100
Subject: [PATCH 86/98] drm/i915: Fixup legacy plane->crtc link for initial fb
 config

This is a very similar bug in the load detect code fixed in

commit 9128b040eb774e04bc23777b005ace2b66ab2a85
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Tue Mar 3 17:31:21 2015 +0100

    drm/i915: Fix modeset state confusion in the load detect code

But this time around it was the initial fb code that forgot to update
the plane->crtc pointer. Otherwise it's the exact same bug, with the
exact same restrains (any set_config call/ioctl that doesn't disable
the pipe papers over the bug for free, so fairly hard to hit in normal
testing). So if you want the full explanation just go read that one
over there - it's rather long ...

Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
[Jani: backported to drm-intel-fixes for v4.0-rc]
Reference: http://mid.gmane.org/CA+5PVA7ChbtJrknqws1qvZcbrg1CW2pQAFkSMURWWgyASRyGXg@mail.gmail.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_display.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 30faf6c262e5..f75173c20f47 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2442,6 +2442,7 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 		struct drm_plane *primary = intel_crtc->base.primary;
 
 		primary->state->crtc = &intel_crtc->base;
+		primary->crtc = &intel_crtc->base;
 		update_state_fb(primary);
 
 		return;
@@ -2476,6 +2477,7 @@ intel_find_plane_obj(struct intel_crtc *intel_crtc,
 			drm_framebuffer_reference(c->primary->fb);
 			primary->fb = c->primary->fb;
 			primary->state->crtc = &intel_crtc->base;
+			primary->crtc = &intel_crtc->base;
 			obj->frontbuffer_bits |= INTEL_FRONTBUFFER_PRIMARY(intel_crtc->pipe);
 			break;
 		}

From 8710e914027e4f64058ebbf0501cc6db3cc8454f Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 26 Mar 2015 12:23:22 -0700
Subject: [PATCH 87/98] timers, sched/clock: Match scope of read and write
 seqcounts

Currently the scope of the raw_write_seqcount_begin/end() in
sched_clock_register() far exceeds the scope of the read section
in sched_clock(). This gives the impression of safety during
cursory review but achieves little.

Note that this is likely to be a latent issue at present because
sched_clock_register() is typically called before we enable
interrupts, however the issue does risk bugs being needlessly
introduced as the code evolves.

This patch fixes the problem by increasing the scope of the read
locking performed by sched_clock() to cover all data modified by
sched_clock_register.

We also improve clarity by moving writes to struct clock_data
that do not impact sched_clock() outside of the critical
section.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
[ Reworked it slightly to apply to tip/timers/core]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1427397806-20889-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/sched_clock.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ca3bc5c7027c..1751e956add9 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -58,23 +58,21 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 unsigned long long notrace sched_clock(void)
 {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 cyc;
+	u64 cyc, res;
 	unsigned long seq;
 
-	if (cd.suspended)
-		return cd.epoch_ns;
-
 	do {
 		seq = raw_read_seqcount_begin(&cd.seq);
-		epoch_cyc = cd.epoch_cyc;
-		epoch_ns = cd.epoch_ns;
+
+		res = cd.epoch_ns;
+		if (!cd.suspended) {
+			cyc = read_sched_clock();
+			cyc = (cyc - cd.epoch_cyc) & sched_clock_mask;
+			res += cyc_to_ns(cyc, cd.mult, cd.shift);
+		}
 	} while (read_seqcount_retry(&cd.seq, seq));
 
-	cyc = read_sched_clock();
-	cyc = (cyc - epoch_cyc) & sched_clock_mask;
-	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+	return res;
 }
 
 /*
@@ -111,7 +109,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
 	u32 new_mult, new_shift;
-	ktime_t new_wrap_kt;
 	unsigned long r;
 	char r_unit;
 
@@ -124,10 +121,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
 
 	new_mask = CLOCKSOURCE_MASK(bits);
+	cd.rate = rate;
 
 	/* calculate how many nanosecs until we risk wrapping */
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
-	new_wrap_kt = ns_to_ktime(wrap);
+	cd.wrap_kt = ns_to_ktime(wrap);
 
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
@@ -138,8 +136,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	raw_write_seqcount_begin(&cd.seq);
 	read_sched_clock = read;
 	sched_clock_mask = new_mask;
-	cd.rate = rate;
-	cd.wrap_kt = new_wrap_kt;
 	cd.mult = new_mult;
 	cd.shift = new_shift;
 	cd.epoch_cyc = new_epoch;

From cf7c9c170787d6870af54684822f58acc00a966c Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 26 Mar 2015 12:23:23 -0700
Subject: [PATCH 88/98] timers, sched/clock: Optimize cache line usage

Currently sched_clock(), a very hot code path, is not optimized
to minimise its cache profile. In particular:

  1. cd is not ____cacheline_aligned,

  2. struct clock_data does not distinguish between hotpath and
     coldpath data, reducing locality of reference in the hotpath,

  3. Some hotpath data is missing from struct clock_data and is marked
     __read_mostly (which more or less guarantees it will not share a
     cache line with cd).

This patch corrects these problems by extracting all hotpath
data into a separate structure and using ____cacheline_aligned
to ensure the hotpath uses a single (64 byte) cache line.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1427397806-20889-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/sched_clock.c | 112 ++++++++++++++++++++++++++------------
 1 file changed, 77 insertions(+), 35 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 1751e956add9..872e0685d1fb 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -18,28 +18,59 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
 
-struct clock_data {
-	ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock
+ *
+ * @epoch_ns:		sched_clock value at last update
+ * @epoch_cyc:		Clock cycle value at last update
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks
+ * @read_sched_clock:	Current clock source (or dummy source when suspended)
+ * @mult:		Multipler for scaled math conversion
+ * @shift:		Shift value for scaled math conversion
+ * @suspended:		Flag to indicate if the clock is suspended (stopped)
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=48 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
 	u64 epoch_ns;
 	u64 epoch_cyc;
-	seqcount_t seq;
-	unsigned long rate;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
 	u32 mult;
 	u32 shift;
 	bool suspended;
 };
 
+/**
+ * struct clock_data - all data needed for sched_clock (including
+ *                     registration of a new clock source)
+ *
+ * @seq:		Sequence counter for protecting updates.
+ * @read_data:		Data required to read from sched_clock.
+ * @wrap_kt:		Duration for which clock can run before wrapping
+ * @rate:		Tick rate of the registered clock
+ * @actual_read_sched_clock: Registered clock read function
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular seq and read_data (combined) should fit
+ * into a single 64 byte cache line.
+ */
+struct clock_data {
+	seqcount_t seq;
+	struct clock_read_data read_data;
+	ktime_t wrap_kt;
+	unsigned long rate;
+};
+
 static struct hrtimer sched_clock_timer;
 static int irqtime = -1;
 
 core_param(irqtime, irqtime, int, 0400);
 
-static struct clock_data cd = {
-	.mult	= NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
 	/*
@@ -49,7 +80,10 @@ static u64 notrace jiffy_sched_clock_read(void)
 	return (u64)(jiffies - INITIAL_JIFFIES);
 }
 
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+	.read_data = { .mult = NSEC_PER_SEC / HZ,
+		       .read_sched_clock = jiffy_sched_clock_read, },
+};
 
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -60,15 +94,16 @@ unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
 	unsigned long seq;
+	struct clock_read_data *rd = &cd.read_data;
 
 	do {
 		seq = raw_read_seqcount_begin(&cd.seq);
 
-		res = cd.epoch_ns;
-		if (!cd.suspended) {
-			cyc = read_sched_clock();
-			cyc = (cyc - cd.epoch_cyc) & sched_clock_mask;
-			res += cyc_to_ns(cyc, cd.mult, cd.shift);
+		res = rd->epoch_ns;
+		if (!rd->suspended) {
+			cyc = rd->read_sched_clock();
+			cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask;
+			res += cyc_to_ns(cyc, rd->mult, rd->shift);
 		}
 	} while (read_seqcount_retry(&cd.seq, seq));
 
@@ -83,16 +118,17 @@ static void notrace update_sched_clock(void)
 	unsigned long flags;
 	u64 cyc;
 	u64 ns;
+	struct clock_read_data *rd = &cd.read_data;
 
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns +
-		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	cyc = rd->read_sched_clock();
+	ns = rd->epoch_ns +
+	     cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask,
+		       rd->mult, rd->shift);
 
 	raw_local_irq_save(flags);
 	raw_write_seqcount_begin(&cd.seq);
-	cd.epoch_ns = ns;
-	cd.epoch_cyc = cyc;
+	rd->epoch_ns = ns;
+	rd->epoch_cyc = cyc;
 	raw_write_seqcount_end(&cd.seq);
 	raw_local_irq_restore(flags);
 }
@@ -111,6 +147,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	u32 new_mult, new_shift;
 	unsigned long r;
 	char r_unit;
+	struct clock_read_data *rd = &cd.read_data;
 
 	if (cd.rate > rate)
 		return;
@@ -129,17 +166,18 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	cyc = rd->read_sched_clock();
+	ns = rd->epoch_ns +
+	     cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask,
+		       rd->mult, rd->shift);
 
 	raw_write_seqcount_begin(&cd.seq);
-	read_sched_clock = read;
-	sched_clock_mask = new_mask;
-	cd.mult = new_mult;
-	cd.shift = new_shift;
-	cd.epoch_cyc = new_epoch;
-	cd.epoch_ns = ns;
+	rd->read_sched_clock = read;
+	rd->sched_clock_mask = new_mask;
+	rd->mult = new_mult;
+	rd->shift = new_shift;
+	rd->epoch_cyc = new_epoch;
+	rd->epoch_ns = ns;
 	raw_write_seqcount_end(&cd.seq);
 
 	r = rate;
@@ -171,7 +209,7 @@ void __init sched_clock_postinit(void)
 	 * If no sched_clock function has been provided at that point,
 	 * make it the final one one.
 	 */
-	if (read_sched_clock == jiffy_sched_clock_read)
+	if (cd.read_data.read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
 
 	update_sched_clock();
@@ -187,17 +225,21 @@ void __init sched_clock_postinit(void)
 
 static int sched_clock_suspend(void)
 {
+	struct clock_read_data *rd = &cd.read_data;
+
 	update_sched_clock();
 	hrtimer_cancel(&sched_clock_timer);
-	cd.suspended = true;
+	rd->suspended = true;
 	return 0;
 }
 
 static void sched_clock_resume(void)
 {
-	cd.epoch_cyc = read_sched_clock();
+	struct clock_read_data *rd = &cd.read_data;
+
+	rd->epoch_cyc = rd->read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-	cd.suspended = false;
+	rd->suspended = false;
 }
 
 static struct syscore_ops sched_clock_ops = {

From 13dbeb384d2d3aa555ea48d511e8cb110bd172e0 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 26 Mar 2015 12:23:24 -0700
Subject: [PATCH 89/98] timers, sched/clock: Remove suspend from
 clock_read_data()

Currently cd.read_data.suspended is read by the hotpath function
sched_clock(). This variable need not be accessed on the
hotpath. In fact, once it is removed, we can remove the
conditional branches from sched_clock() and install a dummy
read_sched_clock function to suspend the clock.

The new master copy of the function pointer
(actual_read_sched_clock) is introduced and is used for all
reads of the clock hardware except those within sched_clock
itself.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1427397806-20889-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/sched_clock.c | 40 ++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 872e0685d1fb..52ea5d976393 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -28,10 +28,9 @@
  * @read_sched_clock:	Current clock source (or dummy source when suspended)
  * @mult:		Multipler for scaled math conversion
  * @shift:		Shift value for scaled math conversion
- * @suspended:		Flag to indicate if the clock is suspended (stopped)
  *
  * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=48 bytes and, when combined
+ * some very hot code paths. It occupies <=40 bytes and, when combined
  * with the seqcount used to synchronize access, comfortably fits into
  * a 64 byte cache line.
  */
@@ -42,7 +41,6 @@ struct clock_read_data {
 	u64 (*read_sched_clock)(void);
 	u32 mult;
 	u32 shift;
-	bool suspended;
 };
 
 /**
@@ -64,6 +62,7 @@ struct clock_data {
 	struct clock_read_data read_data;
 	ktime_t wrap_kt;
 	unsigned long rate;
+	u64 (*actual_read_sched_clock)(void);
 };
 
 static struct hrtimer sched_clock_timer;
@@ -83,6 +82,8 @@ static u64 notrace jiffy_sched_clock_read(void)
 static struct clock_data cd ____cacheline_aligned = {
 	.read_data = { .mult = NSEC_PER_SEC / HZ,
 		       .read_sched_clock = jiffy_sched_clock_read, },
+	.actual_read_sched_clock = jiffy_sched_clock_read,
+
 };
 
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -99,12 +100,9 @@ unsigned long long notrace sched_clock(void)
 	do {
 		seq = raw_read_seqcount_begin(&cd.seq);
 
-		res = rd->epoch_ns;
-		if (!rd->suspended) {
-			cyc = rd->read_sched_clock();
-			cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask;
-			res += cyc_to_ns(cyc, rd->mult, rd->shift);
-		}
+		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+		      rd->sched_clock_mask;
+		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
 	} while (read_seqcount_retry(&cd.seq, seq));
 
 	return res;
@@ -120,7 +118,7 @@ static void notrace update_sched_clock(void)
 	u64 ns;
 	struct clock_read_data *rd = &cd.read_data;
 
-	cyc = rd->read_sched_clock();
+	cyc = cd.actual_read_sched_clock();
 	ns = rd->epoch_ns +
 	     cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask,
 		       rd->mult, rd->shift);
@@ -166,10 +164,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
-	cyc = rd->read_sched_clock();
+	cyc = cd.actual_read_sched_clock();
 	ns = rd->epoch_ns +
 	     cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask,
 		       rd->mult, rd->shift);
+	cd.actual_read_sched_clock = read;
 
 	raw_write_seqcount_begin(&cd.seq);
 	rd->read_sched_clock = read;
@@ -209,7 +208,7 @@ void __init sched_clock_postinit(void)
 	 * If no sched_clock function has been provided at that point,
 	 * make it the final one one.
 	 */
-	if (cd.read_data.read_sched_clock == jiffy_sched_clock_read)
+	if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
 
 	update_sched_clock();
@@ -223,13 +222,24 @@ void __init sched_clock_postinit(void)
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
 
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+	return cd.read_data.epoch_cyc;
+}
+
 static int sched_clock_suspend(void)
 {
 	struct clock_read_data *rd = &cd.read_data;
 
 	update_sched_clock();
 	hrtimer_cancel(&sched_clock_timer);
-	rd->suspended = true;
+	rd->read_sched_clock = suspended_sched_clock_read;
 	return 0;
 }
 
@@ -237,9 +247,9 @@ static void sched_clock_resume(void)
 {
 	struct clock_read_data *rd = &cd.read_data;
 
-	rd->epoch_cyc = rd->read_sched_clock();
+	rd->epoch_cyc = cd.actual_read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-	rd->suspended = false;
+	rd->read_sched_clock = cd.actual_read_sched_clock;
 }
 
 static struct syscore_ops sched_clock_ops = {

From 9fee69a8c8070b38b558161a3f18bd5e2b664682 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 26 Mar 2015 12:23:25 -0700
Subject: [PATCH 90/98] timers, sched/clock: Remove redundant notrace from
 update function

Currently update_sched_clock() is marked as notrace but this
function is not called by ftrace. This is trivially fixed by
removing the mark up.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1427397806-20889-5-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/sched_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 52ea5d976393..8adb9d0c969a 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -111,7 +111,7 @@ unsigned long long notrace sched_clock(void)
 /*
  * Atomically update the sched_clock epoch.
  */
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
 {
 	unsigned long flags;
 	u64 cyc;

From 1809bfa44e1019e397fabaa6f2349bb7237e57a4 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 26 Mar 2015 12:23:26 -0700
Subject: [PATCH 91/98] timers, sched/clock: Avoid deadlock during read from
 NMI

Currently it is possible for an NMI (or FIQ on ARM) to come in
and read sched_clock() whilst update_sched_clock() has locked
the seqcount for writing. This results in the NMI handler
locking up when it calls raw_read_seqcount_begin().

This patch fixes the NMI safety issues by providing banked clock
data. This is a similar approach to the one used in Thomas
Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe
access to CLOCK_MONOTONIC").

Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1427397806-20889-6-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/sched_clock.c | 101 +++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 34 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 8adb9d0c969a..eeea1e950b72 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -47,19 +47,20 @@ struct clock_read_data {
  * struct clock_data - all data needed for sched_clock (including
  *                     registration of a new clock source)
  *
- * @seq:		Sequence counter for protecting updates.
+ * @seq:		Sequence counter for protecting updates. The lowest
+ *			bit is the index for @read_data.
  * @read_data:		Data required to read from sched_clock.
  * @wrap_kt:		Duration for which clock can run before wrapping
  * @rate:		Tick rate of the registered clock
  * @actual_read_sched_clock: Registered clock read function
  *
  * The ordering of this structure has been chosen to optimize cache
- * performance. In particular seq and read_data (combined) should fit
+ * performance. In particular seq and read_data[0] (combined) should fit
  * into a single 64 byte cache line.
  */
 struct clock_data {
 	seqcount_t seq;
-	struct clock_read_data read_data;
+	struct clock_read_data read_data[2];
 	ktime_t wrap_kt;
 	unsigned long rate;
 	u64 (*actual_read_sched_clock)(void);
@@ -80,10 +81,9 @@ static u64 notrace jiffy_sched_clock_read(void)
 }
 
 static struct clock_data cd ____cacheline_aligned = {
-	.read_data = { .mult = NSEC_PER_SEC / HZ,
-		       .read_sched_clock = jiffy_sched_clock_read, },
+	.read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+			  .read_sched_clock = jiffy_sched_clock_read, },
 	.actual_read_sched_clock = jiffy_sched_clock_read,
-
 };
 
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -95,10 +95,11 @@ unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
 	unsigned long seq;
-	struct clock_read_data *rd = &cd.read_data;
+	struct clock_read_data *rd;
 
 	do {
-		seq = raw_read_seqcount_begin(&cd.seq);
+		seq = raw_read_seqcount(&cd.seq);
+		rd = cd.read_data + (seq & 1);
 
 		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
 		      rd->sched_clock_mask;
@@ -108,27 +109,51 @@ unsigned long long notrace sched_clock(void)
 	return res;
 }
 
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+	/* update the backup (odd) copy with the new data */
+	cd.read_data[1] = *rd;
+
+	/* steer readers towards the odd copy */
+	raw_write_seqcount_latch(&cd.seq);
+
+	/* now its safe for us to update the normal (even) copy */
+	cd.read_data[0] = *rd;
+
+	/* switch readers back to the even copy */
+	raw_write_seqcount_latch(&cd.seq);
+}
+
 /*
  * Atomically update the sched_clock epoch.
  */
 static void update_sched_clock(void)
 {
-	unsigned long flags;
 	u64 cyc;
 	u64 ns;
-	struct clock_read_data *rd = &cd.read_data;
+	struct clock_read_data rd;
+
+	rd = cd.read_data[0];
 
 	cyc = cd.actual_read_sched_clock();
-	ns = rd->epoch_ns +
-	     cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask,
-		       rd->mult, rd->shift);
+	ns = rd.epoch_ns +
+	     cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask,
+		       rd.mult, rd.shift);
 
-	raw_local_irq_save(flags);
-	raw_write_seqcount_begin(&cd.seq);
-	rd->epoch_ns = ns;
-	rd->epoch_cyc = cyc;
-	raw_write_seqcount_end(&cd.seq);
-	raw_local_irq_restore(flags);
+	rd.epoch_ns = ns;
+	rd.epoch_cyc = cyc;
+
+	update_clock_read_data(&rd);
 }
 
 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
@@ -145,7 +170,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	u32 new_mult, new_shift;
 	unsigned long r;
 	char r_unit;
-	struct clock_read_data *rd = &cd.read_data;
+	struct clock_read_data rd;
 
 	if (cd.rate > rate)
 		return;
@@ -162,22 +187,23 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
 	cd.wrap_kt = ns_to_ktime(wrap);
 
+	rd = cd.read_data[0];
+
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
 	cyc = cd.actual_read_sched_clock();
-	ns = rd->epoch_ns +
-	     cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask,
-		       rd->mult, rd->shift);
+	ns = rd.epoch_ns +
+	     cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask,
+		       rd.mult, rd.shift);
 	cd.actual_read_sched_clock = read;
 
-	raw_write_seqcount_begin(&cd.seq);
-	rd->read_sched_clock = read;
-	rd->sched_clock_mask = new_mask;
-	rd->mult = new_mult;
-	rd->shift = new_shift;
-	rd->epoch_cyc = new_epoch;
-	rd->epoch_ns = ns;
-	raw_write_seqcount_end(&cd.seq);
+	rd.read_sched_clock = read;
+	rd.sched_clock_mask = new_mask;
+	rd.mult = new_mult;
+	rd.shift = new_shift;
+	rd.epoch_cyc = new_epoch;
+	rd.epoch_ns = ns;
+	update_clock_read_data(&rd);
 
 	r = rate;
 	if (r >= 4000000) {
@@ -227,15 +253,22 @@ void __init sched_clock_postinit(void)
  *
  * This function makes it appear to sched_clock() as if the clock
  * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of epoch_cyc.
  */
 static u64 notrace suspended_sched_clock_read(void)
 {
-	return cd.read_data.epoch_cyc;
+	unsigned long seq = raw_read_seqcount(&cd.seq);
+
+	return cd.read_data[seq & 1].epoch_cyc;
 }
 
 static int sched_clock_suspend(void)
 {
-	struct clock_read_data *rd = &cd.read_data;
+	struct clock_read_data *rd = &cd.read_data[0];
 
 	update_sched_clock();
 	hrtimer_cancel(&sched_clock_timer);
@@ -245,7 +278,7 @@ static int sched_clock_suspend(void)
 
 static void sched_clock_resume(void)
 {
-	struct clock_read_data *rd = &cd.read_data;
+	struct clock_read_data *rd = &cd.read_data[0];
 
 	rd->epoch_cyc = cd.actual_read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);

From 32fea568aec5b73ae27253125522b5c2a970a1f0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 27 Mar 2015 07:08:06 +0100
Subject: [PATCH 92/98] timers, sched/clock: Clean up the code a bit

Trivial cleanups, to improve the readability of the generic sched_clock() code:

 - Improve and standardize comments
 - Standardize the coding style
 - Use vertical spacing where appropriate
 - etc.

No code changed:

  md5:
    19a053b31e0c54feaeff1492012b019a  sched_clock.o.before.asm
    19a053b31e0c54feaeff1492012b019a  sched_clock.o.after.asm

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/sched_clock.c | 105 ++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 50 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index eeea1e950b72..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
 /*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ *                hardware time counters to full 64-bit ns values.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -19,15 +20,15 @@
 #include <linux/bitops.h>
 
 /**
- * struct clock_read_data - data required to read from sched_clock
+ * struct clock_read_data - data required to read from sched_clock()
  *
- * @epoch_ns:		sched_clock value at last update
- * @epoch_cyc:		Clock cycle value at last update
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
  * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
- *			clocks
- * @read_sched_clock:	Current clock source (or dummy source when suspended)
- * @mult:		Multipler for scaled math conversion
- * @shift:		Shift value for scaled math conversion
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
  *
  * Care must be taken when updating this structure; it is read by
  * some very hot code paths. It occupies <=40 bytes and, when combined
@@ -44,25 +45,26 @@ struct clock_read_data {
 };
 
 /**
- * struct clock_data - all data needed for sched_clock (including
+ * struct clock_data - all data needed for sched_clock() (including
  *                     registration of a new clock source)
  *
  * @seq:		Sequence counter for protecting updates. The lowest
  *			bit is the index for @read_data.
  * @read_data:		Data required to read from sched_clock.
- * @wrap_kt:		Duration for which clock can run before wrapping
- * @rate:		Tick rate of the registered clock
- * @actual_read_sched_clock: Registered clock read function
+ * @wrap_kt:		Duration for which clock can run before wrapping.
+ * @rate:		Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
  *
  * The ordering of this structure has been chosen to optimize cache
- * performance. In particular seq and read_data[0] (combined) should fit
- * into a single 64 byte cache line.
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
  */
 struct clock_data {
-	seqcount_t seq;
-	struct clock_read_data read_data[2];
-	ktime_t wrap_kt;
-	unsigned long rate;
+	seqcount_t		seq;
+	struct clock_read_data	read_data[2];
+	ktime_t			wrap_kt;
+	unsigned long		rate;
+
 	u64 (*actual_read_sched_clock)(void);
 };
 
@@ -112,10 +114,10 @@ unsigned long long notrace sched_clock(void)
 /*
  * Updating the data required to read the clock.
  *
- * sched_clock will never observe mis-matched data even if called from
+ * sched_clock() will never observe mis-matched data even if called from
  * an NMI. We do this by maintaining an odd/even copy of the data and
- * steering sched_clock to one or the other using a sequence counter.
- * In order to preserve the data cache profile of sched_clock as much
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
  * as possible the system reverts back to the even copy when the update
  * completes; the odd copy is used *only* during an update.
  */
@@ -135,7 +137,7 @@ static void update_clock_read_data(struct clock_read_data *rd)
 }
 
 /*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
  */
 static void update_sched_clock(void)
 {
@@ -146,9 +148,7 @@ static void update_sched_clock(void)
 	rd = cd.read_data[0];
 
 	cyc = cd.actual_read_sched_clock();
-	ns = rd.epoch_ns +
-	     cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask,
-		       rd.mult, rd.shift);
+	ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
 
 	rd.epoch_ns = ns;
 	rd.epoch_cyc = cyc;
@@ -160,11 +160,12 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
 	update_sched_clock();
 	hrtimer_forward_now(hrt, cd.wrap_kt);
+
 	return HRTIMER_RESTART;
 }
 
-void __init sched_clock_register(u64 (*read)(void), int bits,
-				 unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
 	u32 new_mult, new_shift;
@@ -177,51 +178,53 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 
 	WARN_ON(!irqs_disabled());
 
-	/* calculate the mult/shift to convert counter ticks to ns. */
+	/* Calculate the mult/shift to convert counter ticks to ns. */
 	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
 
 	new_mask = CLOCKSOURCE_MASK(bits);
 	cd.rate = rate;
 
-	/* calculate how many nanosecs until we risk wrapping */
+	/* Calculate how many nanosecs until we risk wrapping */
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
 	cd.wrap_kt = ns_to_ktime(wrap);
 
 	rd = cd.read_data[0];
 
-	/* update epoch for new counter and update epoch_ns from old counter*/
+	/* Update epoch for new counter and update 'epoch_ns' from old counter*/
 	new_epoch = read();
 	cyc = cd.actual_read_sched_clock();
-	ns = rd.epoch_ns +
-	     cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask,
-		       rd.mult, rd.shift);
+	ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
 	cd.actual_read_sched_clock = read;
 
-	rd.read_sched_clock = read;
-	rd.sched_clock_mask = new_mask;
-	rd.mult = new_mult;
-	rd.shift = new_shift;
-	rd.epoch_cyc = new_epoch;
-	rd.epoch_ns = ns;
+	rd.read_sched_clock	= read;
+	rd.sched_clock_mask	= new_mask;
+	rd.mult			= new_mult;
+	rd.shift		= new_shift;
+	rd.epoch_cyc		= new_epoch;
+	rd.epoch_ns		= ns;
+
 	update_clock_read_data(&rd);
 
 	r = rate;
 	if (r >= 4000000) {
 		r /= 1000000;
 		r_unit = 'M';
-	} else if (r >= 1000) {
-		r /= 1000;
-		r_unit = 'k';
-	} else
-		r_unit = ' ';
+	} else {
+		if (r >= 1000) {
+			r /= 1000;
+			r_unit = 'k';
+		} else {
+			r_unit = ' ';
+		}
+	}
 
-	/* calculate the ns resolution of this counter */
+	/* Calculate the ns resolution of this counter */
 	res = cyc_to_ns(1ULL, new_mult, new_shift);
 
 	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
 		bits, r, r_unit, res, wrap);
 
-	/* Enable IRQ time accounting if we have a fast enough sched_clock */
+	/* Enable IRQ time accounting if we have a fast enough sched_clock() */
 	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
 		enable_sched_clock_irqtime();
 
@@ -231,7 +234,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 void __init sched_clock_postinit(void)
 {
 	/*
-	 * If no sched_clock function has been provided at that point,
+	 * If no sched_clock() function has been provided at that point,
 	 * make it the final one one.
 	 */
 	if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
@@ -257,7 +260,7 @@ void __init sched_clock_postinit(void)
  * This function must only be called from the critical
  * section in sched_clock(). It relies on the read_seqcount_retry()
  * at the end of the critical section to be sure we observe the
- * correct copy of epoch_cyc.
+ * correct copy of 'epoch_cyc'.
  */
 static u64 notrace suspended_sched_clock_read(void)
 {
@@ -273,6 +276,7 @@ static int sched_clock_suspend(void)
 	update_sched_clock();
 	hrtimer_cancel(&sched_clock_timer);
 	rd->read_sched_clock = suspended_sched_clock_read;
+
 	return 0;
 }
 
@@ -286,13 +290,14 @@ static void sched_clock_resume(void)
 }
 
 static struct syscore_ops sched_clock_ops = {
-	.suspend = sched_clock_suspend,
-	.resume = sched_clock_resume,
+	.suspend	= sched_clock_suspend,
+	.resume		= sched_clock_resume,
 };
 
 static int __init sched_clock_syscore_init(void)
 {
 	register_syscore_ops(&sched_clock_ops);
+
 	return 0;
 }
 device_initcall(sched_clock_syscore_init);

From 876e78818def2983be55878b21f7152fbaebbd36 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Mar 2015 10:09:06 +0100
Subject: [PATCH 93/98] time: Rename timekeeper::tkr to timekeeper::tkr_mono

In preparation of adding another tkr field, rename this one to
tkr_mono. Also rename tk_read_base::base_mono to tk_read_base::base,
since the structure is not specific to CLOCK_MONOTONIC and the mono
name got added to the tk_read_base instance.

Lots of trivial churn.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150319093400.344679419@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/vdso.c            |  10 +-
 arch/s390/kernel/time.c             |  18 ++--
 arch/tile/kernel/time.c             |  24 ++---
 arch/x86/kernel/vsyscall_gtod.c     |  24 ++---
 arch/x86/kvm/x86.c                  |  14 +--
 include/linux/timekeeper_internal.h |  12 +--
 kernel/time/timekeeping.c           | 150 ++++++++++++++--------------
 7 files changed, 126 insertions(+), 126 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 32aeea083d93..ec37ab3f524f 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -200,7 +200,7 @@ up_fail:
 void update_vsyscall(struct timekeeper *tk)
 {
 	struct timespec xtime_coarse;
-	u32 use_syscall = strcmp(tk->tkr.clock->name, "arch_sys_counter");
+	u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter");
 
 	++vdso_data->tb_seq_count;
 	smp_wmb();
@@ -213,11 +213,11 @@ void update_vsyscall(struct timekeeper *tk)
 	vdso_data->wtm_clock_nsec		= tk->wall_to_monotonic.tv_nsec;
 
 	if (!use_syscall) {
-		vdso_data->cs_cycle_last	= tk->tkr.cycle_last;
+		vdso_data->cs_cycle_last	= tk->tkr_mono.cycle_last;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
-		vdso_data->xtime_clock_nsec	= tk->tkr.xtime_nsec;
-		vdso_data->cs_mult		= tk->tkr.mult;
-		vdso_data->cs_shift		= tk->tkr.shift;
+		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
+		vdso_data->cs_mult		= tk->tkr_mono.mult;
+		vdso_data->cs_shift		= tk->tkr_mono.shift;
 	}
 
 	smp_wmb();
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6c273cd815bb..170ddd2018b3 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -215,20 +215,20 @@ void update_vsyscall(struct timekeeper *tk)
 {
 	u64 nsecps;
 
-	if (tk->tkr.clock != &clocksource_tod)
+	if (tk->tkr_mono.clock != &clocksource_tod)
 		return;
 
 	/* Make userspace gettimeofday spin until we're done. */
 	++vdso_data->tb_update_count;
 	smp_wmb();
-	vdso_data->xtime_tod_stamp = tk->tkr.cycle_last;
+	vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
 	vdso_data->xtime_clock_sec = tk->xtime_sec;
-	vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec;
+	vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
 	vdso_data->wtom_clock_sec =
 		tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = tk->tkr.xtime_nsec +
-		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr.shift);
-	nsecps = (u64) NSEC_PER_SEC << tk->tkr.shift;
+	vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
+		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+	nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
 	while (vdso_data->wtom_clock_nsec >= nsecps) {
 		vdso_data->wtom_clock_nsec -= nsecps;
 		vdso_data->wtom_clock_sec++;
@@ -236,7 +236,7 @@ void update_vsyscall(struct timekeeper *tk)
 
 	vdso_data->xtime_coarse_sec = tk->xtime_sec;
 	vdso_data->xtime_coarse_nsec =
-		(long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+		(long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	vdso_data->wtom_coarse_sec =
 		vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
 	vdso_data->wtom_coarse_nsec =
@@ -246,8 +246,8 @@ void update_vsyscall(struct timekeeper *tk)
 		vdso_data->wtom_coarse_sec++;
 	}
 
-	vdso_data->tk_mult = tk->tkr.mult;
-	vdso_data->tk_shift = tk->tkr.shift;
+	vdso_data->tk_mult = tk->tkr_mono.mult;
+	vdso_data->tk_shift = tk->tkr_mono.shift;
 	smp_wmb();
 	++vdso_data->tb_update_count;
 }
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index d412b0856c0a..00178ecf9aea 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -257,34 +257,34 @@ void update_vsyscall_tz(void)
 
 void update_vsyscall(struct timekeeper *tk)
 {
-	if (tk->tkr.clock != &cycle_counter_cs)
+	if (tk->tkr_mono.clock != &cycle_counter_cs)
 		return;
 
 	write_seqcount_begin(&vdso_data->tb_seq);
 
-	vdso_data->cycle_last		= tk->tkr.cycle_last;
-	vdso_data->mask			= tk->tkr.mask;
-	vdso_data->mult			= tk->tkr.mult;
-	vdso_data->shift		= tk->tkr.shift;
+	vdso_data->cycle_last		= tk->tkr_mono.cycle_last;
+	vdso_data->mask			= tk->tkr_mono.mask;
+	vdso_data->mult			= tk->tkr_mono.mult;
+	vdso_data->shift		= tk->tkr_mono.shift;
 
 	vdso_data->wall_time_sec	= tk->xtime_sec;
-	vdso_data->wall_time_snsec	= tk->tkr.xtime_nsec;
+	vdso_data->wall_time_snsec	= tk->tkr_mono.xtime_nsec;
 
 	vdso_data->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
-	vdso_data->monotonic_time_snsec	= tk->tkr.xtime_nsec
+	vdso_data->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
 					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr.shift);
+						<< tk->tkr_mono.shift);
 	while (vdso_data->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
 		vdso_data->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr.shift;
+					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
 		vdso_data->monotonic_time_sec++;
 	}
 
 	vdso_data->wall_time_coarse_sec	= tk->xtime_sec;
-	vdso_data->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
-						 tk->tkr.shift);
+	vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+						 tk->tkr_mono.shift);
 
 	vdso_data->monotonic_time_coarse_sec =
 		vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index c7d791f32b98..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
 	gtod_write_begin(vdata);
 
 	/* copy vsyscall data */
-	vdata->vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
-	vdata->cycle_last	= tk->tkr.cycle_last;
-	vdata->mask		= tk->tkr.mask;
-	vdata->mult		= tk->tkr.mult;
-	vdata->shift		= tk->tkr.shift;
+	vdata->vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->cycle_last	= tk->tkr_mono.cycle_last;
+	vdata->mask		= tk->tkr_mono.mask;
+	vdata->mult		= tk->tkr_mono.mult;
+	vdata->shift		= tk->tkr_mono.shift;
 
 	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->tkr.xtime_nsec;
+	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
 
 	vdata->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->tkr.xtime_nsec
+	vdata->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
 					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr.shift);
+						<< tk->tkr_mono.shift);
 	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
 		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr.shift;
+					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
 		vdata->monotonic_time_sec++;
 	}
 
 	vdata->wall_time_coarse_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse_nsec	= (long)(tk->tkr.xtime_nsec >>
-						 tk->tkr.shift);
+	vdata->wall_time_coarse_nsec	= (long)(tk->tkr_mono.xtime_nsec >>
+						 tk->tkr_mono.shift);
 
 	vdata->monotonic_time_coarse_sec =
 		vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70be41b3..d7a300e0147f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1070,19 +1070,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
 	u64 boot_ns;
 
-	boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
+	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
 
 	write_seqcount_begin(&vdata->seq);
 
 	/* copy pvclock gtod data */
-	vdata->clock.vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->tkr.cycle_last;
-	vdata->clock.mask		= tk->tkr.mask;
-	vdata->clock.mult		= tk->tkr.mult;
-	vdata->clock.shift		= tk->tkr.shift;
+	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
+	vdata->clock.mask		= tk->tkr_mono.mask;
+	vdata->clock.mult		= tk->tkr_mono.mult;
+	vdata->clock.shift		= tk->tkr_mono.shift;
 
 	vdata->boot_ns			= boot_ns;
-	vdata->nsec_base		= tk->tkr.xtime_nsec;
+	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 
 	write_seqcount_end(&vdata->seq);
 }
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 05af9a334893..73df17f1535f 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -16,16 +16,16 @@
  * @read:	Read function of @clock
  * @mask:	Bitmask for two's complement subtraction of non 64bit clocks
  * @cycle_last: @clock cycle value at last update
- * @mult:	NTP adjusted multiplier for scaled math conversion
+ * @mult:	(NTP adjusted) multiplier for scaled math conversion
  * @shift:	Shift value for scaled math conversion
  * @xtime_nsec: Shifted (fractional) nano seconds offset for readout
- * @base_mono:  ktime_t (nanoseconds) base time for readout
+ * @base:	ktime_t (nanoseconds) base time for readout
  *
  * This struct has size 56 byte on 64 bit. Together with a seqcount it
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessor to clock monotonic.
+ * for a fast NMI safe accessors.
  */
 struct tk_read_base {
 	struct clocksource	*clock;
@@ -35,12 +35,12 @@ struct tk_read_base {
 	u32			mult;
 	u32			shift;
 	u64			xtime_nsec;
-	ktime_t			base_mono;
+	ktime_t			base;
 };
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr:		The readout base structure
+ * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
  * @xtime_sec:		Current CLOCK_REALTIME time in seconds
  * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
  * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
@@ -76,7 +76,7 @@ struct tk_read_base {
  * used instead.
  */
 struct timekeeper {
-	struct tk_read_base	tkr;
+	struct tk_read_base	tkr_mono;
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 892f6cbf1e67..1405091f3acb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -68,8 +68,8 @@ bool __read_mostly persistent_clock_exist = false;
 
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-	while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
-		tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+		tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 		tk->xtime_sec++;
 	}
 }
@@ -79,20 +79,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
 	struct timespec64 ts;
 
 	ts.tv_sec = tk->xtime_sec;
-	ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+	ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	return ts;
 }
 
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec = ts->tv_sec;
-	tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
 }
 
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec += ts->tv_sec;
-	tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
 	tk_normalize_xtime(tk);
 }
 
@@ -136,8 +136,8 @@ static long timekeeping_last_warning;
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
 
-	cycle_t max_cycles = tk->tkr.clock->max_cycles;
-	const char *name = tk->tkr.clock->name;
+	cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+	const char *name = tk->tkr_mono.clock->name;
 
 	if (offset > max_cycles) {
 		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
@@ -246,11 +246,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;
 
-	old_clock = tk->tkr.clock;
-	tk->tkr.clock = clock;
-	tk->tkr.read = clock->read;
-	tk->tkr.mask = clock->mask;
-	tk->tkr.cycle_last = tk->tkr.read(clock);
+	old_clock = tk->tkr_mono.clock;
+	tk->tkr_mono.clock = clock;
+	tk->tkr_mono.read = clock->read;
+	tk->tkr_mono.mask = clock->mask;
+	tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@@ -274,11 +274,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	if (old_clock) {
 		int shift_change = clock->shift - old_clock->shift;
 		if (shift_change < 0)
-			tk->tkr.xtime_nsec >>= -shift_change;
+			tk->tkr_mono.xtime_nsec >>= -shift_change;
 		else
-			tk->tkr.xtime_nsec <<= shift_change;
+			tk->tkr_mono.xtime_nsec <<= shift_change;
 	}
-	tk->tkr.shift = clock->shift;
+	tk->tkr_mono.shift = clock->shift;
 
 	tk->ntp_error = 0;
 	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -289,7 +289,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	 * active clocksource. These value will be adjusted via NTP
 	 * to counteract clock drifting.
 	 */
-	tk->tkr.mult = clock->mult;
+	tk->tkr_mono.mult = clock->mult;
 	tk->ntp_err_mult = 0;
 }
 
@@ -318,11 +318,11 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 
 static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 {
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	cycle_t delta;
 	s64 nsec;
 
-	delta = timekeeping_get_delta(&tk->tkr);
+	delta = timekeeping_get_delta(&tk->tkr_mono);
 
 	/* convert delta to nanoseconds. */
 	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
@@ -428,7 +428,7 @@ u64 notrace ktime_get_mono_fast_ns(void)
 	do {
 		seq = raw_read_seqcount(&tk_fast_mono.seq);
 		tkr = tk_fast_mono.base + (seq & 0x01);
-		now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
 
 	} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
 	return now;
@@ -456,7 +456,7 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
 static void halt_fast_timekeeper(struct timekeeper *tk)
 {
 	static struct tk_read_base tkr_dummy;
-	struct tk_read_base *tkr = &tk->tkr;
+	struct tk_read_base *tkr = &tk->tkr_mono;
 
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tkr->read(tkr->clock);
@@ -472,8 +472,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
 
 	xt = timespec64_to_timespec(tk_xtime(tk));
 	wm = timespec64_to_timespec(tk->wall_to_monotonic);
-	update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
-			    tk->tkr.cycle_last);
+	update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+			    tk->tkr_mono.cycle_last);
 }
 
 static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -490,11 +490,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
 	* users are removed, this can be killed.
 	*/
-	remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
-	tk->tkr.xtime_nsec -= remainder;
-	tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+	tk->tkr_mono.xtime_nsec -= remainder;
+	tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
 	tk->ntp_error += remainder << tk->ntp_error_shift;
-	tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+	tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -559,7 +559,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	 */
 	seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
 	nsec = (u32) tk->wall_to_monotonic.tv_nsec;
-	tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 
 	/* Update the monotonic raw base */
 	tk->base_raw = timespec64_to_ktime(tk->raw_time);
@@ -569,7 +569,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	 * wall_to_monotonic can be greater/equal one second. Take
 	 * this into account before updating tk->ktime_sec.
 	 */
-	nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+	nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	if (nsec >= NSEC_PER_SEC)
 		seconds++;
 	tk->ktime_sec = seconds;
@@ -592,7 +592,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
 
-	update_fast_timekeeper(&tk->tkr);
+	update_fast_timekeeper(&tk->tkr_mono);
 }
 
 /**
@@ -604,18 +604,18 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
  */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	cycle_t cycle_now, delta;
 	s64 nsec;
 
-	cycle_now = tk->tkr.read(clock);
-	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-	tk->tkr.cycle_last = cycle_now;
+	cycle_now = tk->tkr_mono.read(clock);
+	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	tk->tkr_mono.cycle_last = cycle_now;
 
-	tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+	tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
 
 	/* If arch requires, add in get_arch_timeoffset() */
-	tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
 
 	tk_normalize_xtime(tk);
 
@@ -640,7 +640,7 @@ int __getnstimeofday64(struct timespec64 *ts)
 		seq = read_seqcount_begin(&tk_core.seq);
 
 		ts->tv_sec = tk->xtime_sec;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -680,8 +680,8 @@ ktime_t ktime_get(void)
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = tk->tkr.base_mono;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -706,8 +706,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = ktime_add(tk->tkr.base_mono, *offset);
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = ktime_add(tk->tkr_mono.base, *offset);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -777,7 +777,7 @@ void ktime_get_ts64(struct timespec64 *ts)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		ts->tv_sec = tk->xtime_sec;
-		nsec = timekeeping_get_ns(&tk->tkr);
+		nsec = timekeeping_get_ns(&tk->tkr_mono);
 		tomono = tk->wall_to_monotonic;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -863,7 +863,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		ts_real->tv_nsec = 0;
 
 		nsecs_raw = timekeeping_get_ns_raw(tk);
-		nsecs_real = timekeeping_get_ns(&tk->tkr);
+		nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1046,7 +1046,7 @@ static int change_clocksource(void *data)
 	 */
 	if (try_module_get(new->owner)) {
 		if (!new->enable || new->enable(new) == 0) {
-			old = tk->tkr.clock;
+			old = tk->tkr_mono.clock;
 			tk_setup_internals(tk, new);
 			if (old->disable)
 				old->disable(old);
@@ -1074,11 +1074,11 @@ int timekeeping_notify(struct clocksource *clock)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 
-	if (tk->tkr.clock == clock)
+	if (tk->tkr_mono.clock == clock)
 		return 0;
 	stop_machine(change_clocksource, clock, NULL);
 	tick_clock_notify();
-	return tk->tkr.clock == clock ? 0 : -1;
+	return tk->tkr_mono.clock == clock ? 0 : -1;
 }
 
 /**
@@ -1119,7 +1119,7 @@ int timekeeping_valid_for_hres(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+		ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1138,7 +1138,7 @@ u64 timekeeping_max_deferment(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		ret = tk->tkr.clock->max_idle_ns;
+		ret = tk->tkr_mono.clock->max_idle_ns;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1303,7 +1303,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
 void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	unsigned long flags;
 	struct timespec64 ts_new, ts_delta;
 	struct timespec tmp;
@@ -1331,16 +1331,16 @@ void timekeeping_resume(void)
 	 * The less preferred source will only be tried if there is no better
 	 * usable source. The rtc part is handled separately in rtc core code.
 	 */
-	cycle_now = tk->tkr.read(clock);
+	cycle_now = tk->tkr_mono.read(clock);
 	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-		cycle_now > tk->tkr.cycle_last) {
+		cycle_now > tk->tkr_mono.cycle_last) {
 		u64 num, max = ULLONG_MAX;
 		u32 mult = clock->mult;
 		u32 shift = clock->shift;
 		s64 nsec = 0;
 
-		cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
-						tk->tkr.mask);
+		cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+						tk->tkr_mono.mask);
 
 		/*
 		 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1366,7 +1366,7 @@ void timekeeping_resume(void)
 		__timekeeping_inject_sleeptime(tk, &ts_delta);
 
 	/* Re-base the last cycle value */
-	tk->tkr.cycle_last = cycle_now;
+	tk->tkr_mono.cycle_last = cycle_now;
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1519,15 +1519,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
 	 *
 	 * XXX - TODO: Doc ntp_error calculation.
 	 */
-	if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+	if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
 		/* NTP adjustment caused clocksource mult overflow */
 		WARN_ON_ONCE(1);
 		return;
 	}
 
-	tk->tkr.mult += mult_adj;
+	tk->tkr_mono.mult += mult_adj;
 	tk->xtime_interval += interval;
-	tk->tkr.xtime_nsec -= offset;
+	tk->tkr_mono.xtime_nsec -= offset;
 	tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 }
 
@@ -1589,13 +1589,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 		tk->ntp_err_mult = 0;
 	}
 
-	if (unlikely(tk->tkr.clock->maxadj &&
-		(abs(tk->tkr.mult - tk->tkr.clock->mult)
-			> tk->tkr.clock->maxadj))) {
+	if (unlikely(tk->tkr_mono.clock->maxadj &&
+		(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+			> tk->tkr_mono.clock->maxadj))) {
 		printk_once(KERN_WARNING
 			"Adjusting %s more than 11%% (%ld vs %ld)\n",
-			tk->tkr.clock->name, (long)tk->tkr.mult,
-			(long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+			tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+			(long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
 	}
 
 	/*
@@ -1612,9 +1612,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 	 * We'll correct this error next time through this function, when
 	 * xtime_nsec is not as small.
 	 */
-	if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
-		s64 neg = -(s64)tk->tkr.xtime_nsec;
-		tk->tkr.xtime_nsec = 0;
+	if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+		s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+		tk->tkr_mono.xtime_nsec = 0;
 		tk->ntp_error += neg << tk->ntp_error_shift;
 	}
 }
@@ -1629,13 +1629,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
  */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 	unsigned int clock_set = 0;
 
-	while (tk->tkr.xtime_nsec >= nsecps) {
+	while (tk->tkr_mono.xtime_nsec >= nsecps) {
 		int leap;
 
-		tk->tkr.xtime_nsec -= nsecps;
+		tk->tkr_mono.xtime_nsec -= nsecps;
 		tk->xtime_sec++;
 
 		/* Figure out if its a leap sec and apply if needed */
@@ -1680,9 +1680,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 
 	/* Accumulate one shifted interval */
 	offset -= interval;
-	tk->tkr.cycle_last += interval;
+	tk->tkr_mono.cycle_last += interval;
 
-	tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+	tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
 	*clock_set |= accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
@@ -1725,8 +1725,8 @@ void update_wall_time(void)
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	offset = real_tk->cycle_interval;
 #else
-	offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
-				   tk->tkr.cycle_last, tk->tkr.mask);
+	offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 #endif
 
 	/* Check if there's really nothing to do */
@@ -1890,8 +1890,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		base = tk->tkr.base_mono;
-		nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+		base = tk->tkr_mono.base;
+		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
@@ -1922,8 +1922,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		base = tk->tkr.base_mono;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;

From 4a4ad80d32cea69ee93bd4589f24dc478804cd80 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Mar 2015 09:28:44 +0100
Subject: [PATCH 94/98] time: Add timerkeeper::tkr_raw

Introduce tkr_raw and make use of it.

  base_raw -> tkr_raw.base
  clock->{mult,shift} -> tkr_raw.{mult.shift}

Kill timekeeping_get_ns_raw() in favour of
timekeeping_get_ns(&tkr_raw), this removes all mono_raw special
casing.

Duplicate the updates to tkr_mono.cycle_last into tkr_raw.cycle_last,
both need the same value.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150319093400.422589590@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timekeeper_internal.h |  4 +--
 kernel/time/timekeeping.c           | 41 +++++++++++++----------------
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 73df17f1535f..fb86963859c7 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -41,6 +41,7 @@ struct tk_read_base {
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
  * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
+ * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
  * @xtime_sec:		Current CLOCK_REALTIME time in seconds
  * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
  * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
@@ -48,7 +49,6 @@ struct tk_read_base {
  * @offs_boot:		Offset clock monotonic -> clock boottime
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
- * @base_raw:		Monotonic raw base time in ktime_t format
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@@ -77,6 +77,7 @@ struct tk_read_base {
  */
 struct timekeeper {
 	struct tk_read_base	tkr_mono;
+	struct tk_read_base	tkr_raw;
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@@ -84,7 +85,6 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
-	ktime_t			base_raw;
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1405091f3acb..cbb612ee813f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -252,6 +252,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	tk->tkr_mono.mask = clock->mask;
 	tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
 
+	tk->tkr_raw.clock = clock;
+	tk->tkr_raw.read = clock->read;
+	tk->tkr_raw.mask = clock->mask;
+	tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
+
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
 	tmp <<= clock->shift;
@@ -278,7 +283,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 		else
 			tk->tkr_mono.xtime_nsec <<= shift_change;
 	}
+	tk->tkr_raw.xtime_nsec = 0;
+
 	tk->tkr_mono.shift = clock->shift;
+	tk->tkr_raw.shift = clock->shift;
 
 	tk->ntp_error = 0;
 	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -290,6 +298,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	 * to counteract clock drifting.
 	 */
 	tk->tkr_mono.mult = clock->mult;
+	tk->tkr_raw.mult = clock->mult;
 	tk->ntp_err_mult = 0;
 }
 
@@ -316,21 +325,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 	return nsec + arch_gettimeoffset();
 }
 
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
-	struct clocksource *clock = tk->tkr_mono.clock;
-	cycle_t delta;
-	s64 nsec;
-
-	delta = timekeeping_get_delta(&tk->tkr_mono);
-
-	/* convert delta to nanoseconds. */
-	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-
-	/* If arch requires, add in get_arch_timeoffset() */
-	return nsec + arch_gettimeoffset();
-}
-
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
  * @tkr: Timekeeping readout base from which we take the update
@@ -562,7 +556,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 
 	/* Update the monotonic raw base */
-	tk->base_raw = timespec64_to_ktime(tk->raw_time);
+	tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
 
 	/*
 	 * The sum of the nanoseconds portions of xtime and
@@ -611,6 +605,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	cycle_now = tk->tkr_mono.read(clock);
 	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 	tk->tkr_mono.cycle_last = cycle_now;
+	tk->tkr_raw.cycle_last  = cycle_now;
 
 	tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
 
@@ -619,7 +614,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 
 	tk_normalize_xtime(tk);
 
-	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+	nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
 	timespec64_add_ns(&tk->raw_time, nsec);
 }
 
@@ -748,8 +743,8 @@ ktime_t ktime_get_raw(void)
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = tk->base_raw;
-		nsecs = timekeeping_get_ns_raw(tk);
+		base = tk->tkr_raw.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_raw);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -862,7 +857,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		ts_real->tv_sec = tk->xtime_sec;
 		ts_real->tv_nsec = 0;
 
-		nsecs_raw = timekeeping_get_ns_raw(tk);
+		nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
 		nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1096,7 +1091,7 @@ void getrawmonotonic64(struct timespec64 *ts)
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		nsecs = timekeeping_get_ns_raw(tk);
+		nsecs = timekeeping_get_ns(&tk->tkr_raw);
 		ts64 = tk->raw_time;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1217,7 +1212,6 @@ void __init timekeeping_init(void)
 	tk_set_xtime(tk, &now);
 	tk->raw_time.tv_sec = 0;
 	tk->raw_time.tv_nsec = 0;
-	tk->base_raw.tv64 = 0;
 	if (boot.tv_sec == 0 && boot.tv_nsec == 0)
 		boot = tk_xtime(tk);
 
@@ -1367,6 +1361,8 @@ void timekeeping_resume(void)
 
 	/* Re-base the last cycle value */
 	tk->tkr_mono.cycle_last = cycle_now;
+	tk->tkr_raw.cycle_last  = cycle_now;
+
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1681,6 +1677,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 	/* Accumulate one shifted interval */
 	offset -= interval;
 	tk->tkr_mono.cycle_last += interval;
+	tk->tkr_raw.cycle_last  += interval;
 
 	tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
 	*clock_set |= accumulate_nsecs_to_secs(tk);

From 4498e7467e9e441c18ca12f1ca08460356e0508a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Mar 2015 09:36:19 +0100
Subject: [PATCH 95/98] time: Parametrize all tk_fast_mono users

In preparation for more tk_fast instances, remove all hard-coded
tk_fast_mono references.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150319093400.484279927@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timekeeping.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbb612ee813f..278373edb472 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -364,18 +364,18 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
  * slightly wrong timestamp (a few nanoseconds). See
  * @ktime_get_mono_fast_ns.
  */
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
 {
-	struct tk_read_base *base = tk_fast_mono.base;
+	struct tk_read_base *base = tkf->base;
 
 	/* Force readers off to base[1] */
-	raw_write_seqcount_latch(&tk_fast_mono.seq);
+	raw_write_seqcount_latch(&tkf->seq);
 
 	/* Update base[0] */
 	memcpy(base, tkr, sizeof(*base));
 
 	/* Force readers back to base[0] */
-	raw_write_seqcount_latch(&tk_fast_mono.seq);
+	raw_write_seqcount_latch(&tkf->seq);
 
 	/* Update base[1] */
 	memcpy(base + 1, base, sizeof(*base));
@@ -413,20 +413,25 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
  * of the following timestamps. Callers need to be aware of that and
  * deal with it.
  */
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
 	struct tk_read_base *tkr;
 	unsigned int seq;
 	u64 now;
 
 	do {
-		seq = raw_read_seqcount(&tk_fast_mono.seq);
-		tkr = tk_fast_mono.base + (seq & 0x01);
+		seq = raw_read_seqcount(&tkf->seq);
+		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+	} while (read_seqcount_retry(&tkf->seq, seq));
 
-	} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
 	return now;
 }
+
+u64 ktime_get_mono_fast_ns(void)
+{
+	return __ktime_get_fast_ns(&tk_fast_mono);
+}
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 
 /* Suspend-time cycles value for halted fast timekeeper. */
@@ -455,7 +460,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tkr->read(tkr->clock);
 	tkr_dummy.read = dummy_clock_read;
-	update_fast_timekeeper(&tkr_dummy);
+	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
 }
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -586,7 +591,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
 
-	update_fast_timekeeper(&tk->tkr_mono);
+	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 }
 
 /**

From f09cb9a1808e35ad7502ea39b6bfb443c7fa0f19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Mar 2015 09:39:08 +0100
Subject: [PATCH 96/98] time: Introduce tk_fast_raw

Add the NMI safe CLOCK_MONOTONIC_RAW accessor..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150319093400.562746929@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timekeeping.h |  1 +
 kernel/time/timekeeping.c   | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3eaae4754275..f36b1edf3f73 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -220,6 +220,7 @@ static inline u64 ktime_get_raw_ns(void)
 }
 
 extern u64 ktime_get_mono_fast_ns(void);
+extern u64 ktime_get_raw_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 278373edb472..c3fcff06d30a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,6 +59,7 @@ struct tk_fast {
 };
 
 static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw  ____cacheline_aligned;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -434,6 +435,12 @@ u64 ktime_get_mono_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 
+u64 ktime_get_raw_fast_ns(void)
+{
+	return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
 
@@ -461,6 +468,11 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	cycles_at_suspend = tkr->read(tkr->clock);
 	tkr_dummy.read = dummy_clock_read;
 	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+	tkr = &tk->tkr_raw;
+	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+	tkr_dummy.read = dummy_clock_read;
+	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -592,6 +604,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		       sizeof(tk_core.timekeeper));
 
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 }
 
 /**

From fe5fba05b46c791c95a9f34228ac495f81f72fc0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Mar 2015 12:39:10 +0100
Subject: [PATCH 97/98] time: Add ktime_get_tai_ns()

Because it was the only clock for which we didn't have a _ns()
accessor yet.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timekeeping.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index f36b1edf3f73..5047b83483d6 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -214,6 +214,11 @@ static inline u64 ktime_get_boot_ns(void)
 	return ktime_to_ns(ktime_get_boottime());
 }
 
+static inline u64 ktime_get_tai_ns(void)
+{
+	return ktime_to_ns(ktime_get_clocktai());
+}
+
 static inline u64 ktime_get_raw_ns(void)
 {
 	return ktime_to_ns(ktime_get_raw());

From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: [PATCH 98/98] perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++-
 include/linux/perf_event.h       |  2 +
 include/uapi/linux/perf_event.h  |  6 +--
 kernel/events/core.c             | 77 ++++++++++++++++++++++++++++++--
 4 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ac41b3ad1fc9..0420ebcac116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b16eac5f54ce..401554074de9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -173,6 +173,7 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3bb40ddadbe5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -326,7 +326,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -355,8 +356,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;
 
-	/* Align to u64. */
-	__u32	__reserved_2;
+	__s32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bb1a7c36e794..c40c2cac2d8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);
 
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};
 
@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->hw.target = task;
 	}
 
+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
 
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+	bool nmi_safe = false;
+
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		event->clock = &ktime_get_mono_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_MONOTONIC_RAW:
+		event->clock = &ktime_get_raw_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_REALTIME:
+		event->clock = &ktime_get_real_ns;
+		break;
+
+	case CLOCK_BOOTTIME:
+		event->clock = &ktime_get_boot_ns;
+		break;
+
+	case CLOCK_TAI:
+		event->clock = &ktime_get_tai_ns;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;
 
+	if (attr.use_clockid) {
+		err = perf_event_set_clock(event, attr.clockid);
+		if (err)
+			goto err_alloc;
+	}
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context: