diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
index 8e201219f525..b38cc0d0c71a 100644
--- a/arch/i386/kernel/timers/common.c
+++ b/arch/i386/kernel/timers/common.c
@@ -139,6 +139,15 @@ bad_calibration:
 }
 #endif
 
+
+unsigned long read_timer_tsc(void)
+{
+	unsigned long retval;
+	rdtscl(retval);
+	return retval;
+}
+
+
 /* calculate cpu_khz */
 void init_cpu_khz(void)
 {
diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c
index a3d6a288088b..7e39ed8e33f8 100644
--- a/arch/i386/kernel/timers/timer.c
+++ b/arch/i386/kernel/timers/timer.c
@@ -64,3 +64,12 @@ struct timer_opts* __init select_timer(void)
 	panic("select_timer: Cannot find a suitable timer\n");
 	return NULL;
 }
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (cur_timer->read_timer) {
+		*timer_val = cur_timer->read_timer();
+		return 0;
+	}
+	return -1;
+}
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index f778f471a09a..15a7d727bd6f 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -186,6 +186,7 @@ static struct timer_opts timer_hpet = {
 	.get_offset =		get_offset_hpet,
 	.monotonic_clock =	monotonic_clock_hpet,
 	.delay = 		delay_hpet,
+	.read_timer = 		read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_hpet_init = {
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
index d77f22030fe6..4ef20e663498 100644
--- a/arch/i386/kernel/timers/timer_pm.c
+++ b/arch/i386/kernel/timers/timer_pm.c
@@ -246,6 +246,7 @@ static struct timer_opts timer_pmtmr = {
 	.get_offset		= get_offset_pmtmr,
 	.monotonic_clock 	= monotonic_clock_pmtmr,
 	.delay 			= delay_pmtmr,
+	.read_timer 		= read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_pmtmr_init = {
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index 180444d87824..27f08ae9120b 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -572,6 +572,7 @@ static struct timer_opts timer_tsc = {
 	.get_offset = get_offset_tsc,
 	.monotonic_clock = monotonic_clock_tsc,
 	.delay = delay_tsc,
+	.read_timer = read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_tsc_init = {
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index aed61a668a1b..33a873a3c223 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <asm/delay.h>
+#include <asm/msr.h>
 
 #ifdef CONFIG_SMP
 #include <asm/smp.h>
@@ -19,6 +20,12 @@
 
 int x86_udelay_tsc = 0;		/* Delay via TSC */
 
+int read_current_timer(unsigned long *timer_value)
+{
+	rdtscll(*timer_value);
+	return 0;
+}
+
 void __delay(unsigned long loops)
 {
 	unsigned bclock, now;
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index c34709849839..dcf1e07db08a 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -22,6 +22,7 @@ struct timer_opts {
 	unsigned long (*get_offset)(void);
 	unsigned long long (*monotonic_clock)(void);
 	void (*delay)(unsigned long);
+	unsigned long (*read_timer)(void);
 };
 
 struct init_timer_opts {
@@ -52,6 +53,7 @@ extern struct init_timer_opts timer_cyclone_init;
 #endif
 
 extern unsigned long calibrate_tsc(void);
+extern unsigned long read_timer_tsc(void);
 extern void init_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
 #ifdef CONFIG_HPET_TIMER
diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h
index b41e484c3445..e370f907bd39 100644
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -49,4 +49,7 @@ static inline cycles_t get_cycles (void)
 
 extern unsigned long cpu_khz;
 
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
 #endif
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index 34f31a18f90b..24ecf6a637cb 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -26,6 +26,9 @@ static inline cycles_t get_cycles (void)
 
 extern unsigned int cpu_khz;
 
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
 extern struct vxtime_data vxtime;
 
 #endif
diff --git a/init/calibrate.c b/init/calibrate.c
index c698e04a3dbe..d206c7548fe6 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -8,6 +8,8 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 
+#include <asm/timex.h>
+
 static unsigned long preset_lpj;
 static int __init lpj_setup(char *str)
 {
@@ -17,6 +19,92 @@ static int __init lpj_setup(char *str)
 
 __setup("lpj=", lpj_setup);
 
+#ifdef ARCH_HAS_READ_CURRENT_TIMER
+
+/* This routine uses the read_current_timer() routine and gets the
+ * loops per jiffy directly, instead of guessing it using delay().
+ * Also, this code tries to handle non-maskable asynchronous events
+ * (like SMIs)
+ */
+#define DELAY_CALIBRATION_TICKS			((HZ < 100) ? 1 : (HZ/100))
+#define MAX_DIRECT_CALIBRATION_RETRIES		5
+
+static unsigned long __devinit calibrate_delay_direct(void)
+{
+	unsigned long pre_start, start, post_start;
+	unsigned long pre_end, end, post_end;
+	unsigned long start_jiffies;
+	unsigned long tsc_rate_min, tsc_rate_max;
+	unsigned long good_tsc_sum = 0;
+	unsigned long good_tsc_count = 0;
+	int i;
+
+	if (read_current_timer(&pre_start) < 0 )
+		return 0;
+
+	/*
+	 * A simple loop like
+	 *	while ( jiffies < start_jiffies+1)
+	 *		start = read_current_timer();
+	 * will not do. As we don't really know whether jiffy switch
+	 * happened first or timer_value was read first. And some asynchronous
+	 * event can happen between these two events introducing errors in lpj.
+	 *
+	 * So, we do
+	 * 1. pre_start <- When we are sure that jiffy switch hasn't happened
+	 * 2. check jiffy switch
+	 * 3. start <- timer value before or after jiffy switch
+	 * 4. post_start <- When we are sure that jiffy switch has happened
+	 *
+	 * Note, we don't know anything about order of 2 and 3.
+	 * Now, by looking at post_start and pre_start difference, we can
+	 * check whether any asynchronous event happened or not
+	 */
+
+	for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) {
+		pre_start = 0;
+		read_current_timer(&start);
+		start_jiffies = jiffies;
+		while (jiffies <= (start_jiffies + 1)) {
+			pre_start = start;
+			read_current_timer(&start);
+		}
+		read_current_timer(&post_start);
+
+		pre_end = 0;
+		end = post_start;
+		while (jiffies <=
+		       (start_jiffies + 1 + DELAY_CALIBRATION_TICKS)) {
+			pre_end = end;
+			read_current_timer(&end);
+		}
+		read_current_timer(&post_end);
+
+		tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS;
+		tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS;
+
+		/*
+	 	 * If the upper limit and lower limit of the tsc_rate is
+		 * >= 12.5% apart, redo calibration.
+		 */
+		if (pre_start != 0 && pre_end != 0 &&
+		    (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) {
+			good_tsc_count++;
+			good_tsc_sum += tsc_rate_max;
+		}
+	}
+
+	if (good_tsc_count)
+		return (good_tsc_sum/good_tsc_count);
+
+	printk(KERN_WARNING "calibrate_delay_direct() failed to get a good "
+	       "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n");
+	return 0;
+}
+#else
+static unsigned long __devinit calibrate_delay_direct(void) {return 0;}
+#endif
+
 /*
  * This is the number of bits of precision for the loops_per_jiffy.  Each
  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
@@ -35,6 +123,12 @@ void __devinit calibrate_delay(void)
 			"%lu.%02lu BogoMIPS preset\n",
 			loops_per_jiffy/(500000/HZ),
 			(loops_per_jiffy/(5000/HZ)) % 100);
+	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
+		printk("Calibrating delay using timer specific routine.. ");
+		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+			loops_per_jiffy/(500000/HZ),
+			(loops_per_jiffy/(5000/HZ)) % 100,
+			loops_per_jiffy);
 	} else {
 		loops_per_jiffy = (1<<12);